Merge branch 'master' into fix_generate

bigscience-workshop · Jul 8, 2022 · 5848c7f · 5848c7f
2 parents 5759518 + dc8d99f
commit 5848c7f
Show file tree

Hide file tree

Showing 18 changed files with 559 additions and 31 deletions.
diff --git a/evaluation/generation/generate.py b/evaluation/generation/generate.py
@@ -47,7 +47,7 @@ def main():
         max_cpu_memory=args.max_cpu_memory if args.parallelize else None,
         offload_folder=args.offload_folder if args.parallelize else None,
     )
-        
+
     print(f"Loaded model in {datetime.datetime.now() - start}")
 
     text = ''

diff --git a/jz/model_storage/move_checkpoints_to_store_tr11b.slurm b/jz/model_storage/move_checkpoints_to_store_tr11b.slurm
@@ -0,0 +1,44 @@
+#!/bin/bash
+#SBATCH --job-name=tr11b_move_to_tar # job name
+#SBATCH --ntasks=1                   # number of MP tasks
+#SBATCH --nodes=1
+#SBATCH --cpus-per-task=4           # number of cores per tasks
+#SBATCH --hint=nomultithread         # we get physical cores not logical
+#SBATCH --time=20:00:00             # maximum execution time (HH:MM:SS)
+#SBATCH --output=logs/%x-%j.out          # output file name
+#SBATCH --account=six@cpu
+#SBATCH --array=0-1362%1
+#SBATCH --partition=cpu_p1
+
+# DEBUG
+# SLURM_ARRAY_TASK_ID=0 # 0-6549
+
+pushd $six_ALL_CCFRWORK/checkpoints
+# readarray CHECKPOINTS < <(find . -regex '\./tr11[a-z].*/global_step[0-9]*')
+# DEBUG regex to test out only on tr11e-350
+# readarray CHECKPOINTS < <(find . -regex '\./tr11e-350M-ml/.*/global_step[0-9]*')
+# batch size 512 -> one out of 4 checkpoints for 1B tokens
+readarray CHECKPOINTS < <(find . -regex '\./tr11b-1B3-ml/.*/global_step[0-9]*000')
+
+echo "Total number of checkpoints to tar: ${#CHECKPOINTS[@]}"
+
+CHECKPOINT_TO_TAR=${CHECKPOINTS[$SLURM_ARRAY_TASK_ID]}
+echo "Checkpoint to tar: $CHECKPOINT_TO_TAR"
+
+TEMPNAME=$(dirname $CHECKPOINT_TO_TAR)
+DIRNAME=${TEMPNAME:2}
+BASENAME=$(basename $CHECKPOINT_TO_TAR)
+
+CHECKPOINT_TO_TAR=$DIRNAME/$BASENAME
+CHECKPOINT_TAR_TO_FOLDER=$six_ALL_CCFRSTORE/checkpoints/$DIRNAME
+CHECKPOINT_TAR_TO=$CHECKPOINT_TAR_TO_FOLDER/$BASENAME.tar
+
+mkdir -p $CHECKPOINT_TAR_TO_FOLDER
+echo $CHECKPOINT_TO_TAR
+echo $CHECKPOINT_TAR_TO_FOLDER
+
+# cvfj for bz2 compression; won't change much
+tar cvf $CHECKPOINT_TAR_TO $CHECKPOINT_TO_TAR
+
+popd
+
diff --git a/jz/model_storage/move_checkpoints_to_store_tr11c.slurm b/jz/model_storage/move_checkpoints_to_store_tr11c.slurm
@@ -0,0 +1,44 @@
+#!/bin/bash
+#SBATCH --job-name=tr11c_move_to_tar # job name
+#SBATCH --ntasks=1                   # number of MP tasks
+#SBATCH --nodes=1
+#SBATCH --cpus-per-task=4           # number of cores per tasks
+#SBATCH --hint=nomultithread         # we get physical cores not logical
+#SBATCH --time=20:00:00             # maximum execution time (HH:MM:SS)
+#SBATCH --output=logs/%x-%j.out          # output file name
+#SBATCH --account=six@cpu
+#SBATCH --array=0-239%1
+#SBATCH --partition=cpu_p1
+
+# DEBUG
+# SLURM_ARRAY_TASK_ID=0 # 0-6549
+
+pushd $six_ALL_CCFRWORK/checkpoints
+# readarray CHECKPOINTS < <(find . -regex '\./tr11[a-z].*/global_step[0-9]*')
+# DEBUG regex to test out only on tr11e-350
+# readarray CHECKPOINTS < <(find . -regex '\./tr11e-350M-ml/.*/global_step[0-9]*')
+# batch size 512 -> one out of 4 checkpoints for 1B tokens
+readarray CHECKPOINTS < <(find . -regex '\./tr11c-2B5-ml/.*/global_step[0-9]*000')
+
+echo "Total number of checkpoints to tar: ${#CHECKPOINTS[@]}"
+
+CHECKPOINT_TO_TAR=${CHECKPOINTS[$SLURM_ARRAY_TASK_ID]}
+echo "Checkpoint to tar: $CHECKPOINT_TO_TAR"
+
+TEMPNAME=$(dirname $CHECKPOINT_TO_TAR)
+DIRNAME=${TEMPNAME:2}
+BASENAME=$(basename $CHECKPOINT_TO_TAR)
+
+CHECKPOINT_TO_TAR=$DIRNAME/$BASENAME
+CHECKPOINT_TAR_TO_FOLDER=$six_ALL_CCFRSTORE/checkpoints/$DIRNAME
+CHECKPOINT_TAR_TO=$CHECKPOINT_TAR_TO_FOLDER/$BASENAME.tar
+
+mkdir -p $CHECKPOINT_TAR_TO_FOLDER
+echo $CHECKPOINT_TO_TAR
+echo $CHECKPOINT_TAR_TO
+
+# cvfj for bz2 compression; won't change much
+tar cvf $CHECKPOINT_TAR_TO $CHECKPOINT_TO_TAR
+
+popd
+
diff --git a/jz/model_storage/move_checkpoints_to_store_tr11d.slurm b/jz/model_storage/move_checkpoints_to_store_tr11d.slurm
@@ -0,0 +1,44 @@
+#!/bin/bash
+#SBATCH --job-name=tr11d_move_to_tar # job name
+#SBATCH --ntasks=1                   # number of MP tasks
+#SBATCH --nodes=1
+#SBATCH --cpus-per-task=4           # number of cores per tasks
+#SBATCH --hint=nomultithread         # we get physical cores not logical
+#SBATCH --time=20:00:00             # maximum execution time (HH:MM:SS)
+#SBATCH --output=logs/%x-%j.out          # output file name
+#SBATCH --account=six@cpu
+#SBATCH --array=0-166%1
+#SBATCH --partition=cpu_p1
+
+# DEBUG
+# SLURM_ARRAY_TASK_ID=0 # 0-6549
+
+pushd $six_ALL_CCFRWORK/checkpoints
+# readarray CHECKPOINTS < <(find . -regex '\./tr11[a-z].*/global_step[0-9]*')
+# DEBUG regex to test out only on tr11e-350
+# readarray CHECKPOINTS < <(find . -regex '\./tr11e-350M-ml/.*/global_step[0-9]*')
+# batch size 256 -> one out of 8 checkpoints for 1B tokens
+readarray CHECKPOINTS < <(find . -regex '\./tr11d-760M-ml/.*/global_step[0-9]*[02468]000')
+
+echo "Total number of checkpoints to tar: ${#CHECKPOINTS[@]}"
+
+CHECKPOINT_TO_TAR=${CHECKPOINTS[$SLURM_ARRAY_TASK_ID]}
+echo "Checkpoint to tar: $CHECKPOINT_TO_TAR"
+
+TEMPNAME=$(dirname $CHECKPOINT_TO_TAR)
+DIRNAME=${TEMPNAME:2}
+BASENAME=$(basename $CHECKPOINT_TO_TAR)
+
+CHECKPOINT_TO_TAR=$DIRNAME/$BASENAME
+CHECKPOINT_TAR_TO_FOLDER=$six_ALL_CCFRSTORE/checkpoints/$DIRNAME
+CHECKPOINT_TAR_TO=$CHECKPOINT_TAR_TO_FOLDER/$BASENAME.tar
+
+mkdir -p $CHECKPOINT_TAR_TO_FOLDER
+echo $CHECKPOINT_TO_TAR
+echo $CHECKPOINT_TAR_TO
+
+# cvfj for bz2 compression; won't change much
+tar cvf $CHECKPOINT_TAR_TO $CHECKPOINT_TO_TAR
+
+popd
+
diff --git a/jz/model_storage/move_checkpoints_to_store_tr11e.slurm b/jz/model_storage/move_checkpoints_to_store_tr11e.slurm
@@ -0,0 +1,43 @@
+#!/bin/bash
+#SBATCH --job-name=move_to_tar # job name
+#SBATCH --ntasks=1                   # number of MP tasks
+#SBATCH --nodes=1
+#SBATCH --cpus-per-task=4           # number of cores per tasks
+#SBATCH --hint=nomultithread         # we get physical cores not logical
+#SBATCH --time=20:00:00             # maximum execution time (HH:MM:SS)
+#SBATCH --output=logs/%x-%j.out          # output file name
+#SBATCH --account=six@cpu
+#SBATCH --array=0-276%1
+#SBATCH --partition=cpu_p1
+
+# DEBUG
+# SLURM_ARRAY_TASK_ID=0 # 0-6549
+
+pushd $six_ALL_CCFRWORK/checkpoints
+# readarray CHECKPOINTS < <(find . -regex '\./tr11[a-z].*/global_step[0-9]*')
+# DEBUG regex to test out only on tr11e-350
+# readarray CHECKPOINTS < <(find . -regex '\./tr11e-350M-ml/.*/global_step[0-9]*')
+# batch size 256 -> one out of 8 checkpoints for 1B tokens
+readarray CHECKPOINTS < <(find . -regex '\./tr11e-350M-ml/.*/global_step[0-9]*[02468]000')
+
+echo "Total number of checkpoints to tar: ${#CHECKPOINTS[@]}"
+
+CHECKPOINT_TO_TAR=${CHECKPOINTS[$SLURM_ARRAY_TASK_ID]}
+echo "Checkpoint to tar: $CHECKPOINT_TO_TAR"
+
+TEMPNAME=$(dirname $CHECKPOINT_TO_TAR)
+DIRNAME=${TEMPNAME:2}
+BASENAME=$(basename $CHECKPOINT_TO_TAR)
+
+CHECKPOINT_TO_TAR=$DIRNAME/$BASENAME
+CHECKPOINT_TAR_TO_FOLDER=$six_ALL_CCFRSTORE/checkpoints/$DIRNAME
+CHECKPOINT_TAR_TO=$CHECKPOINT_TAR_TO_FOLDER/$BASENAME.tar
+
+mkdir -p $CHECKPOINT_TAR_TO_FOLDER
+echo $CHECKPOINT_TO_TAR
+echo $CHECKPOINT_TAR_TO
+
+# cvfj for bz2 compression; won't change much
+tar cvf $CHECKPOINT_TAR_TO $CHECKPOINT_TO_TAR
+
+popd
diff --git a/jz/model_storage/move_checkpoints_to_store_tr11f.slurm b/jz/model_storage/move_checkpoints_to_store_tr11f.slurm
@@ -0,0 +1,44 @@
+#!/bin/bash
+#SBATCH --job-name=tr11f_move_to_tar # job name
+#SBATCH --ntasks=1                   # number of MP tasks
+#SBATCH --nodes=1
+#SBATCH --cpus-per-task=4           # number of cores per tasks
+#SBATCH --hint=nomultithread         # we get physical cores not logical
+#SBATCH --time=20:00:00             # maximum execution time (HH:MM:SS)
+#SBATCH --output=logs/%x-%j.out          # output file name
+#SBATCH --account=six@cpu
+#SBATCH --array=0-155%1
+#SBATCH --partition=cpu_p1
+
+# DEBUG
+# SLURM_ARRAY_TASK_ID=0 # 0-6549
+
+pushd $six_ALL_CCFRWORK/checkpoints
+# readarray CHECKPOINTS < <(find . -regex '\./tr11[a-z].*/global_step[0-9]*')
+# DEBUG regex to test out only on tr11e-350
+# readarray CHECKPOINTS < <(find . -regex '\./tr11e-350M-ml/.*/global_step[0-9]*')
+# batch size 512 -> one out of 4 checkpoints for 1B tokens
+readarray CHECKPOINTS < <(find . -regex '\./tr11f-6B3-ml/.*/global_step[0-9]*000')
+
+echo "Total number of checkpoints to tar: ${#CHECKPOINTS[@]}"
+
+CHECKPOINT_TO_TAR=${CHECKPOINTS[$SLURM_ARRAY_TASK_ID]}
+echo "Checkpoint to tar: $CHECKPOINT_TO_TAR"
+
+TEMPNAME=$(dirname $CHECKPOINT_TO_TAR)
+DIRNAME=${TEMPNAME:2}
+BASENAME=$(basename $CHECKPOINT_TO_TAR)
+
+CHECKPOINT_TO_TAR=$DIRNAME/$BASENAME
+CHECKPOINT_TAR_TO_FOLDER=$six_ALL_CCFRSTORE/checkpoints/$DIRNAME
+CHECKPOINT_TAR_TO=$CHECKPOINT_TAR_TO_FOLDER/$BASENAME.tar
+
+mkdir -p $CHECKPOINT_TAR_TO_FOLDER
+echo $CHECKPOINT_TO_TAR
+echo $CHECKPOINT_TAR_TO
+
+# cvfj for bz2 compression; won't change much
+tar cvf $CHECKPOINT_TAR_TO $CHECKPOINT_TO_TAR
+
+popd
+
diff --git a/jz/model_storage/move_first_150_checkpoints_to_store.slurm b/jz/model_storage/move_first_150_checkpoints_to_store.slurm
@@ -0,0 +1,45 @@
+#!/bin/bash
+#SBATCH --job-name=move_first_checkpoints_to_tar # job name
+#SBATCH --ntasks=1                   # number of MP tasks
+#SBATCH --nodes=1
+#SBATCH --cpus-per-task=4           # number of cores per tasks
+#SBATCH --hint=nomultithread         # we get physical cores not logical
+#SBATCH --time=20:00:00             # maximum execution time (HH:MM:SS)
+#SBATCH --output=logs/%x-%j.out          # output file name
+#SBATCH --account=six@cpu
+#SBATCH --partition=cpu_p1
+
+# DEBUG
+# SLURM_ARRAY_TASK_ID=0 # 0-149
+
+
+# you have to also pass --array=0-<desired_number>%1 as an sbatch flag to compress everything, eg sbatch --array=0-149%1 move_first_150_checkpoints_to_store.slurm tr11b-1B3-ml 150
+
+pushd $six_ALL_CCFRWORK/checkpoints
+# readarray CHECKPOINTS < <(find . -regex '\./tr11[a-z].*/global_step[0-9]*')
+# DEBUG regex to test out only on tr11e-350
+# readarray CHECKPOINTS < <(find . -regex '\./tr11e-350M-ml/.*/global_step[0-9]*')
+# batch size 512 -> first 150 checkpoints for 39B tokens, batch size 256 -> 300
+readarray CHECKPOINTS < <(ls -v ./"${1}"/checkpoints/main/ | head -"${2}")
+
+echo "Total number of checkpoints to tar: ${#CHECKPOINTS[@]}"
+
+CHECKPOINT_TO_TAR="./${1}/checkpoints/main/${CHECKPOINTS[$SLURM_ARRAY_TASK_ID]}"
+echo "Checkpoint to tar: $CHECKPOINT_TO_TAR"
+
+TEMPNAME=$(dirname $CHECKPOINT_TO_TAR)
+DIRNAME=${TEMPNAME:2}
+BASENAME=$(basename $CHECKPOINT_TO_TAR)
+
+CHECKPOINT_TO_TAR=$DIRNAME/$BASENAME
+CHECKPOINT_TAR_TO_FOLDER=$six_ALL_CCFRSTORE/checkpoints/$DIRNAME
+CHECKPOINT_TAR_TO=$CHECKPOINT_TAR_TO_FOLDER/$BASENAME.tar
+
+mkdir -p $CHECKPOINT_TAR_TO_FOLDER
+echo $CHECKPOINT_TO_TAR
+echo $CHECKPOINT_TAR_TO_FOLDER
+
+# cvfj for bz2 compression; won't change much
+tar cvf $CHECKPOINT_TAR_TO $CHECKPOINT_TO_TAR
+
+popd
diff --git a/jz/slurm/README.md b/jz/slurm/README.md
@@ -295,7 +295,7 @@ more informative all-in-one myjobs that includes the projected start time for pe
 
 ```
 alias myjobs='squeue -u `whoami` -o "%.16i %.9P %.26j %.8T %.10M %.8l %.6D %.20S %R"'
-alias groupjobs='squeue -u $(getent group six | cut -d: -f4) -o "%.16i %.9P %.26j %.8T %.10M %.8l %.6D %.20S %R"'
+alias groupjobs='squeue -u $(getent group six | cut -d: -f4) -o "%.16i %u %.9P %.26j %.8T %.10M %.8l %.6D %.20S %R"'
 ```
 
 

diff --git a/tools/fs-watchdog.py b/tools/fs-watchdog.py
@@ -135,6 +135,25 @@ def analyse_partition_idrquota(partition_name, partition_flag, alert_bytes_thres
             alerts.append(response)
             alerts.append("")
 
+    def analyse_shared_disk(partition_name, alert_bytes_threshold):
+        partition_name_2_disk = {
+            "SCRATCH": "gpfsssd",
+            "WORK": "gpfsdswork",
+            "STORE": "gpfsdsstore"
+        }
+        cmd = "df"
+        response = run_cmd(cmd.split())
+        disk_metas = response.split("\n")
+        column_names = disk_metas[0].split()
+        disk_meta = [disk_meta_.split() for disk_meta_ in disk_metas if disk_meta_.startswith(partition_name_2_disk[partition_name])][0]
+        disk_meta = {column_name: value for column_name, value in zip(column_names, disk_meta)}
+
+        # default `df` counts uses 1024-byte units, and `1024 == 2 ** 10`
+        available_disk_left = int(disk_meta["Available"]) * 2 ** 10
+        if available_disk_left < alert_bytes_threshold:
+            alerts.append(f"Shared {partition_name} has {available_disk_left/2**40:.2f}TB left")
+            alerts.append("")
+
     # WORK and STORE partitions stats can be accessed much faster through `idrquota`, and it already
     # includes the quota info
     analyse_partition_idrquota(partition_name="WORK", partition_flag="-w", alert_bytes_threshold=0.85, alert_inodes_threshold=0.85)
@@ -143,10 +162,13 @@ def analyse_partition_idrquota(partition_name, partition_flag, alert_bytes_thres
     # SCRATCH - check only bytes w/ a hard quota of 400TB - alert on lower threshold than other
     # partitions due to it filling up at a faster rate (dumping huge checkpoints)
     analyse_partition_bytes(partition_name="SCRATCH", partition_path="/gpfsssd/scratch/rech/six/", hard_limit_bytes=400*2**40, alert_bytes_threshold=0.75)
+    # Actually SCRATCH is shared with everyone and we should monitor the output of `df -h | grep gpfsssd`
+    # Check that there's still 40TB left
+    analyse_shared_disk("SCRATCH", 100 * 2 ** 40)
 
-    # WORKFS - check both bytes and inodes w/ hard quotas of 3TB / 3M
-    analyse_partition_bytes(partition_name="WORKFS", partition_path="/gpfsssd/worksf/projects/rech/six/", hard_limit_bytes=3*2**40, alert_bytes_threshold=0.85)
-    analyse_partition_inodes(partition_name="WORKFS", partition_path="/gpfsssd/worksf/projects/rech/six/", hard_limit_inodes=3*10**6, alert_inodes_threshold=0.85)
+    # WORKSF - check both bytes and inodes w/ hard quotas of 2TB / 3M
+    analyse_partition_bytes(partition_name="WORKSF", partition_path="/gpfsssd/worksf/projects/rech/six/", hard_limit_bytes=2*2**40, alert_bytes_threshold=0.85)
+    analyse_partition_inodes(partition_name="WORKSF", partition_path="/gpfsssd/worksf/projects/rech/six/", hard_limit_inodes=3*10**6, alert_inodes_threshold=0.85)
 
     if len(alerts) > 0 :
         print(f"[ALERT] JZ filesystem is getting close to being full")

diff --git a/tools/slurm-status.py b/tools/slurm-status.py
@@ -158,12 +158,16 @@ def main():
     in_the_system = False
     for l in status_lines:
         #print(f"l=[{l}]")
-        jobid, partition, name, state, time, nodes, start_time, notes = l.split(None, 7)
-        #print("-".join([jobid, partition, name, state, time, nodes, start_time, notes]))
-        # XXX: add support for regex matching so partial name can be provided
-        if name == args.job_name:
-            in_the_system = True
-            process_job(jobid, partition, name, state, time, nodes, start_time, notes)
+
+        # XXX: apparently some jobs can be run w/o name and break the split() call, so match our
+        # name first and then split
+        if args.job_name in l:
+            jobid, partition, name, state, time, nodes, start_time, notes = l.split(None, 7)
+            #print("-".join([jobid, partition, name, state, time, nodes, start_time, notes]))
+            # XXX: add support for regex matching so partial name can be provided
+            if name == args.job_name:
+                in_the_system = True
+                process_job(jobid, partition, name, state, time, nodes, start_time, notes)
 
     if not in_the_system:
         preamble = get_preamble()