Skip to content

Commit

Permalink
Merge branch 'master' into fix_generate
Browse files Browse the repository at this point in the history
  • Loading branch information
younesbelkada authored Jul 8, 2022
2 parents 5759518 + dc8d99f commit 5848c7f
Show file tree
Hide file tree
Showing 18 changed files with 559 additions and 31 deletions.
2 changes: 1 addition & 1 deletion evaluation/generation/generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def main():
max_cpu_memory=args.max_cpu_memory if args.parallelize else None,
offload_folder=args.offload_folder if args.parallelize else None,
)

print(f"Loaded model in {datetime.datetime.now() - start}")

text = ''
Expand Down
44 changes: 44 additions & 0 deletions jz/model_storage/move_checkpoints_to_store_tr11b.slurm
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
#!/bin/bash
#SBATCH --job-name=tr11b_move_to_tar # job name
#SBATCH --ntasks=1 # number of MP tasks
#SBATCH --nodes=1
#SBATCH --cpus-per-task=4 # number of cores per tasks
#SBATCH --hint=nomultithread # we get physical cores not logical
#SBATCH --time=20:00:00 # maximum execution time (HH:MM:SS)
#SBATCH --output=logs/%x-%j.out # output file name
#SBATCH --account=six@cpu
#SBATCH --array=0-1362%1
#SBATCH --partition=cpu_p1

# DEBUG
# SLURM_ARRAY_TASK_ID=0 # 0-6549

pushd $six_ALL_CCFRWORK/checkpoints
# readarray CHECKPOINTS < <(find . -regex '\./tr11[a-z].*/global_step[0-9]*')
# DEBUG regex to test out only on tr11e-350
# readarray CHECKPOINTS < <(find . -regex '\./tr11e-350M-ml/.*/global_step[0-9]*')
# batch size 512 -> one out of 4 checkpoints for 1B tokens
readarray CHECKPOINTS < <(find . -regex '\./tr11b-1B3-ml/.*/global_step[0-9]*000')

echo "Total number of checkpoints to tar: ${#CHECKPOINTS[@]}"

CHECKPOINT_TO_TAR=${CHECKPOINTS[$SLURM_ARRAY_TASK_ID]}
echo "Checkpoint to tar: $CHECKPOINT_TO_TAR"

TEMPNAME=$(dirname $CHECKPOINT_TO_TAR)
DIRNAME=${TEMPNAME:2}
BASENAME=$(basename $CHECKPOINT_TO_TAR)

CHECKPOINT_TO_TAR=$DIRNAME/$BASENAME
CHECKPOINT_TAR_TO_FOLDER=$six_ALL_CCFRSTORE/checkpoints/$DIRNAME
CHECKPOINT_TAR_TO=$CHECKPOINT_TAR_TO_FOLDER/$BASENAME.tar

mkdir -p $CHECKPOINT_TAR_TO_FOLDER
echo $CHECKPOINT_TO_TAR
echo $CHECKPOINT_TAR_TO_FOLDER

# cvfj for bz2 compression; won't change much
tar cvf $CHECKPOINT_TAR_TO $CHECKPOINT_TO_TAR

popd

44 changes: 44 additions & 0 deletions jz/model_storage/move_checkpoints_to_store_tr11c.slurm
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
#!/bin/bash
#SBATCH --job-name=tr11c_move_to_tar # job name
#SBATCH --ntasks=1 # number of MP tasks
#SBATCH --nodes=1
#SBATCH --cpus-per-task=4 # number of cores per tasks
#SBATCH --hint=nomultithread # we get physical cores not logical
#SBATCH --time=20:00:00 # maximum execution time (HH:MM:SS)
#SBATCH --output=logs/%x-%j.out # output file name
#SBATCH --account=six@cpu
#SBATCH --array=0-239%1
#SBATCH --partition=cpu_p1

# DEBUG
# SLURM_ARRAY_TASK_ID=0 # 0-6549

pushd $six_ALL_CCFRWORK/checkpoints
# readarray CHECKPOINTS < <(find . -regex '\./tr11[a-z].*/global_step[0-9]*')
# DEBUG regex to test out only on tr11e-350
# readarray CHECKPOINTS < <(find . -regex '\./tr11e-350M-ml/.*/global_step[0-9]*')
# batch size 512 -> one out of 4 checkpoints for 1B tokens
readarray CHECKPOINTS < <(find . -regex '\./tr11c-2B5-ml/.*/global_step[0-9]*000')

echo "Total number of checkpoints to tar: ${#CHECKPOINTS[@]}"

CHECKPOINT_TO_TAR=${CHECKPOINTS[$SLURM_ARRAY_TASK_ID]}
echo "Checkpoint to tar: $CHECKPOINT_TO_TAR"

TEMPNAME=$(dirname $CHECKPOINT_TO_TAR)
DIRNAME=${TEMPNAME:2}
BASENAME=$(basename $CHECKPOINT_TO_TAR)

CHECKPOINT_TO_TAR=$DIRNAME/$BASENAME
CHECKPOINT_TAR_TO_FOLDER=$six_ALL_CCFRSTORE/checkpoints/$DIRNAME
CHECKPOINT_TAR_TO=$CHECKPOINT_TAR_TO_FOLDER/$BASENAME.tar

mkdir -p $CHECKPOINT_TAR_TO_FOLDER
echo $CHECKPOINT_TO_TAR
echo $CHECKPOINT_TAR_TO

# cvfj for bz2 compression; won't change much
tar cvf $CHECKPOINT_TAR_TO $CHECKPOINT_TO_TAR

popd

44 changes: 44 additions & 0 deletions jz/model_storage/move_checkpoints_to_store_tr11d.slurm
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
#!/bin/bash
#SBATCH --job-name=tr11d_move_to_tar # job name
#SBATCH --ntasks=1 # number of MP tasks
#SBATCH --nodes=1
#SBATCH --cpus-per-task=4 # number of cores per tasks
#SBATCH --hint=nomultithread # we get physical cores not logical
#SBATCH --time=20:00:00 # maximum execution time (HH:MM:SS)
#SBATCH --output=logs/%x-%j.out # output file name
#SBATCH --account=six@cpu
#SBATCH --array=0-166%1
#SBATCH --partition=cpu_p1

# DEBUG
# SLURM_ARRAY_TASK_ID=0 # 0-6549

pushd $six_ALL_CCFRWORK/checkpoints
# readarray CHECKPOINTS < <(find . -regex '\./tr11[a-z].*/global_step[0-9]*')
# DEBUG regex to test out only on tr11e-350
# readarray CHECKPOINTS < <(find . -regex '\./tr11e-350M-ml/.*/global_step[0-9]*')
# batch size 256 -> one out of 8 checkpoints for 1B tokens
readarray CHECKPOINTS < <(find . -regex '\./tr11d-760M-ml/.*/global_step[0-9]*[02468]000')

echo "Total number of checkpoints to tar: ${#CHECKPOINTS[@]}"

CHECKPOINT_TO_TAR=${CHECKPOINTS[$SLURM_ARRAY_TASK_ID]}
echo "Checkpoint to tar: $CHECKPOINT_TO_TAR"

TEMPNAME=$(dirname $CHECKPOINT_TO_TAR)
DIRNAME=${TEMPNAME:2}
BASENAME=$(basename $CHECKPOINT_TO_TAR)

CHECKPOINT_TO_TAR=$DIRNAME/$BASENAME
CHECKPOINT_TAR_TO_FOLDER=$six_ALL_CCFRSTORE/checkpoints/$DIRNAME
CHECKPOINT_TAR_TO=$CHECKPOINT_TAR_TO_FOLDER/$BASENAME.tar

mkdir -p $CHECKPOINT_TAR_TO_FOLDER
echo $CHECKPOINT_TO_TAR
echo $CHECKPOINT_TAR_TO

# cvfj for bz2 compression; won't change much
tar cvf $CHECKPOINT_TAR_TO $CHECKPOINT_TO_TAR

popd

43 changes: 43 additions & 0 deletions jz/model_storage/move_checkpoints_to_store_tr11e.slurm
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
#!/bin/bash
#SBATCH --job-name=move_to_tar # job name
#SBATCH --ntasks=1 # number of MP tasks
#SBATCH --nodes=1
#SBATCH --cpus-per-task=4 # number of cores per tasks
#SBATCH --hint=nomultithread # we get physical cores not logical
#SBATCH --time=20:00:00 # maximum execution time (HH:MM:SS)
#SBATCH --output=logs/%x-%j.out # output file name
#SBATCH --account=six@cpu
#SBATCH --array=0-276%1
#SBATCH --partition=cpu_p1

# DEBUG
# SLURM_ARRAY_TASK_ID=0 # 0-6549

pushd $six_ALL_CCFRWORK/checkpoints
# readarray CHECKPOINTS < <(find . -regex '\./tr11[a-z].*/global_step[0-9]*')
# DEBUG regex to test out only on tr11e-350
# readarray CHECKPOINTS < <(find . -regex '\./tr11e-350M-ml/.*/global_step[0-9]*')
# batch size 256 -> one out of 8 checkpoints for 1B tokens
readarray CHECKPOINTS < <(find . -regex '\./tr11e-350M-ml/.*/global_step[0-9]*[02468]000')

echo "Total number of checkpoints to tar: ${#CHECKPOINTS[@]}"

CHECKPOINT_TO_TAR=${CHECKPOINTS[$SLURM_ARRAY_TASK_ID]}
echo "Checkpoint to tar: $CHECKPOINT_TO_TAR"

TEMPNAME=$(dirname $CHECKPOINT_TO_TAR)
DIRNAME=${TEMPNAME:2}
BASENAME=$(basename $CHECKPOINT_TO_TAR)

CHECKPOINT_TO_TAR=$DIRNAME/$BASENAME
CHECKPOINT_TAR_TO_FOLDER=$six_ALL_CCFRSTORE/checkpoints/$DIRNAME
CHECKPOINT_TAR_TO=$CHECKPOINT_TAR_TO_FOLDER/$BASENAME.tar

mkdir -p $CHECKPOINT_TAR_TO_FOLDER
echo $CHECKPOINT_TO_TAR
echo $CHECKPOINT_TAR_TO

# cvfj for bz2 compression; won't change much
tar cvf $CHECKPOINT_TAR_TO $CHECKPOINT_TO_TAR

popd
44 changes: 44 additions & 0 deletions jz/model_storage/move_checkpoints_to_store_tr11f.slurm
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
#!/bin/bash
#SBATCH --job-name=tr11f_move_to_tar # job name
#SBATCH --ntasks=1 # number of MP tasks
#SBATCH --nodes=1
#SBATCH --cpus-per-task=4 # number of cores per tasks
#SBATCH --hint=nomultithread # we get physical cores not logical
#SBATCH --time=20:00:00 # maximum execution time (HH:MM:SS)
#SBATCH --output=logs/%x-%j.out # output file name
#SBATCH --account=six@cpu
#SBATCH --array=0-155%1
#SBATCH --partition=cpu_p1

# DEBUG
# SLURM_ARRAY_TASK_ID=0 # 0-6549

pushd $six_ALL_CCFRWORK/checkpoints
# readarray CHECKPOINTS < <(find . -regex '\./tr11[a-z].*/global_step[0-9]*')
# DEBUG regex to test out only on tr11e-350
# readarray CHECKPOINTS < <(find . -regex '\./tr11e-350M-ml/.*/global_step[0-9]*')
# batch size 512 -> one out of 4 checkpoints for 1B tokens
readarray CHECKPOINTS < <(find . -regex '\./tr11f-6B3-ml/.*/global_step[0-9]*000')

echo "Total number of checkpoints to tar: ${#CHECKPOINTS[@]}"

CHECKPOINT_TO_TAR=${CHECKPOINTS[$SLURM_ARRAY_TASK_ID]}
echo "Checkpoint to tar: $CHECKPOINT_TO_TAR"

TEMPNAME=$(dirname $CHECKPOINT_TO_TAR)
DIRNAME=${TEMPNAME:2}
BASENAME=$(basename $CHECKPOINT_TO_TAR)

CHECKPOINT_TO_TAR=$DIRNAME/$BASENAME
CHECKPOINT_TAR_TO_FOLDER=$six_ALL_CCFRSTORE/checkpoints/$DIRNAME
CHECKPOINT_TAR_TO=$CHECKPOINT_TAR_TO_FOLDER/$BASENAME.tar

mkdir -p $CHECKPOINT_TAR_TO_FOLDER
echo $CHECKPOINT_TO_TAR
echo $CHECKPOINT_TAR_TO

# cvfj for bz2 compression; won't change much
tar cvf $CHECKPOINT_TAR_TO $CHECKPOINT_TO_TAR

popd

45 changes: 45 additions & 0 deletions jz/model_storage/move_first_150_checkpoints_to_store.slurm
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#!/bin/bash
#SBATCH --job-name=move_first_checkpoints_to_tar # job name
#SBATCH --ntasks=1 # number of MP tasks
#SBATCH --nodes=1
#SBATCH --cpus-per-task=4 # number of cores per tasks
#SBATCH --hint=nomultithread # we get physical cores not logical
#SBATCH --time=20:00:00 # maximum execution time (HH:MM:SS)
#SBATCH --output=logs/%x-%j.out # output file name
#SBATCH --account=six@cpu
#SBATCH --partition=cpu_p1

# DEBUG
# SLURM_ARRAY_TASK_ID=0 # 0-149


# you have to also pass --array=0-<desired_number>%1 as an sbatch flag to compress everything, eg sbatch --array=0-149%1 move_first_150_checkpoints_to_store.slurm tr11b-1B3-ml 150

pushd $six_ALL_CCFRWORK/checkpoints
# readarray CHECKPOINTS < <(find . -regex '\./tr11[a-z].*/global_step[0-9]*')
# DEBUG regex to test out only on tr11e-350
# readarray CHECKPOINTS < <(find . -regex '\./tr11e-350M-ml/.*/global_step[0-9]*')
# batch size 512 -> first 150 checkpoints for 39B tokens, batch size 256 -> 300
readarray CHECKPOINTS < <(ls -v ./"${1}"/checkpoints/main/ | head -"${2}")

echo "Total number of checkpoints to tar: ${#CHECKPOINTS[@]}"

CHECKPOINT_TO_TAR="./${1}/checkpoints/main/${CHECKPOINTS[$SLURM_ARRAY_TASK_ID]}"
echo "Checkpoint to tar: $CHECKPOINT_TO_TAR"

TEMPNAME=$(dirname $CHECKPOINT_TO_TAR)
DIRNAME=${TEMPNAME:2}
BASENAME=$(basename $CHECKPOINT_TO_TAR)

CHECKPOINT_TO_TAR=$DIRNAME/$BASENAME
CHECKPOINT_TAR_TO_FOLDER=$six_ALL_CCFRSTORE/checkpoints/$DIRNAME
CHECKPOINT_TAR_TO=$CHECKPOINT_TAR_TO_FOLDER/$BASENAME.tar

mkdir -p $CHECKPOINT_TAR_TO_FOLDER
echo $CHECKPOINT_TO_TAR
echo $CHECKPOINT_TAR_TO_FOLDER

# cvfj for bz2 compression; won't change much
tar cvf $CHECKPOINT_TAR_TO $CHECKPOINT_TO_TAR

popd
2 changes: 1 addition & 1 deletion jz/slurm/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -295,7 +295,7 @@ more informative all-in-one myjobs that includes the projected start time for pe

```
alias myjobs='squeue -u `whoami` -o "%.16i %.9P %.26j %.8T %.10M %.8l %.6D %.20S %R"'
alias groupjobs='squeue -u $(getent group six | cut -d: -f4) -o "%.16i %.9P %.26j %.8T %.10M %.8l %.6D %.20S %R"'
alias groupjobs='squeue -u $(getent group six | cut -d: -f4) -o "%.16i %u %.9P %.26j %.8T %.10M %.8l %.6D %.20S %R"'
```


Expand Down
28 changes: 25 additions & 3 deletions tools/fs-watchdog.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,25 @@ def analyse_partition_idrquota(partition_name, partition_flag, alert_bytes_thres
alerts.append(response)
alerts.append("")

def analyse_shared_disk(partition_name, alert_bytes_threshold):
partition_name_2_disk = {
"SCRATCH": "gpfsssd",
"WORK": "gpfsdswork",
"STORE": "gpfsdsstore"
}
cmd = "df"
response = run_cmd(cmd.split())
disk_metas = response.split("\n")
column_names = disk_metas[0].split()
disk_meta = [disk_meta_.split() for disk_meta_ in disk_metas if disk_meta_.startswith(partition_name_2_disk[partition_name])][0]
disk_meta = {column_name: value for column_name, value in zip(column_names, disk_meta)}

# default `df` counts uses 1024-byte units, and `1024 == 2 ** 10`
available_disk_left = int(disk_meta["Available"]) * 2 ** 10
if available_disk_left < alert_bytes_threshold:
alerts.append(f"Shared {partition_name} has {available_disk_left/2**40:.2f}TB left")
alerts.append("")

# WORK and STORE partitions stats can be accessed much faster through `idrquota`, and it already
# includes the quota info
analyse_partition_idrquota(partition_name="WORK", partition_flag="-w", alert_bytes_threshold=0.85, alert_inodes_threshold=0.85)
Expand All @@ -143,10 +162,13 @@ def analyse_partition_idrquota(partition_name, partition_flag, alert_bytes_thres
# SCRATCH - check only bytes w/ a hard quota of 400TB - alert on lower threshold than other
# partitions due to it filling up at a faster rate (dumping huge checkpoints)
analyse_partition_bytes(partition_name="SCRATCH", partition_path="/gpfsssd/scratch/rech/six/", hard_limit_bytes=400*2**40, alert_bytes_threshold=0.75)
# Actually SCRATCH is shared with everyone and we should monitor the output of `df -h | grep gpfsssd`
# Check that there's still 40TB left
analyse_shared_disk("SCRATCH", 100 * 2 ** 40)

# WORKFS - check both bytes and inodes w/ hard quotas of 3TB / 3M
analyse_partition_bytes(partition_name="WORKFS", partition_path="/gpfsssd/worksf/projects/rech/six/", hard_limit_bytes=3*2**40, alert_bytes_threshold=0.85)
analyse_partition_inodes(partition_name="WORKFS", partition_path="/gpfsssd/worksf/projects/rech/six/", hard_limit_inodes=3*10**6, alert_inodes_threshold=0.85)
# WORKSF - check both bytes and inodes w/ hard quotas of 2TB / 3M
analyse_partition_bytes(partition_name="WORKSF", partition_path="/gpfsssd/worksf/projects/rech/six/", hard_limit_bytes=2*2**40, alert_bytes_threshold=0.85)
analyse_partition_inodes(partition_name="WORKSF", partition_path="/gpfsssd/worksf/projects/rech/six/", hard_limit_inodes=3*10**6, alert_inodes_threshold=0.85)

if len(alerts) > 0 :
print(f"[ALERT] JZ filesystem is getting close to being full")
Expand Down
16 changes: 10 additions & 6 deletions tools/slurm-status.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,12 +158,16 @@ def main():
in_the_system = False
for l in status_lines:
#print(f"l=[{l}]")
jobid, partition, name, state, time, nodes, start_time, notes = l.split(None, 7)
#print("-".join([jobid, partition, name, state, time, nodes, start_time, notes]))
# XXX: add support for regex matching so partial name can be provided
if name == args.job_name:
in_the_system = True
process_job(jobid, partition, name, state, time, nodes, start_time, notes)

# XXX: apparently some jobs can be run w/o name and break the split() call, so match our
# name first and then split
if args.job_name in l:
jobid, partition, name, state, time, nodes, start_time, notes = l.split(None, 7)
#print("-".join([jobid, partition, name, state, time, nodes, start_time, notes]))
# XXX: add support for regex matching so partial name can be provided
if name == args.job_name:
in_the_system = True
process_job(jobid, partition, name, state, time, nodes, start_time, notes)

if not in_the_system:
preamble = get_preamble()
Expand Down
Loading

0 comments on commit 5848c7f

Please sign in to comment.