-
Notifications
You must be signed in to change notification settings - Fork 1
/
batchCrusher.slr
executable file
·82 lines (62 loc) · 2.59 KB
/
batchCrusher.slr
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
#!/bin/bash -l
#SBATCH -N2 --ntasks-per-node=8 --gpus-per-task=1 --cpus-per-task=8 --exclusive
#SBATCH --time=30:00 -J srCos2d
#SBATCH -A AST153_crusher
#SBATCH -p batch
#SBATCH --array 1-1
#-SBATCH -x crusher004 # block sick nodes
# - - - E N D O F SLURM C O M M A N D S
# salloc -A AST153_crusher -p batch -t 30:00 -N 1 --x11 --ntasks-per-node=8 --gpus-per-task=4 --cpus-per-task=8 --exclusive
nprocspn=${SLURM_NTASKS_PER_NODE}
#nprocspn=2 # special case for partial use of full node
#design=hpoa_50eaf423; epochs=3000
design=benchmk_50eaf423; epochs=5
N=${SLURM_NNODES}
G=$[ $N * $nprocspn ]
export MASTER_ADDR=`hostname`
arrIdx=${SLURM_ARRAY_TASK_ID}
jobId=${SLURM_ARRAY_JOB_ID}_${arrIdx} # must not inclide'/'
echo S: JID=${jobId} MASTER_ADDR=$MASTER_ADDR MASTER_PORT=${MASTER_PORT}= G=$G N=$N nprocspn=$nprocspn
nodeList=$(scontrol show hostname $SLURM_NODELIST)
echo S:node-list $nodeList
# grab some variables from environment - if defined
[[ -z "${SRCOS2D_OTHER_PAR}" ]] && otherParStr=" " || otherParStr=" ${SRCOS2D_OTHER_PAR} "
[[ -z "${SRCOS2D_WRK_SUFIX}" ]] && wrkSufix=$jobId || wrkSufix="${SRCOS2D_WRK_SUFIX}"
env |grep SRCOS2D
echo "on Crusher"
facility=crusher
export MPLCONFIGDIR=$wrkDir
SIF=/gpfs/alpine/ast153/scratch/balewski/crusher_amd64/rocm4.5-crusher-torch.v4.sif
ENGINE=" singularity exec $SIF "
baseDir=/gpfs/alpine/ast153/scratch/balewski/tmp_NyxHydro4kF/
#baseDir=/gpfs/alpine/stf016/world-shared/vgv/inbox/olcfdev-571/tmp_NyxHydro4kF/
wrkDir=${baseDir}/${wrkSufix}
echo "S: jobId=$SLURM_JOBID wrkSufix=$wrkSufix wrkDir=$wrkDir"
date
export CMD=" python3 train_dist.py --facility $facility --design $design --epochs $epochs --basePath $wrkDir --expName $jobId $otherParStr "
echo S: CMD=$CMD ENGINE=$ENGINE
codeList=" train_dist.py predict.py toolbox/ batchShifter.slr $design.hpar.yaml "
mkdir -p $wrkDir
cp -rp $codeList $wrkDir
cd $wrkDir
echo S:PWD=`pwd`
# special case: downloading VGG model manually
# see also https://pytorch.org/docs/stable/hub.html
export TORCH_HOME=$baseDir # for storing VGG model
vggPath=${TORCH_HOME}/hub/checkpoints
mkdir -p ${vggPath}
if [ ! -f ${vggPath}/vgg19-dcbb9e9d.pth ] ; then
echo S: download VGG model to ${vggPath}
cd ${vggPath}
pwd
# it hangs here on Crusher worker node --> do it by hand form login node
wget https://download.pytorch.org/models/vgg19-dcbb9e9d.pth
ls -l
cd -
else
echo S: VGG model found at ${vggPath}
fi
echo "S:starting jobId=$jobId srgan_cosmo2 " `date` " wrkDir= $wrkDir"
time srun -n $G $ENGINE toolbox/driveOneTrain.sh >& log.train
echo S:done train
date