forked from TACC/launcher
-
Notifications
You must be signed in to change notification settings - Fork 1
/
paramrun
executable file
·274 lines (242 loc) · 6.86 KB
/
paramrun
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
#!/bin/bash
if [ -z $LAUNCHER_DIR ]
then
echo "ERROR: Launcher base directory LAUNCHER_DIR not set. Exiting." >&2
exit
fi
if [ "x$LAUNCHER_WORKDIR" == "x" ]
then
#Check for older WORKDIR
if [ ! "x$WORKDIR" == "x" ]
then
echo "NOTICE: WORKDIR variable has been depricated. Use LAUNCHER_WORKDIR." >&2
export LAUNCHER_WORKDIR=$WORKDIR
else
echo "WARNING: LAUNCHER_WORKDIR variable not set. Using current directory." >&2
export LAUNCHER_WORKDIR=`pwd`
fi
fi
#Step 0: Determine if resource manager integration is enabled
if [ -z $LAUNCHER_PLUGIN_DIR ]
then
echo "WARNING: LAUNCHER_PLUGIN_DIR not set. Defaulting to $LAUNCHER_DIR/plugins." >&2
LAUNCHER_PLUGIN_DIR=$LAUNCHER_DIR/plugins
fi
if [ -z $LAUNCHER_RMI ]
then
echo "Launcher: Starting without resource manager integration."
else
if [ -f $LAUNCHER_PLUGIN_DIR/$LAUNCHER_RMI.rmi ]
then
source $LAUNCHER_PLUGIN_DIR/$LAUNCHER_RMI.rmi
export LAUNCHER_HOSTFILE=$LAUNCHER_RMI_HOSTFILE
else
echo "WARNING: Resource Manager Integration plugin file $LAUNCHER_RMI.rmi no found." >&2
echo " Available options:" >&2
for rmi in `ls $LAUNCHER_PLUGIN_DIR/*.rmi 2>/dev/null`
do
echo " `basename $rmi .rmi`" >&2
done
echo " Defaulting to no integration." >&2
fi
fi
#Step 1: Ensure that LAUNCHER_HOSTFILE is set and valid
if [ -z $LAUNCHER_HOSTFILE ]
then
echo "WARNING: LAUNCHER_HOSTFILE is not set. Defaulting to localhost." >&2
LAUNCHER_LOCALHOST=1
export LAUNCHER_NHOSTS=1
else
if [ ! -f $LAUNCHER_HOSTFILE ]
then
echo "ERROR: Hostfile ($LAUNCHER_HOSTFILE) not found." >&2
exit
fi
fi
#Step 2: Ensure that necessary variables are set
if [ -z $LAUNCHER_PPN ]
then
if [ ! -z $LAUNCHER_RMI_PPN ]
then
export LAUNCHER_PPN=$LAUNCHER_RMI_PPN
else
echo "WARNING: LAUNCHER_PPN is not set. Defaulting to 1." >&2
export LAUNCHER_PPN=1
fi
fi
if [ -z $LAUNCHER_NHOSTS ]
then
if [ ! -z $LAUNCHER_RMI_NHOSTS ]
then
export LAUNCHER_NHOSTS=$LAUNCHER_RMI_NHOSTS
else
echo "WARNING: LAUNCHER_NHOSTS is not set. Calculating from hostfile." >&2
export LAUNCHER_NHOSTS=`wc -l $LAUNCHER_HOSTFILE | awk '{print $1}'`
fi
fi
export LAUNCHER_NPROCS=`expr $LAUNCHER_NHOSTS \* $LAUNCHER_PPN`
if [ -z $LAUNCHER_NPROCS ]
then
echo "ERROR: LAUNCHER_NPROCS is not set." >&2
exit
fi
#Backward Compatability with v2 and v1: Check for CONTROL_FILE
if [ ! -z $CONTROL_FILE ]
then
echo "NOTICE: CONTROL_FILE variable depricated. Use LAUNCHER_JOB_FILE in the future." >&2
export LAUNCHER_JOB_FILE=$CONTROL_FILE
fi
#Step 3: Ensure that LAUNCHER_JOB_FILE exists
if [ -z $LAUNCHER_JOB_FILE ]
then
echo "WARNING: LAUNCHER_JOB_FILE not set." >&2
exit
else
if [ -f $LAUNCHER_JOB_FILE ]
then
export LAUNCHER_NJOBS=`wc -l $LAUNCHER_JOB_FILE | awk '{print $1}'`
else
echo "ERROR: LAUNCHER_JOB_FILE ($LAUNCHER_JOB_FILE) not found." >&2
exit
fi
fi
#Step 4: Setup Xeon Phi support
if [ "x$LAUNCHER_NPHI" == "x" -o "x$LAUNCHER_NPHI" == "x0" ]
then
LAUNCHER_USE_PHI=0
else
echo "Launcher: Setting up Intel Xeon Phi support."
LAUNCHER_USE_PHI=1
fi
#Step 5: Scheduling setup
if [ "x$LAUNCHER_SCHED" == "x" ]
then
export LAUNCHER_SCHED="dynamic"
fi
if [ "$LAUNCHER_SCHED" == "dynamic" ]; then
#Start tskserver
RUNNING="false"
RETRY=0
while [ $RUNNING == "false" ]
do
$LAUNCHER_DIR/tskserver $LAUNCHER_NJOBS $HOSTNAME 9471 2>/dev/null &
LAUNCHER_DYN_PID=$!
disown $LAUNCHER_DYN_PID
sleep 1s
if ! ps -p $! >/dev/null 2>/dev/null
then
if [ $RETRY -ne 3 ]
then
echo "WARNING: Unable to start dynamic task service. Retrying..."
RETRY=`expr $RETRY + 1`
sleep 10s
else
echo "ERROR: Unable to start dynamic task service. Shutting down."
exit
fi
else
RUNNING="true"
fi
done
export LAUNCHER_DYN_COUNT="$HOSTNAME"
export LAUNCHER_DYN_COUNT_PORT=9471
if [ $LAUNCHER_USE_PHI -ne "0" ]; then
#Start another tskserver for the Intel Xeon Phi cards
$LAUNCHER_DIR/tskserver `wc -l $PHI_WORKDIR/$PHI_CONTROL_FILE` $HOSTNAME 9472 2>/dev/null &
LAUNCHER_PHI_DYN_PID=$1
disown $LAUNCHER_PHI_DYN_PID
export LAUNCHER_PHI_DYN_COUNT="$HOSTNAME"
export LAUNCHER_PHI_DYN_COUNT_PORT=9472
fi
fi
if [ "$LAUNCHER_BIND" == "1" ]
then
num_socks=$(lstopo-no-graphics --only socket | wc -l | awk '{print $1}')
if [ $num_socks -eq 0 ]
then
num_socks=1
fi
num_cores=$(lstopo-no-graphics --only core | wc -l | awk '{print $1}')
num_threads=$(lstopo-no-graphics --only pu | wc -l | awk '{print $1}')
if [ $LAUNCHER_PPN -gt $num_cores ]
then
export LAUNCHER_BIND_HT=1
if [ $LAUNCHER_PPN -gt $num_threads ]
then
echo "WARNING: Requested Processes per Node ($LAUNCHER_PPN) exceeds number of available threads ($num_threads). Resetting..."
export LAUNCHER_PPN=$num_threads
export LAUNCHER_NPROCS=$(($LAUNCHER_NHOSTS * $LAUNCHER_PPN))
fi
pu_per_task=$(($num_threads / $LAUNCHER_PPN))
else
export LAUNCHER_BIND_HT=0
pu_per_task=$(($num_cores / $LAUNCHER_PPN))
fi
export LAUNCHER_PUPT=$pu_per_task
fi
#------------------------------
# Let's finally launch the job
#------------------------------
echo "Launcher: Setup complete."
echo
echo "------------- SUMMARY ---------------"
echo " Number of hosts: $LAUNCHER_NHOSTS"
echo " Working directory: $LAUNCHER_WORKDIR"
echo " Processes per host: $LAUNCHER_PPN"
echo " Total processes: $LAUNCHER_NPROCS"
echo " Total jobs: $LAUNCHER_NJOBS"
echo " Scheduling method: $LAUNCHER_SCHED"
if [ "$LAUNCHER_BIND" == "1" ]
then
echo
echo "------ Process Binding Enabled ------"
echo " Sockets per host: $num_socks"
echo " Cores per host: $num_cores"
echo " Threads per host: $num_threads"
echo -n " Binding each task to $LAUNCHER_PUPT "
if [ "$LAUNCHER_BIND_HT" == "1" ]
then
echo "threads (Hyperthreads in use)"
else
echo "cores (Hyperthreads ignored)"
fi
fi
if [ "$LAUNCHER_USE_PHI" == "1" ]
then
echo
echo "--- Intel Xeon Phi Support Enabled ---"
echo " Cards per host: $LAUNCHER_NPHI"
echo " Processes per card: $LAUNCHER_PHI_PPN"
fi
echo
echo "-------------------------------------"
echo "Launcher: Starting parallel tasks..."
i=0
if [ "x$LAUNCHER_LOCALHOST" == "x1" ]
then
env LAUNCHER_HOST_ID=0 $LAUNCHER_DIR/init_launcher
else
for host in `cat $LAUNCHER_HOSTFILE`
do
ssh $host "cd $LAUNCHER_WORKDIR; env `$LAUNCHER_DIR/pass_env` LAUNCHER_NHOSTS=$np LAUNCHER_HOST_ID=$i $LAUNCHER_DIR/init_launcher" &
i=`expr $i + 1`
done
wait
fi
res=$?
#Cleanup processes and files
if [ ! "x$LAUNCHER_DYN_PID" == "x" ]
then
kill $LAUNCHER_DYN_PID
fi
if [ ! "x$LAUNCHER_PHI_DYN" == "x" ]
then
kill $LAUNCHER_PHI_DYN_PID
fi
rm -f $LAUNCHER_RMI_HOSTFILE
if [ $res -ne 0 ]; then
echo "Launcher: Done. Job exited with code: $res"
else
echo "Launcher: Done. Job exited without errors"
fi
exit $res