Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improvements to grid_submit #1561

Merged
merged 1 commit into from
Mar 28, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 40 additions & 37 deletions GRID/utils/grid_submit.sh
Original file line number Diff line number Diff line change
Expand Up @@ -411,37 +411,46 @@ EOF
echo -ne "\b\b\b${spin[$((counter%4))]} ${JOBSTATUS}"
let counter=counter+1
if [ ! "${counter}" == "100" ]; then
# ensures that we see spinner ... but only check for new job
# status every 100 * 0.5 = 50s?
continue
fi
let counter=0
let counter=0 # reset counter
JOBSTATUS=$(alien.py ps -j ${MY_JOBID} | awk '//{print $4}')
# echo -ne "Waiting for jobs to return; Last status ${JOBSTATUS}"
if [ "$JOBSTATUS" == "D" ]; then

if [ "${JOBSTATUS}" == "D" ]; then
echo "Job done"
WAITFORALIEN=""
WAITFORALIEN="" # guarantees to go out of outer while loop

if [ "${FETCHOUTPUT}" ]; then
SUBJOBIDS=""
while [ ! ${SUBJOBIDS} ]; do
SUBJOBIDS=($(alien.py ps --trace ${MY_JOBID} | awk '/Subjob submitted/' | sed 's/.*submitted: //' | tr '\n' ' '))
sleep 1
done
# TODO: make this happen in a single alien.py session and with parallel copying
echo "Fetching results"
for splitcounter in `seq 1 ${PRODSPLIT}`; do
# we still need to check if this particular subjob was successful
SUBJOBSTATUS=$(alien.py ps -j ${SUBJOBIDS[splitcounter-1]} | awk '//{print $4}')
if [ "$SUBJOBSTATUS" == "D" ]; then
SPLITOUTDIR=$(printf "%03d" ${splitcounter})
[ ! -f ${SPLITOUTDIR} ] && mkdir ${SPLITOUTDIR}
echo "Fetching result files for subjob ${splitcounter} into ${PWD}"
CPCMD="alien.py cp ${MY_JOBWORKDIR}/${SPLITOUTDIR}/* file:./${SPLITOUTDIR}"
eval "${CPCMD}" 2> /dev/null
else
echo "Not fetching files for subjob ${splitcounter} since job code is ${SUBJOBSTATUS}"
fi
done
wait
SUBJOBIDS=()
SUBJOBSTATUSES=()
echo "Fetching subjob info"
while [ "${#SUBJOBIDS[@]}" == "0" ]; do
QUERYRESULT=$(ALIENPY_JSON=true alien.py ps -a -m ${MY_JOBID})
SUBJOBIDS=($(echo ${QUERYRESULT} | jq -r '.results[].id' | tr '\n' ' '))
SUBJOBSTATUSES=($(echo ${QUERYRESULT} | jq -r '.results[].status' | tr '\n' ' '))
# echo "LENGTH SUBJOBS ${#SUBJOBIDS[@]}"
sleep 1
done
# TODO: make this happen with parallel copying
echo "Fetching results for ${PRODSPLIT} sub-jobs"
for splitcounter in `seq 1 ${PRODSPLIT}`; do
let jobindex=splitcounter-1
THIS_STATUS=${SUBJOBSTATUSES[jobindex]}
THIS_JOB=${SUBJOBIDS[jobindex]}
echo "Fetching for job ${THIS_JOB}"
if [ "${THIS_STATUS}" == "DONE" ]; then
SPLITOUTDIR=$(printf "%03d" ${splitcounter})
[ ! -f ${SPLITOUTDIR} ] && mkdir ${SPLITOUTDIR}
echo "Fetching result files for subjob ${splitcounter} into ${PWD}"
CPCMD="alien.py cp ${MY_JOBWORKDIR}/${SPLITOUTDIR}/* file:./${SPLITOUTDIR}"
eval "${CPCMD}" 2> /dev/null
else
echo "Not fetching files for subjob ${splitcounter} since job code is ${THIS_STATUS}"
fi
done
fi
fi
if [[ "${FOO:0:1}" == [EK] ]]; then
Expand Down Expand Up @@ -541,13 +550,13 @@ if [ "${ONGRID}" = "1" ]; then
fi

# ----------- DOWNLOAD ADDITIONAL HELPERS ----------------------------
curl -o analyse_CPU.py https://raw.githubusercontent.com/sawenzel/AliceO2/swenzel/cpuana/Utilities/Tools/analyse_CPU.py &> /dev/null
chmod +x analyse_CPU.py
# curl -o analyse_CPU.py https://raw.githubusercontent.com/sawenzel/AliceO2/swenzel/cpuana/Utilities/Tools/analyse_CPU.py &> /dev/null
# chmod +x analyse_CPU.py
export PATH=$PATH:$PWD
export JOBUTILS_MONITORCPU=ON
export JOBUTILS_WRAPPER_SLEEP=5
#export JOBUTILS_JOB_KILLINACTIVE=180 # kill inactive jobs after 3 minutes --> will be the task of pipeline runner? (or make it optional)
export JOBUTILS_MONITORMEM=ON
# export JOBUTILS_MONITORCPU=ON
# export JOBUTILS_WRAPPER_SLEEP=5
# export JOBUTILS_JOB_KILLINACTIVE=180 # kill inactive jobs after 3 minutes --> will be the task of pipeline runner? (or make it optional)
# export JOBUTILS_MONITORMEM=ON

# ----------- EXECUTE ACTUAL JOB ------------------------------------
# source the actual job script from the work dir
Expand All @@ -558,13 +567,7 @@ chmod +x ./alien_jobscript.sh
cp alien_log_${ALIEN_PROC_ID:-0}.txt logtmp_${ALIEN_PROC_ID:-0}.txt
[ "${ALIEN_JOB_OUTPUTDIR}" ] && upload_to_Alien logtmp_${ALIEN_PROC_ID:-0}.txt ${ALIEN_JOB_OUTPUTDIR}/

# MOMENTARILY WE ZIP ALL LOG FILES
ziparchive=logs_PROCID${ALIEN_PROC_ID:-0}.zip
find ./ -name "*.log*" -exec zip ${ziparchive} {} ';'
find ./ -name "*mergerlog*" -exec zip ${ziparchive} {} ';'
find ./ -name "*serverlog*" -exec zip ${ziparchive} {} ';'
find ./ -name "*workerlog*" -exec zip ${ziparchive} {} ';'
find ./ -name "alien_log*.txt" -exec zip ${ziparchive} {} ';'
echo "Job done"

# We need to exit for the ALIEN JOB HANDLER!
exit 0