Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve error checking in rt.sh #2388

Closed
Closed
Show file tree
Hide file tree
Changes from 14 commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
4830205
Specify the squeue format explicitly
DusanJovic-NOAA Jul 26, 2024
fb56194
Add ${ECF_TRYNO} suffix to compile and run log files, if ecflow is used
DusanJovic-NOAA Jul 26, 2024
47e9560
Check the job exit status in rt_utils.sh
DusanJovic-NOAA Jul 29, 2024
fbd9014
Fix shellcheck warnings
DusanJovic-NOAA Jul 29, 2024
306387a
Fix shellcheck warnings
DusanJovic-NOAA Jul 29, 2024
3b496f4
Add ecflow_client label update
DusanJovic-NOAA Jul 29, 2024
f93e983
Support COMPLETING Slurm job status
DusanJovic-NOAA Jul 29, 2024
8d1cebc
Merge remote-tracking branch 'origin/develop' into rt_squeue_state
DusanJovic-NOAA Aug 1, 2024
9690557
Move code from check_results function into run_test.sh
DusanJovic-NOAA Aug 2, 2024
107001d
Fix shellcheck warning
DusanJovic-NOAA Aug 2, 2024
72a38d3
Add timout test
DusanJovic-NOAA Aug 3, 2024
5a0de70
Fix shellcheck warning
DusanJovic-NOAA Aug 5, 2024
db70a38
Merge remote-tracking branch 'origin/develop' into rt_squeue_state
DusanJovic-NOAA Aug 5, 2024
05164c5
Merge remote-tracking branch 'origin/develop' into rt_squeue_state
DusanJovic-NOAA Aug 6, 2024
3ff2c92
Remove setting ecflow labels from submit_and_wait. Let's not complica…
DusanJovic-NOAA Aug 6, 2024
a0bd6a1
Shelcheck warning fix
DusanJovic-NOAA Aug 6, 2024
3d220fb
Merge remote-tracking branch 'origin/develop' into rt_squeue_state
DusanJovic-NOAA Aug 13, 2024
a0ca819
Use realpath instead of readlink in build.sh and tests/compile.sh
DusanJovic-NOAA Aug 13, 2024
84ab299
Update run_test.sh. Call write_fail_test if test files in check_results
DusanJovic-NOAA Aug 19, 2024
37b0d74
Merge remote-tracking branch 'origin/develop' into rt_squeue_state
DusanJovic-NOAA Aug 20, 2024
d38f4f5
Merge remote-tracking branch 'origin/develop' into rt_squeue_state
DusanJovic-NOAA Aug 21, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions tests/error-test.conf
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@ COMPILE | atm_dyn32 | intel | -DAPP=ATM -DCCPP_SUITES=FV3_GFS_v16,FV3_GFS_v16_fl
# This should succeed
RUN | control_c48.v2.sfc | | baseline |

# This should fail due to wall clock timeout
RUN | control_c48.v2.sfc_timeout | | baseline |

# These tests should always fail, and prevent the workflow from completing.
RUN | fail_to_copy | | baseline |
RUN | fail_to_run | | baseline |
Expand Down
1 change: 1 addition & 0 deletions tests/rt.sh
Original file line number Diff line number Diff line change
Expand Up @@ -1041,6 +1041,7 @@ if [[ ${skip_check_results} == true ]]; then
else
REGRESSIONTEST_LOG=${PATHRT}/logs/RegressionTests_${MACHINE_ID}.log
fi
rm -f "${REGRESSIONTEST_LOG}"

TEST_START_TIME="$(date '+%Y%m%d %T')"
export TEST_START_TIME
Expand Down
201 changes: 47 additions & 154 deletions tests/rt_utils.sh
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,6 @@ submit_and_wait() {
ROCOTO=${ROCOTO:-false}
ECFLOW=${ECFLOW:-false}

local test_status='PASS'
case ${SCHEDULER} in
pbs)
qsubout=$( qsub "${job_card}" )
Expand All @@ -151,6 +150,7 @@ submit_and_wait() {
local count=0
local job_running=''
echo "rt_utils.sh: Job is waiting to enter the queue..."
[[ ${ECFLOW:-false} == true ]] && ecflow_client --label=job_status "Waiting to enter the queue"
until [[ ${job_running} == 'true' ]]
do
case ${SCHEDULER} in
Expand All @@ -177,6 +177,10 @@ submit_and_wait() {
if [[ ${count} -eq 13 ]]; then echo "No job in queue after one minute, exiting..."; exit 2; fi
done
echo "rt_utils.sh Job (${jobid}) is now in the queue."
if [[ ${ECFLOW:-false} == true ]]; then
ecflow_client --label=job_id "${jobid}"
ecflow_client --label=job_status "Submitted"
fi

# wait for the job to finish and compare results
local n=1
Expand All @@ -187,26 +191,38 @@ submit_and_wait() {
set +e
job_info=$( qstat "${jobid}" )
set -e
if grep -q "${jobid}" <<< "${job_info}"; then
job_running=true
# Getting the status letter from scheduler info
status=$( grep "${jobid}" <<< "${job_info}" )
status=$( awk '{print $5}' <<< "${status}" )
else
job_running=false
status='COMPLETED'
set +e
exit_status=$( qstat "${jobid}" -x -f | grep Exit_status | awk '{print $3}')
set -e
if [[ ${exit_status} != 0 ]]; then
status='FAILED'
fi
fi
;;
slurm)
job_info=$( squeue -u "${USER}" -j "${jobid}" )
job_info=$( squeue -u "${USER}" -j "${jobid}" -o '%i %T' )
if grep -q "${jobid}" <<< "${job_info}"; then
job_running=true
else
job_running=false
job_info=$( sacct -n -j "${jobid}" --format=JobID,state%20,Jobname%64 | grep "^${jobid}" | grep "${JBNME}" )
fi
# Getting the status letter from scheduler info
status=$( grep "${jobid}" <<< "${job_info}" )
status=$( awk '{print $2}' <<< "${status}" )
;;
*)
;;
esac


if grep -q "${jobid}" <<< "${job_info}"; then
job_running=true
else
job_running=false
continue
fi

# Getting the status letter from scheduler info
status=$( grep "${jobid}" <<< "${job_info}" )
status=$( awk '{print $5}' <<< "${status}" )

case ${status} in
#waiting cases
#pbs: Q
Expand All @@ -217,7 +233,7 @@ submit_and_wait() {
#running cases
#pbs: R
#slurm: (old: R, new: RUNNING)
R|RUNNING)
R|RUNNING|COMPLETING)
status_label='Job running'
;;
#held cases
Expand All @@ -229,14 +245,16 @@ submit_and_wait() {
#fail/completed cases
#slurm: F/FAILED TO/TIMEOUT CA/CANCELLED
F|TO|CA|FAILED|TIMEOUT|CANCELLED)
echo "rt_utils.sh: !!!!!!!!!!JOB TERMINATED!!!!!!!!!!"
echo "rt_utils.sh: !!!!!!!!!!JOB TERMINATED!!!!!!!!!! status=${status}"
job_running=false #Trip the loop to end with these status flags
[[ ${ECFLOW:-false} == true ]] && ecflow_client --label=job_status "Failed"
interrupt_job
exit 1
;;
#completed
#pbs only: C-Complete E-Exiting
C|E)
#pbs: C-Complete E-Exiting
#slurm: CD/COMPLETED
C|E|CD|COMPLETED)
status_label='Completed'
;;
*)
Expand All @@ -247,146 +265,13 @@ submit_and_wait() {
esac

echo "${n} min. ${SCHEDULER^} Job ${jobid} Status: ${status_label} (${status})"
[[ ${ECFLOW:-false} == true ]] && ecflow_client --label=job_status "${status_label}"

(( n=n+1 ))
sleep 60 & wait $!
done
}

check_results() {
echo "rt_utils.sh: Checking results of the regression test: ${TEST_ID}"

ROCOTO=${ROCOTO:-false}
ECFLOW=${ECFLOW:-false}

local test_status='PASS'

# Give one minute for data to show up on file system
#sleep 60

{
echo
echo "baseline dir = ${RTPWD}/${CNTL_DIR}_${RT_COMPILER}"
echo "working dir = ${RUNDIR}"
echo "Checking test ${TEST_ID} results ...."
} > "${RT_LOG}"
echo
echo "baseline dir = ${RTPWD}/${CNTL_DIR}_${RT_COMPILER}"
echo "working dir = ${RUNDIR}"
echo "Checking test ${TEST_ID} results ...."

if [[ ${CREATE_BASELINE} = false ]]; then
#
# --- regression test comparison
#
for i in ${LIST_FILES} ; do
printf %s " Comparing ${i} ....." >> "${RT_LOG}"
printf %s " Comparing ${i} ....."

if [[ ! -f ${RUNDIR}/${i} ]] ; then

echo ".......MISSING file" >> "${RT_LOG}"
echo ".......MISSING file"
test_status='FAIL'

elif [[ ! -f ${RTPWD}/${CNTL_DIR}_${RT_COMPILER}/${i} ]] ; then

echo ".......MISSING baseline" >> "${RT_LOG}"
echo ".......MISSING baseline"
test_status='FAIL'

else
if [[ ${i##*.} == nc* ]] ; then
if [[ " orion hercules hera wcoss2 acorn derecho gaea jet s4 noaacloud " =~ ${MACHINE_ID} ]]; then
printf "USING NCCMP.." >> "${RT_LOG}"
printf "USING NCCMP.."
if [[ ${CMP_DATAONLY} == false ]]; then
nccmp -d -S -q -f -g -B --Attribute=checksum --warn=format "${RTPWD}/${CNTL_DIR}_${RT_COMPILER}/${i}" "${RUNDIR}/${i}" > "${i}_nccmp.log" 2>&1 && d=$? || d=$?
else
nccmp -d -S -q -f -B --Attribute=checksum --warn=format "${RTPWD}/${CNTL_DIR}_${RT_COMPILER}/${i}" "${RUNDIR}/${i}" > "${i}_nccmp.log" 2>&1 && d=$? || d=$?
fi
if [[ ${d} -ne 0 && ${d} -ne 1 ]]; then
printf "....ERROR" >> "${RT_LOG}"
printf "....ERROR"
test_status='FAIL'
fi
fi
else
printf "USING CMP.." >> "${RT_LOG}"
printf "USING CMP.."
cmp "${RTPWD}/${CNTL_DIR}_${RT_COMPILER}/${i}" "${RUNDIR}/${i}" >/dev/null 2>&1 && d=$? || d=$?
if [[ ${d} -eq 2 ]]; then
printf "....ERROR" >> "${RT_LOG}"
printf "....ERROR"
test_status='FAIL'
fi

fi

if [[ ${d} -ne 0 ]]; then
echo "....NOT IDENTICAL" >> "${RT_LOG}"
echo "....NOT IDENTICAL"
test_status='FAIL'
else
echo "....OK" >> "${RT_LOG}"
echo "....OK"
fi

fi

done

else
#
# --- create baselines
#
echo;echo "Moving baseline ${TEST_ID} files ...."
echo;echo "Moving baseline ${TEST_ID} files ...." >> "${RT_LOG}"

for i in ${LIST_FILES} ; do
printf %s " Moving ${i} ....."
printf %s " Moving ${i} ....." >> "${RT_LOG}"
if [[ -f ${RUNDIR}/${i} ]] ; then
mkdir -p "${NEW_BASELINE}/${CNTL_DIR}_${RT_COMPILER}/$(dirname "${i}")"
cp "${RUNDIR}/${i}" "${NEW_BASELINE}/${CNTL_DIR}_${RT_COMPILER}/${i}"
echo "....OK" >> "${RT_LOG}"
echo "....OK"
else
echo "....NOT OK. Missing ${RUNDIR}/${i}" >> "${RT_LOG}"
echo "....NOT OK. Missing ${RUNDIR}/${i}"
test_status='FAIL'
fi
done

fi

{
echo
grep "The total amount of wall time" "${RUNDIR}/out"
grep "The maximum resident set size" "${RUNDIR}/out"
echo
} >> "${RT_LOG}"

TRIES=''
if [[ ${ECFLOW} == true ]]; then
if [[ ${ECF_TRYNO} -gt 1 ]]; then
TRIES=" Tries: ${ECF_TRYNO}"
fi
fi
echo "Test ${TEST_ID} ${test_status}${TRIES}" >> "${RT_LOG}"
echo >> "${RT_LOG}"
echo "Test ${TEST_ID} ${test_status}${TRIES}"
echo

if [[ ${test_status} = 'FAIL' ]]; then
echo "${TEST_ID} failed in check_result" >> "${PATHRT}/fail_test_${TEST_ID}"
return 1
else
return 0
fi
}


kill_job() {
echo "rt_utils.sh: Killing job: ${jobid} on ${SCHEDULER}..."
[[ -z $1 ]] && exit 1
Expand Down Expand Up @@ -580,7 +465,11 @@ ecflow_create_compile_task() {

cat << EOF > "${ECFLOW_RUN}/${ECFLOW_SUITE}/compile_${COMPILE_ID}.ecf"
%include <head.h>
${PATHRT}/run_compile.sh "${PATHRT}" "${RUNDIR_ROOT}" "${MAKE_OPT}" "${COMPILE_ID}" > "${LOG_DIR}/compile_${COMPILE_ID}.log" 2>&1 &
(
cd "${LOG_DIR}"
ln -sf "compile_${COMPILE_ID}.log.\${ECF_TRYNO}" "compile_${COMPILE_ID}.log"
)
${PATHRT}/run_compile.sh "${PATHRT}" "${RUNDIR_ROOT}" "${MAKE_OPT}" "${COMPILE_ID}" > "${LOG_DIR}/compile_${COMPILE_ID}.log.\${ECF_TRYNO}" 2>&1 &
%include <tail.h>
EOF
{
Expand All @@ -596,7 +485,11 @@ ecflow_create_run_task() {
echo "rt_utils.sh: ${TEST_ID}: Creating ECFLOW run task"
cat << EOF > "${ECFLOW_RUN}/${ECFLOW_SUITE}/${TEST_ID}${RT_SUFFIX}.ecf"
%include <head.h>
${PATHRT}/run_test.sh "${PATHRT}" "${RUNDIR_ROOT}" "${TEST_NAME}" "${TEST_ID}" "${COMPILE_ID}" > "${LOG_DIR}/run_${TEST_ID}${RT_SUFFIX}.log" 2>&1 &
(
cd "${LOG_DIR}"
ln -sf "run_${TEST_ID}${RT_SUFFIX}.log.\${ECF_TRYNO}" "${LOG_DIR}/run_${TEST_ID}${RT_SUFFIX}.log"
)
${PATHRT}/run_test.sh "${PATHRT}" "${RUNDIR_ROOT}" "${TEST_NAME}" "${TEST_ID}" "${COMPILE_ID}" > "${LOG_DIR}/run_${TEST_ID}${RT_SUFFIX}.log.\${ECF_TRYNO}" 2>&1 &
%include <tail.h>
EOF
{
Expand Down
11 changes: 10 additions & 1 deletion tests/run_compile.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,16 @@ cleanup() {

write_fail_test() {
echo "${JBNME} failed in run_compile" >> "${PATHRT}/fail_${JBNME}"
exit 1
if [[ ${ROCOTO:-false} == true ]] || [[ ${ECFLOW:-false} == true ]]; then
# if this script has been submitted by a workflow return non-zero exit status
# so that workflow can resubmit it
exit 1
else
# if this script has been executed interactively, return zero exit status
# so that rt.sh can continue running, and hope that rt.sh's generate_log
# will catch failed tests
exit 0
fi
}

remove_fail_test() {
Expand Down
Loading
Loading