From 92b619234435fda8a14ec8c916086282c1eff77c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bob=20Dr=C3=B6ge?= Date: Tue, 7 Jan 2025 11:36:49 +0100 Subject: [PATCH] try to run reframe tests with bot --- bot/check-test.sh | 238 ++++++++++++++++++++++++++++++++++++++++++++++ bot/test.sh | 143 ++++++++++++++++++++++++++++ 2 files changed, 381 insertions(+) create mode 100755 bot/check-test.sh create mode 100755 bot/test.sh diff --git a/bot/check-test.sh b/bot/check-test.sh new file mode 100755 index 0000000..2731e75 --- /dev/null +++ b/bot/check-test.sh @@ -0,0 +1,238 @@ +#!/bin/bash +# +# Dummy script that only creates test result file for the bot, without actually checking anything +# +# This script is part of the EESSI software layer, see +# https://github.com/EESSI/software-layer.git +# +# author: Kenneth Hoste (HPC-UGent) +# +# license: GPLv2 +# +job_dir=${PWD} +job_out="slurm-${SLURM_JOB_ID}.out" +job_test_result_file="_bot_job${SLURM_JOB_ID}.test" + +# Check that job output file is found +[[ ${VERBOSE} -ne 0 ]] && echo ">> searching for job output file(s) matching '"${job_out}"'" +if [[ -f ${job_out} ]]; then + SLURM_OUTPUT_FOUND=1 + [[ ${VERBOSE} -ne 0 ]] && echo " found slurm output file '"${job_out}"'" +else + SLURM_OUTPUT_FOUND=0 + [[ ${VERBOSE} -ne 0 ]] && echo " Slurm output file '"${job_out}"' NOT found" +fi + +# ReFrame prints e.g. +#[----------] start processing checks +#[ RUN ] GROMACS_EESSI %benchmark_info=HECBioSim/Crambin %nb_impl=cpu %scale=2_nodes %module_name=GROMACS/2021.3-foss-2021a /d597cff4 @snellius:rome+default +#[ RUN ] GROMACS_EESSI %benchmark_info=HECBioSim/Crambin %nb_impl=cpu %scale=2_nodes %module_name=GROMACS/2021.3-foss-2021a /d597cff4 @snellius:genoa+default +#[ RUN ] GROMACS_EESSI %benchmark_info=HECBioSim/Crambin %nb_impl=cpu %scale=1_cpn_2_nodes %module_name=GROMACS/2021.3-foss-2021a /f4194106 @snellius:genoa+default +#[ FAIL ] (1/3) GROMACS_EESSI %benchmark_info=HECBioSim/Crambin %nb_impl=cpu %scale=2_nodes %module_name=GROMACS/2021.3-foss-2021a /d597cff4 @snellius:genoa+default +#==> test failed during 'sanity': test staged in '/scratch-shared/casparl/reframe_output/staging/snellius/genoa/default/GROMACS_EESSI_d597cff4' +#[ OK ] (2/3) GROMACS_EESSI %benchmark_info=HECBioSim/Crambin %nb_impl=cpu %scale=2_nodes %module_name=GROMACS/2021.3-foss-2021a /d597cff4 @snellius:rome+default +#P: perf: 8.441 ns/day (r:0, l:None, u:None) +#[ FAIL ] (3/3) GROMACS_EESSI %benchmark_info=HECBioSim/Crambin %nb_impl=cpu %scale=1_cpn_2_nodes %module_name=GROMACS/2021.3-foss-2021a /f4194106 @snellius:genoa+default +#==> test failed during 'sanity': test staged in '/scratch-shared/casparl/reframe_output/staging/snellius/genoa/default/GROMACS_EESSI_f4194106' +#[----------] all spawned checks have finished +#[ FAILED ] Ran 3/3 test case(s) from 2 check(s) (2 failure(s), 0 skipped, 0 aborted) + +# We will grep for the last and final line, since this reflects the overall result +# Specifically, we grep for FAILED, since this is also what we print if a step in the test script itself fails +FAILED=-1 +if [[ ${SLURM_OUTPUT_FOUND} -eq 1 ]]; then + GP_failed='\[\s*FAILED\s*\].*Ran .* test case' + grep_reframe_failed=$(grep -v "^>> searching for " ${job_dir}/${job_out} | grep "${GP_failed}") + [[ $? -eq 0 ]] && FAILED=1 || FAILED=0 + # have to be careful to not add searched for pattern into slurm out file + [[ ${VERBOSE} -ne 0 ]] && echo ">> searching for '"${GP_failed}"'" + [[ ${VERBOSE} -ne 0 ]] && echo "${grep_reframe_failed}" +fi + +# Here, we grep for 'ERROR:', which is printed if a fatal_error is encountered when executing the test step +# I.e. this is an error in execution of the run_tests.sh itself, NOT in running the actual tests +ERROR=-1 +if [[ ${SLURM_OUTPUT_FOUND} -eq 1 ]]; then + GP_error='ERROR: ' + grep_out=$(grep -v "^>> searching for " ${job_dir}/${job_out} | grep "${GP_error}") + [[ $? -eq 0 ]] && ERROR=1 || ERROR=0 + # have to be careful to not add searched for pattern into slurm out file + [[ ${VERBOSE} -ne 0 ]] && echo ">> searching for '"${GP_error}"'" + [[ ${VERBOSE} -ne 0 ]] && echo "${grep_out}" +fi + +SUCCESS=-1 +# Grep for the success pattern, so we can report the amount of tests run +if [[ ${SLURM_OUTPUT_FOUND} -eq 1 ]]; then + GP_success='\[\s*PASSED\s*\].*Ran .* test case' + grep_reframe_success=$(grep -v "^>> searching for " ${job_dir}/${job_out} | grep "${GP_success}") + [[ $? -eq 0 ]] && SUCCESS=1 || SUCCESS=0 + # have to be careful to not add searched for pattern into slurm out file + [[ ${VERBOSE} -ne 0 ]] && echo ">> searching for '"${GP_success}"'" + [[ ${VERBOSE} -ne 0 ]] && echo "${grep_reframe_success}" +fi + +if [[ ! -z ${grep_reframe_failed} ]]; then + grep_reframe_result=${grep_reframe_failed} +else + # Grep the entire output of ReFrame, so that we can report it in the foldable section of the test report + GP_success_full='(?s)\[----------\] start processing checks.*?\[==========\] Finished on [a-zA-Z0-9 ]*' + # Grab the full ReFrame report, than cut the irrelevant parts + # Note that the character limit for messages in github is around 65k, so cutting is important + grep_reframe_success_full=$( \ + grep -v "^>> searching for " ${job_dir}/${job_out} | \ + # Use -z + grep -Pzo "${GP_success_full}" | \ + # Replace null character with newline, to undo the -z option + sed 's/\x00/\n/g' | \ + # Remove the [ RUN ] lines from reframe, they are not very informative + grep -v -P '\[\s*RUN\s*]' | \ + # Remove the line '[----------] all spawned checks have finished' + grep -v '\[-*\]' | \ + # Remove the line '[==========] Finished on Mon Oct 7 21' + grep -v '\[=*\]' | \ + # Remove blank line(s) from the report + grep -v '^$' | \ + # Remove warnings about the local spawner not supporting memory requests + grep -v 'WARNING\: hooks\.req_memory_per_node does not support the scheduler you configured .local.*$' | \ + # Strip color coding characters + sed 's/\x1B\[[0-9;]*m//g' | \ + # Replace all newline characters with
+ sed ':a;N;$!ba;s/\n//g' | \ + # Replace % with %%. Use \%\% to interpret both %% as (non-special) characters + sed 's/\%/\%\%/g' \ + ) + # TODO (optional): we could impose a character limit here, and truncate if too long + # (though we should do that before inserting the
statements). + # If we do, we should probably re-append the final summary, e.g. + # [ PASSED ] Ran 10/10 test case(s) from 10 check(s) (0 failure(s), 0 skipped, 0 aborted) + # so that that is always displayed + # However, that's not implemented yet - let's see if this ever even becomes an issue + grep_reframe_result=${grep_reframe_success_full} +fi +echo "grep_reframe_result: ${grep_reframe_result}" + +echo "[TEST]" > ${job_test_result_file} +if [[ ${SLURM_OUTPUT_FOUND} -eq 0 ]]; then + summary=":cry: FAILURE" + reason="Job output file not found, cannot check test results." + status="FAILURE" +# Should come before general errors: if SUCCESS==1, it indicates the test suite ran succesfully +# regardless of other things that might have gone wrong +elif [[ ${SUCCESS} -eq 1 ]]; then + summary=":grin: SUCCESS" + reason="" + status="SUCCESS" +# Should come before general errors: if FAILED==1, it indicates the test suite ran +# otherwise the pattern wouldn't have been there +elif [[ ${FAILED} -eq 1 ]]; then + summary=":cry: FAILURE" + reason="EESSI test suite produced failures." + status="FAILURE" +elif [[ ${ERROR} -eq 1 ]]; then + summary=":cry: FAILURE" + reason="EESSI test suite was not run, test step itself failed to execute." + status="FAILURE" +else + summary=":cry: FAILURE" + reason="Failed for unknown reason" + status="FAILURE" +fi + + +echo "[TEST]" > ${job_test_result_file} +echo -n "comment_description = " >> ${job_test_result_file} + +# Use template for writing PR comment with details +# construct and write complete PR comment details: implements third alternative +comment_template="
__SUMMARY_FMT__
__REASON_FMT____REFRAME_FMT____DETAILS_FMT__
" +comment_success_item_fmt=":white_check_mark: __ITEM__" +comment_failure_item_fmt=":x: __ITEM__" + +# Initialize comment_description +comment_description=${comment_template} + +# Now, start replacing template items one by one +comment_summary_fmt="__SUMMARY__ _(click triangle for details)_" +comment_summary="${comment_summary_fmt/__SUMMARY__/${summary}}" +comment_description=${comment_description/__SUMMARY_FMT__/${comment_summary}} + + +# Only add if there is a reason (e.g. no reason for successful runs) +if [[ ! -z ${reason} ]]; then + comment_reason_fmt="
_Reason_
__REASONS__
" + reason_details="${comment_reason_fmt/__REASONS__/${reason}}" + comment_description=${comment_description/__REASON_FMT__/${reason_details}} +else + comment_description=${comment_description/__REASON_FMT__/""} +fi + +# Only add if there is a reframe summary (e.g. no reframe summary if reframe wasn't launched succesfully) +echo "ReFrame result:" +echo "${grep_reframe_result}" +if [[ ! -z ${grep_reframe_result} ]]; then + comment_reframe_fmt="
_ReFrame Summary_
__REFRAME_SUMMARY__
" + reframe_summary=${comment_reframe_fmt/__REFRAME_SUMMARY__/${grep_reframe_result}} + comment_description=${comment_description/__REFRAME_FMT__/${reframe_summary}} +else + comment_description=${comment_description/__REFRAME_FMT__/""} +fi + +# Declare functions +function print_br_item() { + format="${1}" + item="${2}" + echo -n "${format//__ITEM__/${item}}
" +} + +function success() { + format="${comment_success_item_fmt}" + item="$1" + print_br_item "${format}" "${item}" +} + +function failure() { + format="${comment_failure_item_fmt}" + item="$1" + print_br_item "${format}" "${item}" +} + +function add_detail() { + actual=${1} + expected=${2} + success_msg="${3}" + failure_msg="${4}" + if [[ ${actual} -eq ${expected} ]]; then + success "${success_msg}" + else + failure "${failure_msg}" + fi +} + +# first construct comment_details_list, abbreviated comment_details_list +# then use it to set comment_details +comment_details_list="" + +success_msg="job output file ${job_out}" +failure_msg="no job output file ${job_out}" +comment_details_list=${comment_details_list}$(add_detail ${SLURM_OUTPUT_FOUND} 1 "${success_msg}" "${failure_msg}") + +success_msg="no message matching ${GP_error}" +failure_msg="found message matching ${GP_error}" +comment_details_list=${comment_details_list}$(add_detail ${ERROR} 0 "${success_msg}" "${failure_msg}") + +# Add an escape character to every *, for it to be printed correctly in the comment on GitHub +GP_failed="${GP_failed//\*/\\*}" +success_msg="no message matching ""${GP_failed}""" +failure_msg="found message matching ""${GP_failed}""" +comment_details_list=${comment_details_list}$(add_detail ${FAILED} 0 "${success_msg}" "${failure_msg}") + +comment_details_fmt="
_Details_
__DETAILS_LIST__
" +comment_details="${comment_details_fmt/__DETAILS_LIST__/${comment_details_list}}" +comment_description=${comment_description/__DETAILS_FMT__/${comment_details}} + +# Actually writing the comment description to the result file +echo "${comment_description}" >> ${job_test_result_file} +echo "status = ${status}" >> ${job_test_result_file} + +exit 0 diff --git a/bot/test.sh b/bot/test.sh new file mode 100755 index 0000000..9996121 --- /dev/null +++ b/bot/test.sh @@ -0,0 +1,143 @@ +#!/usr/bin/env bash +# +# Script to run tests for the whole EESSI compatibility software layer. +# Intended use is that it is called at the end of a (batch) job running on a compute node. +# +# This script is part of the EESSI compatibility layer, see +# https://github.com/EESSI/compatibility-layer.git +# +# author: Thomas Roeblitz (@trz42) +# author: Caspar van Leeuwen (@casparvl) +# author: Bob Dröge (@bedroge) +# +# license: GPLv2 +# + +# ASSUMPTIONs: +# + assumption for the build step (as run through bot/build.sh which is provided +# in this repository too) +# - working directory has been prepared by the bot with a checkout of a +# pull request (OR by some other means) +# - the working directory contains a directory 'cfg' where the main config +# file 'job.cfg' has been deposited +# - the directory may contain any additional files referenced in job.cfg +# + assumptions for the test step +# - temporary storage is still available +# example +# Using /tmp/bot/EESSI/eessi.7l3zm2x7qH as temporary storage... +# - run test/compat_layer.py with ReFrame inside build container using tmp storage from build step +# plus possibly additional settings (repo, etc.) + +# stop as soon as something fails +set -e + +# source utils.sh and cfg_files.sh +source scripts/utils.sh +source scripts/cfg_files.sh + +# defaults +export JOB_CFG_FILE="${JOB_CFG_FILE_OVERRIDE:=./cfg/job.cfg}" +HOST_ARCH=$(uname -m) + +# check if ${JOB_CFG_FILE} exists +if [[ ! -r "${JOB_CFG_FILE}" ]]; then + fatal_error "job config file (JOB_CFG_FILE=${JOB_CFG_FILE}) does not exist or not readable" +fi +echo "bot/test.sh: showing ${JOB_CFG_FILE} from software-layer side" +cat ${JOB_CFG_FILE} + +echo "bot/test.sh: obtaining configuration settings from '${JOB_CFG_FILE}'" +cfg_load ${JOB_CFG_FILE} + +# if http_proxy is defined in ${JOB_CFG_FILE} use it, if not use env var $http_proxy +HTTP_PROXY=$(cfg_get_value "site_config" "http_proxy") +HTTP_PROXY=${HTTP_PROXY:-${http_proxy}} +echo "bot/test.sh: HTTP_PROXY='${HTTP_PROXY}'" + +# if https_proxy is defined in ${JOB_CFG_FILE} use it, if not use env var $https_proxy +HTTPS_PROXY=$(cfg_get_value "site_config" "https_proxy") +HTTPS_PROXY=${HTTPS_PROXY:-${https_proxy}} +echo "bot/test.sh: HTTPS_PROXY='${HTTPS_PROXY}'" + +LOCAL_TMP=$(cfg_get_value "site_config" "local_tmp") +echo "bot/test.sh: LOCAL_TMP='${LOCAL_TMP}'" + +# try to determine tmp directory from build job +EESSI_TMPDIR=$(grep -oP "To resume work add '--resume \K.*(?=')" slurm-${SLURM_JOBID}.out) + +if [[ -z ${EESSI_TMPDIR} ]]; then + echo "bot/test.sh: no information about tmp directory build step; --> giving up" + exit 2 +fi + +# obtain list of modules to be loaded +LOAD_MODULES=$(cfg_get_value "site_config" "load_modules") +echo "bot/test.sh: LOAD_MODULES='${LOAD_MODULES}'" + +# load modules if LOAD_MODULES is not empty +if [[ ! -z ${LOAD_MODULES} ]]; then + for mod in $(echo ${LOAD_MODULES} | tr ',' '\n') + do + echo "bot/test.sh: loading module '${mod}'" + module load ${mod} + done +else + echo "bot/test.sh: no modules to be loaded" +fi + +cpu_target_arch=$(cfg_get_value "architecture" "software_subdir" | cut -d/ -f1) +host_arch=$(uname -m) +eessi_arch=${cpu_target_arch:-${host_arch}} +eessi_os=linux +job_version=$(cfg_get_value "repository" "repo_version") +eessi_version=${job_version:-2023.06} +job_repo=$(cfg_get_value "repository" "repo_name") +eessi_repo=${job_repo:-software.eessi.io} +tar_topdir=/cvmfs/${eessi_repo}/versions + +if [ "${eessi_arch}" != "${host_arch}" ]; then + echo "Requested architecture (${eessi_arch}) is different from this machine's architecture ($(uname -m))!" + exit 1 +fi + +RUNTIME=$(get_container_runtime) +exit_code=$? +[[ ${VERBOSE} == '-vvv' ]] && echo "RUNTIME='${RUNTIME}'" +check_exit_code ${exit_code} "using runtime ${RUNTIME}" "oh no, neither apptainer nor singularity available" + +# Set up paths and mount points for Apptainer +if [[ -z ${APPTAINER_CACHEDIR} ]]; then + export APPTAINER_CACHEDIR=${EESSI_TMPDIR}/apptainer_cache + [[ ${VERBOSE} == '-vvv' ]] && echo "APPTAINER_CACHEDIR='${APPTAINER_CACHEDIR}'" +fi +export APPTAINER_BIND="${EESSI_TMPDIR}/cvmfs:/cvmfs,${PWD}:/compatibility-layer" +export APPTAINER_BIND="${APPTAINER_BIND},${EESSI_TMPDIR}/tmp:/tmp" +[[ ${VERBOSE} == '-vvv' ]] && echo "APPTAINER_BIND='${APPTAINER_BIND}'" +export APPTAINER_HOME="${EESSI_TMPDIR}/home:/home/${USER}" +[[ ${VERBOSE} == '-vvv' ]] && echo "APPTAINER_HOME='${APPTAINER_HOME}'" + +# also define SINGULARITY_* env vars +if [[ -z ${SINGULARITY_CACHEDIR} ]]; then + export SINGULARITY_CACHEDIR=${EESSI_TMPDIR}/apptainer_cache + [[ ${VERBOSE} == '-vvv' ]] && echo "SINGULARITY_CACHEDIR='${SINGULARITY_CACHEDIR}'" +fi +export SINGULARITY_BIND="${EESSI_TMPDIR}/cvmfs:/cvmfs,${PWD}:/compatibility-layer" +export SINGULARITY_BIND="${SINGULARITY_BIND},${EESSI_TMPDIR}/tmp:/tmp" +[[ ${VERBOSE} == '-vvv' ]] && echo "SINGULARITY_BIND='${SINGULARITY_BIND}'" +export SINGULARITY_HOME="${EESSI_TMPDIR}/home:/home/${USER}" +[[ ${VERBOSE} == '-vvv' ]] && echo "SINGULARITY_HOME='${SINGULARITY_HOME}'" + +CONTAINER=docker://ghcr.io/eessi/bootstrap-prefix:debian11 + +${RUNTIME} shell ${CONTAINER} <