try to run reframe tests with bot

EESSI · Jan 7, 2025 · 92b6192 · 92b6192
1 parent 842ae7b
commit 92b6192
Show file tree

Hide file tree

Showing 2 changed files with 381 additions and 0 deletions.
diff --git a/bot/check-test.sh b/bot/check-test.sh
@@ -0,0 +1,238 @@
+#!/bin/bash
+#
+# Dummy script that only creates test result file for the bot, without actually checking anything
+#
+# This script is part of the EESSI software layer, see
+# https://github.com/EESSI/software-layer.git
+#
+# author: Kenneth Hoste (HPC-UGent)
+#
+# license: GPLv2
+#
+job_dir=${PWD}
+job_out="slurm-${SLURM_JOB_ID}.out"
+job_test_result_file="_bot_job${SLURM_JOB_ID}.test"
+
+# Check that job output file is found
+[[ ${VERBOSE} -ne 0 ]] && echo ">> searching for job output file(s) matching '"${job_out}"'"
+if  [[ -f ${job_out} ]]; then
+    SLURM_OUTPUT_FOUND=1
+    [[ ${VERBOSE} -ne 0 ]] && echo "   found slurm output file '"${job_out}"'"
+else
+    SLURM_OUTPUT_FOUND=0
+    [[ ${VERBOSE} -ne 0 ]] && echo "   Slurm output file '"${job_out}"' NOT found"
+fi
+
+# ReFrame prints e.g.
+#[----------] start processing checks
+#[ RUN      ] GROMACS_EESSI %benchmark_info=HECBioSim/Crambin %nb_impl=cpu %scale=2_nodes %module_name=GROMACS/2021.3-foss-2021a /d597cff4 @snellius:rome+default
+#[ RUN      ] GROMACS_EESSI %benchmark_info=HECBioSim/Crambin %nb_impl=cpu %scale=2_nodes %module_name=GROMACS/2021.3-foss-2021a /d597cff4 @snellius:genoa+default
+#[ RUN      ] GROMACS_EESSI %benchmark_info=HECBioSim/Crambin %nb_impl=cpu %scale=1_cpn_2_nodes %module_name=GROMACS/2021.3-foss-2021a /f4194106 @snellius:genoa+default
+#[     FAIL ] (1/3) GROMACS_EESSI %benchmark_info=HECBioSim/Crambin %nb_impl=cpu %scale=2_nodes %module_name=GROMACS/2021.3-foss-2021a /d597cff4 @snellius:genoa+default
+#==> test failed during 'sanity': test staged in '/scratch-shared/casparl/reframe_output/staging/snellius/genoa/default/GROMACS_EESSI_d597cff4'
+#[       OK ] (2/3) GROMACS_EESSI %benchmark_info=HECBioSim/Crambin %nb_impl=cpu %scale=2_nodes %module_name=GROMACS/2021.3-foss-2021a /d597cff4 @snellius:rome+default
+#P: perf: 8.441 ns/day (r:0, l:None, u:None)
+#[     FAIL ] (3/3) GROMACS_EESSI %benchmark_info=HECBioSim/Crambin %nb_impl=cpu %scale=1_cpn_2_nodes %module_name=GROMACS/2021.3-foss-2021a /f4194106 @snellius:genoa+default
+#==> test failed during 'sanity': test staged in '/scratch-shared/casparl/reframe_output/staging/snellius/genoa/default/GROMACS_EESSI_f4194106'
+#[----------] all spawned checks have finished
+#[  FAILED  ] Ran 3/3 test case(s) from 2 check(s) (2 failure(s), 0 skipped, 0 aborted)
+
+# We will grep for the last and final line, since this reflects the overall result
+# Specifically, we grep for FAILED, since this is also what we print if a step in the test script itself fails
+FAILED=-1
+if [[ ${SLURM_OUTPUT_FOUND} -eq 1 ]]; then
+  GP_failed='\[\s*FAILED\s*\].*Ran .* test case'
+  grep_reframe_failed=$(grep -v "^>> searching for " ${job_dir}/${job_out} | grep "${GP_failed}")
+  [[ $? -eq 0 ]] && FAILED=1 || FAILED=0
+  # have to be careful to not add searched for pattern into slurm out file
+  [[ ${VERBOSE} -ne 0 ]] && echo ">> searching for '"${GP_failed}"'"
+  [[ ${VERBOSE} -ne 0 ]] && echo "${grep_reframe_failed}"
+fi
+
+# Here, we grep for 'ERROR:', which is printed if a fatal_error is encountered when executing the test step
+# I.e. this is an error in execution of the run_tests.sh itself, NOT in running the actual tests
+ERROR=-1
+if [[ ${SLURM_OUTPUT_FOUND} -eq 1 ]]; then
+  GP_error='ERROR: '
+  grep_out=$(grep -v "^>> searching for " ${job_dir}/${job_out} | grep "${GP_error}")
+  [[ $? -eq 0 ]] && ERROR=1 || ERROR=0
+  # have to be careful to not add searched for pattern into slurm out file
+  [[ ${VERBOSE} -ne 0 ]] && echo ">> searching for '"${GP_error}"'"
+  [[ ${VERBOSE} -ne 0 ]] && echo "${grep_out}"
+fi
+
+SUCCESS=-1
+# Grep for the success pattern, so we can report the amount of tests run
+if [[ ${SLURM_OUTPUT_FOUND} -eq 1 ]]; then
+  GP_success='\[\s*PASSED\s*\].*Ran .* test case'
+  grep_reframe_success=$(grep -v "^>> searching for " ${job_dir}/${job_out} | grep "${GP_success}")
+  [[ $? -eq 0 ]] && SUCCESS=1 || SUCCESS=0
+  # have to be careful to not add searched for pattern into slurm out file
+  [[ ${VERBOSE} -ne 0 ]] && echo ">> searching for '"${GP_success}"'"
+  [[ ${VERBOSE} -ne 0 ]] && echo "${grep_reframe_success}"
+fi
+
+if [[ ! -z ${grep_reframe_failed} ]]; then
+    grep_reframe_result=${grep_reframe_failed}
+else
+    # Grep the entire output of ReFrame, so that we can report it in the foldable section of the test report
+    GP_success_full='(?s)\[----------\] start processing checks.*?\[==========\] Finished on [a-zA-Z0-9 ]*'
+    # Grab the full ReFrame report, than cut the irrelevant parts
+    # Note that the character limit for messages in github is around 65k, so cutting is important
+    grep_reframe_success_full=$( \
+        grep -v "^>> searching for " ${job_dir}/${job_out} | \
+        # Use -z
+        grep -Pzo "${GP_success_full}" | \
+        # Replace null character with newline, to undo the -z option
+        sed 's/\x00/\n/g' | \
+        # Remove the [ RUN     ] lines from reframe, they are not very informative
+        grep -v -P '\[\s*RUN\s*]' | \
+        # Remove the line '[----------] all spawned checks have finished'
+        grep -v '\[-*\]' | \
+        # Remove the line '[==========] Finished on Mon Oct  7 21'
+        grep -v '\[=*\]' | \
+        # Remove blank line(s) from the report
+        grep -v '^$' | \
+        # Remove warnings about the local spawner not supporting memory requests
+        grep -v 'WARNING\: hooks\.req_memory_per_node does not support the scheduler you configured .local.*$' | \
+        # Strip color coding characters
+        sed 's/\x1B\[[0-9;]*m//g' | \
+        # Replace all newline characters with <br/>
+        sed ':a;N;$!ba;s/\n/<br\/>/g' | \
+        # Replace % with %%. Use \%\% to interpret both %% as (non-special) characters
+        sed 's/\%/\%\%/g' \
+    )
+    # TODO (optional): we could impose a character limit here, and truncate if too long
+    # (though we should do that before inserting the <br/> statements).
+    # If we do, we should probably re-append the final summary, e.g.
+    # [  PASSED  ] Ran 10/10 test case(s) from 10 check(s) (0 failure(s), 0 skipped, 0 aborted)
+    # so that that is always displayed
+    # However, that's not implemented yet - let's see if this ever even becomes an issue
+    grep_reframe_result=${grep_reframe_success_full}
+fi
+echo "grep_reframe_result: ${grep_reframe_result}"
+
+echo "[TEST]" > ${job_test_result_file}
+if [[ ${SLURM_OUTPUT_FOUND} -eq 0 ]]; then
+    summary=":cry: FAILURE"
+    reason="Job output file not found, cannot check test results."
+    status="FAILURE"
+# Should come before general errors: if SUCCESS==1, it indicates the test suite ran succesfully
+# regardless of other things that might have gone wrong
+elif [[ ${SUCCESS} -eq 1 ]]; then
+    summary=":grin: SUCCESS"
+    reason=""
+    status="SUCCESS"
+# Should come before general errors: if FAILED==1, it indicates the test suite ran
+# otherwise the pattern wouldn't have been there
+elif [[ ${FAILED} -eq 1 ]]; then
+    summary=":cry: FAILURE"
+    reason="EESSI test suite produced failures."
+    status="FAILURE"
+elif [[ ${ERROR} -eq 1 ]]; then
+    summary=":cry: FAILURE"
+    reason="EESSI test suite was not run, test step itself failed to execute."
+    status="FAILURE"
+else
+    summary=":cry: FAILURE"
+    reason="Failed for unknown reason"
+    status="FAILURE"
+fi
+
+
+echo "[TEST]" > ${job_test_result_file}
+echo -n "comment_description = " >> ${job_test_result_file}
+
+# Use template for writing PR comment with details
+# construct and write complete PR comment details: implements third alternative
+comment_template="<details>__SUMMARY_FMT__<dl>__REASON_FMT____REFRAME_FMT____DETAILS_FMT__</dl></details>"
+comment_success_item_fmt=":white_check_mark: __ITEM__"
+comment_failure_item_fmt=":x: __ITEM__"
+
+# Initialize comment_description
+comment_description=${comment_template}
+
+# Now, start replacing template items one by one
+comment_summary_fmt="<summary>__SUMMARY__ _(click triangle for details)_</summary>"
+comment_summary="${comment_summary_fmt/__SUMMARY__/${summary}}"
+comment_description=${comment_description/__SUMMARY_FMT__/${comment_summary}}
+
+
+# Only add if there is a reason (e.g. no reason for successful runs)
+if [[ ! -z ${reason} ]]; then
+    comment_reason_fmt="<dt>_Reason_</dt><dd>__REASONS__</dd>"
+    reason_details="${comment_reason_fmt/__REASONS__/${reason}}"
+    comment_description=${comment_description/__REASON_FMT__/${reason_details}}
+else
+    comment_description=${comment_description/__REASON_FMT__/""}
+fi
+
+# Only add if there is a reframe summary (e.g. no reframe summary if reframe wasn't launched succesfully)
+echo "ReFrame result:"
+echo "${grep_reframe_result}"
+if [[ ! -z ${grep_reframe_result} ]]; then
+    comment_reframe_fmt="<dt>_ReFrame Summary_</dt><dd>__REFRAME_SUMMARY__</dd>"
+    reframe_summary=${comment_reframe_fmt/__REFRAME_SUMMARY__/${grep_reframe_result}}
+    comment_description=${comment_description/__REFRAME_FMT__/${reframe_summary}}
+else
+    comment_description=${comment_description/__REFRAME_FMT__/""}
+fi
+
+# Declare functions
+function print_br_item() {
+    format="${1}"
+    item="${2}"
+    echo -n "${format//__ITEM__/${item}}<br/>"
+}
+
+function success() {
+    format="${comment_success_item_fmt}"
+    item="$1"
+    print_br_item "${format}" "${item}"
+}
+
+function failure() {
+    format="${comment_failure_item_fmt}"
+    item="$1"
+    print_br_item "${format}" "${item}"
+}
+
+function add_detail() {
+    actual=${1}
+    expected=${2}
+    success_msg="${3}"
+    failure_msg="${4}"
+    if [[ ${actual} -eq ${expected} ]]; then
+        success "${success_msg}"
+    else
+        failure "${failure_msg}"
+    fi
+}
+
+# first construct comment_details_list, abbreviated comment_details_list
+# then use it to set comment_details
+comment_details_list=""
+
+success_msg="job output file <code>${job_out}</code>"
+failure_msg="no job output file <code>${job_out}</code>"
+comment_details_list=${comment_details_list}$(add_detail ${SLURM_OUTPUT_FOUND} 1 "${success_msg}" "${failure_msg}")
+
+success_msg="no message matching <code>${GP_error}</code>"
+failure_msg="found message matching <code>${GP_error}</code>"
+comment_details_list=${comment_details_list}$(add_detail ${ERROR} 0 "${success_msg}" "${failure_msg}")
+
+# Add an escape character to every *, for it to be printed correctly in the comment on GitHub
+GP_failed="${GP_failed//\*/\\*}"
+success_msg="no message matching <code>""${GP_failed}""</code>"
+failure_msg="found message matching <code>""${GP_failed}""</code>"
+comment_details_list=${comment_details_list}$(add_detail ${FAILED} 0 "${success_msg}" "${failure_msg}")
+
+comment_details_fmt="<dt>_Details_</dt><dd>__DETAILS_LIST__</dd>"
+comment_details="${comment_details_fmt/__DETAILS_LIST__/${comment_details_list}}"
+comment_description=${comment_description/__DETAILS_FMT__/${comment_details}}
+
+# Actually writing the comment description to the result file
+echo "${comment_description}" >> ${job_test_result_file}
+echo "status = ${status}" >> ${job_test_result_file}
+
+exit 0
diff --git a/bot/test.sh b/bot/test.sh
@@ -0,0 +1,143 @@
+#!/usr/bin/env bash
+#
+# Script to run tests for the whole EESSI compatibility software layer.
+# Intended use is that it is called at the end of a (batch) job running on a compute node.
+#
+# This script is part of the EESSI compatibility layer, see
+# https://github.com/EESSI/compatibility-layer.git
+#
+# author: Thomas Roeblitz (@trz42)
+# author: Caspar van Leeuwen (@casparvl)
+# author: Bob Dröge (@bedroge)
+#
+# license: GPLv2
+#
+
+# ASSUMPTIONs:
+# + assumption for the build step (as run through bot/build.sh which is provided
+#   in this repository too)
+#  - working directory has been prepared by the bot with a checkout of a
+#    pull request (OR by some other means)
+#  - the working directory contains a directory 'cfg' where the main config
+#    file 'job.cfg' has been deposited
+#  - the directory may contain any additional files referenced in job.cfg
+# + assumptions for the test step
+#  - temporary storage is still available
+#    example
+#    Using /tmp/bot/EESSI/eessi.7l3zm2x7qH as temporary storage...
+#  - run test/compat_layer.py with ReFrame inside build container using tmp storage from build step
+#    plus possibly additional settings (repo, etc.)
+
+# stop as soon as something fails
+set -e
+
+# source utils.sh and cfg_files.sh
+source scripts/utils.sh
+source scripts/cfg_files.sh
+
+# defaults
+export JOB_CFG_FILE="${JOB_CFG_FILE_OVERRIDE:=./cfg/job.cfg}"
+HOST_ARCH=$(uname -m)
+
+# check if ${JOB_CFG_FILE} exists
+if [[ ! -r "${JOB_CFG_FILE}" ]]; then
+    fatal_error "job config file (JOB_CFG_FILE=${JOB_CFG_FILE}) does not exist or not readable"
+fi
+echo "bot/test.sh: showing ${JOB_CFG_FILE} from software-layer side"
+cat ${JOB_CFG_FILE}
+
+echo "bot/test.sh: obtaining configuration settings from '${JOB_CFG_FILE}'"
+cfg_load ${JOB_CFG_FILE}
+
+# if http_proxy is defined in ${JOB_CFG_FILE} use it, if not use env var $http_proxy
+HTTP_PROXY=$(cfg_get_value "site_config" "http_proxy")
+HTTP_PROXY=${HTTP_PROXY:-${http_proxy}}
+echo "bot/test.sh: HTTP_PROXY='${HTTP_PROXY}'"
+
+# if https_proxy is defined in ${JOB_CFG_FILE} use it, if not use env var $https_proxy
+HTTPS_PROXY=$(cfg_get_value "site_config" "https_proxy")
+HTTPS_PROXY=${HTTPS_PROXY:-${https_proxy}}
+echo "bot/test.sh: HTTPS_PROXY='${HTTPS_PROXY}'"
+
+LOCAL_TMP=$(cfg_get_value "site_config" "local_tmp")
+echo "bot/test.sh: LOCAL_TMP='${LOCAL_TMP}'"
+
+# try to determine tmp directory from build job
+EESSI_TMPDIR=$(grep -oP "To resume work add '--resume \K.*(?=')" slurm-${SLURM_JOBID}.out)
+
+if [[ -z ${EESSI_TMPDIR} ]]; then
+  echo "bot/test.sh: no information about tmp directory build step; --> giving up"
+  exit 2
+fi
+
+# obtain list of modules to be loaded
+LOAD_MODULES=$(cfg_get_value "site_config" "load_modules")
+echo "bot/test.sh: LOAD_MODULES='${LOAD_MODULES}'"
+
+# load modules if LOAD_MODULES is not empty
+if [[ ! -z ${LOAD_MODULES} ]]; then
+    for mod in $(echo ${LOAD_MODULES} | tr ',' '\n')
+    do
+        echo "bot/test.sh: loading module '${mod}'"
+        module load ${mod}
+    done
+else
+    echo "bot/test.sh: no modules to be loaded"
+fi
+
+cpu_target_arch=$(cfg_get_value "architecture" "software_subdir" | cut -d/ -f1)
+host_arch=$(uname -m)
+eessi_arch=${cpu_target_arch:-${host_arch}}
+eessi_os=linux
+job_version=$(cfg_get_value "repository" "repo_version")
+eessi_version=${job_version:-2023.06}
+job_repo=$(cfg_get_value "repository" "repo_name")
+eessi_repo=${job_repo:-software.eessi.io}
+tar_topdir=/cvmfs/${eessi_repo}/versions
+
+if [ "${eessi_arch}" != "${host_arch}" ]; then
+  echo "Requested architecture (${eessi_arch}) is different from this machine's architecture ($(uname -m))!"
+  exit 1
+fi
+
+RUNTIME=$(get_container_runtime)
+exit_code=$?
+[[ ${VERBOSE} == '-vvv' ]] && echo "RUNTIME='${RUNTIME}'"
+check_exit_code ${exit_code} "using runtime ${RUNTIME}" "oh no, neither apptainer nor singularity available"
+
+# Set up paths and mount points for Apptainer
+if [[ -z ${APPTAINER_CACHEDIR} ]]; then
+  export APPTAINER_CACHEDIR=${EESSI_TMPDIR}/apptainer_cache
+  [[ ${VERBOSE} == '-vvv' ]] && echo "APPTAINER_CACHEDIR='${APPTAINER_CACHEDIR}'"
+fi
+export APPTAINER_BIND="${EESSI_TMPDIR}/cvmfs:/cvmfs,${PWD}:/compatibility-layer"
+export APPTAINER_BIND="${APPTAINER_BIND},${EESSI_TMPDIR}/tmp:/tmp"
+[[ ${VERBOSE} == '-vvv' ]] && echo "APPTAINER_BIND='${APPTAINER_BIND}'"
+export APPTAINER_HOME="${EESSI_TMPDIR}/home:/home/${USER}"
+[[ ${VERBOSE} == '-vvv' ]] && echo "APPTAINER_HOME='${APPTAINER_HOME}'"
+
+# also define SINGULARITY_* env vars
+if [[ -z ${SINGULARITY_CACHEDIR} ]]; then
+  export SINGULARITY_CACHEDIR=${EESSI_TMPDIR}/apptainer_cache
+  [[ ${VERBOSE} == '-vvv' ]] && echo "SINGULARITY_CACHEDIR='${SINGULARITY_CACHEDIR}'"
+fi
+export SINGULARITY_BIND="${EESSI_TMPDIR}/cvmfs:/cvmfs,${PWD}:/compatibility-layer"
+export SINGULARITY_BIND="${SINGULARITY_BIND},${EESSI_TMPDIR}/tmp:/tmp"
+[[ ${VERBOSE} == '-vvv' ]] && echo "SINGULARITY_BIND='${SINGULARITY_BIND}'"
+export SINGULARITY_HOME="${EESSI_TMPDIR}/home:/home/${USER}"
+[[ ${VERBOSE} == '-vvv' ]] && echo "SINGULARITY_HOME='${SINGULARITY_HOME}'"
+
+CONTAINER=docker://ghcr.io/eessi/bootstrap-prefix:debian11
+
+${RUNTIME} shell ${CONTAINER} <<EOF
+pip3 install --ignore-installed --prefix=/tmp/reframe reframe-hpc
+export PYTHONPATH=/tmp/reframe/lib/python3.9/site-packages
+export EESSI_REPO_DIR="/cvmfs/${eessi_repo}"
+export EESSI_VERSION=${eessi_version}
+export EESSI_ARCH=${host_arch}
+export EESSI_OS=linux
+export RFM_PREFIX=/compatibility-layer/reframe_runs
+/tmp/reframe/bin/reframe --nocolor -r -v -c /compatibility-layer/test/compat_layer.py
+EOF
+
+exit 0