Skip to content

Commit

Permalink
Merge pull request #488 from bedroge/software_rebuilds
Browse files Browse the repository at this point in the history
Add functionality for rebuilding software: try it on OpenMPI 4.1.x to fix `smcuda` issue
  • Loading branch information
casparvl authored Mar 26, 2024
2 parents 73905db + bde75ee commit f349fde
Show file tree
Hide file tree
Showing 6 changed files with 207 additions and 41 deletions.
19 changes: 10 additions & 9 deletions EESSI-install-software.sh
Original file line number Diff line number Diff line change
Expand Up @@ -207,26 +207,27 @@ changed_easystacks=$(cat ${pr_diff} | grep '^+++' | cut -f2 -d' ' | sed 's@^[a-z
if [ -z ${changed_easystacks} ]; then
echo "No missing installations, party time!" # Ensure the bot report success, as there was nothing to be build here
else

for easystack_file in ${changed_easystacks}; do

echo -e "Processing easystack file ${easystack_file}...\n\n"

# determine version of EasyBuild module to load based on EasyBuild version included in name of easystack file
eb_version=$(echo ${easystack_file} | sed 's/.*eb-\([0-9.]*\).*/\1/g')

# load EasyBuild module (will be installed if it's not available yet)
source ${TOPDIR}/load_easybuild_module.sh ${eb_version}

${EB} --show-config

echo_green "All set, let's start installing some software with EasyBuild v${eb_version} in ${EASYBUILD_INSTALLPATH}..."

if [ -f ${easystack_file} ]; then
echo_green "Feeding easystack file ${easystack_file} to EasyBuild..."

${EB} --easystack ${TOPDIR}/${easystack_file} --robot
ec=$?

# copy EasyBuild log file if EasyBuild exited with an error
if [ ${ec} -ne 0 ]; then
eb_last_log=$(unset EB_VERBOSE; eb --last-log)
Expand All @@ -241,7 +242,7 @@ else
else
fatal_error "Easystack file ${easystack_file} not found!"
fi

done
fi

Expand Down
125 changes: 125 additions & 0 deletions EESSI-remove-software.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
#!/bin/bash
#
# Script to remove part of the EESSI software stack (version set through init/eessi_defaults)

# see example parsing of command line arguments at
# https://wiki.bash-hackers.org/scripting/posparams#using_a_while_loop
# https://stackoverflow.com/questions/192249/how-do-i-parse-command-line-arguments-in-bash

display_help() {
echo "usage: $0 [OPTIONS]"
echo " -g | --generic - instructs script to build for generic architecture target"
echo " -h | --help - display this usage information"
}

POSITIONAL_ARGS=()

while [[ $# -gt 0 ]]; do
case $1 in
-g|--generic)
DETECTION_PARAMETERS="--generic"
shift
;;
-h|--help)
display_help # Call your function
# no shifting needed here, we're done.
exit 0
;;
-*|--*)
echo "Error: Unknown option: $1" >&2
exit 1
;;
*) # No more options
POSITIONAL_ARGS+=("$1") # save positional arg
shift
;;
esac
done

set -- "${POSITIONAL_ARGS[@]}"

TOPDIR=$(dirname $(realpath $0))

export TMPDIR=$(mktemp -d /tmp/eessi-remove.XXXXXXXX)

source $TOPDIR/scripts/utils.sh

echo ">> Determining software subdirectory to use for current build host..."
if [ -z $EESSI_SOFTWARE_SUBDIR_OVERRIDE ]; then
export EESSI_SOFTWARE_SUBDIR_OVERRIDE=$(python3 $TOPDIR/eessi_software_subdir.py $DETECTION_PARAMETERS)
echo ">> Determined \$EESSI_SOFTWARE_SUBDIR_OVERRIDE via 'eessi_software_subdir.py $DETECTION_PARAMETERS' script"
else
echo ">> Picking up pre-defined \$EESSI_SOFTWARE_SUBDIR_OVERRIDE: ${EESSI_SOFTWARE_SUBDIR_OVERRIDE}"
fi

echo ">> Setting up environment..."

source $TOPDIR/init/bash

if [ -d $EESSI_CVMFS_REPO ]; then
echo_green "$EESSI_CVMFS_REPO available, OK!"
else
fatal_error "$EESSI_CVMFS_REPO is not available!"
fi

if [[ -z ${EESSI_SOFTWARE_SUBDIR} ]]; then
fatal_error "Failed to determine software subdirectory?!"
elif [[ "${EESSI_SOFTWARE_SUBDIR}" != "${EESSI_SOFTWARE_SUBDIR_OVERRIDE}" ]]; then
fatal_error "Values for EESSI_SOFTWARE_SUBDIR_OVERRIDE (${EESSI_SOFTWARE_SUBDIR_OVERRIDE}) and EESSI_SOFTWARE_SUBDIR (${EESSI_SOFTWARE_SUBDIR}) differ!"
else
echo_green ">> Using ${EESSI_SOFTWARE_SUBDIR} as software subdirectory!"
fi

echo ">> Configuring EasyBuild..."
EB="eb"
source $TOPDIR/configure_easybuild

echo ">> Setting up \$MODULEPATH..."
# make sure no modules are loaded
module --force purge
# ignore current $MODULEPATH entirely
module unuse $MODULEPATH
module use $EASYBUILD_INSTALLPATH/modules/all
if [[ -z ${MODULEPATH} ]]; then
fatal_error "Failed to set up \$MODULEPATH?!"
else
echo_green ">> MODULEPATH set up: ${MODULEPATH}"
fi

# assume there's only one diff file that corresponds to the PR patch file
pr_diff=$(ls [0-9]*.diff | head -1)

# if this script is run as root, use PR patch file to determine if software needs to be removed first
if [ $EUID -eq 0 ]; then
changed_easystacks_rebuilds=$(cat ${pr_diff} | grep '^+++' | cut -f2 -d' ' | sed 's@^[a-z]/@@g' | grep '^easystacks/.*yml$' | egrep -v 'known-issues|missing' | grep "/rebuilds/")
if [ -z ${changed_easystacks_rebuilds} ]; then
echo "No software needs to be removed."
else
for easystack_file in ${changed_easystacks_rebuilds}; do
# determine version of EasyBuild module to load based on EasyBuild version included in name of easystack file
eb_version=$(echo ${easystack_file} | sed 's/.*eb-\([0-9.]*\).*/\1/g')

# load EasyBuild module (will be installed if it's not available yet)
source ${TOPDIR}/load_easybuild_module.sh ${eb_version}

if [ -f ${easystack_file} ]; then
echo_green "Software rebuild(s) requested in ${easystack_file}, so determining which existing installation have to be removed..."
# we need to remove existing installation directories first,
# so let's figure out which modules have to be rebuilt by doing a dry-run and grepping "someapp/someversion" for the relevant lines (with [R])
# * [R] $CFGS/s/someapp/someapp-someversion.eb (module: someapp/someversion)
rebuild_apps=$(eb --allow-use-as-root-and-accept-consequences --dry-run-short --rebuild --easystack ${easystack_file} | grep "^ \* \[R\]" | grep -o "module: .*[^)]" | awk '{print $2}')
for app in ${rebuild_apps}; do
app_dir=${EASYBUILD_INSTALLPATH}/software/${app}
app_module=${EASYBUILD_INSTALLPATH}/modules/all/${app}.lua
echo_yellow "Removing ${app_dir} and ${app_module}..."
rm -rf ${app_dir}
rm -rf ${app_module}
done
else
fatal_error "Easystack file ${easystack_file} not found!"
fi
done
fi
else
fatal_error "This script can only be run by root!"
fi
57 changes: 46 additions & 11 deletions bot/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -168,12 +168,56 @@ COMMON_ARGS+=("--mode" "run")
# make sure to use the same parent dir for storing tarballs of tmp
PREVIOUS_TMP_DIR=${PWD}/previous_tmp

# prepare arguments to install_software_layer.sh (specific to build step)
declare -a BUILD_STEP_ARGS=()
declare -a INSTALL_SCRIPT_ARGS=()
declare -a REMOVAL_SCRIPT_ARGS=()
if [[ ${EESSI_SOFTWARE_SUBDIR_OVERRIDE} =~ .*/generic$ ]]; then
INSTALL_SCRIPT_ARGS+=("--generic")
REMOVAL_SCRIPT_ARGS+=("--generic")
fi
[[ ! -z ${BUILD_LOGS_DIR} ]] && INSTALL_SCRIPT_ARGS+=("--build-logs-dir" "${BUILD_LOGS_DIR}")
[[ ! -z ${SHARED_FS_PATH} ]] && INSTALL_SCRIPT_ARGS+=("--shared-fs-path" "${SHARED_FS_PATH}")

# determine if the removal step has to be run
# assume there's only one diff file that corresponds to the PR patch file
pr_diff=$(ls [0-9]*.diff | head -1)
changed_easystacks_rebuilds=$(cat ${pr_diff} | grep '^+++' | cut -f2 -d' ' | sed 's@^[a-z]/@@g' | grep '^easystacks/.*yml$' | grep "/rebuilds/")
if [[ -z ${changed_easystacks_rebuilds} ]]; then
echo "This PR does not add any easystack files in a rebuilds subdirectory, so let's skip the removal step."
else
# prepare directory to store tarball of tmp for removal and build steps
TARBALL_TMP_REMOVAL_STEP_DIR=${PREVIOUS_TMP_DIR}/removal_step
mkdir -p ${TARBALL_TMP_REMOVAL_STEP_DIR}

# prepare arguments to eessi_container.sh specific to remove step
declare -a REMOVAL_STEP_ARGS=()
REMOVAL_STEP_ARGS+=("--save" "${TARBALL_TMP_BUILD_STEP_DIR}")
REMOVAL_STEP_ARGS+=("--storage" "${STORAGE}")
# add fakeroot option in order to be able to remove software, see:
# https://github.com/EESSI/software-layer/issues/312
REMOVAL_STEP_ARGS+=("--fakeroot")

# create tmp file for output of removal step
removal_outerr=$(mktemp remove.outerr.XXXX)

echo "Executing command to remove software:"
echo "./eessi_container.sh ${COMMON_ARGS[@]} ${REMOVAL_STEP_ARGS[@]}"
echo " -- ./EESSI-remove-software.sh \"${REMOVAL_SCRIPT_ARGS[@]}\" \"$@\" 2>&1 | tee -a ${removal_outerr}"
./eessi_container.sh "${COMMON_ARGS[@]}" "${REMOVAL_STEP_ARGS[@]}" \
-- ./EESSI-remove-software.sh "${REMOVAL_SCRIPT_ARGS[@]}" "$@" 2>&1 | tee -a ${removal_outerr}

# make sure that the build step resumes from the same temporary directory
# this is important, as otherwise the removed software will still be there
REMOVAL_TMPDIR=$(grep ' as tmp directory ' ${removal_outerr} | cut -d ' ' -f 2)
BUILD_STEP_ARGS+=("--resume" "${REMOVAL_TMPDIR}")
fi

# prepare directory to store tarball of tmp for build step
TARBALL_TMP_BUILD_STEP_DIR=${PREVIOUS_TMP_DIR}/build_step
mkdir -p ${TARBALL_TMP_BUILD_STEP_DIR}

# prepare arguments to eessi_container.sh specific to build step
declare -a BUILD_STEP_ARGS=()
BUILD_STEP_ARGS+=("--save" "${TARBALL_TMP_BUILD_STEP_DIR}")
BUILD_STEP_ARGS+=("--storage" "${STORAGE}")
# add options required to handle NVIDIA support
Expand All @@ -182,14 +226,6 @@ if [[ ! -z ${SHARED_FS_PATH} ]]; then
BUILD_STEP_ARGS+=("--host-injections" "${SHARED_FS_PATH}/host-injections")
fi

# prepare arguments to install_software_layer.sh (specific to build step)
declare -a INSTALL_SCRIPT_ARGS=()
if [[ ${EESSI_SOFTWARE_SUBDIR_OVERRIDE} =~ .*/generic$ ]]; then
INSTALL_SCRIPT_ARGS+=("--generic")
fi
[[ ! -z ${BUILD_LOGS_DIR} ]] && INSTALL_SCRIPT_ARGS+=("--build-logs-dir" "${BUILD_LOGS_DIR}")
[[ ! -z ${SHARED_FS_PATH} ]] && INSTALL_SCRIPT_ARGS+=("--shared-fs-path" "${SHARED_FS_PATH}")

# create tmp file for output of build step
build_outerr=$(mktemp build.outerr.XXXX)

Expand All @@ -211,8 +247,7 @@ declare -a TARBALL_STEP_ARGS=()
TARBALL_STEP_ARGS+=("--save" "${TARBALL_TMP_TARBALL_STEP_DIR}")

# determine temporary directory to resume from
BUILD_TMPDIR=$(grep ' as tmp directory ' ${build_outerr} | cut -d ' ' -f 2)
TARBALL_STEP_ARGS+=("--resume" "${BUILD_TMPDIR}")
TARBALL_STEP_ARGS+=("--resume" "${REMOVAL_TMPDIR}")

timestamp=$(date +%s)
# to set EESSI_VERSION we need to source init/eessi_defaults now
Expand Down
21 changes: 0 additions & 21 deletions create_lmodsitepackage.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,31 +84,10 @@
end
end
local function eessi_openmpi_load_hook(t)
-- disable smcuda BTL when loading OpenMPI module for aarch64/neoverse_v1,
-- to work around hang/crash due to bug in OpenMPI;
-- see https://gitlab.com/eessi/support/-/issues/41
local frameStk = require("FrameStk"):singleton()
local mt = frameStk:mt()
local moduleName = string.match(t.modFullName, "(.-)/")
local cpuTarget = os.getenv("EESSI_SOFTWARE_SUBDIR") or ""
if (moduleName == "OpenMPI") and (cpuTarget == "aarch64/neoverse_v1") then
local msg = "Adding '^smcuda' to $OMPI_MCA_btl to work around bug in OpenMPI"
LmodMessage(msg .. " (see https://gitlab.com/eessi/support/-/issues/41)")
local ompiMcaBtl = os.getenv("OMPI_MCA_btl")
if ompiMcaBtl == nil then
setenv("OMPI_MCA_btl", "^smcuda")
else
setenv("OMPI_MCA_btl", ompiMcaBtl .. ",^smcuda")
end
end
end
-- Combine both functions into a single one, as we can only register one function as load hook in lmod
-- Also: make it non-local, so it can be imported and extended by other lmodrc files if needed
function eessi_load_hook(t)
eessi_cuda_enabled_load_hook(t)
eessi_openmpi_load_hook(t)
end
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# 2024-03-01
# Rebuild all OpenMPI 4.1.x versions due to an issue with smcuda:
# https://github.com/open-mpi/ompi/issues/12270
# https://github.com/open-mpi/ompi/pull/12344
# https://github.com/easybuilders/easybuild-easyconfigs/pull/19940
easyconfigs:
- OpenMPI-4.1.4-GCC-12.2.0.eb:
options:
from-pr: 19940
- OpenMPI-4.1.5-GCC-12.3.0:
options:
from-pr: 19940
- OpenMPI-4.1.6-GCC-13.2.0:
options:
from-pr: 19940
11 changes: 11 additions & 0 deletions eessi_container.sh
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ display_help() {
echo " -a | --access {ro,rw} - ro (read-only), rw (read & write) [default: ro]"
echo " -c | --container IMG - image file or URL defining the container to use"
echo " [default: docker://ghcr.io/eessi/build-node:debian11]"
echo " -f | --fakeroot - run the container with --fakeroot [default: false]"
echo " -g | --storage DIR - directory space on host machine (used for"
echo " temporary data) [default: 1. TMPDIR, 2. /tmp]"
echo " -h | --help - display this usage information [default: false]"
Expand Down Expand Up @@ -113,6 +114,7 @@ display_help() {
ACCESS="ro"
CONTAINER="docker://ghcr.io/eessi/build-node:debian11"
#DRY_RUN=0
FAKEROOT=0
VERBOSE=0
STORAGE=
LIST_REPOS=0
Expand Down Expand Up @@ -140,6 +142,10 @@ while [[ $# -gt 0 ]]; do
# DRY_RUN=1
# shift 1
# ;;
-f|--fakeroot)
FAKEROOT=1
shift 1
;;
-g|--storage)
STORAGE="$2"
shift 2
Expand Down Expand Up @@ -466,6 +472,11 @@ if [[ ${SETUP_NVIDIA} -eq 1 ]]; then
fi
fi

# Configure the fakeroot setting for the container
if [[ ${FAKEROOT} -eq 1 ]]; then
ADDITIONAL_CONTAINER_OPTIONS+=("--fakeroot")
fi

# set up repository config (always create directory repos_cfg and populate it with info when
# arg -r|--repository is used)
mkdir -p ${EESSI_TMPDIR}/repos_cfg
Expand Down

0 comments on commit f349fde

Please sign in to comment.