diff --git a/.github/workflows/tests_archdetect.yml b/.github/workflows/tests_archdetect.yml index 618f6eb142..37338693c5 100644 --- a/.github/workflows/tests_archdetect.yml +++ b/.github/workflows/tests_archdetect.yml @@ -14,14 +14,15 @@ jobs: - x86_64/amd/zen2/Azure-CentOS7-7V12 - x86_64/amd/zen3/Azure-CentOS7-7V73X - ppc64le/power9le/unknown-power9le - - aarch64/arm/neoverse-n1/Azure-Ubuntu20-Altra - - aarch64/arm/neoverse-n1/AWS-awslinux-graviton2 - - aarch64/arm/neoverse-v1/AWS-awslinux-graviton3 + - aarch64/neoverse_n1/Azure-Ubuntu20-Altra + - aarch64/neoverse_n1/AWS-awslinux-graviton2 + - aarch64/neoverse_v1/AWS-awslinux-graviton3 fail-fast: false steps: - name: checkout uses: actions/checkout@93ea575cb5d8a053eaa0ac8fa3b40d7e05a33cc8 # v3.1.0 - + - name: Enable EESSI + uses: eessi/github-action-eessi@58b50fd2eead2162c2b9ac258d4fb60cc9f30503 # v2.0.13 - name: test eessi_archdetect.sh run: | export EESSI_MACHINE_TYPE=${{matrix.proc_cpuinfo}} @@ -34,3 +35,15 @@ jobs: echo "Test for ${{matrix.proc_cpuinfo}} FAILED: $CPU_ARCH" >&2 exit 1 fi + CPU_ARCHES=$(./init/eessi_archdetect.sh -a cpupath) + if [[ $CPU_ARCHES == "$( cat ./tests/archdetect/${{matrix.proc_cpuinfo}}.all.output )" ]]; then + echo "Test for ${{matrix.proc_cpuinfo}} PASSED: $CPU_ARCHES" >&2 + else + echo "Test for ${{matrix.proc_cpuinfo}} FAILED: $CPU_ARCHES" >&2 + exit 1 + fi + # Check all those architectures actually exist + for dir in $(echo "$CPU_ARCHES" | tr ':' '\n'); do + # Search all EESSI versions as we may drop support at some point + ls -d "$EESSI_PREFIX"/../*/software/linux/"$dir" + done diff --git a/configure_easybuild b/configure_easybuild index 7dca1ce682..c67b879cf3 100644 --- a/configure_easybuild +++ b/configure_easybuild @@ -25,8 +25,7 @@ fi # note: filtering Bison may break some installations, like Qt5 (see https://github.com/EESSI/software-layer/issues/49) # filtering pkg-config breaks R-bundle-Bioconductor installation (see also https://github.com/easybuilders/easybuild-easyconfigs/pull/11104) -# problems occur when filtering pkg-config with gnuplot too (picks up Lua 5.1 from $EPREFIX rather than from Lua 5.3 dependency) -DEPS_TO_FILTER=Autoconf,Automake,Autotools,binutils,bzip2,DBus,flex,gettext,gperf,help2man,intltool,libreadline,libtool,Lua,M4,makeinfo,ncurses,util-linux,XZ,zlib +DEPS_TO_FILTER=Autoconf,Automake,Autotools,binutils,bzip2,DBus,flex,gettext,gperf,help2man,intltool,libreadline,libtool,M4,makeinfo,ncurses,util-linux,XZ,zlib # For aarch64 we need to also filter out Yasm. # See https://github.com/easybuilders/easybuild-easyconfigs/issues/11190 if [[ "$EESSI_CPU_FAMILY" == "aarch64" ]]; then diff --git a/eb_hooks.py b/eb_hooks.py index a1702ab832..aa02a9b0db 100644 --- a/eb_hooks.py +++ b/eb_hooks.py @@ -19,6 +19,7 @@ CPU_TARGET_NEOVERSE_V1 = 'aarch64/neoverse_v1' +CPU_TARGET_AARCH64_GENERIC = 'aarch64/generic' EESSI_RPATH_OVERRIDE_ATTR = 'orig_rpath_override_dirs' @@ -276,6 +277,24 @@ def pre_configure_hook_wrf_aarch64(self, *args, **kwargs): raise EasyBuildError("WRF-specific hook triggered for non-WRF easyconfig?!") +def pre_configure_hook_LAMMPS_aarch64(self, *args, **kwargs): + """ + pre-configure hook for LAMMPS: + - set kokkos_arch on Aarch64 + """ + + cpu_target = get_eessi_envvar('EESSI_SOFTWARE_SUBDIR') + if self.name == 'LAMMPS': + if self.version == '23Jun2022': + if get_cpu_architecture() == AARCH64: + if cpu_target == CPU_TARGET_AARCH64_GENERIC: + self.cfg['kokkos_arch'] = 'ARM80' + else: + self.cfg['kokkos_arch'] = 'ARM81' + else: + raise EasyBuildError("LAMMPS-specific hook triggered for non-LAMMPS easyconfig?!") + + def pre_test_hook(self,*args, **kwargs): """Main pre-test hook: trigger custom functions based on software name.""" if self.name in PRE_TEST_HOOKS: @@ -428,6 +447,7 @@ def inject_gpu_property(ec): 'MetaBAT': pre_configure_hook_metabat_filtered_zlib_dep, 'OpenBLAS': pre_configure_hook_openblas_optarch_generic, 'WRF': pre_configure_hook_wrf_aarch64, + 'LAMMPS': pre_configure_hook_LAMMPS_aarch64, } PRE_TEST_HOOKS = { diff --git a/eessi-2023.06-eb-4.8.0-2021b.yml b/eessi-2023.06-eb-4.8.0-2021b.yml index 477ba6320c..a2562a4ed1 100644 --- a/eessi-2023.06-eb-4.8.0-2021b.yml +++ b/eessi-2023.06-eb-4.8.0-2021b.yml @@ -4,4 +4,9 @@ easyconfigs: # see https://github.com/easybuilders/easybuild-easyconfigs/pull/18746 options: from-pr: 18746 + - gnuplot-5.4.2-GCCcore-11.2.0.eb: + # make sure that Lua dependency is correctly picked up, + # see https://github.com/easybuilders/easybuild-easyconfigs/pull/19261 + options: + from-pr: 19261 - OpenFOAM-v2112-foss-2021b.eb diff --git a/eessi-2023.06-eb-4.8.1-2021b.yml b/eessi-2023.06-eb-4.8.1-2021b.yml index 62f529563a..6ee44926ba 100644 --- a/eessi-2023.06-eb-4.8.1-2021b.yml +++ b/eessi-2023.06-eb-4.8.1-2021b.yml @@ -5,3 +5,11 @@ easyconfigs: options: from-pr: 18834 - R-4.2.0-foss-2021b.eb + - PLUMED-2.7.3-foss-2021b.eb: + # the --enable-asmjit is not supported on Aarch64 + options: + from-pr: 19110 + - LAMMPS-23Jun2022-foss-2021b-kokkos.eb: + # TBB and ScaFaCos are optional dependencies when building on Intel arch + options: + from-pr: 19246 diff --git a/eessi-2023.06-eb-4.8.2-2021b.yml b/eessi-2023.06-eb-4.8.2-2021b.yml new file mode 100644 index 0000000000..00c02adf91 --- /dev/null +++ b/eessi-2023.06-eb-4.8.2-2021b.yml @@ -0,0 +1,6 @@ +easyconfigs: + - Pillow-8.3.2-GCCcore-11.2.0.eb: + # avoid that hardcoded paths like /usr/include are used in build commands + options: + from-pr: 19226 + - matplotlib-3.4.3-foss-2021b.eb diff --git a/eessi-2023.06-eb-4.8.2-2022a.yml b/eessi-2023.06-eb-4.8.2-2022a.yml index bab796db2b..ccd3fcd65d 100644 --- a/eessi-2023.06-eb-4.8.2-2022a.yml +++ b/eessi-2023.06-eb-4.8.2-2022a.yml @@ -5,27 +5,6 @@ easyconfigs: - AOFlagger-3.4.0-foss-2022a: options: from-pr: 19119 - # Exclude Lua from `filter-deps` as the compat layer version is too old for the software - filter-deps: - - Autoconf - - Automake - - Autotools - - binutils - - bzip2 - - DBus - - flex - - gettext - - gperf - - help2man - - intltool - - libreadline - - libtool - - ncurses - - M4 - - makeinfo - - util-linux - - XZ - - zlib - EveryBeam-0.5.2-foss-2022a: options: from-pr: 19119 @@ -35,3 +14,8 @@ easyconfigs: - WSClean-3.4-foss-2022a: options: from-pr: 19119 + - Pillow-9.1.1-GCCcore-11.3.0.eb: + # avoid that hardcoded paths like /usr/include are used in build commands + options: + from-pr: 19226 + - matplotlib-3.5.2-foss-2022a.eb diff --git a/eessi_container.sh b/eessi_container.sh index 48c4653ba9..e31808d546 100755 --- a/eessi_container.sh +++ b/eessi_container.sh @@ -30,8 +30,8 @@ # -. initial settings & exit codes TOPDIR=$(dirname $(realpath $0)) -source ${TOPDIR}/scripts/utils.sh -source ${TOPDIR}/scripts/cfg_files.sh +source "${TOPDIR}"/scripts/utils.sh +source "${TOPDIR}"/scripts/cfg_files.sh # exit codes: bitwise shift codes to allow for combination of exit codes # ANY_ERROR_EXITCODE is sourced from ${TOPDIR}/scripts/utils.sh @@ -46,6 +46,7 @@ SAVE_ERROR_EXITCODE=$((${ANY_ERROR_EXITCODE} << 8)) HTTP_PROXY_ERROR_EXITCODE=$((${ANY_ERROR_EXITCODE} << 9)) HTTPS_PROXY_ERROR_EXITCODE=$((${ANY_ERROR_EXITCODE} << 10)) RUN_SCRIPT_MISSING_EXITCODE=$((${ANY_ERROR_EXITCODE} << 11)) +NVIDIA_MODE_UNKNOWN_EXITCODE=$((${ANY_ERROR_EXITCODE} << 12)) # CernVM-FS settings CVMFS_VAR_LIB="var-lib-cvmfs" @@ -72,12 +73,17 @@ display_help() { echo " -a | --access {ro,rw} - ro (read-only), rw (read & write) [default: ro]" echo " -c | --container IMG - image file or URL defining the container to use" echo " [default: docker://ghcr.io/eessi/build-node:debian11]" - echo " -h | --help - display this usage information [default: false]" echo " -g | --storage DIR - directory space on host machine (used for" echo " temporary data) [default: 1. TMPDIR, 2. /tmp]" + echo " -h | --help - display this usage information [default: false]" + echo " -i | --host-injections - directory to link to for host_injections " + echo " [default: /..storage../opt-eessi]" echo " -l | --list-repos - list available repository identifiers [default: false]" echo " -m | --mode MODE - with MODE==shell (launch interactive shell) or" echo " MODE==run (run a script or command) [default: shell]" + echo " -n | --nvidia MODE - configure the container to work with NVIDIA GPUs," + echo " MODE==install for a CUDA installation, MODE==run to" + echo " attach a GPU, MODE==all for both [default: false]" echo " -r | --repository CFG - configuration file or identifier defining the" echo " repository to use [default: EESSI-pilot via" echo " default container, see --container]" @@ -111,6 +117,8 @@ VERBOSE=0 STORAGE= LIST_REPOS=0 MODE="shell" +SETUP_NVIDIA=0 +ADDITIONAL_SINGULARITY_FLAGS= REPOSITORY="EESSI-pilot" RESUME= SAVE= @@ -141,6 +149,10 @@ while [[ $# -gt 0 ]]; do display_help exit 0 ;; + -i|--host-injections) + USER_HOST_INJECTIONS="$2" + shift 2 + ;; -l|--list-repos) LIST_REPOS=1 shift 1 @@ -149,6 +161,11 @@ while [[ $# -gt 0 ]]; do MODE="$2" shift 2 ;; + -n|--nvidia) + SETUP_NVIDIA=1 + NVIDIA_MODE="$2" + shift 2 + ;; -r|--repository) REPOSITORY="$2" shift 2 @@ -224,6 +241,13 @@ if [[ "${MODE}" != "shell" && "${MODE}" != "run" ]]; then fatal_error "unknown execution mode '${MODE}'" "${MODE_UNKNOWN_EXITCODE}" fi +# Also validate the NVIDIA GPU mode (if present) +if [[ ${SETUP_NVIDIA} -eq 1 ]]; then + if [[ "${NVIDIA_MODE}" != "run" && "${NVIDIA_MODE}" != "install" && "${NVIDIA_MODE}" != "all" ]]; then + fatal_error "unknown NVIDIA mode '${NVIDIA_MODE}'" "${NVIDIA_MODE_UNKNOWN_EXITCODE}" + fi +fi + # TODO (arg -r|--repository) check if repository is known # REPOSITORY_ERROR_EXITCODE if [[ ! -z "${REPOSITORY}" && "${REPOSITORY}" != "EESSI-pilot" && ! -r ${EESSI_REPOS_CFG_FILE} ]]; then @@ -294,6 +318,7 @@ else echo "Using ${EESSI_HOST_STORAGE} as tmp directory (to resume session add '--resume ${EESSI_HOST_STORAGE}')." fi + # if ${RESUME} is a file (assume a tgz), unpack it into ${EESSI_HOST_STORAGE} if [[ ! -z ${RESUME} && -f ${RESUME} ]]; then tar xf ${RESUME} -C ${EESSI_HOST_STORAGE} @@ -310,12 +335,25 @@ fi # |-overlay-work # |-home # |-repos_cfg +# |-opt-eessi (unless otherwise specificed for host_injections) # tmp dir for EESSI EESSI_TMPDIR=${EESSI_HOST_STORAGE} mkdir -p ${EESSI_TMPDIR} [[ ${VERBOSE} -eq 1 ]] && echo "EESSI_TMPDIR=${EESSI_TMPDIR}" +# Set host_injections directory and ensure it is a writable directory (if user provided) +if [ -z ${USER_HOST_INJECTIONS+x} ]; then + # Not set, so use our default + HOST_INJECTIONS=${EESSI_TMPDIR}/opt-eessi + mkdir -p $HOST_INJECTIONS +else + # Make sure the host_injections directory specified exists and is a folder + mkdir -p ${USER_HOST_INJECTIONS} || fatal_error "host_injections directory ${USER_HOST_INJECTIONS} is either not a directory or cannot be created" + HOST_INJECTIONS=${USER_HOST_INJECTIONS} +fi +[[ ${VERBOSE} -eq 1 ]] && echo "HOST_INJECTIONS=${HOST_INJECTIONS}" + # configure Singularity: if SINGULARITY_CACHEDIR is already defined, use that # a global SINGULARITY_CACHEDIR would ensure that we don't consume # storage space again and again for the container & also speed-up @@ -394,12 +432,34 @@ fi [[ ${VERBOSE} -eq 1 ]] && echo "SINGULARITY_HOME=${SINGULARITY_HOME}" # define paths to add to SINGULARITY_BIND (added later when all BIND mounts are defined) -BIND_PATHS="${EESSI_CVMFS_VAR_LIB}:/var/lib/cvmfs,${EESSI_CVMFS_VAR_RUN}:/var/run/cvmfs" +BIND_PATHS="${EESSI_CVMFS_VAR_LIB}:/var/lib/cvmfs,${EESSI_CVMFS_VAR_RUN}:/var/run/cvmfs,${HOST_INJECTIONS}:/opt/eessi" # provide a '/tmp' inside the container BIND_PATHS="${BIND_PATHS},${EESSI_TMPDIR}:${TMP_IN_CONTAINER}" [[ ${VERBOSE} -eq 1 ]] && echo "BIND_PATHS=${BIND_PATHS}" +# Configure anything we need for NVIDIA GPUs and CUDA installation +if [[ ${SETUP_NVIDIA} -eq 1 ]]; then + if [[ "${NVIDIA_MODE}" == "run" || "${NVIDIA_MODE}" == "all" ]]; then + # Give singularity the appropriate flag + ADDITIONAL_SINGULARITY_FLAGS="--nv ${ADDITIONAL_SINGULARITY_FLAGS}" + [[ ${VERBOSE} -eq 1 ]] && echo "ADDITIONAL_SINGULARITY_FLAGS=${ADDITIONAL_SINGULARITY_FLAGS}" + fi + if [[ "${NVIDIA_MODE}" == "install" || "${NVIDIA_MODE}" == "all" ]]; then + # Add additional bind mounts to allow CUDA to install within a container + # (Experience tells us that these are necessary, but we don't know _why_ + # as the CUDA installer is a black box. The suspicion is that the CUDA + # installer gets confused by the permissions on these directories when + # inside a container) + EESSI_VAR_LOG=${EESSI_TMPDIR}/var-log + EESSI_USR_LOCAL_CUDA=${EESSI_TMPDIR}/usr-local-cuda + mkdir -p ${EESSI_VAR_LOG} + mkdir -p ${EESSI_USR_LOCAL_CUDA} + BIND_PATHS="${BIND_PATHS},${EESSI_VAR_LOG}:/var/log,${EESSI_USR_LOCAL_CUDA}:/usr/local/cuda" + [[ ${VERBOSE} -eq 1 ]] && echo "BIND_PATHS=${BIND_PATHS}" + fi +fi + # set up repository config (always create directory repos_cfg and populate it with info when # arg -r|--repository is used) mkdir -p ${EESSI_TMPDIR}/repos_cfg @@ -558,8 +618,8 @@ if [ ! -z ${EESSI_SOFTWARE_SUBDIR_OVERRIDE} ]; then fi echo "Launching container with command (next line):" -echo "singularity ${RUN_QUIET} ${MODE} ${EESSI_FUSE_MOUNTS[@]} ${CONTAINER} $@" -singularity ${RUN_QUIET} ${MODE} "${EESSI_FUSE_MOUNTS[@]}" ${CONTAINER} "$@" +echo "singularity ${RUN_QUIET} ${MODE} ${ADDITIONAL_SINGULARITY_FLAGS} ${EESSI_FUSE_MOUNTS[@]} ${CONTAINER} $@" +singularity ${RUN_QUIET} ${MODE} ${ADDITIONAL_SINGULARITY_FLAGS} "${EESSI_FUSE_MOUNTS[@]}" ${CONTAINER} "$@" exit_code=$? # 6. save tmp if requested (arg -s|--save) diff --git a/gpu_support/nvidia/install_cuda_host_injections.sh b/gpu_support/nvidia/install_cuda_host_injections.sh new file mode 100755 index 0000000000..f02f0da02e --- /dev/null +++ b/gpu_support/nvidia/install_cuda_host_injections.sh @@ -0,0 +1,209 @@ +#!/usr/bin/env bash + +# This script can be used to install CUDA under the `.../host_injections` directory. +# This provides the parts of the CUDA installation that cannot be redistributed as +# part of EESSI due to license limitations. While GPU-based software from EESSI will +# _run_ without these, installation of additional CUDA software requires the CUDA +# installation(s) under `host_injections` to be present. +# +# The `host_injections` directory is a variant symlink that by default points to +# `/opt/eessi`, unless otherwise defined in the local CVMFS configuration (see +# https://cvmfs.readthedocs.io/en/stable/cpt-repo.html#variant-symlinks). For the +# installation to be successful, this directory needs to be writeable by the user +# executing this script. + +# Initialise our bash functions +TOPDIR=$(dirname $(realpath $BASH_SOURCE)) +source "$TOPDIR"/../../scripts/utils.sh + +# Function to display help message +show_help() { + echo "Usage: $0 [OPTIONS]" + echo "Options:" + echo " --help Display this help message" + echo " --accept-cuda-eula You _must_ accept the CUDA EULA to install" + echo " CUDA, see the EULA at" + echo " https://docs.nvidia.com/cuda/eula/index.html" + echo " -c, --cuda-version CUDA_VERSION Specify a version o CUDA to install (must" + echo " have a corresponding easyconfig in the" + echo " EasyBuild release)" + echo " -t, --temp-dir /path/to/tmpdir Specify a location to use for temporary" + echo " storage during the CUDA install" + echo " (must have >10GB available)" +} + +# Initialize variables +install_cuda_version="" +eula_accepted=0 + +# Parse command-line options +while [[ $# -gt 0 ]]; do + case "$1" in + --help) + show_help + exit 0 + ;; + -c|--cuda-version) + if [ -n "$2" ]; then + install_cuda_version="$2" + shift 2 + else + echo "Error: Argument required for $1" + show_help + exit 1 + fi + ;; + --accept-cuda-eula) + eula_accepted=1 + shift 1 + ;; + -t|--temp-dir) + if [ -n "$2" ]; then + CUDA_TEMP_DIR="$2" + shift 2 + else + echo "Error: Argument required for $1" + show_help + exit 1 + fi + ;; + *) + show_help + fatal_error "Error: Unknown option: $1" + ;; + esac +done + +# Make sure EESSI is initialised +check_eessi_initialised + +# Make sure the CUDA version supplied is a semantic version +is_semantic_version() { + local version=$1 + local regex='^[0-9]+\.[0-9]+\.[0-9]+$' + + if [[ $version =~ $regex ]]; then + return 0 # Return success (0) if it's a semantic version + else + return 1 # Return failure (1) if it's not a semantic version + fi +} +if ! is_semantic_version "$install_cuda_version"; then + show_help + error="\nYou must provide a semantic version for CUDA (e.g., 12.1.1) via the appropriate\n" + error="${error}command line option. This script is intended for use with EESSI so the 'correct'\n" + error="${error}version to provide is probably one of those available under\n" + error="${error}$EESSI_SOFTWARE_PATH/software/CUDA\n" + fatal_error "${error}" +fi + +# Make sure they have accepted the CUDA EULA +if [ "$eula_accepted" -ne 1 ]; then + show_help + error="\nYou _must_ accept the CUDA EULA via the appropriate command line option.\n" + fatal_error "${error}" +fi + +# As an installation location just use $EESSI_SOFTWARE_PATH but replacing `versions` with `host_injections` +# (CUDA is a binary installation so no need to worry too much about the EasyBuild setup) +cuda_install_parent=${EESSI_SOFTWARE_PATH/versions/host_injections} + +# Only install CUDA if specified version is not found. +# (existence of easybuild subdir implies a successful install) +if [ -d "${cuda_install_parent}"/software/CUDA/"${install_cuda_version}"/easybuild ]; then + echo_green "CUDA software found! No need to install CUDA again." +else + # We need to be able write to the installation space so let's make sure we can + if ! create_directory_structure "${cuda_install_parent}"/software/CUDA ; then + fatal_error "No write permissions to directory ${cuda_install_parent}/software/CUDA" + fi + + # we need a directory we can use for temporary storage + if [[ -z "${CUDA_TEMP_DIR}" ]]; then + tmpdir=$(mktemp -d) + else + tmpdir="${CUDA_TEMP_DIR}"/temp + if ! mkdir "$tmpdir" ; then + fatal_error "Could not create directory ${tmpdir}" + fi + fi + + required_space_in_tmpdir=50000 + # Let's see if we have sources and build locations defined if not, we use the temporary space + if [[ -z "${EASYBUILD_BUILDPATH}" ]]; then + export EASYBUILD_BUILDPATH=${tmpdir}/build + required_space_in_tmpdir=$((required_space_in_tmpdir + 5000000)) + fi + if [[ -z "${EASYBUILD_SOURCEPATH}" ]]; then + export EASYBUILD_SOURCEPATH=${tmpdir}/sources + required_space_in_tmpdir=$((required_space_in_tmpdir + 5000000)) + fi + + # The install is pretty fat, you need lots of space for download/unpack/install (~3*5GB), + # need to do a space check before we proceed + avail_space=$(df --output=avail "${cuda_install_parent}"/ | tail -n 1 | awk '{print $1}') + if (( avail_space < 5000000 )); then + fatal_error "Need at least 5GB disk space to install CUDA under ${cuda_install_parent}, exiting now..." + fi + avail_space=$(df --output=avail "${tmpdir}"/ | tail -n 1 | awk '{print $1}') + if (( avail_space < required_space_in_tmpdir )); then + error="Need at least ${required_space_in_tmpdir} disk space under ${tmpdir}.\n" + error="${error}Set the environment variable CUDA_TEMP_DIR to a location with adequate space to pass this check." + error="${error}You can alternatively set EASYBUILD_BUILDPATH and/or EASYBUILD_SOURCEPATH " + error="${error}to reduce this requirement. Exiting now..." + fatal_error "${error}" + fi + + if ! command -v "eb" &>/dev/null; then + echo_yellow "Attempting to load an EasyBuild module to do actual install" + module load EasyBuild + # There are some scenarios where this may fail + if [ $? -ne 0 ]; then + error="'eb' command not found in your environment and\n" + error="${error} module load EasyBuild\n" + error="${error}failed for some reason.\n" + error="${error}Please re-run this script with the 'eb' command available." + fatal_error "${error}" + fi + fi + + cuda_easyconfig="CUDA-${install_cuda_version}.eb" + + # Check the easyconfig file is available in the release + # (eb search always returns 0, so we need a grep to ensure a usable exit code) + eb --search ^${cuda_easyconfig}|grep CUDA > /dev/null 2>&1 + # Check the exit code + if [ $? -ne 0 ]; then + eb_version=$(eb --version) + available_cuda_easyconfigs=$(eb --search ^CUDA-*.eb|grep CUDA) + + error="The easyconfig ${cuda_easyconfig} was not found in EasyBuild version:\n" + error="${error} ${eb_version}\n" + error="${error}You either need to give a different version of CUDA to install _or_ \n" + error="${error}use a different version of EasyBuild for the installation.\n" + error="${error}\nThe versions of available with the current eb command are:\n" + error="${error}${available_cuda_easyconfigs}" + fatal_error "${error}" + fi + + # We need the --rebuild option, as the CUDA module may or may not be on the + # `MODULEPATH` yet. Even if it is, we still want to redo this installation + # since it will provide the symlinked targets for the parts of the CUDA + # installation in the `.../versions/...` prefix + # We install the module in our `tmpdir` since we do not need the modulefile, + # we only care about providing the targets for the symlinks. + extra_args="--rebuild --installpath-modules=${tmpdir}" + + # We don't want hooks used in this install, we need a vanilla CUDA installation + touch "$tmpdir"/none.py + # shellcheck disable=SC2086 # Intended splitting of extra_args + eb --prefix="$tmpdir" ${extra_args} --accept-eula-for=CUDA --hooks="$tmpdir"/none.py --installpath="${cuda_install_parent}"/ "${cuda_easyconfig}" + ret=$? + if [ $ret -ne 0 ]; then + fatal_error "CUDA installation failed, please check EasyBuild logs..." + else + echo_green "CUDA installation at ${cuda_install_parent}/software/CUDA/${install_cuda_version} succeeded!" + fi + # clean up tmpdir + rm -rf "${tmpdir}" +fi diff --git a/gpu_support/nvidia/link_nvidia_host_libraries.sh b/gpu_support/nvidia/link_nvidia_host_libraries.sh new file mode 100755 index 0000000000..26760f0b82 --- /dev/null +++ b/gpu_support/nvidia/link_nvidia_host_libraries.sh @@ -0,0 +1,136 @@ +#!/bin/bash + +# This script links host libraries related to GPU drivers to a location where +# they can be found by the EESSI linker + +# Initialise our bash functions +TOPDIR=$(dirname $(realpath $BASH_SOURCE)) +source "$TOPDIR"/../../scripts/utils.sh + +# We rely on ldconfig to give us the location of the libraries on the host +command_name="ldconfig" +# We cannot use a version of ldconfig that's being shipped under CVMFS +exclude_prefix="/cvmfs" + +found_paths=() +# Always attempt to use /sbin/ldconfig +if [ -x "/sbin/$command_name" ]; then + found_paths+=("/sbin/$command_name") +fi +IFS=':' read -ra path_dirs <<< "$PATH" +for dir in "${path_dirs[@]}"; do + if [ "$dir" = "/sbin" ]; then + continue # we've already checked for $command_name in /sbin, don't need to do it twice + fi + if [[ ! "$dir" =~ ^$exclude_prefix ]]; then + if [ -x "$dir/$command_name" ]; then + found_paths+=("$dir/$command_name") + fi + fi +done + +if [ ${#found_paths[@]} -gt 0 ]; then + echo "Found $command_name in the following locations:" + printf -- "- %s\n" "${found_paths[@]}" + echo "Using first version" + host_ldconfig=${found_paths[0]} +else + error="$command_name not found in PATH or only found in paths starting with $exclude_prefix." + fatal_error $error +fi + +# Make sure EESSI is initialised (doesn't matter what version) +check_eessi_initialised + +# Find the CUDA version of the host CUDA drivers +# (making sure that this can still work inside prefix environment inside a container) +export LD_LIBRARY_PATH=/.singularity.d/libs:$LD_LIBRARY_PATH +nvidia_smi_command="nvidia-smi --query-gpu=driver_version --format=csv,noheader" +if $nvidia_smi_command > /dev/null; then + host_driver_version=$($nvidia_smi_command | tail -n1) + # If the first worked, this should work too + host_cuda_version=$(nvidia-smi -q --display=COMPUTE | grep CUDA | awk 'NF>1{print $NF}') +else + error="Failed to successfully execute\n $nvidia_smi_command\n" + fatal_error $error +fi + +# Let's make sure the driver libraries are not already in place +link_drivers=1 + +host_injections_nvidia_dir="/cvmfs/pilot.eessi-hpc.org/host_injections/nvidia/${EESSI_CPU_FAMILY}" +host_injection_driver_dir="${host_injections_nvidia_dir}/host" +host_injection_driver_version_file="$host_injection_driver_dir/driver_version.txt" +if [ -e "$host_injection_driver_version_file" ]; then + if grep -q "$host_driver_version" "$host_injection_driver_version_file"; then + echo_green "The host CUDA driver libraries have already been linked!" + link_drivers=0 + else + # There's something there but it is out of date + echo_yellow "Cleaning out outdated symlinks" + rm $host_injection_driver_dir/* + if [ $? -ne 0 ]; then + error="Unable to remove files under '$host_injection_driver_dir'." + fatal_error $error + fi + fi +fi + +drivers_linked=0 +if [ "$link_drivers" -eq 1 ]; then + if ! create_directory_structure "${host_injection_driver_dir}" ; then + fatal_error "No write permissions to directory ${host_injection_driver_dir}" + fi + cd ${host_injection_driver_dir} + # Need a small temporary space to hold a couple of files + temp_dir=$(mktemp -d) + + # Gather libraries on the host (_must_ be host ldconfig) + $host_ldconfig -p | awk '{print $NF}' > "$temp_dir"/libs.txt + # Allow for the fact that we may be in a container so the CUDA libs might be in there + ls /.singularity.d/libs/* >> "$temp_dir"/libs.txt 2>/dev/null + + # Leverage singularity to find the full list of libraries we should be linking to + echo_yellow "Downloading latest version of nvliblist.conf from Apptainer" + curl -o "$temp_dir"/nvliblist.conf https://raw.githubusercontent.com/apptainer/apptainer/main/etc/nvliblist.conf + + # Make symlinks to all the interesting libraries + grep '.so$' "$temp_dir"/nvliblist.conf | xargs -i grep {} "$temp_dir"/libs.txt | xargs -i ln -s {} + + # Inject driver and CUDA versions into dir + echo $host_driver_version > driver_version.txt + echo $host_cuda_version > cuda_version.txt + drivers_linked=1 + + # Remove the temporary directory when done + rm -r "$temp_dir" +fi + +# Make latest symlink for NVIDIA drivers +cd $host_injections_nvidia_dir +symlink="latest" +if [ -L "$symlink" ]; then + # Unless the drivers have been installed, leave the symlink alone + if [ "$drivers_linked" -eq 1 ]; then + ln -sf host latest + fi +else + # No link exists yet + ln -s host latest +fi + +# Make sure the libraries can be found by the EESSI linker +host_injection_linker_dir=${EESSI_EPREFIX/versions/host_injections} +if [ -L "$host_injection_linker_dir/lib" ]; then + target_path=$(readlink -f "$host_injection_linker_dir/lib") + if [ "$target_path" != "$$host_injections_nvidia_dir/latest" ]; then + cd $host_injection_linker_dir + ln -sf $host_injections_nvidia_dir/latest lib + fi +else + create_directory_structure $host_injection_linker_dir + cd $host_injection_linker_dir + ln -s $host_injections_nvidia_dir/latest lib +fi + +echo_green "Host NVIDIA gpu drivers linked successfully for EESSI" diff --git a/init/arch_specs/eessi_arch_arm.spec b/init/arch_specs/eessi_arch_arm.spec index 92f32a76d8..b5c9275043 100755 --- a/init/arch_specs/eessi_arch_arm.spec +++ b/init/arch_specs/eessi_arch_arm.spec @@ -1,6 +1,6 @@ # ARM CPU architecture specifications # Software path in EESSI | Vendor ID | List of defining CPU features -"aarch64/arm/neoverse-n1" "ARM" "asimd" # Ampere Altra -"aarch64/arm/neoverse-n1" "" "asimd" # AWS Graviton2 -"aarch64/arm/neoverse-v1" "ARM" "asimd svei8mm" -"aarch64/arm/neoverse-v1" "" "asimd svei8mm" # AWS Graviton3 +"aarch64/neoverse_n1" "ARM" "asimd" # Ampere Altra +"aarch64/neoverse_n1" "" "asimd" # AWS Graviton2 +"aarch64/neoverse_v1" "ARM" "asimd svei8mm" +"aarch64/neoverse_v1" "" "asimd svei8mm" # AWS Graviton3 diff --git a/init/eessi_archdetect.sh b/init/eessi_archdetect.sh index d2b6dacf04..58dd99eb6b 100755 --- a/init/eessi_archdetect.sh +++ b/init/eessi_archdetect.sh @@ -1,8 +1,10 @@ #!/usr/bin/env bash -VERSION="1.0.0" +VERSION="1.1.0" # Logging LOG_LEVEL="INFO" +# Default result type is a best match +CPUPATH_RESULT="best" timestamp () { date "+%Y-%m-%d %H:%M:%S" @@ -105,7 +107,8 @@ cpupath(){ log "DEBUG" "cpupath: CPU flags of host system: '$cpu_flags'" # Default to generic CPU - local best_arch_match="generic" + local best_arch_match="$machine_type/generic" + local all_arch_matches=$best_arch_match # Iterate over the supported CPU specifications to find the best match for host CPU # Order of the specifications matters, the last one to match will be selected @@ -114,22 +117,29 @@ cpupath(){ if [ "${cpu_vendor}x" == "${arch_spec[1]}x" ]; then # each flag in this CPU specification must be found in the list of flags of the host check_allinfirst "${cpu_flags[*]}" ${arch_spec[2]} && best_arch_match=${arch_spec[0]} && \ - log "DEBUG" "cpupath: host CPU best match updated to $best_arch_match" + all_arch_matches="$best_arch_match:$all_arch_matches" && \ + log "DEBUG" "cpupath: host CPU best match updated to $best_arch_match" fi done - log "INFO" "cpupath: best match for host CPU: $best_arch_match" - echo "$best_arch_match" + if [ "allx" == "${CPUPATH_RESULT}x" ]; then + log "INFO" "cpupath: all matches for host CPU: $all_arch_matches" + echo "$all_arch_matches" + else + log "INFO" "cpupath: best match for host CPU: $best_arch_match" + echo "$best_arch_match" + fi } # Parse command line arguments -USAGE="Usage: eessi_archdetect.sh [-h][-d] " +USAGE="Usage: eessi_archdetect.sh [-h][-d][-a] " -while getopts 'hdv' OPTION; do +while getopts 'hdva' OPTION; do case "$OPTION" in h) echo "$USAGE"; exit 0;; d) LOG_LEVEL="DEBUG";; v) echo "eessi_archdetect.sh v$VERSION"; exit 0;; + a) CPUPATH_RESULT="all";; ?) echo "$USAGE"; exit 1;; esac done @@ -139,5 +149,5 @@ ARGUMENT=${1:-none} case "$ARGUMENT" in "cpupath") cpupath; exit;; - *) echo "$USAGE"; log "ERROR" "Missing argument";; + *) echo "$USAGE"; log "ERROR" "Missing argument (possible actions: 'cpupath')";; esac diff --git a/scripts/utils.sh b/scripts/utils.sh index d0da95e87f..b2be3f6221 100644 --- a/scripts/utils.sh +++ b/scripts/utils.sh @@ -14,7 +14,7 @@ ANY_ERROR_EXITCODE=1 function fatal_error() { echo_red "ERROR: $1" >&2 if [[ $# -gt 1 ]]; then - exit $2 + exit "$2" else exit "${ANY_ERROR_EXITCODE}" fi @@ -32,11 +32,57 @@ function check_exit_code { fi } +function check_eessi_initialised() { + if [[ -z "${EESSI_SOFTWARE_PATH}" ]]; then + fatal_error "EESSI has not been initialised!" + else + return 0 + fi +} + +function check_in_prefix_shell() { + # Make sure EPREFIX is defined + if [[ -z "${EPREFIX}" ]]; then + fatal_error "This script cannot be used without having first defined EPREFIX" + fi + if [[ ! ${SHELL} = ${EPREFIX}/bin/bash ]]; then + fatal_error "Not running in Gentoo Prefix environment, run '${EPREFIX}/startprefix' first!" + fi +} + +function create_directory_structure() { + # Ensure we are given a single path argument + if [ $# -ne 1 ]; then + echo_red "Function requires a single (relative or absolute) path argument" >&2 + return $ANY_ERROR_EXITCODE + fi + dir_structure="$1" + + # Attempt to create the directory structure + error_message=$(mkdir -p "$dir_structure" 2>&1) + return_code=$? + # If it fails be explicit about the error + if [ ${return_code} -ne 0 ]; then + real_dir=$(realpath -m "$dir_structure") + echo_red "Creating ${dir_structure} (real path ${real_dir}) failed with:\n ${error_message}" >&2 + else + # If we're creating it, our use case is that we want to be able to write there + # (this is a check in case the directory already existed) + if [ ! -w "${dir_structure}" ]; then + real_dir=$(realpath -m "$dir_structure") + echo_red "You do not have (required) write permissions to ${dir_structure} (real path ${real_dir})!" + return_code=$ANY_ERROR_EXITCODE + fi + fi + + return $return_code +} + function get_path_for_tool { tool_name=$1 tool_envvar_name=$2 - which_out=$(which ${tool_name} 2>&1) + which_out=$(which "${tool_name}" 2>&1) exit_code=$? if [[ ${exit_code} -eq 0 ]]; then echo "INFO: found tool ${tool_name} in PATH (${which_out})" >&2 @@ -68,7 +114,7 @@ function get_host_from_url { url=$1 re="(http|https)://([^/:]+)" if [[ $url =~ $re ]]; then - echo ${BASH_REMATCH[2]} + echo "${BASH_REMATCH[2]}" return 0 else echo "" @@ -80,7 +126,7 @@ function get_port_from_url { url=$1 re="(http|https)://[^:]+:([0-9]+)" if [[ $url =~ $re ]]; then - echo ${BASH_REMATCH[2]} + echo "${BASH_REMATCH[2]}" return 0 else echo "" @@ -90,7 +136,7 @@ function get_port_from_url { function get_ipv4_address { hname=$1 - hipv4=$(grep ${hname} /etc/hosts | grep -v '^[[:space:]]*#' | cut -d ' ' -f 1) + hipv4=$(grep "${hname}" /etc/hosts | grep -v '^[[:space:]]*#' | cut -d ' ' -f 1) # TODO try other methods if the one above does not work --> tool that verifies # what method can be used? echo "${hipv4}" diff --git a/tests/archdetect/aarch64/arm/neoverse-n1/AWS-awslinux-graviton2.output b/tests/archdetect/aarch64/arm/neoverse-n1/AWS-awslinux-graviton2.output deleted file mode 100644 index b4dc5e9f1b..0000000000 --- a/tests/archdetect/aarch64/arm/neoverse-n1/AWS-awslinux-graviton2.output +++ /dev/null @@ -1 +0,0 @@ -aarch64/arm/neoverse-n1 diff --git a/tests/archdetect/aarch64/arm/neoverse-n1/Azure-Ubuntu20-Altra.output b/tests/archdetect/aarch64/arm/neoverse-n1/Azure-Ubuntu20-Altra.output deleted file mode 100644 index b4dc5e9f1b..0000000000 --- a/tests/archdetect/aarch64/arm/neoverse-n1/Azure-Ubuntu20-Altra.output +++ /dev/null @@ -1 +0,0 @@ -aarch64/arm/neoverse-n1 diff --git a/tests/archdetect/aarch64/arm/neoverse-v1/AWS-awslinux-graviton3.output b/tests/archdetect/aarch64/arm/neoverse-v1/AWS-awslinux-graviton3.output deleted file mode 100644 index 20db96d01f..0000000000 --- a/tests/archdetect/aarch64/arm/neoverse-v1/AWS-awslinux-graviton3.output +++ /dev/null @@ -1 +0,0 @@ -aarch64/arm/neoverse-v1 diff --git a/tests/archdetect/aarch64/neoverse_n1/AWS-awslinux-graviton2.all.output b/tests/archdetect/aarch64/neoverse_n1/AWS-awslinux-graviton2.all.output new file mode 100644 index 0000000000..340aaa5d02 --- /dev/null +++ b/tests/archdetect/aarch64/neoverse_n1/AWS-awslinux-graviton2.all.output @@ -0,0 +1 @@ +aarch64/neoverse_n1:aarch64/generic diff --git a/tests/archdetect/aarch64/arm/neoverse-n1/AWS-awslinux-graviton2.cpuinfo b/tests/archdetect/aarch64/neoverse_n1/AWS-awslinux-graviton2.cpuinfo similarity index 100% rename from tests/archdetect/aarch64/arm/neoverse-n1/AWS-awslinux-graviton2.cpuinfo rename to tests/archdetect/aarch64/neoverse_n1/AWS-awslinux-graviton2.cpuinfo diff --git a/tests/archdetect/aarch64/neoverse_n1/AWS-awslinux-graviton2.output b/tests/archdetect/aarch64/neoverse_n1/AWS-awslinux-graviton2.output new file mode 100644 index 0000000000..a9bd49c75c --- /dev/null +++ b/tests/archdetect/aarch64/neoverse_n1/AWS-awslinux-graviton2.output @@ -0,0 +1 @@ +aarch64/neoverse_n1 diff --git a/tests/archdetect/aarch64/neoverse_n1/Azure-Ubuntu20-Altra.all.output b/tests/archdetect/aarch64/neoverse_n1/Azure-Ubuntu20-Altra.all.output new file mode 100644 index 0000000000..340aaa5d02 --- /dev/null +++ b/tests/archdetect/aarch64/neoverse_n1/Azure-Ubuntu20-Altra.all.output @@ -0,0 +1 @@ +aarch64/neoverse_n1:aarch64/generic diff --git a/tests/archdetect/aarch64/arm/neoverse-n1/Azure-Ubuntu20-Altra.cpuinfo b/tests/archdetect/aarch64/neoverse_n1/Azure-Ubuntu20-Altra.cpuinfo similarity index 100% rename from tests/archdetect/aarch64/arm/neoverse-n1/Azure-Ubuntu20-Altra.cpuinfo rename to tests/archdetect/aarch64/neoverse_n1/Azure-Ubuntu20-Altra.cpuinfo diff --git a/tests/archdetect/aarch64/neoverse_n1/Azure-Ubuntu20-Altra.output b/tests/archdetect/aarch64/neoverse_n1/Azure-Ubuntu20-Altra.output new file mode 100644 index 0000000000..a9bd49c75c --- /dev/null +++ b/tests/archdetect/aarch64/neoverse_n1/Azure-Ubuntu20-Altra.output @@ -0,0 +1 @@ +aarch64/neoverse_n1 diff --git a/tests/archdetect/aarch64/neoverse_v1/AWS-awslinux-graviton3.all.output b/tests/archdetect/aarch64/neoverse_v1/AWS-awslinux-graviton3.all.output new file mode 100644 index 0000000000..920d5f9996 --- /dev/null +++ b/tests/archdetect/aarch64/neoverse_v1/AWS-awslinux-graviton3.all.output @@ -0,0 +1 @@ +aarch64/neoverse_v1:aarch64/neoverse_n1:aarch64/generic diff --git a/tests/archdetect/aarch64/arm/neoverse-v1/AWS-awslinux-graviton3.cpuinfo b/tests/archdetect/aarch64/neoverse_v1/AWS-awslinux-graviton3.cpuinfo similarity index 100% rename from tests/archdetect/aarch64/arm/neoverse-v1/AWS-awslinux-graviton3.cpuinfo rename to tests/archdetect/aarch64/neoverse_v1/AWS-awslinux-graviton3.cpuinfo diff --git a/tests/archdetect/aarch64/neoverse_v1/AWS-awslinux-graviton3.output b/tests/archdetect/aarch64/neoverse_v1/AWS-awslinux-graviton3.output new file mode 100644 index 0000000000..a8e072a9c6 --- /dev/null +++ b/tests/archdetect/aarch64/neoverse_v1/AWS-awslinux-graviton3.output @@ -0,0 +1 @@ +aarch64/neoverse_v1 diff --git a/tests/archdetect/ppc64le/power9le/unknown-power9le.all.output b/tests/archdetect/ppc64le/power9le/unknown-power9le.all.output new file mode 100644 index 0000000000..7ecf79d0a7 --- /dev/null +++ b/tests/archdetect/ppc64le/power9le/unknown-power9le.all.output @@ -0,0 +1 @@ +ppc64le/power9le:ppc64le/generic \ No newline at end of file diff --git a/tests/archdetect/x86_64/amd/zen2/Azure-CentOS7-7V12.all.output b/tests/archdetect/x86_64/amd/zen2/Azure-CentOS7-7V12.all.output new file mode 100644 index 0000000000..180de26f0e --- /dev/null +++ b/tests/archdetect/x86_64/amd/zen2/Azure-CentOS7-7V12.all.output @@ -0,0 +1 @@ +x86_64/amd/zen2:x86_64/generic \ No newline at end of file diff --git a/tests/archdetect/x86_64/amd/zen3/Azure-CentOS7-7V73X.all.output b/tests/archdetect/x86_64/amd/zen3/Azure-CentOS7-7V73X.all.output new file mode 100644 index 0000000000..798a0aa565 --- /dev/null +++ b/tests/archdetect/x86_64/amd/zen3/Azure-CentOS7-7V73X.all.output @@ -0,0 +1 @@ +x86_64/amd/zen3:x86_64/amd/zen2:x86_64/generic \ No newline at end of file diff --git a/tests/archdetect/x86_64/intel/haswell/archspec-linux-E5-2680-v3.all.output b/tests/archdetect/x86_64/intel/haswell/archspec-linux-E5-2680-v3.all.output new file mode 100644 index 0000000000..a047dd42cc --- /dev/null +++ b/tests/archdetect/x86_64/intel/haswell/archspec-linux-E5-2680-v3.all.output @@ -0,0 +1 @@ +x86_64/intel/haswell:x86_64/generic \ No newline at end of file diff --git a/tests/archdetect/x86_64/intel/skylake_avx512/archspec-linux-6132.all.output b/tests/archdetect/x86_64/intel/skylake_avx512/archspec-linux-6132.all.output new file mode 100644 index 0000000000..c9fa524ea6 --- /dev/null +++ b/tests/archdetect/x86_64/intel/skylake_avx512/archspec-linux-6132.all.output @@ -0,0 +1 @@ +x86_64/intel/skylake_avx512:x86_64/intel/haswell:x86_64/generic \ No newline at end of file