Skip to content

Commit

Permalink
XMerge remote-tracking branch 'upstream/2023.06-pilot.eessi-hpc.org' …
Browse files Browse the repository at this point in the history
…into cuda_install
  • Loading branch information
ocaisa committed Dec 1, 2023
2 parents 2a919a4 + 4e59a22 commit b6d8c8a
Show file tree
Hide file tree
Showing 30 changed files with 557 additions and 53 deletions.
21 changes: 17 additions & 4 deletions .github/workflows/tests_archdetect.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,15 @@ jobs:
- x86_64/amd/zen2/Azure-CentOS7-7V12
- x86_64/amd/zen3/Azure-CentOS7-7V73X
- ppc64le/power9le/unknown-power9le
- aarch64/arm/neoverse-n1/Azure-Ubuntu20-Altra
- aarch64/arm/neoverse-n1/AWS-awslinux-graviton2
- aarch64/arm/neoverse-v1/AWS-awslinux-graviton3
- aarch64/neoverse_n1/Azure-Ubuntu20-Altra
- aarch64/neoverse_n1/AWS-awslinux-graviton2
- aarch64/neoverse_v1/AWS-awslinux-graviton3
fail-fast: false
steps:
- name: checkout
uses: actions/checkout@93ea575cb5d8a053eaa0ac8fa3b40d7e05a33cc8 # v3.1.0

- name: Enable EESSI
uses: eessi/github-action-eessi@58b50fd2eead2162c2b9ac258d4fb60cc9f30503 # v2.0.13
- name: test eessi_archdetect.sh
run: |
export EESSI_MACHINE_TYPE=${{matrix.proc_cpuinfo}}
Expand All @@ -34,3 +35,15 @@ jobs:
echo "Test for ${{matrix.proc_cpuinfo}} FAILED: $CPU_ARCH" >&2
exit 1
fi
CPU_ARCHES=$(./init/eessi_archdetect.sh -a cpupath)
if [[ $CPU_ARCHES == "$( cat ./tests/archdetect/${{matrix.proc_cpuinfo}}.all.output )" ]]; then
echo "Test for ${{matrix.proc_cpuinfo}} PASSED: $CPU_ARCHES" >&2
else
echo "Test for ${{matrix.proc_cpuinfo}} FAILED: $CPU_ARCHES" >&2
exit 1
fi
# Check all those architectures actually exist
for dir in $(echo "$CPU_ARCHES" | tr ':' '\n'); do
# Search all EESSI versions as we may drop support at some point
ls -d "$EESSI_PREFIX"/../*/software/linux/"$dir"
done
3 changes: 1 addition & 2 deletions configure_easybuild
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,7 @@ fi

# note: filtering Bison may break some installations, like Qt5 (see https://github.com/EESSI/software-layer/issues/49)
# filtering pkg-config breaks R-bundle-Bioconductor installation (see also https://github.com/easybuilders/easybuild-easyconfigs/pull/11104)
# problems occur when filtering pkg-config with gnuplot too (picks up Lua 5.1 from $EPREFIX rather than from Lua 5.3 dependency)
DEPS_TO_FILTER=Autoconf,Automake,Autotools,binutils,bzip2,DBus,flex,gettext,gperf,help2man,intltool,libreadline,libtool,Lua,M4,makeinfo,ncurses,util-linux,XZ,zlib
DEPS_TO_FILTER=Autoconf,Automake,Autotools,binutils,bzip2,DBus,flex,gettext,gperf,help2man,intltool,libreadline,libtool,M4,makeinfo,ncurses,util-linux,XZ,zlib
# For aarch64 we need to also filter out Yasm.
# See https://github.com/easybuilders/easybuild-easyconfigs/issues/11190
if [[ "$EESSI_CPU_FAMILY" == "aarch64" ]]; then
Expand Down
20 changes: 20 additions & 0 deletions eb_hooks.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@


CPU_TARGET_NEOVERSE_V1 = 'aarch64/neoverse_v1'
CPU_TARGET_AARCH64_GENERIC = 'aarch64/generic'

EESSI_RPATH_OVERRIDE_ATTR = 'orig_rpath_override_dirs'

Expand Down Expand Up @@ -276,6 +277,24 @@ def pre_configure_hook_wrf_aarch64(self, *args, **kwargs):
raise EasyBuildError("WRF-specific hook triggered for non-WRF easyconfig?!")


def pre_configure_hook_LAMMPS_aarch64(self, *args, **kwargs):
"""
pre-configure hook for LAMMPS:
- set kokkos_arch on Aarch64
"""

cpu_target = get_eessi_envvar('EESSI_SOFTWARE_SUBDIR')
if self.name == 'LAMMPS':
if self.version == '23Jun2022':
if get_cpu_architecture() == AARCH64:
if cpu_target == CPU_TARGET_AARCH64_GENERIC:
self.cfg['kokkos_arch'] = 'ARM80'
else:
self.cfg['kokkos_arch'] = 'ARM81'
else:
raise EasyBuildError("LAMMPS-specific hook triggered for non-LAMMPS easyconfig?!")


def pre_test_hook(self,*args, **kwargs):
"""Main pre-test hook: trigger custom functions based on software name."""
if self.name in PRE_TEST_HOOKS:
Expand Down Expand Up @@ -428,6 +447,7 @@ def inject_gpu_property(ec):
'MetaBAT': pre_configure_hook_metabat_filtered_zlib_dep,
'OpenBLAS': pre_configure_hook_openblas_optarch_generic,
'WRF': pre_configure_hook_wrf_aarch64,
'LAMMPS': pre_configure_hook_LAMMPS_aarch64,
}

PRE_TEST_HOOKS = {
Expand Down
5 changes: 5 additions & 0 deletions eessi-2023.06-eb-4.8.0-2021b.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,9 @@ easyconfigs:
# see https://github.com/easybuilders/easybuild-easyconfigs/pull/18746
options:
from-pr: 18746
- gnuplot-5.4.2-GCCcore-11.2.0.eb:
# make sure that Lua dependency is correctly picked up,
# see https://github.com/easybuilders/easybuild-easyconfigs/pull/19261
options:
from-pr: 19261
- OpenFOAM-v2112-foss-2021b.eb
8 changes: 8 additions & 0 deletions eessi-2023.06-eb-4.8.1-2021b.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,11 @@ easyconfigs:
options:
from-pr: 18834
- R-4.2.0-foss-2021b.eb
- PLUMED-2.7.3-foss-2021b.eb:
# the --enable-asmjit is not supported on Aarch64
options:
from-pr: 19110
- LAMMPS-23Jun2022-foss-2021b-kokkos.eb:
# TBB and ScaFaCos are optional dependencies when building on Intel arch
options:
from-pr: 19246
6 changes: 6 additions & 0 deletions eessi-2023.06-eb-4.8.2-2021b.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
easyconfigs:
- Pillow-8.3.2-GCCcore-11.2.0.eb:
# avoid that hardcoded paths like /usr/include are used in build commands
options:
from-pr: 19226
- matplotlib-3.4.3-foss-2021b.eb
26 changes: 5 additions & 21 deletions eessi-2023.06-eb-4.8.2-2022a.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,27 +5,6 @@ easyconfigs:
- AOFlagger-3.4.0-foss-2022a:
options:
from-pr: 19119
# Exclude Lua from `filter-deps` as the compat layer version is too old for the software
filter-deps:
- Autoconf
- Automake
- Autotools
- binutils
- bzip2
- DBus
- flex
- gettext
- gperf
- help2man
- intltool
- libreadline
- libtool
- ncurses
- M4
- makeinfo
- util-linux
- XZ
- zlib
- EveryBeam-0.5.2-foss-2022a:
options:
from-pr: 19119
Expand All @@ -35,3 +14,8 @@ easyconfigs:
- WSClean-3.4-foss-2022a:
options:
from-pr: 19119
- Pillow-9.1.1-GCCcore-11.3.0.eb:
# avoid that hardcoded paths like /usr/include are used in build commands
options:
from-pr: 19226
- matplotlib-3.5.2-foss-2022a.eb
72 changes: 66 additions & 6 deletions eessi_container.sh
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,8 @@
# -. initial settings & exit codes
TOPDIR=$(dirname $(realpath $0))

source ${TOPDIR}/scripts/utils.sh
source ${TOPDIR}/scripts/cfg_files.sh
source "${TOPDIR}"/scripts/utils.sh
source "${TOPDIR}"/scripts/cfg_files.sh

# exit codes: bitwise shift codes to allow for combination of exit codes
# ANY_ERROR_EXITCODE is sourced from ${TOPDIR}/scripts/utils.sh
Expand All @@ -46,6 +46,7 @@ SAVE_ERROR_EXITCODE=$((${ANY_ERROR_EXITCODE} << 8))
HTTP_PROXY_ERROR_EXITCODE=$((${ANY_ERROR_EXITCODE} << 9))
HTTPS_PROXY_ERROR_EXITCODE=$((${ANY_ERROR_EXITCODE} << 10))
RUN_SCRIPT_MISSING_EXITCODE=$((${ANY_ERROR_EXITCODE} << 11))
NVIDIA_MODE_UNKNOWN_EXITCODE=$((${ANY_ERROR_EXITCODE} << 12))

# CernVM-FS settings
CVMFS_VAR_LIB="var-lib-cvmfs"
Expand All @@ -72,12 +73,17 @@ display_help() {
echo " -a | --access {ro,rw} - ro (read-only), rw (read & write) [default: ro]"
echo " -c | --container IMG - image file or URL defining the container to use"
echo " [default: docker://ghcr.io/eessi/build-node:debian11]"
echo " -h | --help - display this usage information [default: false]"
echo " -g | --storage DIR - directory space on host machine (used for"
echo " temporary data) [default: 1. TMPDIR, 2. /tmp]"
echo " -h | --help - display this usage information [default: false]"
echo " -i | --host-injections - directory to link to for host_injections "
echo " [default: /..storage../opt-eessi]"
echo " -l | --list-repos - list available repository identifiers [default: false]"
echo " -m | --mode MODE - with MODE==shell (launch interactive shell) or"
echo " MODE==run (run a script or command) [default: shell]"
echo " -n | --nvidia MODE - configure the container to work with NVIDIA GPUs,"
echo " MODE==install for a CUDA installation, MODE==run to"
echo " attach a GPU, MODE==all for both [default: false]"
echo " -r | --repository CFG - configuration file or identifier defining the"
echo " repository to use [default: EESSI-pilot via"
echo " default container, see --container]"
Expand Down Expand Up @@ -111,6 +117,8 @@ VERBOSE=0
STORAGE=
LIST_REPOS=0
MODE="shell"
SETUP_NVIDIA=0
ADDITIONAL_SINGULARITY_FLAGS=
REPOSITORY="EESSI-pilot"
RESUME=
SAVE=
Expand Down Expand Up @@ -141,6 +149,10 @@ while [[ $# -gt 0 ]]; do
display_help
exit 0
;;
-i|--host-injections)
USER_HOST_INJECTIONS="$2"
shift 2
;;
-l|--list-repos)
LIST_REPOS=1
shift 1
Expand All @@ -149,6 +161,11 @@ while [[ $# -gt 0 ]]; do
MODE="$2"
shift 2
;;
-n|--nvidia)
SETUP_NVIDIA=1
NVIDIA_MODE="$2"
shift 2
;;
-r|--repository)
REPOSITORY="$2"
shift 2
Expand Down Expand Up @@ -224,6 +241,13 @@ if [[ "${MODE}" != "shell" && "${MODE}" != "run" ]]; then
fatal_error "unknown execution mode '${MODE}'" "${MODE_UNKNOWN_EXITCODE}"
fi

# Also validate the NVIDIA GPU mode (if present)
if [[ ${SETUP_NVIDIA} -eq 1 ]]; then
if [[ "${NVIDIA_MODE}" != "run" && "${NVIDIA_MODE}" != "install" && "${NVIDIA_MODE}" != "all" ]]; then
fatal_error "unknown NVIDIA mode '${NVIDIA_MODE}'" "${NVIDIA_MODE_UNKNOWN_EXITCODE}"
fi
fi

# TODO (arg -r|--repository) check if repository is known
# REPOSITORY_ERROR_EXITCODE
if [[ ! -z "${REPOSITORY}" && "${REPOSITORY}" != "EESSI-pilot" && ! -r ${EESSI_REPOS_CFG_FILE} ]]; then
Expand Down Expand Up @@ -294,6 +318,7 @@ else
echo "Using ${EESSI_HOST_STORAGE} as tmp directory (to resume session add '--resume ${EESSI_HOST_STORAGE}')."
fi


# if ${RESUME} is a file (assume a tgz), unpack it into ${EESSI_HOST_STORAGE}
if [[ ! -z ${RESUME} && -f ${RESUME} ]]; then
tar xf ${RESUME} -C ${EESSI_HOST_STORAGE}
Expand All @@ -310,12 +335,25 @@ fi
# |-overlay-work
# |-home
# |-repos_cfg
# |-opt-eessi (unless otherwise specificed for host_injections)

# tmp dir for EESSI
EESSI_TMPDIR=${EESSI_HOST_STORAGE}
mkdir -p ${EESSI_TMPDIR}
[[ ${VERBOSE} -eq 1 ]] && echo "EESSI_TMPDIR=${EESSI_TMPDIR}"

# Set host_injections directory and ensure it is a writable directory (if user provided)
if [ -z ${USER_HOST_INJECTIONS+x} ]; then
# Not set, so use our default
HOST_INJECTIONS=${EESSI_TMPDIR}/opt-eessi
mkdir -p $HOST_INJECTIONS
else
# Make sure the host_injections directory specified exists and is a folder
mkdir -p ${USER_HOST_INJECTIONS} || fatal_error "host_injections directory ${USER_HOST_INJECTIONS} is either not a directory or cannot be created"
HOST_INJECTIONS=${USER_HOST_INJECTIONS}
fi
[[ ${VERBOSE} -eq 1 ]] && echo "HOST_INJECTIONS=${HOST_INJECTIONS}"

# configure Singularity: if SINGULARITY_CACHEDIR is already defined, use that
# a global SINGULARITY_CACHEDIR would ensure that we don't consume
# storage space again and again for the container & also speed-up
Expand Down Expand Up @@ -394,12 +432,34 @@ fi
[[ ${VERBOSE} -eq 1 ]] && echo "SINGULARITY_HOME=${SINGULARITY_HOME}"

# define paths to add to SINGULARITY_BIND (added later when all BIND mounts are defined)
BIND_PATHS="${EESSI_CVMFS_VAR_LIB}:/var/lib/cvmfs,${EESSI_CVMFS_VAR_RUN}:/var/run/cvmfs"
BIND_PATHS="${EESSI_CVMFS_VAR_LIB}:/var/lib/cvmfs,${EESSI_CVMFS_VAR_RUN}:/var/run/cvmfs,${HOST_INJECTIONS}:/opt/eessi"
# provide a '/tmp' inside the container
BIND_PATHS="${BIND_PATHS},${EESSI_TMPDIR}:${TMP_IN_CONTAINER}"

[[ ${VERBOSE} -eq 1 ]] && echo "BIND_PATHS=${BIND_PATHS}"

# Configure anything we need for NVIDIA GPUs and CUDA installation
if [[ ${SETUP_NVIDIA} -eq 1 ]]; then
if [[ "${NVIDIA_MODE}" == "run" || "${NVIDIA_MODE}" == "all" ]]; then
# Give singularity the appropriate flag
ADDITIONAL_SINGULARITY_FLAGS="--nv ${ADDITIONAL_SINGULARITY_FLAGS}"
[[ ${VERBOSE} -eq 1 ]] && echo "ADDITIONAL_SINGULARITY_FLAGS=${ADDITIONAL_SINGULARITY_FLAGS}"
fi
if [[ "${NVIDIA_MODE}" == "install" || "${NVIDIA_MODE}" == "all" ]]; then
# Add additional bind mounts to allow CUDA to install within a container
# (Experience tells us that these are necessary, but we don't know _why_
# as the CUDA installer is a black box. The suspicion is that the CUDA
# installer gets confused by the permissions on these directories when
# inside a container)
EESSI_VAR_LOG=${EESSI_TMPDIR}/var-log
EESSI_USR_LOCAL_CUDA=${EESSI_TMPDIR}/usr-local-cuda
mkdir -p ${EESSI_VAR_LOG}
mkdir -p ${EESSI_USR_LOCAL_CUDA}
BIND_PATHS="${BIND_PATHS},${EESSI_VAR_LOG}:/var/log,${EESSI_USR_LOCAL_CUDA}:/usr/local/cuda"
[[ ${VERBOSE} -eq 1 ]] && echo "BIND_PATHS=${BIND_PATHS}"
fi
fi

# set up repository config (always create directory repos_cfg and populate it with info when
# arg -r|--repository is used)
mkdir -p ${EESSI_TMPDIR}/repos_cfg
Expand Down Expand Up @@ -558,8 +618,8 @@ if [ ! -z ${EESSI_SOFTWARE_SUBDIR_OVERRIDE} ]; then
fi

echo "Launching container with command (next line):"
echo "singularity ${RUN_QUIET} ${MODE} ${EESSI_FUSE_MOUNTS[@]} ${CONTAINER} $@"
singularity ${RUN_QUIET} ${MODE} "${EESSI_FUSE_MOUNTS[@]}" ${CONTAINER} "$@"
echo "singularity ${RUN_QUIET} ${MODE} ${ADDITIONAL_SINGULARITY_FLAGS} ${EESSI_FUSE_MOUNTS[@]} ${CONTAINER} $@"
singularity ${RUN_QUIET} ${MODE} ${ADDITIONAL_SINGULARITY_FLAGS} "${EESSI_FUSE_MOUNTS[@]}" ${CONTAINER} "$@"
exit_code=$?

# 6. save tmp if requested (arg -s|--save)
Expand Down
Loading

0 comments on commit b6d8c8a

Please sign in to comment.