Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

General config and run_reframe.sh for local and EESSI stack #200

Merged
merged 19 commits into from
Dec 18, 2024
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 43 additions & 0 deletions CI/hortense_EESSI_ss/ci_config.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# Configurable items
if [ -z "${TEST_SUITE_PARTITION}" ]; then
echo "You have to indicate on which partition the test-suite will run on vsc-Hortense"
echo "This needs to be TEST_SUITE_PARTITION=cpu_rome_256gb"
echo "untill new functionality of `sched_options` is part of"
echo "# the ReFrame release https://github.com/reframe-hpc/reframe/issues/2970"
exit 1
fi

if [ -z "${REFRAME_ARGS}" ]; then
REFRAME_ARGS="--tag CI --tag 1_node|2_nodes --system hortense:${TEST_SUITE_PARTITION}"
fi

if [ -z "${UNSET_MODULEPATH}" ]; then
export UNSET_MODULEPATH=False
module --force purge
fi

if [ -z "${USE_EESSI_SOFTWARE_STACK}" ]; then
export USE_EESSI_SOFTWARE_STACK=True
fi

if [ -z "${RFM_CONFIG_FILES}" ]; then
export RFM_CONFIG_FILES="${TEMPDIR}/test-suite/config/vsc_hortense.py"
fi

if [ -z "${SET_LOCAL_MODULE_ENV}"]; then
export SET_LOCAL_MODULE_ENV=True
fi

if [ -z "${LOCAL_MODULES}"]; then
if [ "$TEST_SUITE_PARTITION" == "cpu_rome_256gb" ]; then
export LOCAL_MODULES="env/vsc/dodrio/cpu_rome env/slurm/dodrio/cpu_rome"
elif [ "$TEST_SUITE_PARTITION" == "cpu_rome_512gb" ]; then
export LOCAL_MODULES="env/vsc/dodrio/cpu_rome_512 env/slurm/dodrio/cpu_rome_512"
elif [ "$TEST_SUITE_PARTITION" == "gpu_rome_a100_40gb" ]; then
export LOCAL_MODULES="env/vsc/dodrio/gpu_rome_a100_40 env/slurm/dodrio/gpu_rome_a100_40"
elif [ "$TEST_SUITE_PARTITION" == "gpu_rome_a100_80gb" ]; then
export LOCAL_MODULES="env/vsc/dodrio/gpu_rome_a100_80 env/slurm/dodrio/gpu_rome_a100_80"
else
export LOCAL_MODULES="env/vsc/dodrio/${TEST_SUITE_PARTITION} env/slurm/dodrio/${TEST_SUITE_PARTITION}"
fi
smoors marked this conversation as resolved.
Show resolved Hide resolved
fi
42 changes: 42 additions & 0 deletions CI/hortense_local_ss/ci_config.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# Configurable items
if [ -z "${TEST_SUITE_PARTITION}" ]; then
echo "You have to indicate on which partition the test-suite will run on vsc-Hortense"
echo "This environment variable needs to be set TEST_SUITE_PARTITION=cpu_rome_256gb"
echo "Can only set to 'cpu_rome_256gb' untill new functionality of 'sched_options' is part of"
echo "the ReFrame release https://github.com/reframe-hpc/reframe/issues/2970"
exit 1
fi

if [ -z "${REFRAME_ARGS}" ]; then
REFRAME_ARGS="--tag CI --tag 1_node|2_nodes --system hortense:${TEST_SUITE_PARTITION}"
fi

if [ -z "${UNSET_MODULEPATH}" ]; then
export UNSET_MODULEPATH=False
fi

smoors marked this conversation as resolved.
Show resolved Hide resolved
if [ -z "${USE_EESSI_SOFTWARE_STACK}" ]; then
export USE_EESSI_SOFTWARE_STACK=False
fi

if [ -z "${RFM_CONFIG_FILES}" ]; then
export RFM_CONFIG_FILES="${TEMPDIR}/test-suite/config/vsc_hortense.py"
fi

if [ -z "${SET_LOCAL_MODULE_ENV}"]; then
export SET_LOCAL_MODULE_ENV=True
fi

if [ -z "${LOCAL_MODULES}"]; then
if [ "$TEST_SUITE_PARTITION" == "cpu_rome_256gb" ]; then
export LOCAL_MODULES="cluster/dodrio/cpu_rome"
elif [ "$TEST_SUITE_PARTITION" == "cpu_rome_512gb" ]; then
export LOCAL_MODULES="cluster/dodrio/cpu_rome_512"
elif [ "$TEST_SUITE_PARTITION" == "gpu_rome_a100_40gb" ]; then
export LOCAL_MODULES="cluster/dodrio/gpu_rome_a100_40"
elif [ "$TEST_SUITE_PARTITION" == "gpu_rome_a100_80gb" ]; then
export LOCAL_MODULES="cluster/dodrio/gpu_rome_a100_80"
else
export LOCAL_MODULES="cluster/dodrio/${TEST_SUITE_PARTITION}"
fi
fi
47 changes: 36 additions & 11 deletions CI/run_reframe.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,15 @@ if [ ! -f "${CI_CONFIG}" ]; then
exit 1
fi

# Create temporary directory
if [ -z "${TEMPDIR}" ]; then
TEMPDIR=$(mktemp --directory --tmpdir=/tmp -t rfm.XXXXXXXXXX)
fi

# Set the CI configuration for this system
source "${CI_CONFIG}"

# Set default configuration, but let anything set by CI_CONFIG take priority
if [ -z "${TEMPDIR}" ]; then
TEMPDIR=$(mktemp --directory --tmpdir=/tmp -t rfm.XXXXXXXXXX)
fi
if [ -z "${REFRAME_ARGS}" ]; then
REFRAME_ARGS="--tag CI --tag 1_node"
fi
Expand All @@ -50,11 +52,14 @@ fi
if [ -z "${EESSI_TESTSUITE_BRANCH}" ]; then
EESSI_TESTSUITE_BRANCH='v0.4.0'
fi
if [ -z "${EESSI_CVMFS_REPO}" ]; then
export EESSI_CVMFS_REPO=/cvmfs/software.eessi.io
fi
if [ -z "${EESSI_VERSION}" ]; then
export EESSI_VERSION=2023.06
if [ -z "${USE_EESSI_SOFTWARE_STACK}" ] || [ "$USE_EESSI_SOFTWARE_STACK" == "True" ]; then
export USE_EESSI_SOFTWARE_STACK=True
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would be good to print some more of these variables in the section that also prints reframe config etc. Makes things easier to debug.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've now added these two

echo "Using EESSI: ${USE_EESSI_SOFTWARE_STACK}"
echo "MODULEPATH: ${MODULEPATH}"

if [ -z "${EESSI_CVMFS_REPO}" ]; then
export EESSI_CVMFS_REPO=/cvmfs/software.eessi.io
fi
if [ -z "${EESSI_VERSION}" ]; then
export EESSI_VERSION=2023.06
fi
fi
if [ -z "${RFM_CONFIG_FILES}" ]; then
export RFM_CONFIG_FILES="${TEMPDIR}/test-suite/config/${EESSI_CI_SYSTEM_NAME}.py"
Expand All @@ -73,6 +78,9 @@ if [ -z "${REFRAME_TIMEOUT}" ]; then
# This will prevent multiple ReFrame runs from piling up and exceeding the quota on our Magic Castle clusters
export REFRAME_TIMEOUT=1430m
fi
if [ -z "${UNSET_MODULEPATH}" ]; then
export UNSET_MODULEPATH=True
fi

# Create virtualenv for ReFrame using system python
python3 -m venv "${TEMPDIR}"/reframe_venv
Expand All @@ -92,10 +100,25 @@ echo "Cloning EESSI repo: git clone ${EESSI_CLONE_ARGS}"
git clone ${EESSI_CLONE_ARGS}
export PYTHONPATH="${PYTHONPATH}":"${TEMPDIR}"/test-suite/

# Unset the ModulePath on systems where it is required
if [ "$UNSET_MODULEPATH" == "True" ]; then
unset MODULEPATH
fi

# Set local module environment
if [ "$SET_LOCAL_MODULE_ENV" == "True" ]; then
if [ -z "${LOCAL_MODULES}" ]; then
echo "You have to add the name of the module in the ci_config.sh file of your system"
exit 1
fi
module load "${LOCAL_MODULES}"
fi

# Start the EESSI environment
unset MODULEPATH
eessi_init_path="${EESSI_CVMFS_REPO}"/versions/"${EESSI_VERSION}"/init/bash
source "${eessi_init_path}"
if [ "$USE_EESSI_SOFTWARE_STACK" == "True" ]; then
eessi_init_path="${EESSI_CVMFS_REPO}"/versions/"${EESSI_VERSION}"/init/bash
source "${eessi_init_path}"
fi

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we have an alternative to load a local module environment? Might not be needed on your systems. On our system we'd have to load a meta-module (e.g. 2023, or 2024) to make a software environment available.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

On our system we have sticky module environments per partition/cluster. So right now I do that in the wrapper of the tun_reframe_wrapper.sh. But I can look into moving that to the hortense_local_ss/ci_config.sh

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What could work is that we add something like LOCAL_CI_MODULE_ENV=2024 and in our case LOCAL_CI_MODULE_ENV=cpu_rome_256gb. Now I'm also hardcoding that because the fix in ReFrame is not part of a release yet. I'll add that now now and than you can see if it works.

# Needed in order to make sure the reframe from our TEMPDIR is first on the PATH,
# prior to the one shipped with the 2021.12 compat layer
Expand All @@ -119,6 +142,8 @@ echo "ReFrame check search path: ${RFM_CHECK_SEARCH_PATH}"
echo "ReFrame check search recursive: ${RFM_CHECK_SEARCH_RECURSIVE}"
echo "ReFrame prefix: ${RFM_PREFIX}"
echo "ReFrame args: ${REFRAME_ARGS}"
echo "Using EESSI: ${USE_EESSI_SOFTWARE_STACK}"
echo "MODULEPATH: ${MODULEPATH}"
echo ""

# List tests
Expand Down
41 changes: 26 additions & 15 deletions config/vsc_hortense.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# https://docs.vscentrum.be/en/latest/gent/tier1_hortense.html
#
# authors: Samuel Moors (VUB-HPC), Kenneth Hoste (HPC-UGent)
import os

from reframe.core.backends import register_launcher
from reframe.core.launchers import JobLauncher
Expand All @@ -21,6 +22,16 @@ def command(self, job):
return ['mympirun', '--hybrid', str(job.num_tasks_per_node)]


eessi_cvmfs_repo = os.getenv('EESSI_CVMFS_REPO', None)
if eessi_cvmfs_repo is not None:
prepare_eessi_init = 'module --force purge'
launcher = 'mpirun'
mpi_module = ''
else:
prepare_eessi_init = ''
launcher = 'mympirun'
mpi_module = 'vsc-mympirun'

site_configuration = {
'systems': [
{
Expand All @@ -32,13 +43,13 @@ def command(self, job):
{
'name': 'cpu_rome_256gb',
'scheduler': 'slurm',
'prepare_cmds': [common_eessi_init()],
'prepare_cmds': [prepare_eessi_init, common_eessi_init()],
'access': hortense_access + ['--partition=cpu_rome'],
'environs': ['default'],
'descr': 'CPU nodes (AMD Rome, 256GiB RAM)',
'max_jobs': 20,
'launcher': 'mympirun',
'modules': ['vsc-mympirun'],
'launcher': launcher,
'modules': [mpi_module],
'processor': {
'num_cpus': 128,
'num_sockets': 2,
Expand All @@ -64,13 +75,13 @@ def command(self, job):
{
'name': 'cpu_rome_512gb',
'scheduler': 'slurm',
'prepare_cmds': [common_eessi_init()],
'prepare_cmds': [prepare_eessi_init, common_eessi_init()],
'access': hortense_access + ['--partition=cpu_rome_512'],
'environs': ['default'],
'descr': 'CPU nodes (AMD Rome, 512GiB RAM)',
'max_jobs': 20,
'launcher': 'mympirun',
'modules': ['vsc-mympirun'],
'launcher': launcher,
'modules': [mpi_module],
'processor': {
'num_cpus': 128,
'num_sockets': 2,
Expand All @@ -96,13 +107,13 @@ def command(self, job):
{
'name': 'cpu_milan',
'scheduler': 'slurm',
'prepare_cmds': [common_eessi_init()],
'prepare_cmds': [prepare_eessi_init, common_eessi_init()],
'access': hortense_access + ['--partition=cpu_milan'],
'environs': ['default'],
'descr': 'CPU nodes (AMD Milan, 256GiB RAM)',
'max_jobs': 20,
'launcher': 'mympirun',
'modules': ['vsc-mympirun'],
'launcher': launcher,
'modules': [mpi_module],
'processor': {
'num_cpus': 128,
'num_sockets': 2,
Expand All @@ -128,13 +139,13 @@ def command(self, job):
{
'name': 'gpu_rome_a100_40gb',
'scheduler': 'slurm',
'prepare_cmds': [common_eessi_init()],
'prepare_cmds': [prepare_eessi_init, common_eessi_init()],
'access': hortense_access + ['--partition=gpu_rome_a100_40'],
'environs': ['default'],
'descr': 'GPU nodes (A100 40GB)',
'max_jobs': 20,
'launcher': 'mympirun',
'modules': ['vsc-mympirun'],
'launcher': launcher,
'modules': [mpi_module],
'processor': {
'num_cpus': 48,
'num_sockets': 2,
Expand Down Expand Up @@ -172,13 +183,13 @@ def command(self, job):
{
'name': 'gpu_rome_a100_80gb',
'scheduler': 'slurm',
'prepare_cmds': [common_eessi_init()],
'prepare_cmds': [prepare_eessi_init, common_eessi_init()],
'access': hortense_access + ['--partition=gpu_rome_a100_80'],
'environs': ['default'],
'descr': 'GPU nodes (A100 80GB)',
'max_jobs': 20,
'launcher': 'mympirun',
'modules': ['vsc-mympirun'],
'launcher': launcher,
'modules': [mpi_module],
'processor': {
'num_cpus': 48,
'num_sockets': 2,
Expand Down