diff --git a/.github/workflows/pip_install.yml b/.github/workflows/pip_install.yml index 84ad340a..1847574d 100644 --- a/.github/workflows/pip_install.yml +++ b/.github/workflows/pip_install.yml @@ -4,20 +4,33 @@ on: [push, pull_request, workflow_dispatch] permissions: read-all jobs: test_pip_install: - runs-on: ubuntu-22.04 + # ubuntu <= 20.04 is required for python 3.6 + runs-on: ubuntu-20.04 strategy: fail-fast: false matrix: - python: ['3.6', '3.7', '3.8', '3.9', '3.10', '3.11'] + python-version: ['3.6', '3.7', '3.8', '3.9', '3.10', '3.11'] steps: - name: Check out software-layer repository uses: actions/checkout@93ea575cb5d8a053eaa0ac8fa3b40d7e05a33cc8 # v3.1.0 with: persist-credentials: false + - name: Set up Python + uses: actions/setup-python@61a6322f88396a6271a6ee3565807d608ecaddd1 # v4.7.0 + with: + python-version: ${{ matrix.python-version }} + + - name: Install setuptools + run: | + if [[ "${{ matrix.python-version }}" == "3.6" ]]; then + # system installed setuptools version in RHEL8 and CO7 + python -m pip install --user setuptools==39.2.0 + fi + - name: Install ReFrame run: | - pip install --user ReFrame-HPC + python -m pip install --user ReFrame-HPC - name: Install EESSI test suite with 'pip install' run: | @@ -26,8 +39,15 @@ jobs: python setup.py sdist ls dist - pip install --user dist/eessi*.tar.gz + python -m pip install --user dist/eessi*.tar.gz find $HOME/.local + # make sure we are not in the source directory + cd $HOME + + python --version + python -m pip --version + python -c 'import setuptools; print("setuptools", setuptools.__version__)' + python -c 'import eessi.testsuite.utils' python -c 'import eessi.testsuite.tests.apps' diff --git a/README.md b/README.md index b44e084d..cdec5fd4 100644 --- a/README.md +++ b/README.md @@ -1,108 +1,13 @@ # test-suite -A portable test suite for software installations, using ReFrame -## Getting started +A portable test suite for software installations, using ReFrame. -- install ReFrame >=4.0 +## Documentation -- install the test suite using +For documentation on installing, configuring, and using the EESSI test suite, see https://eessi.io/docs/test-suite/. -```bash -pip install git+https://github.com/EESSI/test-suite.git -``` - -Alternatively, you can clone the repository - -```bash -git clone git@github.com:EESSI/test-suite.git -``` - -- add the path of the `test-suite` directory to your ``$PYTHONPATH`` - -- create a site configuration file - - - should look similar to `test-suite/config/settings_example.py` - -- run the tests - - The example below runs a gromacs simulation using GROMACS modules available - in the system, in combination with all available system:partitions as - defined in the site config file, using 1 full node (`--tag 1_node`, see `SCALES` - in `constants.py`). This example assumes that you have cloned the - repository at `/path/to/EESSI/test-suite`. - -``` -cd /path/to/EESSI/test-suite - -module load ReFrame/4.2.0 - -export PYTHONPATH=$PWD:$PYTHONPATH - -reframe \ - --config-file \ - --checkpath eessi/testsuite/tests/apps \ - --tag CI --tag 1_node \ - --run --performance-report -``` +## Development -## Configuring GPU/non-GPU partitions in your site config file: - -- running GPU jobs in GPU nodes - - add `'features': [FEATURES[GPU]]` to the GPU partitions - - add `'extras': {GPU_VENDOR: GPU_VENDORS[NVIDIA]}` to the GPU partitions (or - `INTEL` or `AMD`, see `GPU_VENDORS` in `constants.py`) - -- running non-GPU jobs in non-GPU nodes - - add `'features': [FEATURES[CPU]]` to the non-GPU partitions - -- running both GPU jobs and non-GPU jobs in GPU nodes - - add `'features': [FEATURES[CPU], FEATURES[GPU]]` to the GPU partitions - -- setting the number of GPUS per node for a partition: - ``` - 'access': ['-p '], - 'devices': [ - {'type': DEVICE_TYPES[GPU], 'num_devices': } - ], - ``` -- requesting GPUs per node for a partition: - ``` - 'resources': [ - { - 'name': '_rfm_gpu', - 'options': ['--gpus-per-node={num_gpus_per_node}'], - } - ], - ``` - -## Changing the default test behavior on the cmd line - -- specifying modules - - `--setvar modules=` - -- specifying valid systems:partitions - - `--setvar valid_systems=` - - Note that setting `valid_systems` on the cmd line disables filtering of - valid systems:partitions in the hooks, so you have to do the filtering - yourself. - -- overriding tasks, cpus, gpus - - `--setvar num_tasks_per_node=` - - `--setvar num_cpus_per_task=` - - `--setvar num_gpus_per_node=` - -- setting additional environment variables - - `--setvar env_vars=:` - -Note that these override the variables for _all_ tests in the test suite that -respect those variables. To override a variable only for specific tests, one -can use the `TEST.VAR` syntax. For example, to run the `GROMACS_EESSI` test with the -module `GROMACS/2021.6-foss-2022a`: - -- `--setvar GROMACS_EESSI.modules=GROMACS/2021.6-foss-2022a` - -## Developers If you want to install the EESSI test suite from a branch, you can either install the feature branch with `pip`, or clone the Github repository and check out the feature branch. @@ -123,8 +28,9 @@ pip install git+https://github.com//test-suite.git@branchname ``` ### Check out a feature branch from a fork -We'll assume you already have a local clone of the official test-suite -repository, called 'origin'. In that case, executing `git remote -v`, you + +We'll assume you already have a local clone of the official `test-suite` +repository, called '`origin`'. In that case, executing `git remote -v`, you should see: ```bash @@ -146,10 +52,10 @@ With `git remote -v` you should now see the new remote: ```bash $ git remote -v -origin git@github.com:EESSI/test-suite.git (fetch) -origin git@github.com:EESSI/test-suite.git (push) -casparvl git@github.com:casparvl/test-suite.git (fetch) -casparvl git@github.com:casparvl/test-suite.git (push) +origin git@github.com:EESSI/test-suite.git (fetch) +origin git@github.com:EESSI/test-suite.git (push) +casparvl git@github.com:casparvl/test-suite.git (fetch) +casparvl git@github.com:casparvl/test-suite.git (push) ``` Next, we'll fetch the branches that `casparvl` has in his fork: @@ -161,10 +67,8 @@ $ git fetch casparvl We can check the remote branches using ```bash $ git branch --list --remotes - casparvl/gromacs_cscs + casparvl/example_branch casparvl/main - casparvl/setuppy - casparvl/updated_defaults_pr11 origin/HEAD -> origin/main origin/main ``` @@ -173,14 +77,15 @@ $ git branch --list --remotes this command). Finally, we can create a new local branch (`-c`) and checkout one of these -feature branches (e.g. `setuppy` from the remote `casparvl`). Here, we've -picked `local_setuppy_branch` as the local branch name: +feature branches (e.g. `example_branch` from the remote `casparvl`). Here, we've +picked `my_own_example_branch` as the local branch name: ```bash -$ git switch -c local_setuppy_branch casparvl/setuppy +$ git switch -c my_own_example_branch casparvl/example_branch ``` While the initial setup is a bit more involved, the advantage of this approach is that it is easy to pull in updates from a feature branch using `git pull`. + You can also push back changes to the feature branch directly, but note that you are pushing to the Github fork of another Github user, so _make sure they are ok with that_ before doing so! diff --git a/RELEASE_NOTES b/RELEASE_NOTES new file mode 100644 index 00000000..0bb97deb --- /dev/null +++ b/RELEASE_NOTES @@ -0,0 +1,19 @@ +This file contains a description of the major changes to the EESSI test suite. +For more detailed information, please see the git log. + +v0.1.0 (5 October 2023) +----------------------- + +This is the first release of the EESSI test suite. + +It includes: + +* A well-structured `eessi.testsuite` Python package that provides constants, utilities, hooks, and tests, which can be installed with "`pip install`". +* Tests for GROMACS and TensorFlow in `eessi.testsuite.tests.apps` that leverage the functionality provided by `eessi.testsuite.*`. +* Examples of ReFrame configuration files for various systems in the `config` subdirectory. +* A `common_logging_config()` function to facilitate the ReFrame logging configuration. +* A set of standard device types and features that can be used in the partitions section of the ReFrame configuration file. +* A set of tags (CI + scale) that can be used to filter checks. +* Scripts that show how to run the test suite. + +For documentation, see https://eessi.io/docs/test-suite . diff --git a/config/aws_citc.py b/config/aws_citc.py index 181cc11c..6bd44062 100644 --- a/config/aws_citc.py +++ b/config/aws_citc.py @@ -1,29 +1,23 @@ -# This is an example configuration file +# WARNING: for CPU autodetect to work correctly you need to +# 1. Either use ReFrame >= 4.3.3 or temporarily change the 'launcher' for each partition to srun +# 2. Either use ReFrame >= 4.3.3 or run from a clone of the ReFrame repository -# Note that CPU autodetect currently does not work with this configuration file on AWS. -# This is because there is no system mpirun, and the CPU autodetection doesn't load any modules -# that would make an mpirun command available (as normal multiprocessing tests would). -# In order to do CPU autodetection, you'll need to change the launcer to srun: -# 'launcher = srun' -# You can run the CPU autodetect by listing all tests (reframe -l ...) -# and then, once all CPUs are autodetected, change the launcher back to mpirun for a 'real' run (reframe -r ...) +# Without this, the autodetect job fails because +# 1. A missing mpirun command +# 2. An incorrect directory structure is assumed when preparing the stagedir for the autodetect job -# Another known issue is that CPU autodetection fails if run from an actual installation of ReFrame. -# It only works if run from a clone of their Github Repo. See https://github.com/reframe-hpc/reframe/issues/2914 +# Related issues +# 1. https://github.com/reframe-hpc/reframe/issues/2926 +# 2. https://github.com/reframe-hpc/reframe/issues/2914 -from os import environ, makedirs +import os +from eessi.testsuite.common_config import common_logging_config from eessi.testsuite.constants import FEATURES -# Get username of current user -homedir = environ.get('HOME') - # This config will write all staging, output and logging to subdirs under this prefix -reframe_prefix = f'{homedir}/reframe_runs' -log_prefix = f'{reframe_prefix}/logs' - -# ReFrame complains if the directory for the file logger doesn't exist yet -makedirs(f'{log_prefix}', exist_ok=True) +# Override with RFM_PREFIX environment variable +reframe_prefix = os.path.join(os.environ['HOME'], 'reframe_runs') # AWS CITC site configuration site_configuration = { @@ -32,7 +26,7 @@ 'name': 'citc', 'descr': 'Cluster in the Cloud build and test environment on AWS', 'modules_system': 'lmod', - 'hostnames': ['mgmt', 'login', 'fair-mastodon*'], + 'hostnames': ['mgmt', 'login', 'fair-mastodon*'], 'prefix': reframe_prefix, 'partitions': [ { @@ -110,9 +104,9 @@ 'access': ['--constraint=shape=c7g.4xlarge', '--export=NONE'], 'descr': 'Graviton3, 16 cores, 32 GiB', }, - ] - }, - ], + ] + }, + ], 'environments': [ { 'name': 'default', @@ -120,48 +114,12 @@ 'cxx': '', 'ftn': '', }, - ], - 'logging': [ - { - 'level': 'debug', - 'handlers': [ - { - 'type': 'stream', - 'name': 'stdout', - 'level': 'info', - 'format': '%(message)s' - }, - { - 'type': 'file', - 'prefix': f'{log_prefix}/reframe.log', - 'name': 'reframe.log', - 'level': 'debug', - 'format': '[%(asctime)s] %(levelname)s: %(check_info)s: %(message)s', # noqa: E501 - 'append': True, - 'timestamp': "%Y%m%d_%H%M%S", - }, - ], - 'handlers_perflog': [ - { - 'type': 'filelog', - 'prefix': f'{log_prefix}/%(check_system)s/%(check_partition)s', - 'level': 'info', - 'format': ( - '%(check_job_completion_time)s|reframe %(version)s|' - '%(check_info)s|jobid=%(check_jobid)s|' - '%(check_perf_var)s=%(check_perf_value)s|' - 'ref=%(check_perf_ref)s ' - '(l=%(check_perf_lower_thres)s, ' - 'u=%(check_perf_upper_thres)s)|' - '%(check_perf_unit)s' - ), - 'append': True - } - ] - } ], + 'logging': common_logging_config(reframe_prefix), 'general': [ { + # Enable automatic detection of CPU architecture for each partition + # See https://reframe-hpc.readthedocs.io/en/stable/configure.html#auto-detecting-processor-information 'remote_detect': True, } ], @@ -170,13 +128,7 @@ # Add default things to each partition: partition_defaults = { 'scheduler': 'squeue', - # mpirun causes problems with cpu autodetect, since there is no system mpirun. - # See https://github.com/EESSI/test-suite/pull/53#issuecomment-1590849226 - # and this feature request https://github.com/reframe-hpc/reframe/issues/2926 - # However, using srun requires either using pmix or proper pmi2 integration in the MPI library - # See https://github.com/EESSI/test-suite/pull/53#issuecomment-1598753968 - # Thus, we use mpirun for now, and manually swap to srun if we want to autodetect CPUs... - 'launcher': 'srun', + 'launcher': 'mpirun', 'environs': ['default'], 'features': [ FEATURES['CPU'] @@ -191,5 +143,3 @@ for system in site_configuration['systems']: for partition in system['partitions']: partition.update(partition_defaults) - - diff --git a/config/github_actions.py b/config/github_actions.py index bea90f1a..68c55072 100644 --- a/config/github_actions.py +++ b/config/github_actions.py @@ -1,6 +1,7 @@ # ReFrame configuration file that can be used in GitHub Actions with EESSI -from eessi.testsuite.constants import * +from eessi.testsuite.common_config import common_logging_config +from eessi.testsuite.constants import * # noqa: F403 site_configuration = { @@ -35,43 +36,7 @@ { 'purge_environment': True, 'resolve_module_conflicts': False, # avoid loading the module before submitting the job - 'keep_stage_files': True, - } - ], - 'logging': [ - { - 'level': 'debug', - 'handlers': [ - { - 'type': 'stream', - 'name': 'stdout', - 'level': 'info', - 'format': '%(message)s' - }, - { - 'type': 'file', - 'level': 'debug', - 'format': '[%(asctime)s] %(levelname)s: %(check_info)s: %(message)s', # noqa: E501 - 'append': True - } - ], - 'handlers_perflog': [ - { - 'type': 'filelog', - 'prefix': '%(check_system)s/%(check_partition)s', - 'level': 'info', - 'format': ( - '%(check_job_completion_time)s|reframe %(version)s|' - '%(check_info)s|jobid=%(check_jobid)s|' - '%(check_perf_var)s=%(check_perf_value)s|' - 'ref=%(check_perf_ref)s ' - '(l=%(check_perf_lower_thres)s, ' - 'u=%(check_perf_upper_thres)s)|' - '%(check_perf_unit)s' - ), - 'append': True - } - ] - } - ], + } + ], + 'logging': common_logging_config(), } diff --git a/config/izum_vega.py b/config/izum_vega.py index 223b4dfb..ca3e2179 100644 --- a/config/izum_vega.py +++ b/config/izum_vega.py @@ -1,32 +1,35 @@ -from os import environ, makedirs +# WARNING: for CPU autodetect to work correctly you need to +# 1. Either use ReFrame >= 4.3.3 or temporarily change the 'launcher' for each partition to srun +# 2. Either use ReFrame >= 4.3.3 or run from a clone of the ReFrame repository +# 3. Temporarily change the 'access' field for the GPU partition to +# 'access': ['-p gpu', '--export=None', '--gres=gpu:1'], -from eessi.testsuite.constants import * +# Without this, the autodetect job fails because +# 1. A missing mpirun command +# 2. An incorrect directory structure is assumed when preparing the stagedir for the autodetect job +# 3. Vega doesn't allow submission to the GPU partition without requesting at least one GPU (change #2) -# Get username of current user -homedir = environ.get('HOME') +# Related issues +# 1. https://github.com/reframe-hpc/reframe/issues/2926 +# 2. https://github.com/reframe-hpc/reframe/issues/2914 -# This config will write all staging, output and logging to subdirs under this prefix -reframe_prefix = f'{homedir}/reframe_runs' -log_prefix = f'{reframe_prefix}/logs' +import os + +from eessi.testsuite.common_config import common_logging_config +from eessi.testsuite.constants import * # noqa: F403 -# ReFrame complains if the directory for the file logger doesn't exist yet -makedirs(f'{log_prefix}', exist_ok=True) +# This config will write all staging, output and logging to subdirs under this prefix +# Override with RFM_PREFIX environment variable +reframe_prefix = os.path.join(os.environ['HOME'], 'reframe_runs') # This is an example configuration file site_configuration = { - 'general': [ - { - # Enable automatic detection of CPU architecture for each partition - # See https://reframe-hpc.readthedocs.io/en/stable/configure.html#auto-detecting-processor-information - 'remote_detect': True, - } - ], 'systems': [ { 'name': 'vega', 'descr': 'Vega, a EuroHPC JU system', 'modules_system': 'lmod', - 'hostnames': ['vglogin*','cn*','gn*'], + 'hostnames': ['vglogin*', 'cn*', 'gn*'], 'prefix': reframe_prefix, 'partitions': [ { @@ -42,7 +45,7 @@ # Can be taken out once we don't care about old OpenMPI versions anymore (pre-4.1.1) 'export OMPI_MCA_pml=ucx', ], - 'launcher': 'mpirun', # Needs to be temporarily changed to srun for cpu autodetection + 'launcher': 'mpirun', # Use --export=None to avoid that login environment is passed down to submitted jobs 'access': ['-p cpu', '--export=None'], 'environs': ['default'], @@ -65,7 +68,7 @@ # Can be taken out once we don't care about old OpenMPI versions anymore (pre-4.1.1) 'export OMPI_MCA_pml=ucx', ], - 'launcher': 'mpirun', # Needs to be temporarily changed to srun for cpu autodetection + 'launcher': 'mpirun', # Use --export=None to avoid that login environment is passed down to submitted jobs 'access': ['-p gpu', '--export=None'], 'environs': ['default'], @@ -87,9 +90,9 @@ ], 'descr': 'GPU partition, see https://en-doc.vega.izum.si/architecture/' }, - ] - }, - ], + ] + }, + ], 'environments': [ { 'name': 'default', @@ -97,46 +100,13 @@ 'cxx': '', 'ftn': '', }, - ], - 'logging': [ + ], + 'logging': common_logging_config(reframe_prefix), + 'general': [ { - 'level': 'debug', - 'handlers': [ - { - 'type': 'stream', - 'name': 'stdout', - 'level': 'info', - 'format': '%(message)s' - }, - { - 'type': 'file', - 'name': f'{log_prefix}/reframe.log', - 'level': 'debug', - 'format': '[%(asctime)s] %(levelname)s: %(check_info)s: %(message)s', # noqa: E501 - 'append': True, - 'timestamp': "%Y%m%d_%H%M%S", - } - ], - 'handlers_perflog': [ - { - 'type': 'filelog', - 'prefix': f'{log_prefix}/%(check_system)s/%(check_partition)s', - 'level': 'info', - 'format': ( - '%(check_job_completion_time)s|reframe %(version)s|' - '%(check_info)s|jobid=%(check_jobid)s|' - '%(check_perfvalues)s' - ), - 'format_perfvars': ( - '%(check_perf_var)s=%(check_perf_value)s|' - 'ref=%(check_perf_ref)s ' - '(l=%(check_perf_lower_thres)s, ' - 'u=%(check_perf_upper_thres)s)|' - '%(check_perf_unit)s|' - ), - 'append': True - } - ] + # Enable automatic detection of CPU architecture for each partition + # See https://reframe-hpc.readthedocs.io/en/stable/configure.html#auto-detecting-processor-information + 'remote_detect': True, } ], } diff --git a/config/settings_example.py b/config/settings_example.py index 5954974b..c46636a5 100644 --- a/config/settings_example.py +++ b/config/settings_example.py @@ -1,12 +1,27 @@ +# WARNING: for CPU autodetect to work correctly you need to +# 1. Either use ReFrame >= 4.3.3 or temporarily change the 'launcher' for each partition to srun +# 2. Either use ReFrame >= 4.3.3 or run from a clone of the ReFrame repository +# If your system has a GPU partition, it might force jobs to request at least one GPU. If that is the +# case, you also need to temporarily change 'access' field for the GPU partition to include the request +# for one GPU, e.g. 'access': ['-p gpu', '--export=None', '--gres=gpu:1'], + +# Without this, the autodetect job fails because +# 1. A missing mpirun command +# 2. An incorrect directory structure is assumed when preparing the stagedir for the autodetect job + +# Related issues +# 1. https://github.com/reframe-hpc/reframe/issues/2926 +# 2. https://github.com/reframe-hpc/reframe/issues/2914 + + """ Example configuration file """ -from os import environ - -from eessi.testsuite.constants import * +import os +from eessi.testsuite.common_config import common_logging_config, format_perfvars, perflog_format +from eessi.testsuite.constants import * # noqa: F403 -username = environ.get('USER') site_configuration = { 'systems': [ @@ -16,37 +31,44 @@ 'modules_system': 'lmod', 'hostnames': ['*'], # Note that the stagedir should be a shared directory available on all nodes running ReFrame tests - 'stagedir': f'/some/shared/dir/{username}/reframe_output/staging', + 'stagedir': f'/some/shared/dir/{os.environ.get("USER")}/reframe_output/staging', 'partitions': [ { 'name': 'cpu_partition', + 'descr': 'CPU partition' 'scheduler': 'slurm', 'launcher': 'mpirun', - 'access': ['-p cpu'], + 'access': ['-p cpu', '--export=None'], + 'prepare_cmds': ['source /cvmfs/pilot.eessi-hpc.org/latest/init/bash'], 'environs': ['default'], 'max_jobs': 4, - 'processor': { - 'num_cpus': 128, - 'num_sockets': 2, - 'num_cpus_per_socket': 64, - 'arch': 'zen2', - }, + # We recommend to rely on ReFrame's CPU autodetection, + # and only define the 'processor' field if autodetection fails + # 'processor': { + # 'num_cpus': 128, + # 'num_sockets': 2, + # 'num_cpus_per_socket': 64, + # 'num_cpus_per_core': 1, + # }, 'features': [FEATURES[CPU]], - 'descr': 'CPU partition' }, { 'name': 'gpu_partition', + 'descr': 'GPU partition' 'scheduler': 'slurm', 'launcher': 'mpirun', - 'access': ['-p gpu'], + 'access': ['-p gpu', '--export=None'], + 'prepare_cmds': ['source /cvmfs/pilot.eessi-hpc.org/latest/init/bash'], 'environs': ['default'], 'max_jobs': 4, - 'processor': { - 'num_cpus': 72, - 'num_sockets': 2, - 'num_cpus_per_socket': 36, - 'arch': 'icelake', - }, + # We recommend to rely on ReFrame's CPU autodetection, + # and only define the 'processor' field if autodetection fails + # 'processor': { + # 'num_cpus': 72, + # 'num_sockets': 2, + # 'num_cpus_per_socket': 36, + # 'num_cpus_per_core': 1, + # }, 'resources': [ { 'name': '_rfm_gpu', @@ -66,7 +88,6 @@ 'extras': { GPU_VENDOR: GPU_VENDORS[NVIDIA], }, - 'descr': 'GPU partition' }, ] }, @@ -79,42 +100,22 @@ 'ftn': '', }, ], - 'logging': [ + 'logging': common_logging_config(), + 'general': [ { - 'level': 'debug', - 'handlers': [ - { - 'type': 'stream', - 'name': 'stdout', - 'level': 'info', - 'format': '%(message)s' - }, - { - 'type': 'file', - 'name': 'reframe.log', - 'level': 'debug', - 'format': '[%(asctime)s] %(levelname)s: %(check_info)s: %(message)s', # noqa: E501 - 'append': True, - 'timestamp': "%Y%m%d_%H%M%S", - } - ], - 'handlers_perflog': [ - { - 'type': 'filelog', - 'prefix': '%(check_system)s/%(check_partition)s', - 'level': 'info', - 'format': ( - '%(check_job_completion_time)s|reframe %(version)s|' - '%(check_info)s|jobid=%(check_jobid)s|' - '%(check_perf_var)s=%(check_perf_value)s|' - 'ref=%(check_perf_ref)s ' - '(l=%(check_perf_lower_thres)s, ' - 'u=%(check_perf_upper_thres)s)|' - '%(check_perf_unit)s' - ), - 'append': True - } - ] + # Enable automatic detection of CPU architecture for each partition + # See https://reframe-hpc.readthedocs.io/en/stable/configure.html#auto-detecting-processor-information + 'remote_detect': True, } ], } + +# optional logging to syslog +site_configuration['logging'][0]['handlers_perflog'].append({ + 'type': 'syslog', + 'address': '/dev/log', + 'level': 'info', + 'format': f'reframe: {perflog_format}', + 'format_perfvars': format_perfvars, + 'append': True, +}) diff --git a/config/surf_snellius.py b/config/surf_snellius.py index 532ecb2f..d15d2a6d 100644 --- a/config/surf_snellius.py +++ b/config/surf_snellius.py @@ -1,9 +1,26 @@ -from os import environ +# WARNING: for CPU autodetect to work correctly you need to +# 1. Either use ReFrame >= 4.3.3 or temporarily change the 'launcher' for each partition to srun +# 2. Either use ReFrame >= 4.3.3 or run from a clone of the ReFrame repository +# 3. Temporarily change the 'access' field for the GPU partition to +# 'access': ['-p gpu', '--export=None', '--exclusive'], -from eessi.testsuite.constants import * +# Without this, the autodetect job fails because +# 1. A missing mpirun command +# 2. An incorrect directory structure is assumed when preparing the stagedir for the autodetect job +# 3. Snellius doesn't allow submission to the GPU partition without requesting at least one GPU +# Related issues +# 1. https://github.com/reframe-hpc/reframe/issues/2926 +# 2. https://github.com/reframe-hpc/reframe/issues/2914 -username = environ.get('USER') +import os + +from eessi.testsuite.common_config import common_logging_config +from eessi.testsuite.constants import * # noqa: F403 + +# This config will write all staging, output and logging to subdirs under this prefix +# Override with RFM_PREFIX environment variable +reframe_prefix = os.path.join(os.environ['HOME'], 'reframe_runs') # This is an example configuration file site_configuration = { @@ -13,21 +30,36 @@ 'descr': 'Dutch National Supercomputer', 'modules_system': 'lmod', 'hostnames': ['int*', 'tcn*', 'hcn*', 'fcn*', 'gcn*', 'srv*'], - 'stagedir': f'/scratch-shared/{username}/reframe_output/staging', + 'prefix': reframe_prefix, + 'stagedir': f'/scratch-shared/{os.environ.get("USER")}/reframe_output/staging', 'partitions': [ { - 'name': 'thin', + 'name': 'rome', 'scheduler': 'slurm', 'prepare_cmds': ['source /cvmfs/pilot.eessi-hpc.org/latest/init/bash'], 'launcher': 'mpirun', - 'access': ['-p thin', '--export=None'], + 'access': ['-p rome', '--export=None'], 'environs': ['default'], 'max_jobs': 120, 'features': [ FEATURES[CPU], ], - 'descr': 'Test CPU partition with native EESSI stack' + 'descr': 'AMD Rome CPU partition with native EESSI stack' }, + { + 'name': 'genoa', + 'scheduler': 'slurm', + 'prepare_cmds': ['source /cvmfs/pilot.eessi-hpc.org/latest/init/bash'], + 'launcher': 'mpirun', + 'access': ['-p genoa', '--export=None'], + 'environs': ['default'], + 'max_jobs': 120, + 'features': [ + FEATURES[CPU], + ], + 'descr': 'AMD Genoa CPU partition with native EESSI stack' + }, + { 'name': 'gpu', 'scheduler': 'slurm', @@ -54,7 +86,7 @@ 'extras': { GPU_VENDOR: GPU_VENDORS[NVIDIA], }, - 'descr': 'Test GPU partition with native EESSI stack' + 'descr': 'Nvidia A100 GPU partition with native EESSI stack' }, ] }, @@ -67,49 +99,11 @@ 'ftn': '', }, ], - 'logging': [ - { - 'level': 'debug', - 'handlers': [ - { - 'type': 'stream', - 'name': 'stdout', - 'level': 'info', - 'format': '%(message)s' - }, - { - 'type': 'file', - 'name': 'reframe.log', - 'level': 'debug', - 'format': '[%(asctime)s] %(levelname)s: %(check_info)s: %(message)s', # noqa: E501 - 'append': True, - 'timestamp': "%Y%m%d_%H%M%S", - } - ], - 'handlers_perflog': [ - { - 'type': 'filelog', - 'prefix': '%(check_system)s/%(check_partition)s', - 'level': 'info', - 'format': ( - '%(check_job_completion_time)s|reframe %(version)s|' - '%(check_info)s|jobid=%(check_jobid)s|' - '%(check_perf_var)s=%(check_perf_value)s|' - 'ref=%(check_perf_ref)s ' - '(l=%(check_perf_lower_thres)s, ' - 'u=%(check_perf_upper_thres)s)|' - '%(check_perf_unit)s' - ), - 'append': True - } - ] - } - ], + 'logging': common_logging_config(reframe_prefix), 'general': [ { - # For autodetect to work, temporarily change: - # 1. The launchers to srun - # 2. Add --exclusive to GPU 'access' field above (avoids submission error that no GPUs are requested) + # Enable automatic detection of CPU architecture for each partition + # See https://reframe-hpc.readthedocs.io/en/stable/configure.html#auto-detecting-processor-information 'remote_detect': True, } ], diff --git a/config/vsc_hortense.py b/config/vsc_hortense.py index 223783cb..14d94c36 100644 --- a/config/vsc_hortense.py +++ b/config/vsc_hortense.py @@ -1,47 +1,23 @@ # ReFrame configuration file for VSC Tier-1 Hortense # https://docs.vscentrum.be/en/latest/gent/tier1_hortense.html # -# authors: Sam Moors (VUB-HPC), Kenneth Hoste (HPC-UGent) +# authors: Samuel Moors (VUB-HPC), Kenneth Hoste (HPC-UGent) from reframe.core.backends import register_launcher from reframe.core.launchers import JobLauncher -from eessi.testsuite.constants import * - +from eessi.testsuite.common_config import common_logging_config +from eessi.testsuite.constants import * # noqa: F403 account = "my-slurm-account" -# use 'info' to log to syslog -syslog_level = 'warning' - -perf_logging_format = 'reframe: ' + '|'.join([ - 'username=%(osuser)s', - 'version=%(version)s', - 'name=%(check_name)s', - 'system=%(check_system)s', - 'partition=%(check_partition)s', - 'environ=%(check_environ)s', - 'num_tasks=%(check_num_tasks)s', - 'num_cpus_per_task=%(check_num_cpus_per_task)s', - 'num_tasks_per_node=%(check_num_tasks_per_node)s', - 'modules=%(check_modules)s', - 'jobid=%(check_jobid)s', - '%(check_perfvalues)s', -]) - -format_perfvars = '|'.join([ - 'perf_var=%(check_perf_var)s', - 'perf_value=%(check_perf_value)s', - 'unit=%(check_perf_unit)s', -]) + '|' - hortense_access = [f'-A {account}', '--export=NONE', '--get-user-env=60L'] @register_launcher('mympirun') class MyMpirunLauncher(JobLauncher): def command(self, job): - return ['mympirun', '--hybrid', str(job.num_tasks)] + return ['mympirun', '--hybrid', str(job.num_tasks_per_node)] site_configuration = { @@ -55,6 +31,7 @@ def command(self, job): { 'name': 'cpu_rome_256gb', 'scheduler': 'slurm', + 'prepare_cmds': ['source /cvmfs/pilot.eessi-hpc.org/latest/init/bash'], 'access': hortense_access + ['--partition=cpu_rome'], 'environs': ['default'], 'descr': 'CPU nodes (AMD Rome, 256GiB RAM)', @@ -65,6 +42,7 @@ def command(self, job): 'num_cpus': 128, 'num_sockets': 2, 'num_cpus_per_socket': 64, + 'num_cpus_per_core': 1, 'arch': 'zen2', }, 'features': [ @@ -74,6 +52,7 @@ def command(self, job): { 'name': 'cpu_rome_512gb', 'scheduler': 'slurm', + 'prepare_cmds': ['source /cvmfs/pilot.eessi-hpc.org/latest/init/bash'], 'access': hortense_access + ['--partition=cpu_rome_512'], 'environs': ['default'], 'descr': 'CPU nodes (AMD Rome, 512GiB RAM)', @@ -84,6 +63,7 @@ def command(self, job): 'num_cpus': 128, 'num_sockets': 2, 'num_cpus_per_socket': 64, + 'num_cpus_per_core': 1, 'arch': 'zen2', }, 'features': [ @@ -93,6 +73,7 @@ def command(self, job): { 'name': 'cpu_milan', 'scheduler': 'slurm', + 'prepare_cmds': ['source /cvmfs/pilot.eessi-hpc.org/latest/init/bash'], 'access': hortense_access + ['--partition=cpu_milan'], 'environs': ['default'], 'descr': 'CPU nodes (AMD Milan, 256GiB RAM)', @@ -103,6 +84,7 @@ def command(self, job): 'num_cpus': 128, 'num_sockets': 2, 'num_cpus_per_socket': 64, + 'num_cpus_per_core': 1, 'arch': 'zen3', }, 'features': [ @@ -112,6 +94,7 @@ def command(self, job): { 'name': 'gpu_rome_a100_40gb', 'scheduler': 'slurm', + 'prepare_cmds': ['source /cvmfs/pilot.eessi-hpc.org/latest/init/bash'], 'access': hortense_access + ['--partition=cpu_rome_a100_40'], 'environs': ['default'], 'descr': 'GPU nodes (A100 40GB)', @@ -122,6 +105,7 @@ def command(self, job): 'num_cpus': 48, 'num_sockets': 2, 'num_cpus_per_socket': 24, + 'num_cpus_per_core': 1, 'arch': 'zen2', }, 'features': [ @@ -147,6 +131,7 @@ def command(self, job): { 'name': 'gpu_rome_a100_80gb', 'scheduler': 'slurm', + 'prepare_cmds': ['source /cvmfs/pilot.eessi-hpc.org/latest/init/bash'], 'access': hortense_access + ['--partition=cpu_rome_a100_80'], 'environs': ['default'], 'descr': 'GPU nodes (A100 80GB)', @@ -157,6 +142,7 @@ def command(self, job): 'num_cpus': 48, 'num_sockets': 2, 'num_cpus_per_socket': 24, + 'num_cpus_per_core': 1, 'arch': 'zen2', }, 'features': [ @@ -214,54 +200,7 @@ def command(self, job): { 'purge_environment': True, 'resolve_module_conflicts': False, # avoid loading the module before submitting the job - 'keep_stage_files': True, - } - ], - 'logging': [ - { - 'level': 'debug', - 'handlers': [ - { - 'type': 'file', - 'name': 'reframe.log', - 'level': 'debug', - 'format': '[%(asctime)s] %(levelname)s: %(check_name)s: %(message)s', # noqa: E501 - 'append': True, - 'timestamp': "%Y%m%d_%H%M%S", - }, - { - 'type': 'stream', - 'name': 'stdout', - 'level': 'info', - 'format': '%(message)s', - }, - { - 'type': 'file', - 'name': 'reframe.out', - 'level': 'info', - 'format': '%(message)s', - 'append': True, - 'timestamp': "%Y%m%d_%H%M%S", - }, - ], - 'handlers_perflog': [ - { - 'type': 'filelog', - 'prefix': '%(check_system)s/%(check_partition)s', - 'level': 'info', - 'format': '%(check_job_completion_time)s ' + perf_logging_format, - 'format_perfvars': format_perfvars, - 'append': True, - }, - { - 'type': 'syslog', - 'address': '/dev/log', - 'level': syslog_level, - 'format': perf_logging_format, - 'format_perfvars': format_perfvars, - 'append': True, - }, - ], } ], + 'logging': common_logging_config(), } diff --git a/eessi/__init__.py b/eessi/__init__.py new file mode 100644 index 00000000..5284146e --- /dev/null +++ b/eessi/__init__.py @@ -0,0 +1 @@ +__import__("pkg_resources").declare_namespace(__name__) diff --git a/eessi/testsuite/__init__.py b/eessi/testsuite/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/eessi/testsuite/common_config.py b/eessi/testsuite/common_config.py new file mode 100644 index 00000000..4fef1a2f --- /dev/null +++ b/eessi/testsuite/common_config.py @@ -0,0 +1,70 @@ +import os + +perflog_format = '|'.join([ + '%(check_job_completion_time)s', + '%(osuser)s', + '%(version)s', + '%(check_unique_name)s', + '%(check_info)s', + '%(check_system)s', + '%(check_partition)s', + '%(check_environ)s', + '%(check_exclusive_access)s', + '%(check_num_tasks)s', + '%(check_num_cpus_per_task)s', + '%(check_num_tasks_per_node)s', + '%(check_num_gpus_per_node)s', + '%(check_use_multithreading)s', + '%(check_modules)s', + '%(check_jobid)s', + '%(check_perfvalues)s', +]) + +format_perfvars = '|'.join([ + '%(check_perf_var)s', + '%(check_perf_value)s', + '%(check_perf_lower_thres)s', + '%(check_perf_upper_thres)s', + '%(check_perf_unit)s', + '' # final delimiter required +]) + + +def common_logging_config(prefix=None): + """ + return default logging configuration as a list: stdout, file log, perflog + :param prefix: file log prefix + """ + prefix = os.getenv('RFM_PREFIX', prefix if prefix else '.') + logdir = os.path.join(prefix, 'logs') + os.makedirs(logdir, exist_ok=True) + + return [{ + 'level': 'debug', + 'handlers': [ + { + 'type': 'stream', + 'name': 'stdout', + 'level': 'info', + 'format': '%(message)s', + }, + { + 'type': 'file', + 'name': os.path.join(logdir, 'reframe.log'), + 'level': 'debug', + 'format': '[%(asctime)s] %(levelname)s: %(check_info)s: %(message)s', + 'append': True, + 'timestamp': "%Y%m%d_%H%M%S", # add a timestamp to the filename (reframe_.log) + }, + ], + 'handlers_perflog': [ + { + 'type': 'filelog', + 'prefix': '%(check_system)s/%(check_partition)s', + 'level': 'info', + 'format': perflog_format, + 'format_perfvars': format_perfvars, + 'append': True, # avoid overwriting + }, + ], + }] diff --git a/eessi/testsuite/tests/__init__.py b/eessi/testsuite/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/eessi/testsuite/tests/apps/__init__.py b/eessi/testsuite/tests/apps/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/eessi/testsuite/tests/apps/tensorflow/__init__.py b/eessi/testsuite/tests/apps/tensorflow/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/eessi/testsuite/tests/apps/tensorflow/src/__init__.py b/eessi/testsuite/tests/apps/tensorflow/src/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/eessi/testsuite/utils.py b/eessi/testsuite/utils.py index 3b770d77..79ae1ec4 100644 --- a/eessi/testsuite/utils.py +++ b/eessi/testsuite/utils.py @@ -57,15 +57,60 @@ def is_cuda_required_module(module_name: str) -> bool: return requires_cuda -def find_modules(substr: str) -> Iterator[str]: - """Return all modules in the current system that contain ``substr`` in their name.""" - if not isinstance(substr, str): +def find_modules(regex: str, name_only=True) -> Iterator[str]: + """ + Return all modules matching the regular expression regex. Note that since we use re.search, + a module matches if the regex matches the module name at any place. I.e. the match does + not have to be at the start of the smodule name + + Arguments: + - regex: a regular expression + - name_only: regular expressions will only be matched on the module name, not the version (default: True). + + Note: the name_only feature assumes anything after the last forward '/' is the version, + and strips that before doing a match. + + Example + + Suppose we have the following modules on a system: + + gompic/2022a + gompi/2022a + CGAL/4.14.3-gompi-2022a + + The following calls would return the following respective modules + + find_modules('gompi') => [gompic/2022a, gompi/2022a] + find_modules('gompi$') => [gompi/2022a] + find_modules('gompi', name_only = False) => [gompic/2022a, gompi/2022a, CGAL/4.14.3-gompi-2022a] + find_modules('^gompi', name_only = False) => [gompic/2022a, gompi/2022a] + find_modules('^gompi/', name_only = False) => [gompi/2022a] + find_modules('-gompi-2022a', name_only = False) => [CGAL/4.14.3-gompi-2022a] + + """ + + if not isinstance(regex, str): raise TypeError("'substr' argument must be a string") ms = rt.runtime().modules_system - modules = OrderedSet(ms.available_modules(substr)) - for m in modules: - yield m + # Returns e.g. ['Bison/', 'Bison/3.7.6-GCCcore-10.3.0', 'BLIS/', 'BLIS/0.8.1-GCC-10.3.0'] + modules = ms.available_modules('') + for mod in modules: + # Exclude anything without version, i.e. ending with / (e.g. Bison/) + if re.search('.*/$', mod): + continue + # The thing we yield should always be the original module name (orig_mod), including version + orig_mod = mod + if name_only: + # Remove trailing slashes from the regex (in case the callee forgot) + regex = regex.rstrip('/') + # Remove part after the last forward slash, as we assume this is the version + mod = re.sub('/[^/]*$', '', mod) + # Match the actual regular expression + log(f"Matching module {mod} with regex {regex}") + if re.search(regex, mod): + log(f"Match!") + yield orig_mod def check_proc_attribute_defined(test: rfm.RegressionTest, attribute) -> bool: """ @@ -104,4 +149,4 @@ def check_proc_attribute_defined(test: rfm.RegressionTest, attribute) -> bool: "This is a programming error, please report this issue." ) raise AttributeError(msg) - \ No newline at end of file + diff --git a/pyproject.toml b/pyproject.toml index 65cb01e9..3c374a5c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "eessi-testsuite" -version = "0.0.2" +version = "0.1.0" description = "Test suite for the EESSI software stack" readme = "README.md" license = {file = "LICENSE"} @@ -14,10 +14,8 @@ classifiers = [ requires-python = ">=3.6" [project.urls] -"Homepage" = "https://eessi.github.io/docs/software_testing/" +"Homepage" = "https://eessi.io/docs/test-suite" "Bug Tracker" = "https://github.com/EESSI/test-suite/issues" [tool.setuptools.packages.find] -where = ["eessi/testsuite"] include = ["eessi*"] -namespaces = true diff --git a/scripts/run_reframe.sh b/scripts/run_reframe.sh index 870d41a5..b4b6f20c 100755 --- a/scripts/run_reframe.sh +++ b/scripts/run_reframe.sh @@ -44,7 +44,15 @@ source ${TEMPDIR}/${REFRAME_VENV}/bin/activate # Run ReFrame echo "PYTHONPATH: ${PYTHONPATH}" -reframe -C ${TEMPDIR}/test-suite/config/${RFM_CONFIG_NAME} -c ${TEMPDIR}/test-suite/eessi/testsuite/tests/apps/ -R -t CI ${TAGS} -r --performance-report +options=( + --config-file ${TEMPDIR}/test-suite/config/${RFM_CONFIG_NAME} + --checkpath ${TEMPDIR}/test-suite/eessi/testsuite/tests/apps/ + --recursive # Search for checks in the search path recursively + --tag CI ${TAGS} + --run + --performance-report +) +reframe "${options[@]}" # Cleanup rm -rf ${TEMPDIR} diff --git a/setup.cfg b/setup.cfg index b6329a2f..9839603a 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,12 +1,14 @@ [metadata] name = eessi-testsuite -version = 0.0.2 +version = 0.1.0 description = Test suite for the EESSI software stack -long_description = file: README.md, LICENSE +long_description = file: README.md +long_description_content_type = text/markdown +license = GPL-2.0-only classifiers = Programming Language :: Python :: 3 project_urls = - Homepage = https://eessi.github.io/docs/software_testing/ + Homepage = https://eessi.io/docs/test-suite Bug Tracker = https://github.com/EESSI/test-suite/issues [options] @@ -14,10 +16,7 @@ install_requires = setuptools python_requires = >=3.6 packages = find: -package_dir = - =eessi/testsuite +namespace_packages = eessi [options.packages.find] -where = eessi/testsuite include = eessi* -namespaces = true