Skip to content

Commit

Permalink
Merge pull request #111 from casparvl/filter_incompatible_scales
Browse files Browse the repository at this point in the history
Add new hook to filter invalid scales based on features set in the config file
  • Loading branch information
smoors authored Feb 13, 2024
2 parents 3a09ab3 + f879bb1 commit d516f05
Show file tree
Hide file tree
Showing 12 changed files with 93 additions and 26 deletions.
4 changes: 2 additions & 2 deletions config/aws_mc.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
import os

from eessi.testsuite.common_config import common_logging_config, common_eessi_init
from eessi.testsuite.constants import FEATURES
from eessi.testsuite.constants import FEATURES, SCALES

# This config will write all staging, output and logging to subdirs under this prefix
# Override with RFM_PREFIX environment variable
Expand Down Expand Up @@ -97,7 +97,7 @@
'environs': ['default'],
'features': [
FEATURES['CPU']
],
] + list(SCALES.keys()),
'prepare_cmds': [
'source %s' % common_eessi_init(),
# Required when using srun as launcher with --export=NONE in partition access, in order to ensure job
Expand Down
2 changes: 1 addition & 1 deletion config/github_actions.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
'scheduler': 'local',
'launcher': 'local',
'environs': ['default'],
'features': [FEATURES[CPU]],
'features': [FEATURES[CPU]] + list(SCALES.keys()),
'processor': {'num_cpus': 2},
'resources': [
{
Expand Down
4 changes: 2 additions & 2 deletions config/it4i_karolina.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@
'max_jobs': 120,
'features': [
FEATURES[CPU],
],
] + list(SCALES.keys()),
'descr': 'CPU Universal Compute Nodes, see https://docs.it4i.cz/karolina/hardware-overview/'
},
# We don't have GPU budget on Karolina at this time
Expand Down Expand Up @@ -88,7 +88,7 @@
# ],
# 'features': [
# FEATURES[GPU],
# ],
# ] + list(SCALES.keys()),
# 'descr': 'GPU partition with accelerated nodes, see https://docs.it4i.cz/karolina/hardware-overview/'
# },
]
Expand Down
4 changes: 2 additions & 2 deletions config/izum_vega.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@
],
'features': [
FEATURES[CPU],
],
] + list(SCALES.keys()),
'descr': 'CPU partition Standard, see https://en-doc.vega.izum.si/architecture/'
},
{
Expand Down Expand Up @@ -97,7 +97,7 @@
],
'features': [
FEATURES[GPU],
],
] + list(SCALES.keys()),
'descr': 'GPU partition, see https://en-doc.vega.izum.si/architecture/'
},
]
Expand Down
6 changes: 4 additions & 2 deletions config/settings_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,9 @@
'options': ['--mem={size}'],
}
],
'features': [FEATURES[CPU]],
# list(SCALES.keys()) adds all the scales from eessi.testsuite.constants as valid for thi partition
# Can be modified if not all scales can run on this partition, see e.g. the surf_snellius.py config
'features': [FEATURES[CPU]] + list(SCALES.keys()),
},
{
'name': 'gpu_partition',
Expand Down Expand Up @@ -94,7 +96,7 @@
'features': [
FEATURES[CPU],
FEATURES[GPU],
],
] + list(SCALES.keys()),
'extras': {
GPU_VENDOR: GPU_VENDORS[NVIDIA],
},
Expand Down
9 changes: 6 additions & 3 deletions config/surf_snellius.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@
# Override with RFM_PREFIX environment variable
reframe_prefix = os.path.join(os.environ['HOME'], 'reframe_runs')

# Jobs that partially fill multiple nodes are not allowed on the GPU partition
valid_scales_snellius_gpu = [s for s in SCALES if s not in ['1_cpn_2_nodes', '1_cpn_4_nodes']]

# This is an example configuration file
site_configuration = {
'systems': [
Expand Down Expand Up @@ -49,7 +52,7 @@
],
'features': [
FEATURES[CPU],
],
] + list(SCALES.keys()),
'descr': 'AMD Rome CPU partition with native EESSI stack'
},
{
Expand All @@ -68,7 +71,7 @@
],
'features': [
FEATURES[CPU],
],
] + list(SCALES.keys()),
'descr': 'AMD Genoa CPU partition with native EESSI stack'
},

Expand Down Expand Up @@ -98,7 +101,7 @@
],
'features': [
FEATURES[GPU],
],
] + valid_scales_snellius_gpu,
'extras': {
GPU_VENDOR: GPU_VENDORS[NVIDIA],
},
Expand Down
10 changes: 5 additions & 5 deletions config/vsc_hortense.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def command(self, job):
],
'features': [
FEATURES[CPU],
],
] + list(SCALES.keys()),
},
{
'name': 'cpu_rome_512gb',
Expand All @@ -80,7 +80,7 @@ def command(self, job):
],
'features': [
FEATURES[CPU],
],
] + list(SCALES.keys()),
},
{
'name': 'cpu_milan',
Expand All @@ -107,7 +107,7 @@ def command(self, job):
],
'features': [
FEATURES[CPU],
],
] + list(SCALES.keys()),
},
{
'name': 'gpu_rome_a100_40gb',
Expand All @@ -128,7 +128,7 @@ def command(self, job):
},
'features': [
FEATURES[GPU],
],
] + list(SCALES.keys()),
'extras': {
GPU_VENDOR: GPU_VENDORS[NVIDIA],
},
Expand Down Expand Up @@ -169,7 +169,7 @@ def command(self, job):
},
'features': [
FEATURES[GPU],
],
] + list(SCALES.keys()),
'extras': {
GPU_VENDOR: GPU_VENDORS[NVIDIA],
},
Expand Down
3 changes: 3 additions & 0 deletions eessi/testsuite/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,3 +60,6 @@
'8_nodes': {'num_nodes': 8, 'node_part': 1},
'16_nodes': {'num_nodes': 16, 'node_part': 1},
}

# When tests are filtered by the hooks, the valid_systems is set to this system name:
INVALID_SYSTEM = "INVALID_SYSTEM"
62 changes: 56 additions & 6 deletions eessi/testsuite/hooks.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
"""
import math
import shlex
import warnings

import reframe as rfm

Expand Down Expand Up @@ -283,15 +284,64 @@ def _assign_one_task_per_gpu(test: rfm.RegressionTest):
log(f'num_tasks set to {test.num_tasks}')


def _set_or_append_valid_systems(test: rfm.RegressionTest, valid_systems: str):
"""
Sets test.valid_systems based on the valid_systems argument.
- If valid_systems is an empty string, test.valid_systems is set equal to eessi.testsuite.constants.INVALID_SYSTEM
- If test.valid_systems was an empty list, leave it as is (test should not be run)
- If test.valid_systems was at the default value ['*'], it is overwritten by [valid_system]
- If test.valid_systems was already set and is a list of one element, valid_system is appended to it,
which allows adding requests for multiple partition features by different hooks.
- If test.valid_systems was already set and is a list of multiple elements, we warn that the use has to take
care of filtering him/herself. This is typically the case when someone overrides the valid_systems on command line.
In this scenario, this function leaves test.valid_systems as it is.
"""

# This indicates an invalid test that always has to be filtered
if valid_systems == '':
test.valid_systems = [INVALID_SYSTEM]
return

# test.valid_systems wasn't set yet, so set it
if len(test.valid_systems) == 0:
# test.valid_systems is empty, meaning all tests are filtered out. This hook shouldn't change that
return
# test.valid_systems still at default value, so overwrite
elif len(test.valid_systems) == 1 and test.valid_systems[0] == '*':
test.valid_systems = [valid_systems]
# test.valid_systems was set before, so append
elif len(test.valid_systems) == 1:
test.valid_systems[0] = f'{test.valid_systems[0]} {valid_systems}'
else:
warn_msg = f"valid_systems has multiple ({len(test.valid_systems)}) items,"
warn_msg += f" which is not supported by this hook."
warn_msg += f" Make sure to handle filtering yourself."
warnings.warn(warn_msg)
return


def filter_supported_scales(test: rfm.RegressionTest):
"""
Filter tests scales based on which scales are supported by each partition in the ReFrame configuration.
Filtering is done using features, i.e. the current test scale is requested as a feature.
Any partition that does not include this feature in the ReFrame configuration file will effectively be filtered out.
"""
valid_systems = f'+{test.scale}'

# Change test.valid_systems accordingly:
_set_or_append_valid_systems(test, valid_systems)

log(f'valid_systems set to {test.valid_systems}')

def filter_valid_systems_by_device_type(test: rfm.RegressionTest, required_device_type: str):
"""
Filter valid_systems by required device type and by whether the module supports CUDA,
unless valid_systems is specified with --setvar valid_systems=<comma-separated-list>.
"""
if test.valid_systems:
# valid_systems is specified, so don't filter
return
Any invalid combination (e.g. a non-CUDA module with a required_device_type GPU) will
cause the valid_systems to be set to an empty string, and consequently the
test.valid_systems to an invalid system name (eessi.testsuite.constants.INVALID_SYSTEM).
"""
is_cuda_module = is_cuda_required_module(test.module_name)

if is_cuda_module and required_device_type == DEVICE_TYPES[GPU]:
Expand All @@ -308,8 +358,8 @@ def filter_valid_systems_by_device_type(test: rfm.RegressionTest, required_devic
# Invalid combination: a module without GPU support cannot use a GPU
valid_systems = ''

if valid_systems:
test.valid_systems = [valid_systems]
# Change test.valid_systems accordingly:
_set_or_append_valid_systems(test, valid_systems)

log(f'valid_systems set to {test.valid_systems}')

Expand Down
5 changes: 4 additions & 1 deletion eessi/testsuite/tests/apps/gromacs.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,14 +42,17 @@
class EESSI_GROMACS(gromacs_check):
scale = parameter(SCALES.keys())
valid_prog_environs = ['default']
valid_systems = []
valid_systems = ['*']
time_limit = '30m'
module_name = parameter(find_modules('GROMACS'))

@run_after('init')
def run_after_init(self):
"""Hooks to run after the init phase"""

# Filter on which scales are supported by the partitions defined in the ReFrame configuration
hooks.filter_supported_scales(self)

# Make sure that GPU tests run in partitions that support running on a GPU,
# and that CPU-only tests run in partitions that support running CPU-only.
# Also support setting valid_systems on the cmd line.
Expand Down
5 changes: 4 additions & 1 deletion eessi/testsuite/tests/apps/osu.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ class EESSI_OSU_Micro_Benchmarks_pt2pt(osu_benchmark):
''' Run-only OSU test '''
scale = parameter(filter_scales_pt2pt())
valid_prog_environs = ['default']
valid_systems = []
valid_systems = ['*']
time_limit = '30m'
module_name = parameter(find_modules('OSU-Micro-Benchmarks'))
# Device type for non-cuda OSU-Micro-Benchmarks should run on hosts of both node types. To do this the default
Expand All @@ -57,6 +57,9 @@ class EESSI_OSU_Micro_Benchmarks_pt2pt(osu_benchmark):
@run_after('init')
def run_after_init(self):
"""hooks to run after init phase"""
# Filter on which scales are supported by the partitions defined in the ReFrame configuration
hooks.filter_supported_scales(self)

hooks.filter_valid_systems_by_device_type(self, required_device_type=self.device_type)
is_cuda_module = utils.is_cuda_required_module(self.module_name)
# This part of the hook is meant to be for the OSU cpu tests. This is required since the non CUDA module should
Expand Down
5 changes: 4 additions & 1 deletion eessi/testsuite/tests/apps/tensorflow/tensorflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ class EESSI_TensorFlow(rfm.RunOnlyRegressionTest):
# This test can run at any scale, so parameterize over all known SCALES
scale = parameter(SCALES.keys())
valid_prog_environs = ['default']
valid_systems = []
valid_systems = ['*']

# Parameterize over all modules that start with TensorFlow
module_name = parameter(utils.find_modules('TensorFlow'))
Expand Down Expand Up @@ -70,6 +70,9 @@ def perf(self):
@run_after('init')
def run_after_init(self):
"""hooks to run after the init phase"""
# Filter on which scales are supported by the partitions defined in the ReFrame configuration
hooks.filter_supported_scales(self)

hooks.filter_valid_systems_by_device_type(self, required_device_type=self.device_type)
hooks.set_modules(self)
hooks.set_tag_scale(self)
Expand Down

0 comments on commit d516f05

Please sign in to comment.