Skip to content

Commit

Permalink
Merge branch 'main' into osu_mixin
Browse files Browse the repository at this point in the history
  • Loading branch information
Samuel Moors committed Jan 12, 2025
2 parents e905e28 + 695b7b2 commit 84f213f
Show file tree
Hide file tree
Showing 9 changed files with 107 additions and 190 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
__pycache__/
*.egg-info/
build/

# Vim
*.sw[op]
*~
25 changes: 9 additions & 16 deletions CI/hortense_local_ss/ci_config.sh
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
# Configurable items
if [ -z "${TEST_SUITE_PARTITION}" ]; then
echo "You have to indicate on which partition the test-suite will run on vsc-Hortense"
echo "This environment variable needs to be set TEST_SUITE_PARTITION=cpu_rome_256gb"
echo "Can only set to 'cpu_rome_256gb' untill new functionality of 'sched_options' is part of"
echo "the ReFrame release https://github.com/reframe-hpc/reframe/issues/2970"
exit 1
if [[ "$TEST_SUITE_PARTITION" == "GPU" ]]; then
module --force purge
if [ -z "${SET_LOCAL_MODULE_ENV}"]; then
export SET_LOCAL_MODULE_ENV=True
fi
if [ -z "${LOCAL_MODULES}"]; then
export LOCAL_MODULES="cluster/dodrio/gpu_rome_a100"
fi
fi

if [ -z "${REFRAME_ARGS}" ]; then
REFRAME_ARGS="--tag CI --tag 1_node|2_nodes --system hortense:${TEST_SUITE_PARTITION}"
REFRAME_ARGS="--tag CI --tag 1_node|2_nodes"
fi

if [ -z "${USE_EESSI_SOFTWARE_STACK}" ]; then
Expand All @@ -21,13 +23,4 @@ fi

if [ -z "${UNSET_MODULEPATH}" ]; then
export UNSET_MODULEPATH=False
module --force purge
fi

if [ -z "${SET_LOCAL_MODULE_ENV}"]; then
export SET_LOCAL_MODULE_ENV=True
fi

if [ -z "${LOCAL_MODULES}"]; then
export LOCAL_MODULES="cluster/dodrio/${TEST_SUITE_PARTITION}"
fi
18 changes: 16 additions & 2 deletions config/vsc_hortense.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,7 @@
# authors: Samuel Moors (VUB-HPC), Kenneth Hoste (HPC-UGent), Lara Peeters (HPC-UGent)

# Use generated topology file by ReFrame for CPU partitions
# Cannot use autodetection untill new functionality of `sched_options` is part of
# the ReFrame release https://github.com/reframe-hpc/reframe/issues/2970
# `shed_access_in_submit` does not work with setting `'remote_detect': True,`

# Instructions on generating topology file
# ```
Expand Down Expand Up @@ -64,6 +63,9 @@ def command(self, job):
'scheduler': 'slurm',
'prepare_cmds': [prepare_eessi_init, common_eessi_init()],
'access': hortense_access + ['--partition=cpu_rome'],
'sched_options': {
'shed_access_in_submit': True,
},
'environs': ['default'],
'descr': 'CPU nodes (AMD Rome, 256GiB RAM)',
'max_jobs': 20,
Expand All @@ -89,6 +91,9 @@ def command(self, job):
'scheduler': 'slurm',
'prepare_cmds': [prepare_eessi_init, common_eessi_init()],
'access': hortense_access + ['--partition=cpu_rome_512'],
'sched_options': {
'shed_access_in_submit': True,
},
'environs': ['default'],
'descr': 'CPU nodes (AMD Rome, 512GiB RAM)',
'max_jobs': 20,
Expand All @@ -114,6 +119,9 @@ def command(self, job):
'scheduler': 'slurm',
'prepare_cmds': [prepare_eessi_init, common_eessi_init()],
'access': hortense_access + ['--partition=cpu_milan'],
'sched_options': {
'shed_access_in_submit': True,
},
'environs': ['default'],
'descr': 'CPU nodes (AMD Milan, 256GiB RAM)',
'max_jobs': 20,
Expand All @@ -139,6 +147,9 @@ def command(self, job):
'scheduler': 'slurm',
'prepare_cmds': [prepare_eessi_init, common_eessi_init()],
'access': hortense_access + ['--partition=gpu_rome_a100_40'],
'sched_options': {
'shed_access_in_submit': True,
},
'environs': ['default'],
'descr': 'GPU nodes (A100 40GB)',
'max_jobs': 20,
Expand Down Expand Up @@ -176,6 +187,9 @@ def command(self, job):
'scheduler': 'slurm',
'prepare_cmds': [prepare_eessi_init, common_eessi_init()],
'access': hortense_access + ['--partition=gpu_rome_a100_80'],
'sched_options': {
'shed_access_in_submit': True,
},
'environs': ['default'],
'descr': 'GPU nodes (A100 80GB)',
'max_jobs': 20,
Expand Down
1 change: 1 addition & 0 deletions eessi/testsuite/eessi_mixin.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ class EESSI_Mixin(RegressionMixin):

# Set defaults for these class variables, can be overwritten by child class if desired
measure_memory_usage = variable(bool, value=False)
exact_memory = variable(bool, value=False)
scale = parameter(SCALES.keys())
bench_name = None
bench_name_ci = None
Expand Down
26 changes: 13 additions & 13 deletions eessi/testsuite/hooks.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
"""
import math
import shlex
import warnings

import reframe as rfm
import reframe.core.logging as rflog
Expand Down Expand Up @@ -432,10 +431,10 @@ def _set_or_append_valid_systems(test: rfm.RegressionTest, valid_systems: str):
elif len(test.valid_systems) == 1:
test.valid_systems[0] = f'{test.valid_systems[0]} {valid_systems}'
else:
warn_msg = f"valid_systems has multiple ({len(test.valid_systems)}) items,"
warn_msg += " which is not supported by this hook."
warn_msg += " Make sure to handle filtering yourself."
warnings.warn(warn_msg)
msg = f"valid_systems has multiple ({len(test.valid_systems)}) items,"
msg += " which is not supported by this hook."
msg += " Make sure to handle filtering yourself."
rflog.getlogger().warning(msg)
return


Expand Down Expand Up @@ -529,7 +528,6 @@ def req_memory_per_node(test: rfm.RegressionTest, app_mem_req: float):
# and return from this hook (as setting test.extra_resources will be ignored in that case according to
# https://reframe-hpc.readthedocs.io/en/stable/regression_test_api.html#reframe.core.pipeline.RegressionTest.extra_resources
if 'memory' not in test.current_partition.resources:
logger = rflog.getlogger()
msg = "Your ReFrame configuration file does not specify any resource called 'memory' for this partition "
msg += f" ({test.current_partition.name})."
msg += " Without this, an explicit memory request cannot be made from the scheduler. This test will run,"
Expand All @@ -538,7 +536,7 @@ def req_memory_per_node(test: rfm.RegressionTest, app_mem_req: float):
msg += " 'memory' in your ReFrame configuration file for this partition."
msg += " For a SLURM system, one would e.g. define:"
msg += " 'resources': [{'name': 'memory', 'options': ['--mem={size}']}]"
logger.warning(msg)
rflog.getlogger().warning(msg)
# We return, as setting a test.extra_resources is pointless - it would be ignored anyway
# This way, we also don't add any lines to the log that a specific amount of memory was requested
return
Expand All @@ -557,8 +555,12 @@ def req_memory_per_node(test: rfm.RegressionTest, app_mem_req: float):
log(f"Memory requested by application: {app_mem_req} MiB")
log(f"Memory proportional to the core count: {proportional_mem} MiB")

# Request the maximum of the proportional_mem, and app_mem_req to the scheduler
req_mem_per_node = max(proportional_mem, app_mem_req)
if test.exact_memory:
# Request the exact amount of required memory
req_mem_per_node = app_mem_req
else:
# Request the maximum of the proportional_mem, and app_mem_req to the scheduler
req_mem_per_node = max(proportional_mem, app_mem_req)

test.extra_resources = {'memory': {'size': f'{req_mem_per_node}M'}}
log(f"Requested {req_mem_per_node} MiB per node from the SLURM batch scheduler")
Expand All @@ -580,14 +582,13 @@ def req_memory_per_node(test: rfm.RegressionTest, app_mem_req: float):
log(f"Requested {req_mem_per_task} MiB per task from the torque batch scheduler")

else:
logger = rflog.getlogger()
msg = "hooks.req_memory_per_node does not support the scheduler you configured"
msg += f" ({test.current_partition.scheduler.registered_name})."
msg += " The test will run, but since it doesn't request the required amount of memory explicitely,"
msg += " it may result in an out-of-memory error."
msg += " Please expand the functionality of hooks.req_memory_per_node for your scheduler."
# Warnings will, at default loglevel, be printed on stdout when executing the ReFrame command
logger.warning(msg)
rflog.getlogger().warning(msg)


def set_modules(test: rfm.RegressionTest):
Expand Down Expand Up @@ -671,14 +672,13 @@ def set_compact_process_binding(test: rfm.RegressionTest):
log(f'Set environment variable SLURM_DISTRIBUTION to {test.env_vars["SLURM_DISTRIBUTION"]}')
log(f'Set environment variable SLURM_CPU_BIND to {test.env_vars["SLURM_CPU_BIND"]}')
else:
logger = rflog.getlogger()
msg = "hooks.set_compact_process_binding does not support the current launcher"
msg += f" ({test.current_partition.launcher_type().registered_name})."
msg += " The test will run, but using the default binding strategy of your parallel launcher."
msg += " This may lead to suboptimal performance."
msg += " Please expand the functionality of hooks.set_compact_process_binding for your parallel launcher."
# Warnings will, at default loglevel, be printed on stdout when executing the ReFrame command
logger.warning(msg)
rflog.getlogger().warning(msg)


def set_compact_thread_binding(test: rfm.RegressionTest):
Expand Down
77 changes: 20 additions & 57 deletions eessi/testsuite/tests/apps/PyTorch/PyTorch_torchvision.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,79 +2,40 @@

import reframe as rfm
import reframe.utility.sanity as sn
# Added only to make the linter happy
from reframe.core.builtins import parameter, variable, run_after, sanity_function, performance_function
from reframe.core.builtins import parameter, run_after, sanity_function, performance_function

from eessi.testsuite import hooks
from eessi.testsuite.constants import SCALES, TAGS, DEVICE_TYPES, COMPUTE_UNIT, CPU, NUMA_NODE, GPU
from eessi.testsuite.constants import DEVICE_TYPES, COMPUTE_UNIT, CPU, NUMA_NODE, GPU
from eessi.testsuite.eessi_mixin import EESSI_Mixin
from eessi.testsuite.utils import find_modules


class EESSI_PyTorch_torchvision(rfm.RunOnlyRegressionTest):
class EESSI_PyTorch_torchvision(rfm.RunOnlyRegressionTest, EESSI_Mixin):
descr = 'Benchmark that runs a selected torchvision model on synthetic data'

nn_model = parameter(['vgg16', 'resnet50', 'resnet152', 'densenet121', 'mobilenet_v3_large'])
scale = parameter(SCALES.keys())
bench_name_ci = 'resnet50'
parallel_strategy = parameter([None, 'ddp'])
compute_device = variable(str)
# Both torchvision and PyTorch-bundle modules have everything needed to run this test
module_name = parameter(chain(find_modules('torchvision'), find_modules('PyTorch-bundle')))

descr = 'Benchmark that runs a selected torchvision model on synthetic data'

executable = 'python'

valid_prog_environs = ['default']
valid_systems = ['*']

time_limit = '30m'

def required_mem_per_node(self):
return self.num_tasks_per_node * 1024

@run_after('init')
def prepare_test(self):

# Set nn_model as executable option
self.executable_opts = ['pytorch_synthetic_benchmark.py --model %s' % self.nn_model]
self.bench_name = self.nn_model

# If not a GPU run, disable CUDA
if self.compute_device != DEVICE_TYPES[GPU]:
if self.device_type != DEVICE_TYPES[GPU]:
self.executable_opts += ['--no-cuda']

@run_after('init')
def apply_init_hooks(self):
# Filter on which scales are supported by the partitions defined in the ReFrame configuration
hooks.filter_supported_scales(self)

# Make sure that GPU tests run in partitions that support running on a GPU,
# and that CPU-only tests run in partitions that support running CPU-only.
# Also support setting valid_systems on the cmd line.
hooks.filter_valid_systems_by_device_type(self, required_device_type=self.compute_device)

# Support selecting modules on the cmd line.
hooks.set_modules(self)

# Support selecting scales on the cmd line via tags.
hooks.set_tag_scale(self)

@run_after('init')
def set_tag_ci(self):
if self.nn_model == 'resnet50':
self.tags.add(TAGS['CI'])

@run_after('setup')
def apply_setup_hooks(self):
if self.compute_device == DEVICE_TYPES[GPU]:
hooks.assign_tasks_per_compute_unit(test=self, compute_unit=COMPUTE_UNIT[GPU])
else:
# Hybrid code, for which launching one task per NUMA_NODE is typically the most efficient
hooks.assign_tasks_per_compute_unit(test=self, compute_unit=COMPUTE_UNIT[NUMA_NODE])

# This is a hybrid test, binding is important for performance
hooks.set_compact_process_binding(self)

# Set OMP_NUM_THREADS based on the number of cores per task
self.env_vars["OMP_NUM_THREADS"] = self.num_cpus_per_task

@run_after('setup')
def set_ddp_options(self):
# Set environment variables for PyTorch DDP
"Set environment variables for PyTorch DDP"
if self.parallel_strategy == 'ddp':
# Set additional options required by DDP
self.executable_opts += ["--master-port $(python get_free_socket.py)"]
Expand All @@ -94,7 +55,7 @@ def filter_invalid_parameter_combinations(self):

@run_after('setup')
def pass_parallel_strategy(self):
# Set parallelization strategy when using more than one process
"Set parallelization strategy when using more than one process"
if self.num_tasks != 1:
self.executable_opts += ['--use-%s' % self.parallel_strategy]

Expand All @@ -110,21 +71,23 @@ def total_throughput(self):

@performance_function('img/sec')
def througput_per_CPU(self):
'''Training througput per CPU'''
if self.compute_device == DEVICE_TYPES[CPU]:
'''Training througput per device type'''
if self.device_type == DEVICE_TYPES[CPU]:
return sn.extractsingle(r'Img/sec per CPU:\s+(?P<perf_per_cpu>\S+)', self.stdout, 'perf_per_cpu', float)
else:
return sn.extractsingle(r'Img/sec per GPU:\s+(?P<perf_per_gpu>\S+)', self.stdout, 'perf_per_gpu', float)


@rfm.simple_test
class EESSI_PyTorch_torchvision_CPU(EESSI_PyTorch_torchvision):
compute_device = DEVICE_TYPES[CPU]
device_type = DEVICE_TYPES[CPU]
compute_unit = COMPUTE_UNIT[NUMA_NODE]


@rfm.simple_test
class EESSI_PyTorch_torchvision_GPU(EESSI_PyTorch_torchvision):
compute_device = DEVICE_TYPES[GPU]
device_type = DEVICE_TYPES[GPU]
compute_unit = COMPUTE_UNIT[GPU]
precision = parameter(['default', 'mixed'])

@run_after('init')
Expand Down
Loading

0 comments on commit 84f213f

Please sign in to comment.