Merge branch 'main' into osu_mixin

EESSI · Jan 12, 2025 · 84f213f · 84f213f
2 parents e905e28 + 695b7b2
commit 84f213f
Show file tree

Hide file tree

Showing 9 changed files with 107 additions and 190 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,7 @@
 __pycache__/
 *.egg-info/
 build/
+
+# Vim
+*.sw[op]
+*~
diff --git a/CI/hortense_local_ss/ci_config.sh b/CI/hortense_local_ss/ci_config.sh
@@ -1,14 +1,16 @@
 # Configurable items
-if [ -z "${TEST_SUITE_PARTITION}" ]; then
-   echo "You have to indicate on which partition the test-suite will run on vsc-Hortense"
-   echo "This environment variable needs to be set TEST_SUITE_PARTITION=cpu_rome_256gb"
-   echo "Can only set to 'cpu_rome_256gb' untill new functionality of 'sched_options' is part of"
-   echo "the ReFrame release https://github.com/reframe-hpc/reframe/issues/2970"
-   exit 1
+if [[ "$TEST_SUITE_PARTITION" == "GPU" ]]; then
+    module --force purge
+    if [ -z "${SET_LOCAL_MODULE_ENV}"]; then
+        export SET_LOCAL_MODULE_ENV=True
+    fi
+    if [ -z "${LOCAL_MODULES}"]; then
+        export LOCAL_MODULES="cluster/dodrio/gpu_rome_a100"
+    fi
 fi
 
 if [ -z "${REFRAME_ARGS}" ]; then
-    REFRAME_ARGS="--tag CI --tag 1_node|2_nodes --system hortense:${TEST_SUITE_PARTITION}"
+    REFRAME_ARGS="--tag CI --tag 1_node|2_nodes"
 fi
 
 if [ -z "${USE_EESSI_SOFTWARE_STACK}" ]; then
@@ -21,13 +23,4 @@ fi
 
 if [ -z "${UNSET_MODULEPATH}" ]; then
     export UNSET_MODULEPATH=False
-    module --force purge
-fi
-
-if [ -z "${SET_LOCAL_MODULE_ENV}"]; then
-    export SET_LOCAL_MODULE_ENV=True
-fi
-
-if [ -z "${LOCAL_MODULES}"]; then
-    export LOCAL_MODULES="cluster/dodrio/${TEST_SUITE_PARTITION}"
 fi
diff --git a/config/vsc_hortense.py b/config/vsc_hortense.py
@@ -4,8 +4,7 @@
 # authors: Samuel Moors (VUB-HPC), Kenneth Hoste (HPC-UGent), Lara Peeters (HPC-UGent)
 
 # Use generated topology file by ReFrame for CPU partitions
-# Cannot use autodetection untill new functionality of `sched_options` is part of
-# the ReFrame release https://github.com/reframe-hpc/reframe/issues/2970
+# `shed_access_in_submit` does not work with setting `'remote_detect': True,`
 
 # Instructions on generating topology file
 # ```
@@ -64,6 +63,9 @@ def command(self, job):
                     'scheduler': 'slurm',
                     'prepare_cmds': [prepare_eessi_init, common_eessi_init()],
                     'access': hortense_access + ['--partition=cpu_rome'],
+                    'sched_options': {
+                        'shed_access_in_submit': True,
+                    },
                     'environs': ['default'],
                     'descr': 'CPU nodes (AMD Rome, 256GiB RAM)',
                     'max_jobs': 20,
@@ -89,6 +91,9 @@ def command(self, job):
                     'scheduler': 'slurm',
                     'prepare_cmds': [prepare_eessi_init, common_eessi_init()],
                     'access': hortense_access + ['--partition=cpu_rome_512'],
+                    'sched_options': {
+                        'shed_access_in_submit': True,
+                    },
                     'environs': ['default'],
                     'descr': 'CPU nodes (AMD Rome, 512GiB RAM)',
                     'max_jobs': 20,
@@ -114,6 +119,9 @@ def command(self, job):
                     'scheduler': 'slurm',
                     'prepare_cmds': [prepare_eessi_init, common_eessi_init()],
                     'access': hortense_access + ['--partition=cpu_milan'],
+                    'sched_options': {
+                        'shed_access_in_submit': True,
+                    },
                     'environs': ['default'],
                     'descr': 'CPU nodes (AMD Milan, 256GiB RAM)',
                     'max_jobs': 20,
@@ -139,6 +147,9 @@ def command(self, job):
                     'scheduler': 'slurm',
                     'prepare_cmds': [prepare_eessi_init, common_eessi_init()],
                     'access': hortense_access + ['--partition=gpu_rome_a100_40'],
+                    'sched_options': {
+                        'shed_access_in_submit': True,
+                    },
                     'environs': ['default'],
                     'descr': 'GPU nodes (A100 40GB)',
                     'max_jobs': 20,
@@ -176,6 +187,9 @@ def command(self, job):
                     'scheduler': 'slurm',
                     'prepare_cmds': [prepare_eessi_init, common_eessi_init()],
                     'access': hortense_access + ['--partition=gpu_rome_a100_80'],
+                    'sched_options': {
+                        'shed_access_in_submit': True,
+                    },
                     'environs': ['default'],
                     'descr': 'GPU nodes (A100 80GB)',
                     'max_jobs': 20,

diff --git a/eessi/testsuite/eessi_mixin.py b/eessi/testsuite/eessi_mixin.py
@@ -40,6 +40,7 @@ class EESSI_Mixin(RegressionMixin):
 
     # Set defaults for these class variables, can be overwritten by child class if desired
     measure_memory_usage = variable(bool, value=False)
+    exact_memory = variable(bool, value=False)
     scale = parameter(SCALES.keys())
     bench_name = None
     bench_name_ci = None

diff --git a/eessi/testsuite/hooks.py b/eessi/testsuite/hooks.py
@@ -3,7 +3,6 @@
 """
 import math
 import shlex
-import warnings
 
 import reframe as rfm
 import reframe.core.logging as rflog
@@ -432,10 +431,10 @@ def _set_or_append_valid_systems(test: rfm.RegressionTest, valid_systems: str):
     elif len(test.valid_systems) == 1:
         test.valid_systems[0] = f'{test.valid_systems[0]} {valid_systems}'
     else:
-        warn_msg = f"valid_systems has multiple ({len(test.valid_systems)}) items,"
-        warn_msg += " which is not supported by this hook."
-        warn_msg += " Make sure to handle filtering yourself."
-        warnings.warn(warn_msg)
+        msg = f"valid_systems has multiple ({len(test.valid_systems)}) items,"
+        msg += " which is not supported by this hook."
+        msg += " Make sure to handle filtering yourself."
+        rflog.getlogger().warning(msg)
         return
 
 
@@ -529,7 +528,6 @@ def req_memory_per_node(test: rfm.RegressionTest, app_mem_req: float):
     # and return from this hook (as setting test.extra_resources will be ignored in that case according to
     # https://reframe-hpc.readthedocs.io/en/stable/regression_test_api.html#reframe.core.pipeline.RegressionTest.extra_resources
     if 'memory' not in test.current_partition.resources:
-        logger = rflog.getlogger()
         msg = "Your ReFrame configuration file does not specify any resource called 'memory' for this partition "
         msg += f" ({test.current_partition.name})."
         msg += " Without this, an explicit memory request cannot be made from the scheduler. This test will run,"
@@ -538,7 +536,7 @@ def req_memory_per_node(test: rfm.RegressionTest, app_mem_req: float):
         msg += " 'memory' in your ReFrame configuration file for this partition."
         msg += " For a SLURM system, one would e.g. define:"
         msg += " 'resources': [{'name': 'memory', 'options': ['--mem={size}']}]"
-        logger.warning(msg)
+        rflog.getlogger().warning(msg)
         # We return, as setting a test.extra_resources is pointless - it would be ignored anyway
         # This way, we also don't add any lines to the log that a specific amount of memory was requested
         return
@@ -557,8 +555,12 @@ def req_memory_per_node(test: rfm.RegressionTest, app_mem_req: float):
         log(f"Memory requested by application: {app_mem_req} MiB")
         log(f"Memory proportional to the core count: {proportional_mem} MiB")
 
-        # Request the maximum of the proportional_mem, and app_mem_req to the scheduler
-        req_mem_per_node = max(proportional_mem, app_mem_req)
+        if test.exact_memory:
+            # Request the exact amount of required memory
+            req_mem_per_node = app_mem_req
+        else:
+            # Request the maximum of the proportional_mem, and app_mem_req to the scheduler
+            req_mem_per_node = max(proportional_mem, app_mem_req)
 
         test.extra_resources = {'memory': {'size': f'{req_mem_per_node}M'}}
         log(f"Requested {req_mem_per_node} MiB per node from the SLURM batch scheduler")
@@ -580,14 +582,13 @@ def req_memory_per_node(test: rfm.RegressionTest, app_mem_req: float):
         log(f"Requested {req_mem_per_task} MiB per task from the torque batch scheduler")
 
     else:
-        logger = rflog.getlogger()
         msg = "hooks.req_memory_per_node does not support the scheduler you configured"
         msg += f" ({test.current_partition.scheduler.registered_name})."
         msg += " The test will run, but since it doesn't request the required amount of memory explicitely,"
         msg += " it may result in an out-of-memory error."
         msg += " Please expand the functionality of hooks.req_memory_per_node for your scheduler."
         # Warnings will, at default loglevel, be printed on stdout when executing the ReFrame command
-        logger.warning(msg)
+        rflog.getlogger().warning(msg)
 
 
 def set_modules(test: rfm.RegressionTest):
@@ -671,14 +672,13 @@ def set_compact_process_binding(test: rfm.RegressionTest):
         log(f'Set environment variable SLURM_DISTRIBUTION to {test.env_vars["SLURM_DISTRIBUTION"]}')
         log(f'Set environment variable SLURM_CPU_BIND to {test.env_vars["SLURM_CPU_BIND"]}')
     else:
-        logger = rflog.getlogger()
         msg = "hooks.set_compact_process_binding does not support the current launcher"
         msg += f" ({test.current_partition.launcher_type().registered_name})."
         msg += " The test will run, but using the default binding strategy of your parallel launcher."
         msg += " This may lead to suboptimal performance."
         msg += " Please expand the functionality of hooks.set_compact_process_binding for your parallel launcher."
         # Warnings will, at default loglevel, be printed on stdout when executing the ReFrame command
-        logger.warning(msg)
+        rflog.getlogger().warning(msg)
 
 
 def set_compact_thread_binding(test: rfm.RegressionTest):

diff --git a/eessi/testsuite/tests/apps/PyTorch/PyTorch_torchvision.py b/eessi/testsuite/tests/apps/PyTorch/PyTorch_torchvision.py
@@ -2,79 +2,40 @@
 
 import reframe as rfm
 import reframe.utility.sanity as sn
-# Added only to make the linter happy
-from reframe.core.builtins import parameter, variable, run_after, sanity_function, performance_function
+from reframe.core.builtins import parameter, run_after, sanity_function, performance_function
 
-from eessi.testsuite import hooks
-from eessi.testsuite.constants import SCALES, TAGS, DEVICE_TYPES, COMPUTE_UNIT, CPU, NUMA_NODE, GPU
+from eessi.testsuite.constants import DEVICE_TYPES, COMPUTE_UNIT, CPU, NUMA_NODE, GPU
+from eessi.testsuite.eessi_mixin import EESSI_Mixin
 from eessi.testsuite.utils import find_modules
 
 
-class EESSI_PyTorch_torchvision(rfm.RunOnlyRegressionTest):
+class EESSI_PyTorch_torchvision(rfm.RunOnlyRegressionTest, EESSI_Mixin):
+    descr = 'Benchmark that runs a selected torchvision model on synthetic data'
+
     nn_model = parameter(['vgg16', 'resnet50', 'resnet152', 'densenet121', 'mobilenet_v3_large'])
-    scale = parameter(SCALES.keys())
+    bench_name_ci = 'resnet50'
     parallel_strategy = parameter([None, 'ddp'])
-    compute_device = variable(str)
     # Both torchvision and PyTorch-bundle modules have everything needed to run this test
     module_name = parameter(chain(find_modules('torchvision'), find_modules('PyTorch-bundle')))
-
-    descr = 'Benchmark that runs a selected torchvision model on synthetic data'
-
     executable = 'python'
-
-    valid_prog_environs = ['default']
-    valid_systems = ['*']
-
     time_limit = '30m'
 
+    def required_mem_per_node(self):
+        return self.num_tasks_per_node * 1024
+
     @run_after('init')
     def prepare_test(self):
-
         # Set nn_model as executable option
         self.executable_opts = ['pytorch_synthetic_benchmark.py --model %s' % self.nn_model]
+        self.bench_name = self.nn_model
 
         # If not a GPU run, disable CUDA
-        if self.compute_device != DEVICE_TYPES[GPU]:
+        if self.device_type != DEVICE_TYPES[GPU]:
             self.executable_opts += ['--no-cuda']
 
-    @run_after('init')
-    def apply_init_hooks(self):
-        # Filter on which scales are supported by the partitions defined in the ReFrame configuration
-        hooks.filter_supported_scales(self)
-
-        # Make sure that GPU tests run in partitions that support running on a GPU,
-        # and that CPU-only tests run in partitions that support running CPU-only.
-        # Also support setting valid_systems on the cmd line.
-        hooks.filter_valid_systems_by_device_type(self, required_device_type=self.compute_device)
-
-        # Support selecting modules on the cmd line.
-        hooks.set_modules(self)
-
-        # Support selecting scales on the cmd line via tags.
-        hooks.set_tag_scale(self)
-
-    @run_after('init')
-    def set_tag_ci(self):
-        if self.nn_model == 'resnet50':
-            self.tags.add(TAGS['CI'])
-
-    @run_after('setup')
-    def apply_setup_hooks(self):
-        if self.compute_device == DEVICE_TYPES[GPU]:
-            hooks.assign_tasks_per_compute_unit(test=self, compute_unit=COMPUTE_UNIT[GPU])
-        else:
-            # Hybrid code, for which launching one task per NUMA_NODE is typically the most efficient
-            hooks.assign_tasks_per_compute_unit(test=self, compute_unit=COMPUTE_UNIT[NUMA_NODE])
-
-        # This is a hybrid test, binding is important for performance
-        hooks.set_compact_process_binding(self)
-
-        # Set OMP_NUM_THREADS based on the number of cores per task
-        self.env_vars["OMP_NUM_THREADS"] = self.num_cpus_per_task
-
     @run_after('setup')
     def set_ddp_options(self):
-        # Set environment variables for PyTorch DDP
+        "Set environment variables for PyTorch DDP"
         if self.parallel_strategy == 'ddp':
             # Set additional options required by DDP
             self.executable_opts += ["--master-port $(python get_free_socket.py)"]
@@ -94,7 +55,7 @@ def filter_invalid_parameter_combinations(self):
 
     @run_after('setup')
     def pass_parallel_strategy(self):
-        # Set parallelization strategy when using more than one process
+        "Set parallelization strategy when using more than one process"
         if self.num_tasks != 1:
             self.executable_opts += ['--use-%s' % self.parallel_strategy]
 
@@ -110,21 +71,23 @@ def total_throughput(self):
 
     @performance_function('img/sec')
     def througput_per_CPU(self):
-        '''Training througput per CPU'''
-        if self.compute_device == DEVICE_TYPES[CPU]:
+        '''Training througput per device type'''
+        if self.device_type == DEVICE_TYPES[CPU]:
             return sn.extractsingle(r'Img/sec per CPU:\s+(?P<perf_per_cpu>\S+)', self.stdout, 'perf_per_cpu', float)
         else:
             return sn.extractsingle(r'Img/sec per GPU:\s+(?P<perf_per_gpu>\S+)', self.stdout, 'perf_per_gpu', float)
 
 
 @rfm.simple_test
 class EESSI_PyTorch_torchvision_CPU(EESSI_PyTorch_torchvision):
-    compute_device = DEVICE_TYPES[CPU]
+    device_type = DEVICE_TYPES[CPU]
+    compute_unit = COMPUTE_UNIT[NUMA_NODE]
 
 
 @rfm.simple_test
 class EESSI_PyTorch_torchvision_GPU(EESSI_PyTorch_torchvision):
-    compute_device = DEVICE_TYPES[GPU]
+    device_type = DEVICE_TYPES[GPU]
+    compute_unit = COMPUTE_UNIT[GPU]
     precision = parameter(['default', 'mixed'])
 
     @run_after('init')