From b735a8997cac981df59a0d05cbb429b46968ae0f Mon Sep 17 00:00:00 2001 From: Michele Pugno Date: Tue, 19 Mar 2024 15:02:48 +0100 Subject: [PATCH] Bump version to 4.6.1, update config file, simplified logging, keep-stage removed from config and added to command line added braniac to tests. Updated GPU test, numpy and Gaussian test. Added vsc tag where missing. --- README.md | 2 +- config_vsc.py | 129 ++++++++++++++--------------- run.sh | 4 +- tests/apps/gaussian/gaussian.py | 6 +- tests/apps/julia/julia-linalg.py | 7 +- tests/apps/matlab/matlab-linalg.py | 10 ++- tests/apps/namd/namd_test.py | 8 +- tests/apps/python/numpy_check.py | 43 ++++++++-- tests/gpu/gpu_burn.py | 40 ++++----- 9 files changed, 142 insertions(+), 107 deletions(-) diff --git a/README.md b/README.md index a0d4f9f..09c168b 100644 --- a/README.md +++ b/README.md @@ -18,5 +18,5 @@ Log files and output will be saved in $HOME/reframe ## Requirements -- Reframe 3.10.1 installed as a module +- Reframe 4.6.1 installed as a module - Python3 diff --git a/config_vsc.py b/config_vsc.py index 5941212..f15239c 100644 --- a/config_vsc.py +++ b/config_vsc.py @@ -2,44 +2,17 @@ import os from py import builtin - -antwerpen_mode_options = [ - '--exec-policy=async', - '--output=/apps/antwerpen/reframe/logs/output/', - '--perflogdir=/apps/antwerpen/reframe/logs/', - '--stage=/apps/antwerpen/reframe/logs/stage/', - '--report-file=/apps/antwerpen/reframe/logs/reports/last-$VSC_INSTITUTE_CLUSTER.json', - '--compress-report', - '--nocolor'] - -perf_logging_format = [ - '{"username": "%(osuser)s"', - '"version": "%(version)s"', - '"name": "%(check_name)s"', - '"system": "%(check_system)s"', - '"partition": "%(check_partition)s"', - '"environ": "%(check_environ)s"', - '"nodelist": "%(check_job_nodelist)s"', - '"num_tasks": "%(check_num_tasks)s"', - '"num_cpus_per_task": "%(check_num_cpus_per_task)s"', - '"num_tasks_per_node": "%(check_num_tasks_per_node)s"', - '"modules": "%(check_modules)s"', - '"jobid": "%(check_jobid)s"', - '"perf_var": "%(check_perf_var)s"', - '"perf_value": "%(check_perf_value)s"', - '"unit": "%(check_perf_unit)s"', - '"description": "%(check_descr)s"', - '"job_completion_time": "%(check_job_completion_time)s"', - '"check_result": "%(check_result)s"', - ] - -logging_format = perf_logging_format + ['"message": "%(message)s"', '"time": "%(asctime)s"}'] -perf_logging_format[-1] += '}' +# use 'info' to log to syslog +syslog_level = 'warning' # To run jobs on the kul cluster, you need to be a member of the following # vsc group kul_account_string_tier2 = '-A lpt2_vsc_test_suite' +# To run jobs on the calcua cluster, you need to be a member of the following +# vsc group +calcua_account_string_tier2 = '-A ap_calcua_staff' + # By default, not all installed modules are visible on the genius cluster genius_modulepath = [] for version in ['2018a', '2019b', '2021a']: @@ -188,26 +161,26 @@ 'access': [], 'environs': ['standard'], 'descr': 'tests in the local node (no job)', - 'max_jobs': 1, + 'max_jobs': 10, 'launcher': 'local', }, { 'name': 'single-node', 'scheduler': 'slurm', 'modules': [], - 'access': [], + 'access': [calcua_account_string_tier2], 'environs': ['standard'], 'descr': 'single-node jobs', - 'max_jobs': 1, + 'max_jobs': 10, 'launcher': 'local', }, - { + { 'name': 'mpi-job', 'scheduler': 'slurm', - 'access': [], + 'access': [calcua_account_string_tier2], 'environs': ['intel-2021a'], 'descr': 'MPI jobs', - 'max_jobs': 1, + 'max_jobs': 10, # TODO Here we actually want to set vsc-mympirun, but since # this is a custom launcher not shipped with ReFrame, we # can only do this in the test itself after registering the @@ -217,7 +190,7 @@ { 'name': 'nvidia', 'scheduler': 'slurm', - 'access': ['-p ampere_gpu'], + 'access': [calcua_account_string_tier2, '-p ampere_gpu'], 'environs': ['CUDA', 'standard'], 'descr': 'Nvidia ampere node', 'max_jobs': 1, @@ -244,26 +217,26 @@ 'access': [], 'environs': ['standard'], 'descr': 'tests in the local node (no job)', - 'max_jobs': 1, + 'max_jobs': 10, 'launcher': 'local', }, { 'name': 'single-node', 'scheduler': 'slurm', 'modules': [], - 'access': [], + 'access': [calcua_account_string_tier2], 'environs': ['standard'], 'descr': 'single-node jobs', - 'max_jobs': 1, + 'max_jobs': 10, 'launcher': 'local', }, { 'name': 'mpi-job', 'scheduler': 'slurm', - 'access': [], + 'access': [calcua_account_string_tier2], 'environs': ['intel-2021a'], 'descr': 'MPI jobs', - 'max_jobs': 1, + 'max_jobs': 10, # TODO Here we actually want to set vsc-mympirun, but since # this is a custom launcher not shipped with ReFrame, we # can only do this in the test itself after registering the @@ -273,7 +246,7 @@ { 'name': 'nvidia', 'scheduler': 'slurm', - 'access': ['-p pascal_gpu'], + 'access': [calcua_account_string_tier2, '-p pascal_gpu'], 'environs': ['CUDA', 'standard'], 'descr': 'Nvidia pascal nodes', 'max_jobs': 2, @@ -287,6 +260,47 @@ } ] }, + { + 'name': 'breniac', + 'descr': 'VSC Tier-2 Breniac', + 'hostnames': ['login.breniac'], + 'modules_system': 'lmod', + 'partitions': [ + { + 'name': 'local', + 'scheduler': 'local', + 'modules': [], + 'access': [], + 'environs': ['standard'], + 'descr': 'tests in the local node (no job)', + 'max_jobs': 10, + 'launcher': 'local', + }, + { + 'name': 'single-node', + 'scheduler': 'slurm', + 'modules': [], + 'access': [calcua_account_string_tier2], + 'environs': ['standard'], + 'descr': 'single-node jobs', + 'max_jobs': 10, + 'launcher': 'local', + }, + { + 'name': 'mpi-job', + 'scheduler': 'slurm', + 'access': [calcua_account_string_tier2], + 'environs': ['intel-2021a'], + 'descr': 'MPI jobs', + 'max_jobs': 10, + # TODO Here we actually want to set vsc-mympirun, but since + # this is a custom launcher not shipped with ReFrame, we + # can only do this in the test itself after registering the + # vsc-mympirun launcher + 'launcher': 'srun', + }, + ] + }, ], 'environments': [ { @@ -313,40 +327,19 @@ { 'purge_environment': True, 'resolve_module_conflicts': False, # avoid loading the module before submitting the job - 'keep_stage_files': True, } ], 'logging': [ { 'level': 'debug', 'handlers': [ - { - 'type': 'file', - 'name': 'reframe.log', - 'level': 'debug', - 'format': '[%(asctime)s] %(levelname)s: %(check_name)s: %(message)s', # noqa: E501 - 'append': False, - }, { 'type': 'stream', 'name': 'stdout', 'level': 'info', 'format': '%(message)s', }, - { - 'type': 'file', - 'name': 'reframe.out', - 'level': 'info', - 'format': '%(message)s', - 'append': False, - }, ], } ], - 'modes': [ - { - 'name': 'UAstandard', - 'options': antwerpen_mode_options, - }, - ] -} \ No newline at end of file +} diff --git a/run.sh b/run.sh index 3182816..35e1ad6 100755 --- a/run.sh +++ b/run.sh @@ -1,4 +1,4 @@ -module load ReFrame/4.2.0 +module load ReFrame/4.6.1 export RFM_CONFIG_FILES=$(dirname $0)/config_vsc.py export RFM_CHECK_SEARCH_PATH=$(dirname $0)/tests @@ -7,5 +7,5 @@ export RFM_PREFIX=$VSC_SCRATCH/reframe export RFM_CHECK_SEARCH_RECURSIVE=true export RFM_SAVE_LOG_FILES=true -reframe --run "$@" +reframe --keep-stage-files --run "$@" #rm $(dirname $0)/reframe.out $(dirname $0)/reframe.log diff --git a/tests/apps/gaussian/gaussian.py b/tests/apps/gaussian/gaussian.py index 03b5ac2..d2b0a88 100644 --- a/tests/apps/gaussian/gaussian.py +++ b/tests/apps/gaussian/gaussian.py @@ -16,10 +16,10 @@ def __init__(self): 'time': ( sn.extractsingle( r'^real\t(?P\S+)m\S+s', - 'rfm_GaussianCPUTest_job.err', 'minutes', float) + + self.stderr, 'minutes', float) + sn.extractsingle( r'^real\t\S+m(?P\S+)s', - 'rfm_GaussianCPUTest_job.err', 'seconds', float) / 60.0) + self.stderr, 'seconds', float) / 60.0) } self.maintainers = ['Lewih'] @@ -45,7 +45,7 @@ def __init__(self): }, } - self.tags = {'apps', 'gaussian', 'performance'} + self.tags = {'apps', 'gaussian', 'performance', 'vsc'} @run_after('setup') diff --git a/tests/apps/julia/julia-linalg.py b/tests/apps/julia/julia-linalg.py index 384453b..8516509 100644 --- a/tests/apps/julia/julia-linalg.py +++ b/tests/apps/julia/julia-linalg.py @@ -10,7 +10,7 @@ def __init__(self): self.modules = ['Julia'] self.executable = 'julia' self.executable_opts = ['linalg.jl'] - self.tags = {'apps', 'julia', '1nodes', 'performance'} + self.tags = {'apps', 'julia', '1nodes', 'performance', 'vsc'} self.maintainers = ['Lewih'] self.time_limit = '10m' @@ -50,6 +50,11 @@ def __init__(self): 'cholesky': (0.57, None, 0.1, 'seconds'), 'lu': (0.31, None, 0.1, 'seconds'), }, + 'breniac:single-node': { + 'dot': (0.47, None, 0.1, 'seconds'), + 'cholesky': (0.57, None, 0.1, 'seconds'), + 'lu': (0.31, None, 0.1, 'seconds'), + }, 'hortense:single-node': { 'dot': (0.44, None, 0.1, 'seconds'), 'cholesky': (0.47, None, 0.1, 'seconds'), diff --git a/tests/apps/matlab/matlab-linalg.py b/tests/apps/matlab/matlab-linalg.py index aac7b73..79cab3a 100644 --- a/tests/apps/matlab/matlab-linalg.py +++ b/tests/apps/matlab/matlab-linalg.py @@ -30,7 +30,7 @@ def __init__(self): self.executable = 'cat' self.executable_opts = ['linalg.m | matlab -nodesktop -nosplash'] self.num_tasks_per_node = 1 - self.tags = {'apps', 'matlab', 'performance'} + self.tags = {'apps', 'matlab', 'performance', 'vsc'} self.maintainers = ['Lewih'] @@ -40,6 +40,7 @@ def __init__(self): super().__init__() self.valid_systems = ['leibniz:single-node', 'vaughan:single-node', + 'breniac:single-node', 'hydra:single-node', 'genius:single-node'] @@ -54,6 +55,11 @@ def __init__(self): 'cholesky': (0.06, None, 0.10, 'seconds'), 'lu': (0.18, None, 0.10, 'seconds'), }, + 'breniac:single-node': { + 'dot': (0.28, None, 0.10, 'seconds'), + 'cholesky': (0.06, None, 0.10, 'seconds'), + 'lu': (0.24, None, 0.10, 'seconds'), + }, 'genius:single-node': { 'dot': (0.14, None, 0.10, 'seconds'), 'cholesky': (0.05, None, 0.10, 'seconds'), @@ -68,7 +74,7 @@ def __init__(self): @run_after('setup') def set_num_cpus(self): - if self.current_system.name == 'leibniz': + if self.current_system.name in ['leibniz', 'breniac']: self.num_cpus_per_task = 28 elif self.current_system.name == 'vaughan': self.num_cpus_per_task = 32 diff --git a/tests/apps/namd/namd_test.py b/tests/apps/namd/namd_test.py index 1b7c3fc..60b79d6 100644 --- a/tests/apps/namd/namd_test.py +++ b/tests/apps/namd/namd_test.py @@ -22,7 +22,7 @@ def __init__(self, arch): self.maintainers = ['Lewih'] - self.tags = {'apps', 'namd', 'performance'} + self.tags = {'apps', 'namd', 'performance', 'vsc'} self.tags.add(f'{self.num_nodes}nodes') @run_before('run') @@ -109,6 +109,7 @@ def __init__(self): self.valid_systems = ['leibniz:single-node', 'vaughan:single-node', + 'breniac:single-node', 'genius:single-node', 'hydra:single-node'] @@ -119,18 +120,21 @@ def __init__(self): '1': { 'leibniz:single-node': {'days_ns': (0.347779, None, 0.05, 'days/ns')}, 'vaughan:single-node': {'days_ns': (0.188093, None, 0.05, 'days/ns')}, + 'breniac:single-node': {'days_ns': (0.8281605, None, 0.05, 'days/ns')}, 'hydra:single-node': {'days_ns': (0.202701, None, 0.05, 'days/ns')}, 'genius:single-node': {'days_ns': (0.210896, None, 0.05, 'days/ns')}, }, '2': { 'leibniz:single-node': {'days_ns': (0.1782715, None, 0.05, 'days/ns')}, 'vaughan:single-node': {'days_ns': (0.09856985, None, 0.05, 'days/ns')}, + 'breniac:single-node': {'days_ns': (0.1459575, None, 0.05, 'days/ns')}, 'hydra:single-node': {'days_ns': (0.1099565, None, 0.05, 'days/ns')}, 'genius:single-node': {'days_ns': (0.1151855, None, 0.05, 'days/ns')}, }, '4': { 'leibniz:single-node': {'days_ns': (1.05726, None, 0.05, 'days/ns')}, 'vaughan:single-node': {'days_ns': (0.5438339, None, 0.05, 'days/ns')}, + 'breniac:single-node': {'days_ns': (0.8281605, None, 0.05, 'days/ns')}, 'hydra:single-node': {'days_ns': (0.5427335, None, 0.05, 'days/ns')}, 'genius:single-node': {'days_ns': (0.565824, None, 0.05, 'days/ns')}, }, @@ -144,7 +148,7 @@ def set_num_cpus(self): configFile = self.download_material() # VSC specific config - if self.current_system.name == 'leibniz': + if self.current_system.name in ['leibniz', 'breniac']: self.num_cpus_per_task = 28 self.modules = ['NAMD/2.14-verbs'] launcher = 'charm_antwerp' diff --git a/tests/apps/python/numpy_check.py b/tests/apps/python/numpy_check.py index 788ebd7..d3b14c0 100644 --- a/tests/apps/python/numpy_check.py +++ b/tests/apps/python/numpy_check.py @@ -35,25 +35,47 @@ def __init__(self): self.executable = 'python3' self.executable_opts = ['np_ops.py'] # self.use_multithreading = False - self.tags = {'apps', 'python', 'performance'} + self.tags = {'apps', 'python', 'numpy', 'performance', 'vsc'} self.maintainers = ['Lewih'] self.valid_systems = ['*:single-node'] self.reference = { + # old references with intel + # 'vaughan:single-node': { + # 'dot': (0.7, None, 0.10, 'seconds'), + # 'svd': (0.6, None, 0.10, 'seconds'), + # 'cholesky': (0.28, None, 0.10, 'seconds'), + # 'eigendec': (7.0, None, 0.10, 'seconds'), + # 'inv': (0.40, None, 0.10, 'seconds'), + # }, + # 'leibniz:single-node': { + # 'dot': (0.72, None, 0.10, 'seconds'), + # 'svd': (0.42, None, 0.10, 'seconds'), + # 'cholesky': (0.1, None, 0.10, 'seconds'), + # 'eigendec': (4.3, None, 0.10, 'seconds'), + # 'inv': (0.25, None, 0.10, 'seconds'), + # }, 'vaughan:single-node': { - 'dot': (0.7, None, 0.10, 'seconds'), - 'svd': (0.6, None, 0.10, 'seconds'), - 'cholesky': (0.28, None, 0.10, 'seconds'), - 'eigendec': (7.0, None, 0.10, 'seconds'), - 'inv': (0.40, None, 0.10, 'seconds'), + 'dot': (0.84, None, 0.10, 'seconds'), + 'svd': (1.48, None, 0.10, 'seconds'), + 'cholesky': (0.57, None, 0.10, 'seconds'), + 'eigendec': (13.06, None, 0.10, 'seconds'), + 'inv': (0.73, None, 0.10, 'seconds'), }, 'leibniz:single-node': { - 'dot': (0.72, None, 0.10, 'seconds'), - 'svd': (0.42, None, 0.10, 'seconds'), + 'dot': (0.87, None, 0.10, 'seconds'), + 'svd': (1.0, None, 0.10, 'seconds'), 'cholesky': (0.1, None, 0.10, 'seconds'), - 'eigendec': (4.3, None, 0.10, 'seconds'), + 'eigendec': (7, None, 0.10, 'seconds'), 'inv': (0.25, None, 0.10, 'seconds'), }, + 'breniac:single-node': { + 'dot': (0.7, None, 0.10, 'seconds'), + 'svd': (0.81, None, 0.10, 'seconds'), + 'cholesky': (0.28, None, 0.10, 'seconds'), + 'eigendec': (7.0, None, 0.10, 'seconds'), + 'inv': (0.40, None, 0.10, 'seconds'), + }, 'genius:single-node': { 'dot': (0.7, None, 0.10, 'seconds'), 'svd': (0.6, None, 0.10, 'seconds'), @@ -91,3 +113,6 @@ def set_num_cpus(self): elif self.current_system.name == "hortense": self.modules = ["SciPy-bundle/2021.10-foss-2021b"] self.job.options = ["--exclusive"] + elif self.current_system.name in ["leibniz", "vaughan", "breniac"]: + self.modules = ["SciPy-bundle/2021.05-foss-2021a"] + self.job.options = ["--exclusive"] diff --git a/tests/gpu/gpu_burn.py b/tests/gpu/gpu_burn.py index 071a6e8..454a454 100644 --- a/tests/gpu/gpu_burn.py +++ b/tests/gpu/gpu_burn.py @@ -12,21 +12,22 @@ class GPU_Burn_nvidia(rfm.RunOnlyRegressionTest): time_limit = '10m' prerun_cmds = ['git clone https://github.com/wilicc/gpu-burn.git', 'cd gpu-burn', 'make'] executable = '--output=rfm_GPUBURN_nvidia_node-%N.out ./gpu_burn 20' - tags = {"gpu", "burn", "performance"} + tags = {"gpu", "burn", "performance", "vsc"} num_devices = 0 num_tasks_per_node = 1 + # no upper bound, keep lower bound for reference reference = { 'vaughan:nvidia': { - 'device0_nvam1': (17339.0, -0.05, 0.05, 'Gflop/s'), - 'device1_nvam1': (17336.0, -0.05, 0.05, 'Gflop/s'), - 'device2_nvam1': (17340.0, -0.05, 0.05, 'Gflop/s'), - 'device3_nvam1': (17335.0, -0.05, 0.05, 'Gflop/s'), + 'nvam1_device0': (17339.0, -0.05, None, 'Gflop/s'), + 'nvam1_device1': (17336.0, -0.05, None, 'Gflop/s'), + 'nvam1_device2': (17340.0, -0.05, None, 'Gflop/s'), + 'nvam1_device3': (17335.0, -0.05, None, 'Gflop/s'), }, 'leibniz:nvidia': { - 'device0_nvpa1': (7412.0, -0.05, 0.05, 'Gflop/s'), - 'device1_nvpa1': (7412.0, -0.05, 0.05, 'Gflop/s'), - 'device0_nvpa2': (7412.0, -0.05, 0.05, 'Gflop/s'), - 'device1_nvpa2': (7412.0, -0.05, 0.05, 'Gflop/s'), + 'nvpa1_device0': (7412.0, -0.05, None, 'Gflop/s'), + 'nvpa1_device1': (7412.0, -0.05, None, 'Gflop/s'), + 'nvpa2_device0': (7412.0, -0.05, None, 'Gflop/s'), + 'nvpa2_device1': (7412.0, -0.05, None, 'Gflop/s'), } } @@ -47,22 +48,23 @@ def assert_job(self): result = True for n in sorted(self.job.nodelist): node = n.split('.')[0] - result = sn.and_(sn.and_(sn.assert_found(r'OK', f'gpu-burn/rfm_GPUBURN_nvidia_node-{node}.out'), sn.assert_not_found(r'FAULTY', f'gpu-burn/rfm_GPUBURN_nvidia_node-{node}.out')), result) - + result = sn.and_(sn.and_(sn.assert_found(r'OK', self.stagedir+f'/gpu-burn/rfm_GPUBURN_nvidia_node-{node}.out'), sn.assert_not_found(r'FAULTY', self.stagedir+f'/gpu-burn/rfm_GPUBURN_nvidia_node-{node}.out')), result) return result @performance_function('Gflop/s') def get_gflops(self, device=0, node=None): - return sn.extractsingle(r'\((?P\S+) Gflop/s\)', f'gpu-burn/rfm_GPUBURN_nvidia_node-{node}.out', 'gflops', float, item=(-device-1)) + # take starting from item -1 (last match) + return sn.extractsingle(r'\((?P\S+) Gflop/s\)', self.stagedir+f'/gpu-burn/rfm_GPUBURN_nvidia_node-{node}.out', 'gflops', float, item=(-device-1)) @run_before('performance') def set_perf_variables(self): '''Build the dictionary with all the performance variables.''' self.perf_variables = {} - - for n in self.job.nodelist: - node =n.split('.')[0] - device = 0 - for x in range(self.num_devices): - self.perf_variables[f'device{device}_{node}'] = self.get_gflops(device=self.num_devices-device, node=node) - device += 1 + # for dry runs, check if nodelist is empty + if self.job.nodelist: + for n in self.job.nodelist: + node =n.split('.')[0] + device = 0 + for x in range(self.num_devices): + self.perf_variables[f'{node}_device{self.num_devices-device-1}'] = self.get_gflops(device=self.num_devices-device, node=node) + device += 1