Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add Tensorflow test #38

Merged
merged 36 commits into from
Jul 31, 2023
Merged
Show file tree
Hide file tree
Changes from 25 commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
ff57caa
Initial version of the python files for the TensorFlow test
May 22, 2023
5b16306
Merge branch 'EESSI:main' into tensorflow
casparvl May 22, 2023
61c29ec
Modify list of visible GPU devices to empty when argument is cpu
May 23, 2023
b466396
Merge branch 'tensorflow' of github.com:casparvl/test-suite into tens…
May 24, 2023
5428ff5
Moved test files
May 24, 2023
b058bc9
Reorganized the test code. Less hardcoded, more arguments, more segme…
May 24, 2023
9d599c5
Added ReFrame TensorFlow test. Still a work in progress: number of ta…
May 24, 2023
85fab0e
Added support for launching one task per socket, useful for hybrid pr…
May 24, 2023
7aab841
Added binding environment variables
Jun 5, 2023
e9cab6b
Set process binding. We commented out thread binding for now, as on m…
Jun 13, 2023
bf40e9a
merged with main, resolved conflicts due to renaming of namespace
Jun 13, 2023
09e7704
Made separate hooks for binding processes and binding threads. Only d…
Jun 16, 2023
c063f64
Use tf.config.threading api to set number of threads
Jun 16, 2023
27bf9aa
Increased default batch size to correspond to the many tests run in h…
Jun 19, 2023
0ab5a39
Merge branch 'main' into tensorflow
casparvl Jun 19, 2023
e682b7a
Add logging to binding hooks
Jun 19, 2023
c359c3a
Make more sensible default behaviour for partial nodes in the SCALES
Jun 19, 2023
18a20ae
Changed optimizer, faster convergence, so we can more sure that the n…
Jun 20, 2023
79eebb7
Sanity check for large node counts were failing since I/O was not coo…
Jun 20, 2023
8903e40
Fix process binding on hyperthreading enabled systems
Jun 26, 2023
fae59b1
Define separate variable, comment, and reuse that
Jun 26, 2023
c6d77ff
Add some support for systems with hyperthreading to assign_one_task_p…
Jun 28, 2023
d52ae1a
Trying to get hyperthreading to do something sensible. Not working yet
Jun 28, 2023
dca0a2e
Add some context
Jun 28, 2023
1c023d1
Revert changes to make TF work on hyperthreading systems
Jun 28, 2023
358966e
Should not have been in this PR, is part of another
Jul 18, 2023
16ddf31
Update eessi/testsuite/tests/apps/tensorflow/src/tf_test.py
casparvl Jul 18, 2023
50d49a6
Merge remote-tracking branch 'origin/main' into tensorflow
Jul 26, 2023
ea5414b
Update eessi/testsuite/tests/apps/tensorflow/src/tf_test.py
casparvl Jul 26, 2023
932b2f3
Merge branch 'tensorflow' of github.com:casparvl/test-suite into tens…
Jul 26, 2023
03f37d0
Update eessi/testsuite/hooks.py
casparvl Jul 26, 2023
178d70c
Check if ReFrame config specifies the required processor attributes
Jul 27, 2023
4416d08
Merge branch 'tensorflow' of github.com:casparvl/test-suite into tens…
Jul 27, 2023
e87e628
Implemented review comments: logging, calling a hook for the process …
Jul 27, 2023
ea561f4
Merged main into this branch, then resolved conflicts
Jul 28, 2023
9e1b5a2
Merge branch 'main' into tensorflow
Jul 31, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
119 changes: 119 additions & 0 deletions config/izum_vega.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
from os import environ
username = environ.get('USER')

# This is an example configuration file
site_configuration = {
'general': [
{
'remote_detect': True,
}
],
'systems': [
{
'name': 'vega',
'descr': 'Vega, a EuroHPC JU system',
'modules_system': 'lmod',
'hostnames': ['vglogin*','cn*','gn*'],
casparvl marked this conversation as resolved.
Show resolved Hide resolved
'stagedir': f'reframe_runs/staging',
'outputdir': f'reframe_runs/output',
'partitions': [
{
'name': 'cpu',
'scheduler': 'slurm',
'prepare_cmds': [
'source /cvmfs/pilot.eessi-hpc.org/latest/init/bash',
'export SLURM_EXPORT_ENV=ALL',
# Avoid https://github.com/EESSI/software-layer/issues/136
# Can be taken out once we don't care about old OpenMPI versions anymore (pre-4.1.1)
'export OMPI_MCA_pml=ucx',
],
'launcher': 'mpirun',
'access': ['-p cpu', '--export=None'],
'environs': ['default'],
'max_jobs': 120,
'features': [
'cpu',
],
'descr': 'CPU partition Standard, see https://en-doc.vega.izum.si/architecture/'
},
{
'name': 'gpu',
'scheduler': 'slurm',
'prepare_cmds': [
'source /cvmfs/pilot.eessi-hpc.org/latest/init/bash',
'export SLURM_EXPORT_ENV=ALL',
# Avoid https://github.com/EESSI/software-layer/issues/136
# Can be taken out once we don't care about old OpenMPI versions anymore (pre-4.1.1)
'export OMPI_MCA_pml=ucx',
],
'launcher': 'mpirun',
'access': ['-p gpu', '--export=None'],
'environs': ['default'],
'max_jobs': 60,
'devices': [
{
'type': 'gpu',
'num_devices': 4,
}
],
'resources': [
{
'name': '_rfm_gpu',
'options': ['--gpus-per-node={num_gpus_per_node}'],
}
],
'features': [
'gpu',
],
'descr': 'GPU partition, see https://en-doc.vega.izum.si/architecture/'
},
]
},
],
'environments': [
{
'name': 'default',
'cc': 'cc',
'cxx': '',
'ftn': '',
},
],
'logging': [
{
'level': 'debug',
'handlers': [
{
'type': 'stream',
'name': 'stdout',
'level': 'info',
'format': '%(message)s'
},
{
'type': 'file',
'name': 'reframe.log',
'level': 'debug',
'format': '[%(asctime)s] %(levelname)s: %(check_info)s: %(message)s', # noqa: E501
'append': False,
'timestamp': "%Y%m%d_%H%M%S",
}
],
'handlers_perflog': [
{
'type': 'filelog',
'prefix': '%(check_system)s/%(check_partition)s',
'level': 'info',
'format': (
'%(check_job_completion_time)s|reframe %(version)s|'
'%(check_info)s|jobid=%(check_jobid)s|'
'%(check_perf_var)s=%(check_perf_value)s|'
'ref=%(check_perf_ref)s '
'(l=%(check_perf_lower_thres)s, '
'u=%(check_perf_upper_thres)s)|'
'%(check_perf_unit)s'
),
'append': True
}
]
}
],
}
1 change: 1 addition & 0 deletions eessi/testsuite/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
DEVICES = {
'GPU': 'gpu',
'CPU': 'cpu',
'CPU_SOCKET': 'cpu_socket',
}

TAGS = {
Expand Down
153 changes: 150 additions & 3 deletions eessi/testsuite/hooks.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,19 +18,40 @@
'''


def assign_one_task_per_compute_unit(test: rfm.RegressionTest, compute_unit: str):
def assign_one_task_per_compute_unit(test: rfm.RegressionTest, compute_unit: str, use_hyperthreading: bool = False):
"""
Assign one task per compute unit (DEVICES['CPU'] or DEVICES['GPU']).
Assign one task per compute unit (DEVICES['CPU'], DEVICES['CPU_SOCKET'] or DEVICES['GPU']).
casparvl marked this conversation as resolved.
Show resolved Hide resolved
Automatically sets num_tasks, num_tasks_per_node, num_cpus_per_task, and num_gpus_per_node,
based on the current scale and the current partition’s num_cpus, max_avail_gpus_per_node and num_nodes.
For GPU tests, one task per GPU is set, and num_cpus_per_task is based on the ratio of CPU-cores/GPUs.
For CPU tests, one task per CPU is set, and num_cpus_per_task is set to 1.
Total task count is determined based on the number of nodes to be used in the test.
If use_hyperthreading is True, each hyperthread is considered a valid place to run one thread.
Behaviour of this function is (usually) sensible for MPI tests.

Arguments:
- test: the ReFrame test to which this hook should apply
- compute_unit: a device as listed in eessi.testsuite.constants.DEVICES
- use_hyperthreading: whether hyperthreading should be considered when computing task counts

Examples:
On a single node with 2 sockets, 64 cores and 128 hyperthreads:
- assign_one_task_per_compute_unit(test, DEVICES['CPU'], false) will launch 64 tasks with 1 thread
- assign_one_task_per_compute_unit(test, DEVICES['CPU'], true) will launch 128 tasks with 1 thread
- assign_one_task_per_compute_unit(test, DEVICES['CPU_SOCKET'], false) will launch 2 tasks with 32 threads per task
- assign_one_task_per_compute_unit(test, DEVICES['CPU_SOCKET'], true) will launch 2 tasks with 64 threads per task
"""
# if use_hyperthreading and test.current_partition.processor.num_cpus:
# test.max_avail_cpus_per_node = test.current_partition.processor.num_cpus
# elif (not use_hyperthreading and
# test.current_partition.processor.num_cpus and
# test.current_partition.processor.num_cpus_per_core):
# test.max_avail_cpus_per_node = \
# test.current_partition.processor.num_cpus / test.current_partition.processor.num_cpus_per_core
test.max_avail_cpus_per_node = test.current_partition.processor.num_cpus
if test.max_avail_cpus_per_node is None:
raise AttributeError(PROCESSOR_INFO_MISSING)
log(f'max_avail_cpus_per_node set to {test.max_avail_cpus_per_node}')

# Check if either node_part, or default_num_cpus_per_node and default_num_gpus_per_node are set correctly
if not (
Expand Down Expand Up @@ -63,9 +84,75 @@ def assign_one_task_per_compute_unit(test: rfm.RegressionTest, compute_unit: str
_assign_one_task_per_gpu(test)
elif compute_unit == DEVICES['CPU']:
_assign_one_task_per_cpu(test)
elif compute_unit == DEVICES['CPU_SOCKET']:
_assign_one_task_per_cpu_socket(test)
else:
raise ValueError(f'compute unit {compute_unit} is currently not supported')

def _assign_one_task_per_cpu_socket(test: rfm.RegressionTest, use_hyperthreading: bool = False):
"""
Determines the number of tasks per node by dividing the default_num_cpus_per_node by
the number of cpus available per socket, and rounding up. The result is that for full-node jobs the default
will spawn one task per socket, with a number of cpus per task equal to the number of cpus per socket.
Other examples:
- half a node (i.e. node_part=2) on a 4-socket system would result in 2 tasks per node,
with number of cpus per task equal to the number of cpus per socket.
- 2 cores (i.e. default_num_cpus_per_node=2) on a 16 core system with 2 sockets would result in
1 task per node, with 2 cpus per task

This default is set unless the test is run with:
--setvar num_tasks_per_node=<x> and/or
--setvar num_cpus_per_task=<y>.
In those cases, those take precedence, and the remaining variable (num_cpus_per task or
num_tasks_per_node respectively) is calculated based on the equality
test.num_tasks_per_node * test.num_cpus_per_task == test.default_num_cpus_per_node.

Variables:
- default_num_cpus_per_node: default number of CPUs per node as defined in the test
(e.g. by earlier hooks like set_tag_scale)


Default resources requested:
- num_tasks_per_node = default_num_cpus_per_node
- num_cpus_per_task = default_num_cpus_per_node / num_tasks_per_node
"""
# neither num_tasks_per_node nor num_cpus_per_task are set
if not test.num_tasks_per_node and not test.num_cpus_per_task:
if test.current_partition.processor.num_sockets:
casparvl marked this conversation as resolved.
Show resolved Hide resolved
# # TOOD: make this if elif else a separate function, it is reused three times
# if use_hyperthreading and test.current_partition.processor.num_cpus:
# num_cpus_per_node = test.current_partition.processor.num_cpus
# elif (not use_hyperthreading and
# test.current_partition.processor.num_cpus and
# test.current_partition.processor.num_cpus_per_core):
# num_cpus_per_node = test.current_partition.processor.num_cpus / test.current_partition.processor.num_cpus_per_core
# else:
# raise AttributeError(PROCESSOR_INFO_MISSING)
num_cpus_per_socket = test.current_partition.processor.num_cpus / test.current_partition.processor.num_sockets
test.num_tasks_per_node = math.ceil(test.default_num_cpus_per_node / num_cpus_per_socket)
test.num_cpus_per_task = int(test.default_num_cpus_per_node / test.num_tasks_per_node)

# num_tasks_per_node is not set, but num_cpus_per_task is
elif not test.num_tasks_per_node:
# if use_hyperthreading and test.current_partition.processor.num_cpus:
# num_cpus_per_node = test.current_partition.processor.num_cpus
# elif (not use_hyperthreading and
# test.current_partition.processor.num_cpus and
# test.current_partition.processor.num_cpus_per_core):
# num_cpus_per_node = test.current_partition.processor.num_cpus / test.current_partition.processor.num_cpus_per_core
# else:
# raise AttributeError(PROCESSOR_INFO_MISSING)
num_cpus_per_socket = test.current_partition.processor.num_cpus / test.current_partition.processor.num_sockets
test.num_tasks_per_node = int(test.default_num_cpus_per_node / test.num_cpus_per_task)

# num_cpus_per_task is not set, but num_tasks_per_node is
elif not test.num_cpus_per_task:
test.num_cpus_per_task = int(test.default_num_cpus_per_node / test.num_tasks_per_node)

else:
pass # both num_tasks_per_node and num_cpus_per_node are already set

test.num_tasks = test.num_nodes * test.num_tasks_per_node
casparvl marked this conversation as resolved.
Show resolved Hide resolved

def _assign_one_task_per_cpu(test: rfm.RegressionTest):
"""
Expand Down Expand Up @@ -236,5 +323,65 @@ def check_custom_executable_opts(test: rfm.RegressionTest, num_default: int = 0)
test.has_custom_executable_opts = False
if len(test.executable_opts) > num_default:
test.has_custom_executable_opts = True

log(f'has_custom_executable_opts set to {test.has_custom_executable_opts}')


def set_compact_process_binding(test: rfm.RegressionTest):
"""
This hook sets a binding policy for process binding.
More specifically, it will bind each process to subsequent domains of test.num_cpus_per_task cores.

A few examples:
- Pure MPI (test.num_cpus_per_task = 1) will result in binding 1 process to each core.
this will happen in a compact way, i.e. rank 0 to core 0, rank 1 to core 1, etc
- Hybrid MPI-OpenMP, e.g. test.num_cpus_per_task = 4 will result in binding 1 process to subsequent sets of 4 cores.
I.e. rank 0 to core 0-3, rank 1 to core 4-7, rank 2 to core 8-11, etc

It is hard to do this in a portable way. Currently supported for process binding are:
- Intel MPI (through I_MPI_PIN_DOMAIN)
- OpenMPI (through OMPI_MCA_rmaps_base_mapping_policy)
- srun (LIMITED SUPPORT: through SLURM_CPU_BIND, but only effective if task/affinity plugin is enabled)
"""

# WARNING: this function currently binds tasks to cores, which asumes assign_one_task_per_compute_unit is called
# with use_hyperthreading = False. This function should be extend to also do proper binding when use_hyperthreading = True

# If hyperthreading is enabled, we need to change our process binding
num_cpus_per_core = test.current_partition.processor.num_cpus_per_core
if num_cpus_per_core is None:
raise AttributeError(PROCESSOR_INFO_MISSING)
casparvl marked this conversation as resolved.
Show resolved Hide resolved

# Do binding for intel and OpenMPI's mpirun, and srun
# Other launchers may or may not do the correct binding

# Number of cores to bind each individual task to:
physical_cpus_per_task = int(test.num_cpus_per_task / num_cpus_per_core)
test.env_vars['I_MPI_PIN_CELL'] = 'core' # Don't bind to hyperthreads, only to physcial cores
test.env_vars['I_MPI_PIN_DOMAIN'] = '%s:compact' % test.num_cpus_per_task
test.env_vars['OMPI_MCA_rmaps_base_mapping_policy'] = 'node:PE=%s' % test.num_cpus_per_task
# Default binding for SLURM. Only effective if the task/affinity plugin is enabled
# and when number of tasks times cpus per task equals either socket, core or thread count
test.env_vars['SLURM_CPU_BIND'] = 'q'
casparvl marked this conversation as resolved.
Show resolved Hide resolved
log(f'Set environment variable I_MPI_PIN_DOMAIN to {test.env_vars["I_MPI_PIN_DOMAIN"]}')
log(f'Set environment variable OMPI_MCA_rmaps_base_mapping_policy to {test.env_vars["OMPI_MCA_rmaps_base_mapping_policy"]}')
log(f'Set environment variable SLURM_CPU_BIND to {test.env_vars["SLURM_CPU_BIND"]}')


def set_compact_thread_binding(test: rfm.RegressionTest):
"""
This hook sets a binding policy for thread binding.
It sets a number of environment variables to try and set a sensible binding for OPENMP tasks.

Thread binding is supported for:
- GNU OpenMP (through OMP_NUM_THREADS, OMP_PLACES and OMP_PROC_BIND)
- Intel OpenMP (through KMP_AFFINITY)
"""

# Set thread binding
test.env_vars['OMP_PLACES'] = 'cores'
test.env_vars['OMP_PROC_BIND'] = 'close'
# See https://www.intel.com/content/www/us/en/docs/cpp-compiler/developer-guide-reference/2021-8/thread-affinity-interface.html
test.env_vars['KMP_AFFINITY'] = 'granularity=fine,compact,1,0'
log(f'Set environment variable OMP_PLACES to {test.env_vars["OMP_PLACES"]}')
log(f'Set environment variable OMP_PROC_BIND to {test.env_vars["OMP_PROC_BIND"]}')
log(f'Set environment variable KMP_AFFINITY to {test.env_vars["KMP_AFFINITY"]}')
31 changes: 31 additions & 0 deletions eessi/testsuite/tests/apps/tensorflow/src/mnist_setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import tensorflow as tf
import numpy as np

def mnist_dataset(batch_size, test_batch_size):
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()
# The `x` arrays are in uint8 and have values in the [0, 255] range.
# You need to convert them to float32 with values in the [0, 1] range.
x_train = x_train / np.float32(255)
y_train = y_train.astype(np.int64)
x_test = x_test / np.float32(255)
y_test = y_test.astype(np.int64)
train_dataset = tf.data.Dataset.from_tensor_slices(
(x_train, y_train)).shuffle(60000).repeat().batch(batch_size)
test_dataset = tf.data.Dataset.from_tensor_slices(
(x_test, y_test)).batch(test_batch_size)
return train_dataset, test_dataset

def build_and_compile_cnn_model():
model = tf.keras.Sequential([
tf.keras.layers.InputLayer(input_shape=(28, 28)),
tf.keras.layers.Reshape(target_shape=(28, 28, 1)),
tf.keras.layers.Conv2D(32, 3, activation='relu'),
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(128, activation='relu'),
tf.keras.layers.Dense(10)
])
model.compile(
loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
metrics=['accuracy'])
return model
Loading