Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

use mixin class for pytorch #213

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 20 additions & 56 deletions eessi/testsuite/tests/apps/PyTorch/PyTorch_torchvision.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,79 +2,41 @@

import reframe as rfm
import reframe.utility.sanity as sn
# Added only to make the linter happy
from reframe.core.builtins import parameter, variable, run_after, sanity_function, performance_function

from eessi.testsuite import hooks
from eessi.testsuite.constants import SCALES, TAGS, DEVICE_TYPES, COMPUTE_UNIT, CPU, NUMA_NODE, GPU
from eessi.testsuite.constants import DEVICE_TYPES, COMPUTE_UNIT, CPU, NUMA_NODE, GPU
from eessi.testsuite.eessi_mixin import EESSI_Mixin
from eessi.testsuite.utils import find_modules


class EESSI_PyTorch_torchvision(rfm.RunOnlyRegressionTest):
class EESSI_PyTorch_torchvision(rfm.RunOnlyRegressionTest, EESSI_Mixin):
descr = 'Benchmark that runs a selected torchvision model on synthetic data'

nn_model = parameter(['vgg16', 'resnet50', 'resnet152', 'densenet121', 'mobilenet_v3_large'])
scale = parameter(SCALES.keys())
bench_name_ci = 'resnet50'
parallel_strategy = parameter([None, 'ddp'])
compute_device = variable(str)
device_type = variable(str)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess we can remove this, no? We also don't set the compute_unit as a variable...

I guess I used to have it to enforce setting it in the child class. But the eessi_mixin enforces you to set a device type anyway (I think)? so no use enforcing it here anymore.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

good point, removed.

# Both torchvision and PyTorch-bundle modules have everything needed to run this test
module_name = parameter(chain(find_modules('torchvision'), find_modules('PyTorch-bundle')))

descr = 'Benchmark that runs a selected torchvision model on synthetic data'

executable = 'python'

valid_prog_environs = ['default']
valid_systems = ['*']

time_limit = '30m'

def required_mem_per_node(self):
return self.num_tasks_per_node * 1024

@run_after('init')
def prepare_test(self):

# Set nn_model as executable option
self.executable_opts = ['pytorch_synthetic_benchmark.py --model %s' % self.nn_model]
self.bench_name = self.nn_model

# If not a GPU run, disable CUDA
if self.compute_device != DEVICE_TYPES[GPU]:
if self.device_type != DEVICE_TYPES[GPU]:
self.executable_opts += ['--no-cuda']

@run_after('init')
def apply_init_hooks(self):
# Filter on which scales are supported by the partitions defined in the ReFrame configuration
hooks.filter_supported_scales(self)

# Make sure that GPU tests run in partitions that support running on a GPU,
# and that CPU-only tests run in partitions that support running CPU-only.
# Also support setting valid_systems on the cmd line.
hooks.filter_valid_systems_by_device_type(self, required_device_type=self.compute_device)

# Support selecting modules on the cmd line.
hooks.set_modules(self)

# Support selecting scales on the cmd line via tags.
hooks.set_tag_scale(self)

@run_after('init')
def set_tag_ci(self):
if self.nn_model == 'resnet50':
self.tags.add(TAGS['CI'])

@run_after('setup')
def apply_setup_hooks(self):
if self.compute_device == DEVICE_TYPES[GPU]:
hooks.assign_tasks_per_compute_unit(test=self, compute_unit=COMPUTE_UNIT[GPU])
else:
# Hybrid code, for which launching one task per NUMA_NODE is typically the most efficient
hooks.assign_tasks_per_compute_unit(test=self, compute_unit=COMPUTE_UNIT[NUMA_NODE])

# This is a hybrid test, binding is important for performance
hooks.set_compact_process_binding(self)

# Set OMP_NUM_THREADS based on the number of cores per task
self.env_vars["OMP_NUM_THREADS"] = self.num_cpus_per_task

@run_after('setup')
def set_ddp_options(self):
# Set environment variables for PyTorch DDP
"Set environment variables for PyTorch DDP"
if self.parallel_strategy == 'ddp':
# Set additional options required by DDP
self.executable_opts += ["--master-port $(python get_free_socket.py)"]
Expand All @@ -94,7 +56,7 @@ def filter_invalid_parameter_combinations(self):

@run_after('setup')
def pass_parallel_strategy(self):
# Set parallelization strategy when using more than one process
"Set parallelization strategy when using more than one process"
if self.num_tasks != 1:
self.executable_opts += ['--use-%s' % self.parallel_strategy]

Expand All @@ -110,21 +72,23 @@ def total_throughput(self):

@performance_function('img/sec')
def througput_per_CPU(self):
'''Training througput per CPU'''
if self.compute_device == DEVICE_TYPES[CPU]:
'''Training througput per device type'''
if self.device_type == DEVICE_TYPES[CPU]:
return sn.extractsingle(r'Img/sec per CPU:\s+(?P<perf_per_cpu>\S+)', self.stdout, 'perf_per_cpu', float)
else:
return sn.extractsingle(r'Img/sec per GPU:\s+(?P<perf_per_gpu>\S+)', self.stdout, 'perf_per_gpu', float)


@rfm.simple_test
class EESSI_PyTorch_torchvision_CPU(EESSI_PyTorch_torchvision):
compute_device = DEVICE_TYPES[CPU]
device_type = DEVICE_TYPES[CPU]
compute_unit = COMPUTE_UNIT[NUMA_NODE]


@rfm.simple_test
class EESSI_PyTorch_torchvision_GPU(EESSI_PyTorch_torchvision):
compute_device = DEVICE_TYPES[GPU]
device_type = DEVICE_TYPES[GPU]
compute_unit = COMPUTE_UNIT[GPU]
precision = parameter(['default', 'mixed'])

@run_after('init')
Expand Down
Loading