From 5b2d914e626831147a5d676ed1fd344a6be8bcc0 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Tue, 23 Jul 2024 13:57:04 +0000 Subject: [PATCH 1/2] compiler: Honour multi-par-tile as input --- devito/core/operator.py | 4 +++ devito/passes/clusters/blocking.py | 43 ++++++++++++++++-------------- 2 files changed, 27 insertions(+), 20 deletions(-) diff --git a/devito/core/operator.py b/devito/core/operator.py index 390f2469d7..c05d98b60e 100644 --- a/devito/core/operator.py +++ b/devito/core/operator.py @@ -430,3 +430,7 @@ def __new__(cls, items, default=None, sparse=None, reduce=None): obj.reduce = as_tuple(reduce) return obj + + @property + def is_multi(self): + return len(self) > 1 diff --git a/devito/passes/clusters/blocking.py b/devito/passes/clusters/blocking.py index 9ee0882190..a8e12de9b9 100644 --- a/devito/passes/clusters/blocking.py +++ b/devito/passes/clusters/blocking.py @@ -502,30 +502,33 @@ class BlockSizeGenerator: def __init__(self, par_tile): self.tip = -1 - - # The default par-tile, as an UnboundedMultiTuple. It will be used - # for most cases self.umt = par_tile - # Special case 1: a smaller par-tile to avoid under-utilizing - # computational resources when the iteration spaces are too small - self.umt_small = UnboundedMultiTuple(par_tile.default) - - # Special case 2: par-tiles for iteration spaces that must be fully - # blocked for correctness - if par_tile.sparse: - self.umt_sparse = UnboundTuple(*par_tile.sparse, 1) - elif len(par_tile) == 1: - self.umt_sparse = UnboundTuple(*par_tile[0], 1) + if par_tile.is_multi: + # The user has supplied one specific par-tile per blocked nest + self.umt_small = par_tile + self.umt_sparse = par_tile + self.umt_reduce = par_tile else: - self.umt_sparse = UnboundTuple(*par_tile.default, 1) + # Special case 1: a smaller par-tile to avoid under-utilizing + # computational resources when the iteration spaces are too small + self.umt_small = UnboundedMultiTuple(par_tile.default) + + # Special case 2: par-tiles for iteration spaces that must be fully + # blocked for correctness + if par_tile.sparse: + self.umt_sparse = UnboundTuple(*par_tile.sparse, 1) + elif len(par_tile) == 1: + self.umt_sparse = UnboundTuple(*par_tile[0], 1) + else: + self.umt_sparse = UnboundTuple(*par_tile.default, 1) - if par_tile.reduce: - self.umt_reduce = UnboundTuple(*par_tile.reduce, 1) - elif len(par_tile) == 1: - self.umt_reduce = UnboundTuple(*par_tile[0], 1) - else: - self.umt_reduce = UnboundTuple(*par_tile.default, 1) + if par_tile.reduce: + self.umt_reduce = UnboundTuple(*par_tile.reduce, 1) + elif len(par_tile) == 1: + self.umt_reduce = UnboundTuple(*par_tile[0], 1) + else: + self.umt_reduce = UnboundTuple(*par_tile.default, 1) def next(self, prefix, d, clusters): # If a whole new set of Dimensions, move the tip -- this means `clusters` From 1efe5d4f557077bbfec8385e4ca0ebe3803b6d7c Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Wed, 24 Jul 2024 08:12:21 +0000 Subject: [PATCH 2/2] mpi: Revamp topology API --- FAQ.md | 45 ++++++++++++++++++++++++++++++++++++++- devito/__init__.py | 8 ++++++- devito/mpi/distributed.py | 31 ++++++++++++++++----------- devito/parameters.py | 1 + devito/types/grid.py | 15 +++++++++++-- tests/test_mpi.py | 10 +++++++++ 6 files changed, 94 insertions(+), 16 deletions(-) diff --git a/FAQ.md b/FAQ.md index ecd011faea..345efb2167 100644 --- a/FAQ.md +++ b/FAQ.md @@ -598,7 +598,50 @@ By default, Devito compiles the generated code using flags that maximize the run ## Can I control the MPI domain decomposition? -Until Devito v3.5 included, domain decomposition occurs along the fastest axis. As of later versions, domain decomposition occurs along the slowest axis, for performance reasons. And yes, it is possible to control the domain decomposition in user code, but this is not neatly documented. Take a look at `class CustomTopology` in [distributed.py](https://github.com/devitocodes/devito/blob/master/devito/mpi/distributed.py) and `test_custom_topology` in [this file](https://github.com/devitocodes/devito/blob/master/tests/test_mpi.py). In essence, `Grid` accepts the optional argument `topology`, which allows the user to pass a custom topology as an n-tuple, where `n` is the number of distributed dimensions. For example, for a two-dimensional grid, the topology `(4, 1)` will decompose the slowest axis into four partitions, one partition per MPI rank, while the fastest axis will be replicated over all MPI ranks. +By default, domain decomposition starts along the slowest axis. This means that for an n-dimensional grid, the outermost dimension is decomposed first. For example, in a three-dimensional grid (x, y, z), the x dimension is split into chunks first, followed by the y dimension, and finally the z dimension. Then the process restarts from the outermost dimension again, ensuring an n-dimensional decomposition, until as many partitions as MPI ranks are created. + +### Customization + +The `Grid` class accepts an optional `topology` argument, allowing users to specify custom domain decompositions as an n-tuple, where `n` is the number of distributed dimensions. For instance, for a two-dimensional grid, the topology `(4, 1)` will decompose the slowest axis into four partitions (one per MPI rank), while the fastest axis will be replicated across all MPI ranks. So, for example, given a `Grid(shape=(16, 16), topology=(4, 1))`, the x dimension would be split into 4 chunks of 4, resulting in partitions of shape `(4, 16)` for each MPI rank. + +### Wildcard-based Decomposition + +Consider a domain with three distributed dimensions: x, y, and z, and an MPI communicator with `N` processes. Here are some examples of specifying a custom `topology`: + +- **With N known (e.g., N=4)**: + - `(1, 1, 4)`: Decomposes the z dimension into 4 chunks. + - `(2, 1, 2)`: Decomposes the x dimension into 2 chunks and the z dimension into 2 chunks. + +- **With N unknown**: + - `(1, '*', 1)`: The wildcard `'*'` indicates that the runtime should decompose the y dimension into N chunks. + - `('*', '*', 1)`: The runtime decomposes both x and y dimensions into factors of N, prioritizing the outermost dimension. + + Assuming the number of ranks `N` cannot be evenly decomposed into the requested stars, decomposition is as even as possible, prioritizing the outermost dimension: + + - **For N=3**: + - `('*', '*', 1)` results in (3, 1, 1). + - `('*', 1, '*')` results in (3, 1, 1). + - `(1, '*', '*')` results in (1, 3, 1). + + - **For N=6**: + - `('*', '*', 1)` results in (3, 2, 1). + - `('*', 1, '*')` results in (3, 1, 2). + - `(1, '*', '*')` results in (1, 3, 2). + + - **For N=8**: + - `('*', '*', '*')` results in (2, 2, 2). + - `('*', '*', 1)` results in (4, 2, 1). + - `('*', 1, '*')` results in (4, 1, 2). + - `(1, '*', '*')` results in (1, 4, 2). + +### The `DEVITO_TOPOLOGY` Environment Variable + +As of Devito v4.8.11, the domain decomposition topology can also be specified globally using the environment variable `DEVITO_TOPOLOGY`. Accepted values are: + +- `x`: Corresponds to the topology `('*', 1, 1)`, decomposing the x dimension. +- `y`: Corresponds to the topology `(1, '*', 1)`, decomposing the y dimension. +- `z`: Corresponds to the topology `(1, 1, '*')`, decomposing the z dimension. +- `xy`: Corresponds to the topology `('*', '*', 1)`, decomposing both x and y dimensions. [top](#Frequently-Asked-Questions) diff --git a/devito/__init__.py b/devito/__init__.py index 143870319a..ef08d206f2 100644 --- a/devito/__init__.py +++ b/devito/__init__.py @@ -71,13 +71,19 @@ def reinit_compiler(val): # Setup language for shared-memory parallelism preprocessor = lambda i: {0: 'C', 1: 'openmp'}.get(i, i) # Handles DEVITO_OPENMP deprec configuration.add('language', 'C', [0, 1] + list(operator_registry._languages), - preprocessor=preprocessor, callback=reinit_compiler, deprecate='openmp') + preprocessor=preprocessor, callback=reinit_compiler, + deprecate='openmp') # MPI mode (0 => disabled, 1 == basic) preprocessor = lambda i: bool(i) if isinstance(i, int) else i configuration.add('mpi', 0, [0, 1] + list(mpi_registry), preprocessor=preprocessor, callback=reinit_compiler) +# Domain decomposition topology. Only relevant with MPI +preprocessor = lambda i: CustomTopology._shortcuts.get(i) +configuration.add('topology', None, [None] + list(CustomTopology._shortcuts), + preprocessor=preprocessor) + # Should Devito run a first-touch Operator upon data allocation? configuration.add('first-touch', 0, [0, 1], preprocessor=bool, impacts_jit=False) diff --git a/devito/mpi/distributed.py b/devito/mpi/distributed.py index 090e3ad243..a0479b3032 100644 --- a/devito/mpi/distributed.py +++ b/devito/mpi/distributed.py @@ -242,9 +242,7 @@ def __init__(self, shape, dimensions, input_comm=None, topology=None): self._topology = compute_dims(self._input_comm.size, len(shape)) else: # A custom topology may contain integers or the wildcard '*' - topology = CustomTopology(topology, self._input_comm) - - self._topology = topology + self._topology = CustomTopology(topology, self._input_comm) if self._input_comm is not input_comm: # By default, Devito arranges processes into a cartesian topology. @@ -634,8 +632,9 @@ class CustomTopology(tuple): Examples -------- - For example, let's consider a domain with three distributed dimensions: x, y, and z, - and an MPI communicator with N processes. Here are a few examples of CustomTopology: + For example, let's consider a domain with three distributed dimensions: x, + y, and z, and an MPI communicator with N processes. Here are a few examples + of CustomTopology: With N known, say N=4: * `(1, 1, 4)`: the z Dimension is decomposed into 4 chunks @@ -643,14 +642,15 @@ class CustomTopology(tuple): is decomposed into 2 chunks With N unknown: - * `(1, '*', 1)`: the wildcard `'*'` indicates that the runtime should decompose the y - Dimension into N chunks - * `('*', '*', 1)`: the wildcard `'*'` indicates that the runtime should decompose both - the x and y Dimensions in `nstars` factors of N, prioritizing - the outermost dimension + * `(1, '*', 1)`: the wildcard `'*'` indicates that the runtime should + decompose the y Dimension into N chunks + * `('*', '*', 1)`: the wildcard `'*'` indicates that the runtime should + decompose both the x and y Dimensions in `nstars` factors + of N, prioritizing the outermost dimension - Assuming that the number of ranks `N` cannot evenly be decomposed to the requested - stars=6 we decompose as evenly as possible by prioritising the outermost dimension + Assuming that the number of ranks `N` cannot evenly be decomposed to the + requested stars=6 we decompose as evenly as possible by prioritising the + outermost dimension For N=3 * `('*', '*', 1)` gives: (3, 1, 1) @@ -674,6 +674,13 @@ class CustomTopology(tuple): by the Devito runtime based on user input. """ + _shortcuts = { + 'x': ('*', 1, 1), + 'y': (1, '*', 1), + 'z': (1, 1, '*'), + 'xy': ('*', '*', 1), + } + def __new__(cls, items, input_comm): # Keep track of nstars and already defined decompositions nstars = items.count('*') diff --git a/devito/parameters.py b/devito/parameters.py index 142ab8d08e..d9ca3ee227 100644 --- a/devito/parameters.py +++ b/devito/parameters.py @@ -146,6 +146,7 @@ def _signature_items(self): 'DEVITO_DEVELOP': 'develop-mode', 'DEVITO_OPT': 'opt', 'DEVITO_MPI': 'mpi', + 'DEVITO_TOPOLOGY': 'topology', 'DEVITO_LANGUAGE': 'language', 'DEVITO_AUTOTUNING': 'autotuning', 'DEVITO_LOGGING': 'log-level', diff --git a/devito/types/grid.py b/devito/types/grid.py index 041f4156b1..fc44f34b0d 100644 --- a/devito/types/grid.py +++ b/devito/types/grid.py @@ -5,6 +5,7 @@ import numpy as np from sympy import prod +from devito import configuration from devito.data import LEFT, RIGHT from devito.logger import warning from devito.mpi import Distributor, MPI @@ -163,8 +164,18 @@ def __init__(self, shape, extent=None, origin=None, dimensions=None, # Create a Distributor, used internally to implement domain decomposition # by all Functions defined on this Grid - self._topology = topology - self._distributor = Distributor(shape, dimensions, comm, topology) + topology = topology or configuration['topology'] + if topology: + if len(topology) == len(self.shape): + self._topology = topology + else: + warning("Ignoring the provided topology `%s` as it " + "is incompatible with the grid shape `%s`" % + (topology, self.shape)) + self._topology = None + else: + self._topology = None + self._distributor = Distributor(shape, dimensions, comm, self._topology) # The physical extent self._extent = as_tuple(extent or tuple(1. for _ in self.shape)) diff --git a/tests/test_mpi.py b/tests/test_mpi.py index c0836fb0c2..e7dfd5d851 100644 --- a/tests/test_mpi.py +++ b/tests/test_mpi.py @@ -228,6 +228,16 @@ def test_custom_topology_v2(self, comm_size, topology, dist_topology, mode): custom_topology = CustomTopology(topology, dummy_comm) assert custom_topology == dist_topology + @pytest.mark.parallel(mode=[(4, 'diag2')]) + @switchconfig(topology='y') + def test_custom_topology_fallback(self, mode): + grid = Grid(shape=(16,)) + f = Function(name='f', grid=grid) + + # The input topology was `y` but Grid only has one axis, so we decompose + # along that instead + assert f.shape == (4,) + class TestFunction: