From 101f04172e99b7b6bae4dc62a2f112dd0a6d4108 Mon Sep 17 00:00:00 2001 From: FabioLuporini Date: Fri, 24 Jun 2022 15:32:52 +0200 Subject: [PATCH 1/4] compiler: Simplify cluster.is_dense --- devito/ir/clusters/cluster.py | 1 - 1 file changed, 1 deletion(-) diff --git a/devito/ir/clusters/cluster.py b/devito/ir/clusters/cluster.py index 1759447da5..60a337a8f6 100644 --- a/devito/ir/clusters/cluster.py +++ b/devito/ir/clusters/cluster.py @@ -204,7 +204,6 @@ def is_dense(self): # Fallback to legacy is_dense checks return (not any(e.conditionals for e in self.exprs) and not any(f.is_SparseFunction for f in self.functions) and - not self.is_scalar and all(a.is_regular for a in self.scope.accesses)) @cached_property From c18a698f465bbf1b2f77a942ce9a416929cedf06 Mon Sep 17 00:00:00 2001 From: FabioLuporini Date: Fri, 24 Jun 2022 17:52:34 +0200 Subject: [PATCH 2/4] compiler: Generalize normalize_reductions --- devito/core/cpu.py | 7 +++++ devito/core/gpu.py | 7 +++++ devito/ir/clusters/algorithms.py | 44 ++++++++++++++++++++++++++------ devito/ir/clusters/analysis.py | 2 +- devito/ir/clusters/cluster.py | 4 +++ 5 files changed, 55 insertions(+), 9 deletions(-) diff --git a/devito/core/cpu.py b/devito/core/cpu.py index 29e0f50864..f7772551f9 100644 --- a/devito/core/cpu.py +++ b/devito/core/cpu.py @@ -76,6 +76,12 @@ class Cpu64OperatorMixin(object): than this threshold. """ + MAPIFY_REDUCE = False + """ + Vector-expand all scalar reductions to turn them into explicit map-reductions, + which may be easier to parallelize for certain backends. + """ + @classmethod def _normalize_kwargs(cls, **kwargs): o = {} @@ -119,6 +125,7 @@ def _normalize_kwargs(cls, **kwargs): # Misc o['optcomms'] = oo.pop('optcomms', True) o['linearize'] = oo.pop('linearize', False) + o['mapify-reduce'] = oo.pop('mapify-reduce', cls.MAPIFY_REDUCE) # Recognised but unused by the CPU backend oo.pop('par-disabled', None) diff --git a/devito/core/gpu.py b/devito/core/gpu.py index 517ba94718..cf661758f4 100644 --- a/devito/core/gpu.py +++ b/devito/core/gpu.py @@ -60,6 +60,12 @@ class DeviceOperatorMixin(object): Assuming all functions fit into the gpu memory. """ + MAPIFY_REDUCE = False + """ + Vector-expand all scalar reductions to turn them into explicit map-reductions, + which may be easier to parallelize for certain backends. + """ + @classmethod def _normalize_kwargs(cls, **kwargs): o = {} @@ -104,6 +110,7 @@ def _normalize_kwargs(cls, **kwargs): # Misc o['optcomms'] = oo.pop('optcomms', True) o['linearize'] = oo.pop('linearize', False) + o['mapify-reduce'] = oo.pop('mapify-reduce', cls.MAPIFY_REDUCE) if oo: raise InvalidOperator("Unsupported optimization options: [%s]" diff --git a/devito/ir/clusters/algorithms.py b/devito/ir/clusters/algorithms.py index 035014014e..e68c8ad1ce 100644 --- a/devito/ir/clusters/algorithms.py +++ b/devito/ir/clusters/algorithms.py @@ -13,13 +13,13 @@ from devito.symbolics import retrieve_indexed, uxreplace, xreplace_indices from devito.tools import (DefaultOrderedDict, Stamp, as_mapper, flatten, is_integer, timed_pass) -from devito.types import Eq, Symbol +from devito.types import Array, Eq, Inc, Symbol from devito.types.dimension import BOTTOM, ModuloDimension __all__ = ['clusterize'] -def clusterize(exprs, options=None, **kwargs): +def clusterize(exprs, **kwargs): """ Turn a sequence of LoweredEqs into a sequence of Clusters. """ @@ -36,7 +36,7 @@ def clusterize(exprs, options=None, **kwargs): clusters = guard(clusters) # Determine relevant computational properties (e.g., parallelism) - clusters = analyze(clusters, options) + clusters = analyze(clusters) # Input normalization (e.g., SSA) clusters = normalize(clusters, **kwargs) @@ -322,10 +322,11 @@ def rule(size, e): def normalize(clusters, **kwargs): + options = kwargs['options'] sregistry = kwargs['sregistry'] clusters = normalize_nested_indexeds(clusters, sregistry) - clusters = normalize_reductions(clusters, sregistry) + clusters = normalize_reductions(clusters, sregistry, options) return clusters @@ -368,19 +369,46 @@ def pull_indexeds(expr, subs, mapper, parent=None): @cluster_pass(mode='all') -def normalize_reductions(cluster, sregistry): +def normalize_reductions(cluster, sregistry, options): """ Extract the right-hand sides of reduction Eq's in to temporaries. """ - if not any(PARALLEL_IF_ATOMIC in v for v in cluster.properties.values()): + opt_mapify_reduce = options['mapify-reduce'] + + dims = [d for d, v in cluster.properties.items() if PARALLEL_IF_ATOMIC in v] + + if not dims: return cluster processed = [] for e in cluster.exprs: - if e.is_Increment and e.lhs.function.is_AbstractFunction: - v = Symbol(name=sregistry.make_name(), dtype=e.dtype) + if e.is_Reduction and e.lhs.is_Indexed and cluster.is_sparse: + # Transform `e` such that we reduce into a scalar (ultimately via + # atomic ops, though this part is carried out by a much later pass) + # For example, given `i = m[p_src]` (i.e., indirection array), turn: + # `u[t, i] += f(u[t, i], src, ...)` + # into + # `s = f(u[t, i], src, ...)` + # `u[t, i] += s` + name = sregistry.make_name() + v = Symbol(name=name, dtype=e.dtype) processed.extend([e.func(v, e.rhs, operation=None), e.func(e.lhs, v)]) + + elif e.is_Reduction and e.lhs.is_Symbol and opt_mapify_reduce: + # Transform `e` into what is in essence an explicit map-reduce + # For example, turn: + # `s += f(u[x], v[x], ...)` + # into + # `r[x] = f(u[x], v[x], ...)` + # `s += r[x]` + # This makes it much easier to parallelize the map part regardless + # of the target backend + name = sregistry.make_name() + a = Array(name=name, dtype=e.dtype, dimensions=dims) + processed.extend([Eq(a.indexify(), e.rhs), + Inc(e.lhs, a.indexify())]) + else: processed.append(e) diff --git a/devito/ir/clusters/analysis.py b/devito/ir/clusters/analysis.py index f50b1db8a7..4778f2b2b9 100644 --- a/devito/ir/clusters/analysis.py +++ b/devito/ir/clusters/analysis.py @@ -7,7 +7,7 @@ @timed_pass() -def analyze(clusters, options): +def analyze(clusters): state = QueueStateful.State() # Collect properties diff --git a/devito/ir/clusters/cluster.py b/devito/ir/clusters/cluster.py index 60a337a8f6..3299e4e755 100644 --- a/devito/ir/clusters/cluster.py +++ b/devito/ir/clusters/cluster.py @@ -206,6 +206,10 @@ def is_dense(self): not any(f.is_SparseFunction for f in self.functions) and all(a.is_regular for a in self.scope.accesses)) + @property + def is_sparse(self): + return not self.is_dense + @cached_property def dtype(self): """ From f8380b69762caaa775b514b5baa60681621d43a0 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Wed, 27 Jul 2022 13:30:45 +0100 Subject: [PATCH 3/4] compiler: Bypass Wildcards in SSA lowering --- devito/passes/clusters/utils.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/devito/passes/clusters/utils.py b/devito/passes/clusters/utils.py index 181419abbd..853dba808a 100644 --- a/devito/passes/clusters/utils.py +++ b/devito/passes/clusters/utils.py @@ -1,5 +1,5 @@ from devito.symbolics import uxreplace -from devito.types import Symbol +from devito.types import Symbol, Wildcard __all__ = ['makeit_ssa'] @@ -11,7 +11,8 @@ def makeit_ssa(exprs): # Identify recurring LHSs seen = {} for i, e in enumerate(exprs): - seen.setdefault(e.lhs, []).append(i) + if not isinstance(e.lhs, Wildcard): + seen.setdefault(e.lhs, []).append(i) # Optimization: don't waste time reconstructing stuff if already in SSA form if all(len(i) == 1 for i in seen.values()): return exprs From 693485f5ec3343bb6321dcaedee02012adafc31a Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Wed, 27 Jul 2022 13:54:23 +0100 Subject: [PATCH 4/4] compiler: Patch factorizer pass after relaxing is_dense --- .github/workflows/docker-devito.yml | 4 ++-- devito/passes/clusters/factorization.py | 4 ++-- devito/symbolics/extended_sympy.py | 3 ++- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/.github/workflows/docker-devito.yml b/.github/workflows/docker-devito.yml index 9ced958a7a..7de99381c9 100644 --- a/.github/workflows/docker-devito.yml +++ b/.github/workflows/docker-devito.yml @@ -22,13 +22,13 @@ jobs: tag: 'nvidia-nvc' flag: '--gpus all' test: 'tests/test_gpu_openacc.py tests/test_gpu_common.py' - runner: ["self-hosted", "gpu", "docker"] + runner: ["self-hosted", "gpu", "docker", "v1004"] - base: 'bases:nvidia-clang' tag: 'nvidia-clang' flag: '--gpus all' test: 'tests/test_gpu_openmp.py tests/test_gpu_common.py' - runner: ["self-hosted", "gpu", "docker"] + runner: ["self-hosted", "gpu", "docker", "kimogila"] # Runtime gpu flags from https://hub.docker.com/r/rocm/tensorflow/ - base: 'bases:amd' diff --git a/devito/passes/clusters/factorization.py b/devito/passes/clusters/factorization.py index 7b3cf2872d..f023762402 100644 --- a/devito/passes/clusters/factorization.py +++ b/devito/passes/clusters/factorization.py @@ -3,7 +3,7 @@ from sympy import Add, Mul, S, collect from devito.ir import cluster_pass -from devito.symbolics import estimate_cost, retrieve_symbols +from devito.symbolics import BasicWrapperMixin, estimate_cost, retrieve_symbols from devito.tools import ReducerMap __all__ = ['factorize'] @@ -115,7 +115,7 @@ def run(expr): return expr, {'funcs': expr} elif expr.is_Pow: return expr, {'pows': expr} - elif expr.is_Symbol or expr.is_Indexed or expr.is_Atom: + elif expr.is_Symbol or expr.is_Indexed or isinstance(expr, BasicWrapperMixin): return expr, {} elif expr.is_Add: args, candidates = zip(*[run(arg) for arg in expr.args]) diff --git a/devito/symbolics/extended_sympy.py b/devito/symbolics/extended_sympy.py index 294a3e3514..1e58b90937 100644 --- a/devito/symbolics/extended_sympy.py +++ b/devito/symbolics/extended_sympy.py @@ -15,7 +15,8 @@ 'FieldFromComposite', 'ListInitializer', 'Byref', 'IndexedPointer', 'Cast', 'DefFunction', 'InlineIf', 'Keyword', 'String', 'Macro', 'MacroArgument', 'CustomType', 'Deref', 'INT', 'FLOAT', 'DOUBLE', 'VOID', 'CEIL', - 'FLOOR', 'MAX', 'MIN', 'Null', 'SizeOf', 'rfunc', 'cast_mapper'] + 'FLOOR', 'MAX', 'MIN', 'Null', 'SizeOf', 'rfunc', 'cast_mapper', + 'BasicWrapperMixin'] class CondEq(sympy.Eq):