From 101f04172e99b7b6bae4dc62a2f112dd0a6d4108 Mon Sep 17 00:00:00 2001
From: FabioLuporini <fabilupo@gmail.com>
Date: Fri, 24 Jun 2022 15:32:52 +0200
Subject: [PATCH 1/4] compiler: Simplify cluster.is_dense

---
 devito/ir/clusters/cluster.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/devito/ir/clusters/cluster.py b/devito/ir/clusters/cluster.py
index 1759447da5..60a337a8f6 100644
--- a/devito/ir/clusters/cluster.py
+++ b/devito/ir/clusters/cluster.py
@@ -204,7 +204,6 @@ def is_dense(self):
         # Fallback to legacy is_dense checks
         return (not any(e.conditionals for e in self.exprs) and
                 not any(f.is_SparseFunction for f in self.functions) and
-                not self.is_scalar and
                 all(a.is_regular for a in self.scope.accesses))
 
     @cached_property

From c18a698f465bbf1b2f77a942ce9a416929cedf06 Mon Sep 17 00:00:00 2001
From: FabioLuporini <fabilupo@gmail.com>
Date: Fri, 24 Jun 2022 17:52:34 +0200
Subject: [PATCH 2/4] compiler: Generalize normalize_reductions

---
 devito/core/cpu.py               |  7 +++++
 devito/core/gpu.py               |  7 +++++
 devito/ir/clusters/algorithms.py | 44 ++++++++++++++++++++++++++------
 devito/ir/clusters/analysis.py   |  2 +-
 devito/ir/clusters/cluster.py    |  4 +++
 5 files changed, 55 insertions(+), 9 deletions(-)

diff --git a/devito/core/cpu.py b/devito/core/cpu.py
index 29e0f50864..f7772551f9 100644
--- a/devito/core/cpu.py
+++ b/devito/core/cpu.py
@@ -76,6 +76,12 @@ class Cpu64OperatorMixin(object):
     than this threshold.
     """
 
+    MAPIFY_REDUCE = False
+    """
+    Vector-expand all scalar reductions to turn them into explicit map-reductions,
+    which may be easier to parallelize for certain backends.
+    """
+
     @classmethod
     def _normalize_kwargs(cls, **kwargs):
         o = {}
@@ -119,6 +125,7 @@ def _normalize_kwargs(cls, **kwargs):
         # Misc
         o['optcomms'] = oo.pop('optcomms', True)
         o['linearize'] = oo.pop('linearize', False)
+        o['mapify-reduce'] = oo.pop('mapify-reduce', cls.MAPIFY_REDUCE)
 
         # Recognised but unused by the CPU backend
         oo.pop('par-disabled', None)
diff --git a/devito/core/gpu.py b/devito/core/gpu.py
index 517ba94718..cf661758f4 100644
--- a/devito/core/gpu.py
+++ b/devito/core/gpu.py
@@ -60,6 +60,12 @@ class DeviceOperatorMixin(object):
     Assuming all functions fit into the gpu memory.
     """
 
+    MAPIFY_REDUCE = False
+    """
+    Vector-expand all scalar reductions to turn them into explicit map-reductions,
+    which may be easier to parallelize for certain backends.
+    """
+
     @classmethod
     def _normalize_kwargs(cls, **kwargs):
         o = {}
@@ -104,6 +110,7 @@ def _normalize_kwargs(cls, **kwargs):
         # Misc
         o['optcomms'] = oo.pop('optcomms', True)
         o['linearize'] = oo.pop('linearize', False)
+        o['mapify-reduce'] = oo.pop('mapify-reduce', cls.MAPIFY_REDUCE)
 
         if oo:
             raise InvalidOperator("Unsupported optimization options: [%s]"
diff --git a/devito/ir/clusters/algorithms.py b/devito/ir/clusters/algorithms.py
index 035014014e..e68c8ad1ce 100644
--- a/devito/ir/clusters/algorithms.py
+++ b/devito/ir/clusters/algorithms.py
@@ -13,13 +13,13 @@
 from devito.symbolics import retrieve_indexed, uxreplace, xreplace_indices
 from devito.tools import (DefaultOrderedDict, Stamp, as_mapper, flatten,
                           is_integer, timed_pass)
-from devito.types import Eq, Symbol
+from devito.types import Array, Eq, Inc, Symbol
 from devito.types.dimension import BOTTOM, ModuloDimension
 
 __all__ = ['clusterize']
 
 
-def clusterize(exprs, options=None, **kwargs):
+def clusterize(exprs, **kwargs):
     """
     Turn a sequence of LoweredEqs into a sequence of Clusters.
     """
@@ -36,7 +36,7 @@ def clusterize(exprs, options=None, **kwargs):
     clusters = guard(clusters)
 
     # Determine relevant computational properties (e.g., parallelism)
-    clusters = analyze(clusters, options)
+    clusters = analyze(clusters)
 
     # Input normalization (e.g., SSA)
     clusters = normalize(clusters, **kwargs)
@@ -322,10 +322,11 @@ def rule(size, e):
 
 
 def normalize(clusters, **kwargs):
+    options = kwargs['options']
     sregistry = kwargs['sregistry']
 
     clusters = normalize_nested_indexeds(clusters, sregistry)
-    clusters = normalize_reductions(clusters, sregistry)
+    clusters = normalize_reductions(clusters, sregistry, options)
 
     return clusters
 
@@ -368,19 +369,46 @@ def pull_indexeds(expr, subs, mapper, parent=None):
 
 
 @cluster_pass(mode='all')
-def normalize_reductions(cluster, sregistry):
+def normalize_reductions(cluster, sregistry, options):
     """
     Extract the right-hand sides of reduction Eq's in to temporaries.
     """
-    if not any(PARALLEL_IF_ATOMIC in v for v in cluster.properties.values()):
+    opt_mapify_reduce = options['mapify-reduce']
+
+    dims = [d for d, v in cluster.properties.items() if PARALLEL_IF_ATOMIC in v]
+
+    if not dims:
         return cluster
 
     processed = []
     for e in cluster.exprs:
-        if e.is_Increment and e.lhs.function.is_AbstractFunction:
-            v = Symbol(name=sregistry.make_name(), dtype=e.dtype)
+        if e.is_Reduction and e.lhs.is_Indexed and cluster.is_sparse:
+            # Transform `e` such that we reduce into a scalar (ultimately via
+            # atomic ops, though this part is carried out by a much later pass)
+            # For example, given `i = m[p_src]` (i.e., indirection array), turn:
+            # `u[t, i] += f(u[t, i], src, ...)`
+            # into
+            # `s = f(u[t, i], src, ...)`
+            # `u[t, i] += s`
+            name = sregistry.make_name()
+            v = Symbol(name=name, dtype=e.dtype)
             processed.extend([e.func(v, e.rhs, operation=None),
                               e.func(e.lhs, v)])
+
+        elif e.is_Reduction and e.lhs.is_Symbol and opt_mapify_reduce:
+            # Transform `e` into what is in essence an explicit map-reduce
+            # For example, turn:
+            # `s += f(u[x], v[x], ...)`
+            # into
+            # `r[x] = f(u[x], v[x], ...)`
+            # `s += r[x]`
+            # This makes it much easier to parallelize the map part regardless
+            # of the target backend
+            name = sregistry.make_name()
+            a = Array(name=name, dtype=e.dtype, dimensions=dims)
+            processed.extend([Eq(a.indexify(), e.rhs),
+                              Inc(e.lhs, a.indexify())])
+
         else:
             processed.append(e)
 
diff --git a/devito/ir/clusters/analysis.py b/devito/ir/clusters/analysis.py
index f50b1db8a7..4778f2b2b9 100644
--- a/devito/ir/clusters/analysis.py
+++ b/devito/ir/clusters/analysis.py
@@ -7,7 +7,7 @@
 
 
 @timed_pass()
-def analyze(clusters, options):
+def analyze(clusters):
     state = QueueStateful.State()
 
     # Collect properties
diff --git a/devito/ir/clusters/cluster.py b/devito/ir/clusters/cluster.py
index 60a337a8f6..3299e4e755 100644
--- a/devito/ir/clusters/cluster.py
+++ b/devito/ir/clusters/cluster.py
@@ -206,6 +206,10 @@ def is_dense(self):
                 not any(f.is_SparseFunction for f in self.functions) and
                 all(a.is_regular for a in self.scope.accesses))
 
+    @property
+    def is_sparse(self):
+        return not self.is_dense
+
     @cached_property
     def dtype(self):
         """

From f8380b69762caaa775b514b5baa60681621d43a0 Mon Sep 17 00:00:00 2001
From: Fabio Luporini <fabilupo@gmail.com>
Date: Wed, 27 Jul 2022 13:30:45 +0100
Subject: [PATCH 3/4] compiler: Bypass Wildcards in SSA lowering

---
 devito/passes/clusters/utils.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/devito/passes/clusters/utils.py b/devito/passes/clusters/utils.py
index 181419abbd..853dba808a 100644
--- a/devito/passes/clusters/utils.py
+++ b/devito/passes/clusters/utils.py
@@ -1,5 +1,5 @@
 from devito.symbolics import uxreplace
-from devito.types import Symbol
+from devito.types import Symbol, Wildcard
 
 __all__ = ['makeit_ssa']
 
@@ -11,7 +11,8 @@ def makeit_ssa(exprs):
     # Identify recurring LHSs
     seen = {}
     for i, e in enumerate(exprs):
-        seen.setdefault(e.lhs, []).append(i)
+        if not isinstance(e.lhs, Wildcard):
+            seen.setdefault(e.lhs, []).append(i)
     # Optimization: don't waste time reconstructing stuff if already in SSA form
     if all(len(i) == 1 for i in seen.values()):
         return exprs

From 693485f5ec3343bb6321dcaedee02012adafc31a Mon Sep 17 00:00:00 2001
From: Fabio Luporini <fabilupo@gmail.com>
Date: Wed, 27 Jul 2022 13:54:23 +0100
Subject: [PATCH 4/4] compiler: Patch factorizer pass after relaxing is_dense

---
 .github/workflows/docker-devito.yml     | 4 ++--
 devito/passes/clusters/factorization.py | 4 ++--
 devito/symbolics/extended_sympy.py      | 3 ++-
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/docker-devito.yml b/.github/workflows/docker-devito.yml
index 9ced958a7a..7de99381c9 100644
--- a/.github/workflows/docker-devito.yml
+++ b/.github/workflows/docker-devito.yml
@@ -22,13 +22,13 @@ jobs:
             tag: 'nvidia-nvc'
             flag: '--gpus all'
             test: 'tests/test_gpu_openacc.py tests/test_gpu_common.py'
-            runner: ["self-hosted", "gpu", "docker"]
+            runner: ["self-hosted", "gpu", "docker", "v1004"]
 
           - base: 'bases:nvidia-clang'
             tag: 'nvidia-clang'
             flag: '--gpus all'
             test: 'tests/test_gpu_openmp.py tests/test_gpu_common.py'
-            runner: ["self-hosted", "gpu", "docker"]
+            runner: ["self-hosted", "gpu", "docker", "kimogila"]
 
           # Runtime gpu flags from https://hub.docker.com/r/rocm/tensorflow/
           - base: 'bases:amd'
diff --git a/devito/passes/clusters/factorization.py b/devito/passes/clusters/factorization.py
index 7b3cf2872d..f023762402 100644
--- a/devito/passes/clusters/factorization.py
+++ b/devito/passes/clusters/factorization.py
@@ -3,7 +3,7 @@
 from sympy import Add, Mul, S, collect
 
 from devito.ir import cluster_pass
-from devito.symbolics import estimate_cost, retrieve_symbols
+from devito.symbolics import BasicWrapperMixin, estimate_cost, retrieve_symbols
 from devito.tools import ReducerMap
 
 __all__ = ['factorize']
@@ -115,7 +115,7 @@ def run(expr):
             return expr, {'funcs': expr}
         elif expr.is_Pow:
             return expr, {'pows': expr}
-        elif expr.is_Symbol or expr.is_Indexed or expr.is_Atom:
+        elif expr.is_Symbol or expr.is_Indexed or isinstance(expr, BasicWrapperMixin):
             return expr, {}
         elif expr.is_Add:
             args, candidates = zip(*[run(arg) for arg in expr.args])
diff --git a/devito/symbolics/extended_sympy.py b/devito/symbolics/extended_sympy.py
index 294a3e3514..1e58b90937 100644
--- a/devito/symbolics/extended_sympy.py
+++ b/devito/symbolics/extended_sympy.py
@@ -15,7 +15,8 @@
            'FieldFromComposite', 'ListInitializer', 'Byref', 'IndexedPointer', 'Cast',
            'DefFunction', 'InlineIf', 'Keyword', 'String', 'Macro', 'MacroArgument',
            'CustomType', 'Deref', 'INT', 'FLOAT', 'DOUBLE', 'VOID', 'CEIL',
-           'FLOOR', 'MAX', 'MIN', 'Null', 'SizeOf', 'rfunc', 'cast_mapper']
+           'FLOOR', 'MAX', 'MIN', 'Null', 'SizeOf', 'rfunc', 'cast_mapper',
+           'BasicWrapperMixin']
 
 
 class CondEq(sympy.Eq):