Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

compiler: Misc improvements to code generation #2516

Open
wants to merge 15 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 58 additions & 0 deletions devito/arch/archinfo.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@
'POWER8', 'POWER9',
# Generic GPUs
'AMDGPUX', 'NVIDIAX', 'INTELGPUX',
# Nvidia GPUs
'AMPERE',
mloubout marked this conversation as resolved.
Show resolved Hide resolved
# Intel GPUs
'PVC', 'INTELGPUMAX', 'MAX1100', 'MAX1550']

Expand Down Expand Up @@ -867,6 +869,12 @@ def limits(self, compiler=None, language=None):
'max-block-dims': 3,
}

def supports(self, query, language=None):
"""
Check if the device supports a given feature.
"""
return False


class IntelDevice(Device):

Expand Down Expand Up @@ -895,6 +903,52 @@ def march(self):
return 'tesla'
return None

def supports(self, query, language=None):
if language != 'cuda':
return False

cc = get_nvidia_cc()
if query == 'async-loads' and cc >= 80:
# Asynchronous pipeline loads -- introduced in Ampere
return True
elif query == 'tma' and cc >= 90:
# Tensor Memory Acceleratory -- introduced in Hopper
return True
else:
return False


class Volta(NvidiaDevice):
pass


class Ampere(Volta):

def supports(self, query, language=None):
if language != 'cuda':
return False

if query == 'async-loads':
return True

return super().supports(query, language)


class Hopper(Ampere):

def supports(self, query, language=None):
georgebisbas marked this conversation as resolved.
Show resolved Hide resolved
if language != 'cuda':
return False

if query == 'tma':
return True

return super().supports(query, language)


class Blackwell(Hopper):
pass


class AmdDevice(Device):

Expand Down Expand Up @@ -963,6 +1017,10 @@ def march(cls):
ANYGPU = Cpu64('gpu')

NVIDIAX = NvidiaDevice('nvidiaX')
VOLTA = Volta('volta')
AMPERE = Ampere('ampere')
HOPPER = Hopper('hopper')
BLACKWELL = Blackwell('blackwell')

AMDGPUX = AmdDevice('amdgpuX')

Expand Down
16 changes: 8 additions & 8 deletions devito/arch/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@
from codepy.toolchain import (GCCToolchain,
call_capture_output as _call_capture_output)

from devito.arch import (AMDGPUX, Cpu64, AppleArm, NVIDIAX, POWER8, POWER9, Graviton,
IntelDevice, get_nvidia_cc, check_cuda_runtime,
from devito.arch import (AMDGPUX, Cpu64, AppleArm, NvidiaDevice, POWER8, POWER9,
Graviton, IntelDevice, get_nvidia_cc, check_cuda_runtime,
get_m1_llvm_path)
from devito.exceptions import CompilationError
from devito.logger import debug, warning
Expand Down Expand Up @@ -487,7 +487,7 @@ def __init_finalize__(self, **kwargs):
language = kwargs.pop('language', configuration['language'])
platform = kwargs.pop('platform', configuration['platform'])

if platform is NVIDIAX:
if isinstance(platform, NvidiaDevice):
georgebisbas marked this conversation as resolved.
Show resolved Hide resolved
self.cflags.remove('-std=c99')
# Add flags for OpenMP offloading
if language in ['C', 'openmp']:
Expand Down Expand Up @@ -555,7 +555,7 @@ def __init_finalize__(self, **kwargs):
if not configuration['safe-math']:
self.cflags.append('-ffast-math')

if platform is NVIDIAX:
if isinstance(platform, NvidiaDevice):
self.cflags.remove('-std=c99')
elif platform is AMDGPUX:
self.cflags.remove('-std=c99')
Expand Down Expand Up @@ -607,7 +607,7 @@ def __init_finalize__(self, **kwargs):
language = kwargs.pop('language', configuration['language'])
platform = kwargs.pop('platform', configuration['platform'])

if platform is NVIDIAX:
if isinstance(platform, NvidiaDevice):
if self.version >= Version("24.9"):
self.cflags.append('-gpu=mem:separate:pinnedalloc')
else:
Expand Down Expand Up @@ -843,7 +843,7 @@ def __init_finalize__(self, **kwargs):
self.ldflags.remove('-qopenmp')
self.ldflags.append('-fopenmp')

if platform is NVIDIAX:
if isinstance(platform, NvidiaDevice):
self.cflags.append('-fopenmp-targets=nvptx64-cuda')
elif isinstance(platform, IntelDevice):
self.cflags.append('-fiopenmp')
Expand Down Expand Up @@ -900,7 +900,7 @@ def __init_finalize__(self, **kwargs):

if isinstance(platform, Cpu64):
pass
elif platform is NVIDIAX:
elif isinstance(platform, NvidiaDevice):
self.cflags.append('-fsycl-targets=nvptx64-cuda')
elif isinstance(platform, IntelDevice):
self.cflags.append('-fsycl-targets=spir64')
Expand Down Expand Up @@ -931,7 +931,7 @@ def __new__(cls, *args, **kwargs):
_base = ClangCompiler
elif isinstance(platform, IntelDevice):
_base = OneapiCompiler
elif platform is NVIDIAX:
elif isinstance(platform, NvidiaDevice):
if language == 'cuda':
_base = CudaCompiler
else:
Expand Down
9 changes: 8 additions & 1 deletion devito/finite_differences/differentiable.py
Original file line number Diff line number Diff line change
Expand Up @@ -730,6 +730,12 @@ def __init_finalize__(self, *args, **kwargs):

super().__init_finalize__(*args, **kwargs)

@classmethod
def class_key(cls):
# Ensure Weights appear before any other AbstractFunction
p, v, _ = Array.class_key()
return p, v - 1, cls.__name__

def __eq__(self, other):
return (isinstance(other, Weights) and
self.name == other.name and
Expand Down Expand Up @@ -819,7 +825,8 @@ def compare(self, other):
n1 = self.__class__
n2 = other.__class__
if n1.__name__ == n2.__name__:
return self.base.compare(other.base)
return (self.weights.compare(other.weights) or
self.base.compare(other.base))
else:
return super().compare(other)

Expand Down
42 changes: 15 additions & 27 deletions devito/ir/clusters/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,17 @@
import numpy as np

from devito.ir.equations import ClusterizedEq
from devito.ir.support import (PARALLEL, PARALLEL_IF_PVT, BaseGuardBoundNext,
Forward, Interval, IntervalGroup, IterationSpace,
DataSpace, Guards, Properties, Scope, WaitLock,
WithLock, PrefetchUpdate, detect_accesses, detect_io,
normalize_properties, normalize_syncs, minimum,
maximum, null_ispace)
from devito.ir.support import (
PARALLEL, PARALLEL_IF_PVT, BaseGuardBoundNext, Forward, Interval, IntervalGroup,
IterationSpace, DataSpace, Guards, Properties, Scope, WaitLock, WithLock,
PrefetchUpdate, detect_accesses, detect_io, normalize_properties,
tailor_properties, update_properties, normalize_syncs, minimum, maximum,
null_ispace
)
from devito.mpi.halo_scheme import HaloScheme, HaloTouch
from devito.mpi.reduction_scheme import DistReduce
from devito.symbolics import estimate_cost
from devito.tools import as_tuple, flatten, infer_dtype
from devito.tools import as_tuple, filter_ordered, flatten, infer_dtype
from devito.types import Fence, WeakFence, CriticalRegion

__all__ = ["Cluster", "ClusterGroup"]
Expand Down Expand Up @@ -52,7 +53,8 @@ def __init__(self, exprs, ispace=null_ispace, guards=None, properties=None,
self._syncs = normalize_syncs(syncs or {})

properties = Properties(properties or {})
self._properties = tailor_properties(properties, ispace)
properties = tailor_properties(properties, ispace)
self._properties = update_properties(properties, self.exprs)

self._halo_scheme = halo_scheme

Expand Down Expand Up @@ -482,15 +484,17 @@ def properties(self):

@cached_property
def guards(self):
"""The guards of each Cluster in self."""
return tuple(i.guards for i in self)
"""
A view of the ClusterGroup's guards.
"""
return tuple(filter_ordered(i.guards for i in self))

@cached_property
def syncs(self):
"""
A view of the ClusterGroup's synchronization operations.
"""
return normalize_syncs(*[c.syncs for c in self])
return normalize_syncs(*[c.syncs for c in self], strict=False)

@cached_property
def dspace(self):
Expand Down Expand Up @@ -540,19 +544,3 @@ def reduce_properties(clusters):
properties[d] = normalize_properties(properties.get(d, v), v)

return Properties(properties)


def tailor_properties(properties, ispace):
Copy link
Contributor Author

@FabioLuporini FabioLuporini Jan 14, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

note for reviewers: finally moved to ir/support/properties as promised in an old PR

"""
Create a new Properties object off `properties` that retains all and only
the iteration dimensions in `ispace`.
"""
for i in properties:
for d in as_tuple(i):
if d not in ispace.itdims:
properties = properties.drop(d)

for d in ispace.itdims:
properties = properties.add(d)

return properties
80 changes: 76 additions & 4 deletions devito/ir/support/properties.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,14 @@ def __init__(self, name, val=None):
"""

PREFETCHABLE = Property('prefetchable')
"""
A Dimension along which prefetching is feasible and beneficial.
"""

PREFETCHABLE_SHM = Property('prefetchable-shm')
"""
A Dimension along which shared-memory prefetching is feasible and beneficial.
"""


# Bundles
Expand Down Expand Up @@ -129,6 +137,62 @@ def relax_properties(properties):
return frozenset(properties - {PARALLEL_INDEP})


def tailor_properties(properties, ispace):
"""
Create a new Properties object off `properties` that retains all and only
the iteration dimensions in `ispace`.
"""
for i in properties:
for d in as_tuple(i):
if d not in ispace.itdims:
properties = properties.drop(d)

for d in ispace.itdims:
properties = properties.add(d)

return properties


def update_properties(properties, exprs):
"""
Create a new Properties object off `properties` augmented with properties
discovered from `exprs` or with properties removed if they are incompatible
with `exprs`.
"""
exprs = as_tuple(exprs)

if not exprs:
return properties

# Auto-detect prefetchable Dimensions
dims = set()
flag = False
for e in as_tuple(exprs):
w, r = e.args

# Ensure it's in the form `Indexed = Indexed`
try:
wf, rf = w.function, r.function
except AttributeError:
break

if not wf._mem_shared:
break
dims.update({d.parent for d in wf.dimensions if d.parent in properties})

if not rf._mem_heap:
break
else:
flag = True

if flag:
properties = properties.prefetchable_shm(dims)
else:
properties = properties.drop(properties=PREFETCHABLE_SHM)

return properties


class Properties(frozendict):

"""
Expand Down Expand Up @@ -183,12 +247,15 @@ def sequentialize(self, dims=None):
m[d] = normalize_properties(set(self.get(d, [])), {SEQUENTIAL})
return Properties(m)

def prefetchable(self, dims):
def prefetchable(self, dims, v=PREFETCHABLE):
m = dict(self)
for d in as_tuple(dims):
m[d] = self.get(d, set()) | {PREFETCHABLE}
m[d] = self.get(d, set()) | {v}
return Properties(m)

def prefetchable_shm(self, dims):
return self.prefetchable(dims, PREFETCHABLE_SHM)

def block(self, dims, kind='default'):
if kind == 'default':
p = TILABLE
Expand Down Expand Up @@ -232,8 +299,13 @@ def is_blockable(self, d):
def is_blockable_small(self, d):
return TILABLE_SMALL in self.get(d, set())

def is_prefetchable(self, dims):
return any(PREFETCHABLE in self.get(d, set()) for d in as_tuple(dims))
def is_prefetchable(self, dims=None, v=PREFETCHABLE):
if dims is None:
dims = list(self)
return any(v in self.get(d, set()) for d in as_tuple(dims))

def is_prefetchable_shm(self, dims=None):
return self.is_prefetchable(dims, PREFETCHABLE_SHM)

@property
def nblockable(self):
Expand Down
15 changes: 8 additions & 7 deletions devito/ir/support/syncs.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ def update(self, ops):
return Ops(m)


def normalize_syncs(*args):
def normalize_syncs(*args, strict=True):
if not args:
return {}

Expand All @@ -175,12 +175,13 @@ def normalize_syncs(*args):

syncs = {k: tuple(filter_ordered(v)) for k, v in syncs.items()}

for v in syncs.values():
waitlocks = [s for s in v if isinstance(s, WaitLock)]
withlocks = [s for s in v if isinstance(s, WithLock)]
if strict:
for v in syncs.values():
waitlocks = [s for s in v if isinstance(s, WaitLock)]
withlocks = [s for s in v if isinstance(s, WithLock)]

if waitlocks and withlocks:
# We do not allow mixing up WaitLock and WithLock ops
raise ValueError("Incompatible SyncOps")
if waitlocks and withlocks:
# We do not allow mixing up WaitLock and WithLock ops
raise ValueError("Incompatible SyncOps")

return Ops(syncs)
Loading
Loading