Skip to content

Commit

Permalink
Various bugfixes on the feature
Browse files Browse the repository at this point in the history
  • Loading branch information
ThrudPrimrose committed Dec 6, 2024
1 parent 21dd0c3 commit e669f7c
Show file tree
Hide file tree
Showing 5 changed files with 61 additions and 49 deletions.
11 changes: 0 additions & 11 deletions dace/codegen/targets/cpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -510,15 +510,6 @@ def allocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphV

if not declared:
declaration_stream.write(f'{nodedesc.dtype.ctype} *{name};\n', cfg, state_id, node)
# Initialize size array
size_str = ",".join(["0" if cpp.sym2cpp(dim).startswith("__dace_defer") else cpp.sym2cpp(dim) for dim in nodedesc.shape])
if (nodedesc.transient and (
nodedesc.storage == dtypes.StorageType.CPU_Heap or
nodedesc.storage == dtypes.StorageType.GPU_Global)
):
size_desc_name = nodedesc.size_desc_name
size_nodedesc = sdfg.arrays[size_desc_name]
declaration_stream.write(f'{size_nodedesc.dtype.ctype} {size_desc_name}[{size_nodedesc.shape[0]}]{{{size_str}}};\n', cfg, state_id, node)
if deferred_allocation:
allocation_stream.write(
"%s = nullptr; // Deferred Allocation" %
Expand All @@ -539,8 +530,6 @@ def allocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphV
)

define_var(name, DefinedType.Pointer, ctypedef)
if not declared:
define_var(size_desc_name, DefinedType.Pointer, size_nodedesc.dtype.ctype)

if node.setzero:
allocation_stream.write("memset(%s, 0, sizeof(%s)*%s);" %
Expand Down
48 changes: 21 additions & 27 deletions dace/codegen/targets/cuda.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,7 +299,9 @@ def _compute_pool_release(self, top_sdfg: SDFG):

# Add sink as terminator state
for arr in unfreed:
self.pool_release[(sdfg, arr)] = (sink, set())
if (sdfg.arrays[arr].storage in [dtypes.StorageType.GPU_Global, dtypes.StorageType.GPU_Shared]
and arr not in sdfg.size_arrays()): # Do put size arrays to pool release
self.pool_release[(sdfg, arr)] = (sink, set())

# Generate final code
def get_generated_codeobjects(self):
Expand Down Expand Up @@ -578,6 +580,7 @@ def allocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphV

# Check if array is already declared
declared = False
size_declared = False
try:
self._dispatcher.declared_arrays.get(dataname)
declared = True # Array was already declared in this or upper scopes
Expand Down Expand Up @@ -608,17 +611,9 @@ def allocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphV
# Different types of GPU arrays
if nodedesc.storage == dtypes.StorageType.GPU_Global:
if not declared:
result_decl.write('%s %s;\n' % (ctypedef, dataname))
size_str = ",".join(["0" if cpp.sym2cpp(dim).startswith("__dace_defer") else cpp.sym2cpp(dim) for dim in nodedesc.shape])
if nodedesc.transient:
size_desc_name = nodedesc.size_desc_name
if size_desc_name is not None:
size_nodedesc = sdfg.arrays[size_desc_name]
result_decl.write(f'{size_nodedesc.dtype.ctype} {size_desc_name}[{size_nodedesc.shape[0]}]{{{size_str}}};\n')
self._dispatcher.defined_vars.add(size_desc_name, DefinedType.Pointer, size_nodedesc.dtype.ctype)
declaration_stream.write('%s %s;\n' % (ctypedef, dataname))
self._dispatcher.defined_vars.add(dataname, DefinedType.Pointer, ctypedef)


if deferred_allocation:
result_alloc.write(
"%s = nullptr; // Deferred Allocation" %
Expand Down Expand Up @@ -765,7 +760,7 @@ def deallocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgrap

if nodedesc.storage == dtypes.StorageType.GPU_Global:
if not nodedesc.pool: # If pooled, will be freed somewhere else
callsite_stream.write('DACE_GPU_CHECK(%sFree(%s));\n' % (self.backend, dataname), cfg, state_id, node)
callsite_stream.write('DACE_GPU_CHECK(%sFree(%s));//a1\n' % (self.backend, dataname), cfg, state_id, node)
elif nodedesc.storage == dtypes.StorageType.CPU_Pinned:
callsite_stream.write('DACE_GPU_CHECK(%sFreeHost(%s));\n' % (self.backend, dataname), cfg, state_id, node)
elif nodedesc.storage == dtypes.StorageType.GPU_Shared or \
Expand Down Expand Up @@ -1286,8 +1281,7 @@ def generate_state(self,
ptrname = cpp.ptr(name, desc, sd, self._frame)
if isinstance(desc, dt.Array) and desc.start_offset != 0:
ptrname = f'({ptrname} - {cpp.sym2cpp(desc.start_offset)})'

callsite_stream.write(f'DACE_GPU_CHECK({backend}Free({ptrname}));\n', sd)
callsite_stream.write(f'DACE_GPU_CHECK({backend}Free({ptrname}));//a2\n', sd)
self._emit_sync(callsite_stream)
to_remove.add((sd, name))
for sd, name in to_remove:
Expand Down Expand Up @@ -1584,10 +1578,12 @@ def generate_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: StateSub
prototype_kernel_args[aname] = arg

if aname in sdfg.arrays:
size_arr_name = data_desc.size_desc_name
if size_arr_name is not None:
size_arr = sdfg.arrays[data_desc.size_desc_name]
host_size_args[size_arr_name] = size_arr
arr = sdfg.arrays[aname]
if arr.transient and arr.storage == dtypes.StorageType.GPU_Global and arr.size_desc_name is not None:
size_arr_name = data_desc.size_desc_name
if size_arr_name is not None:
size_arr = sdfg.arrays[data_desc.size_desc_name]
host_size_args[size_arr_name] = size_arr

kernel_args_typed = [('const ' if k in const_params else '') + v.as_arg(name=k)
for k, v in prototype_kernel_args.items()]
Expand Down Expand Up @@ -1626,18 +1622,16 @@ def generate_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: StateSub
# Size arrays
needed_size_scalars_declaration = []
for size_desc_name, arg in host_size_args.items():
if isinstance(arg, dt.Array):
size_arr = arg
arr_name = size_desc_name.removesuffix("_size")
for i in range(size_arr.shape[0]):
if f"__{arr_name}_dim{i}_size" not in dyn_args:
dyn_args.append(f"__{arr_name}_dim{i}_size")
dyn_args_typed.append(f"const {dace.uint64} __{arr_name}_dim{i}_size")
needed_size_scalars_declaration.append(f"const {dace.uint64} __{arr_name}_dim{i}_size = {size_desc_name}[{i}];")
arr_name = size_desc_name.removesuffix("_size")
for i in range(size_arr.shape[0]):
if f"__{arr_name}_dim{i}_size" not in dyn_args:
dyn_args.append(f"__{arr_name}_dim{i}_size")
dyn_args_typed.append(f"const {dace.uint64} __{arr_name}_dim{i}_size")
needed_size_scalars_declaration.append(f"const {dace.uint64} __{arr_name}_dim{i}_size = {size_desc_name}[{i}];")

self._localcode.write(
'__global__ void %s %s(%s) {\n' %
(launch_bounds, kernel_name, ', '.join(kernel_args_typed + extra_kernel_args_typed + dyn_args_typed)), sdfg, state_id, node)
(launch_bounds, kernel_name, ', '.join(kernel_args_typed + dyn_args_typed + extra_kernel_args_typed)), sdfg, state_id, node)

# Write constant expressions in GPU code
self._frame.generate_constants(sdfg, self._localcode)
Expand Down Expand Up @@ -1713,7 +1707,7 @@ def generate_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: StateSub
self._localcode.write(
memlet_definition,
cfg, state_id, scope_entry)
self._localcode.write("// Array sizes of arrays are passed to the kernel even if not used in maps")
self._localcode.write("// Array sizes of arrays used are passed here if needed to the kernel even")
for decl in needed_size_scalars_declaration:
self._localcode.write(decl, cfg, state_id, scope_entry)

Expand Down
15 changes: 15 additions & 0 deletions dace/codegen/targets/framecode.py
Original file line number Diff line number Diff line change
Expand Up @@ -834,6 +834,7 @@ def allocate_arrays_in_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, scope: Un
self._dispatcher.dispatch_allocate(tsdfg, cfg if state is None else state.parent_graph, state, state_id,
node, desc, function_stream, callsite_stream, declare, allocate)


def deallocate_arrays_in_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, scope: Union[nodes.EntryNode, SDFGState,
SDFG],
function_stream: CodeIOStream, callsite_stream: CodeIOStream):
Expand Down Expand Up @@ -907,6 +908,20 @@ def generate_code(self,
global_symbols = copy.deepcopy(sdfg.symbols)
global_symbols.update({aname: arr.dtype for aname, arr in sdfg.arrays.items()})

# Allocate size arrays (always check as name and array changes affect size descriptor names)
size_arrays = {(v.size_desc_name, sdfg.arrays[v.size_desc_name])
for v in sdfg.arrays.values()
if v.size_desc_name is not None and v.size_desc_name in sdfg.arrays}
callsite_stream.write(f'//Declare size arrays\n', sdfg)
for size_desc_name, size_nodedesc in size_arrays:
assert ("__return" not in size_desc_name)
ctypedef = size_nodedesc.dtype.ctype
from dace.codegen.targets import cpp
size_str = ",".join(["0" if cpp.sym2cpp(dim).startswith("__dace_defer") else cpp.sym2cpp(dim) for dim in size_nodedesc.shape])
alloc_str = f'{ctypedef} {size_desc_name}[{len(size_nodedesc.shape)}]{{{size_str}}};\n'
callsite_stream.write(alloc_str)
self.dispatcher.defined_vars.add(size_desc_name, disp.DefinedType.Pointer, ctypedef)

interstate_symbols = {}
for cfr in sdfg.all_control_flow_regions():
for e in cfr.dfs_edges(cfr.start_block):
Expand Down
25 changes: 15 additions & 10 deletions dace/sdfg/sdfg.py
Original file line number Diff line number Diff line change
Expand Up @@ -753,7 +753,8 @@ def replace_dict(self,

# Replace in arrays and symbols (if a variable name)
size_arrays = {v.size_desc_name for v in self.arrays.values()
if v.size_desc_name is not None and v.size_desc_name in self.arrays}
if v.size_desc_name is not None and v.size_desc_name in self.arrays
and v.transient}
non_size_arrays = {k for k in self.arrays if k not in size_arrays}
size_desc_map = dict()

Expand All @@ -763,7 +764,7 @@ def replace_dict(self,
for name, new_name in repldict_filtered.items():
if validate_name(new_name):
_replace_dict_keys(self.arrays, name, new_name, non_size_arrays)
if new_name != "__return":
if "__return" not in new_name: # To catch __return_0, __return_1, gpu__return
size_desc_map[new_name] = new_name + "_size"
_replace_dict_keys(self.arrays, name + "_size", new_name + "_size", size_arrays)
_replace_dict_keys(self.symbols, name, new_name)
Expand All @@ -772,12 +773,15 @@ def replace_dict(self,
_replace_dict_values(self.callback_mapping, name, new_name)

# Update size descriptors
# Return_size break things delete it from the arrays
# Changes symbol names might not related to arrays
# Return_size break things (it is collected to the tuple of return) delete it from the arrays
# If this is called because array's properties had been changed then set the size desc to none
for arr_name, size_desc_name in size_desc_map.items():
arr = self.arrays[arr_name] if arr_name in self.arrays else None
if arr is not None:
arr.size_desc_name = size_desc_name if size_desc_name != "__return_size" else None
if arr.transient and isinstance(arr, dt.Array):
arr.size_desc_name = size_desc_name if "__return" not in new_name else None
else:
arr.size_desc_name = None

# Replace inside data descriptors
for array in self.arrays.values():
Expand Down Expand Up @@ -1771,6 +1775,8 @@ def add_array(self,

# Every Array also supports reallocation, we need to create a secondary size array
# The array size is constant and not changeable, yet the values in the array can change
if name.endswith("_size"):
raise InvalidSDFGError("Array names are not allowed to end with _size")

# convert strings to int if possible, unless it is not the reserved symbol for deferred allocation
newshape = []
Expand Down Expand Up @@ -2103,16 +2109,15 @@ def add_datadesc(self, name: str, datadesc: dt.Data, find_new_name=False) -> str
# Add the data descriptor to the SDFG and all symbols that are not yet known.
self._arrays[name] = datadesc
self._add_symbols(datadesc)

if (
datadesc.transient is True and
isinstance(datadesc, dt.Array) and
name != "__return"
"__return" not in name
):
size_desc_name = f"{name}_size"
size_desc = dt.Array(dtype=dace.uint64,
shape=(len(datadesc.shape),),
storage=dtypes.StorageType.Default,
storage=dtypes.StorageType.CPU_Heap,
location=None,
allow_conflicts=False,
transient=True,
Expand Down Expand Up @@ -2468,8 +2473,8 @@ def argument_typecheck(self, args, kwargs, types_only=False):

# Omit return values from arguments
expected_args = collections.OrderedDict([(k, v) for k, v in expected_args.items()
if not k.startswith('__return')])
kwargs = {k: v for k, v in kwargs.items() if not k.startswith('__return')}
if '__return' not in k])
kwargs = {k: v for k, v in kwargs.items() if '__return' not in k}

num_args_passed = len(args) + len(kwargs)
num_args_expected = len(expected_args)
Expand Down
11 changes: 10 additions & 1 deletion tests/deferred_alloc_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
from dace.transformation.dataflow.redundant_array import RedundantArray, RedundantSecondArray
from dace.transformation.interstate.state_fusion import StateFusion
import numpy
import cupy
import pytest

@pytest.fixture(params=[dace.dtypes.StorageType.CPU_Heap, dace.dtypes.StorageType.GPU_Global])
Expand Down Expand Up @@ -145,6 +144,11 @@ def test_realloc_use(storage_type: dace.dtypes.StorageType, transient: bool, sch
compiled_sdfg(user_size=user_size, example_array=arr)
assert ( arr[0] == 3.0 )
if storage_type == dace.dtypes.StorageType.GPU_Global:
try:
import cupy
except Exception:
return

arr = cupy.array([-1.0]).astype(cupy.float32)
user_size = numpy.array([10, 10]).astype(numpy.uint64)
compiled_sdfg(user_size=user_size, example_array=arr)
Expand All @@ -160,6 +164,11 @@ def test_realloc_use(storage_type: dace.dtypes.StorageType, transient: bool, sch
compiled_sdfg(user_size=user_size, example_array=arr)
assert ( arr[0] == 3.0 )
if storage_type == dace.dtypes.StorageType.GPU_Global:
try:
import cupy
except Exception:
return

arr = cupy.array([-1.0]).astype(cupy.float32)
user_size = numpy.array([10, 10]).astype(numpy.uint64)
compiled_sdfg(user_size=user_size, example_array=arr)
Expand Down

0 comments on commit e669f7c

Please sign in to comment.