From 55a613eebc3a4b797bf181092ab2442c069c3842 Mon Sep 17 00:00:00 2001 From: Michael Gilbert Date: Thu, 24 Oct 2024 15:08:05 -0400 Subject: [PATCH] New mapper --- pytimeloop/fastfusion/fastmodel/fastmodel.py | 46 ++- pytimeloop/fastfusion/mapper/mapper.py | 1 - pytimeloop/fastfusion/mapper/mapper2.py | 332 +++++++++++++++--- .../fastfusion/mapper/shape_subspace.py | 22 +- tests/fastfusion/test_mapper.py | 73 ++-- 5 files changed, 353 insertions(+), 121 deletions(-) diff --git a/pytimeloop/fastfusion/fastmodel/fastmodel.py b/pytimeloop/fastfusion/fastmodel/fastmodel.py index d41f04d..1f98486 100644 --- a/pytimeloop/fastfusion/fastmodel/fastmodel.py +++ b/pytimeloop/fastfusion/fastmodel/fastmodel.py @@ -17,7 +17,10 @@ def compile_mapping(mapping, tensor_name_to_id = workload.data_space_name_to_id() einsum_name = mapping[-1]['einsum'] - einsum_id = einsum_name_to_id[einsum_name] + if isinstance(einsum_name, int): + einsum_id = einsum_name + else: + einsum_id = einsum_name_to_id[einsum_name] tensors = ( workload.tensors_read_by_einsum(einsum_id) @@ -63,12 +66,18 @@ def compile_mapping(mapping, for node in mapping: if node['type'] == 'temporal': rank_name = node['rank'] - rank_id = rank_name_to_id[rank_name] + if isinstance(rank_name, int): + rank_id = rank_name + else: + rank_id = rank_name_to_id[rank_name] group_id = rank_groups.rank_to_group_id[rank_id] - tile_shape = sympy.symbols(f'tileshape{len(tile_shapes)}') - tile_shapes.append(tile_shape) - factor = einsum_shape[group_id] // tile_shape + if 'tile_shape' not in node: + tile_shape = sympy.symbols(f'tileshape{len(tile_shapes)}') + tile_shapes.append(tile_shape) + else: + tile_shape = node['tile_shape'] + factor = einsum_shape[group_id] / tile_shape einsum_shape[group_id] = tile_shape latency *= factor @@ -78,7 +87,7 @@ def compile_mapping(mapping, if group_id in relevant_ranks: actual_tensor_access_multiplier[tensor_id] = \ potential_tensor_access_multiplier[tensor_id] - tensor_size[tensor_id] //= factor + tensor_size[tensor_id] /= factor else: potential_tensor_access_multiplier[tensor_id] *= factor elif node['type'] == 'sequential': @@ -87,13 +96,24 @@ def compile_mapping(mapping, potential_tensor_access_multiplier[tensor_id] elif node['type'] == 'spatial': rank_name = node['rank'] - rank_id = rank_name_to_id[rank_name] + if isinstance(rank_name, int): + rank_id = rank_name + else: + rank_id = rank_name_to_id[rank_name] group_id = rank_groups.rank_to_group_id[rank_id] - tile_shape = sympy.symbols(f'tileshape{len(tile_shapes)}') - tile_shapes.append(tile_shape) - factor = einsum_shape[group_id] // tile_shape + if 'tile_shape' not in node: + tile_shape = sympy.symbols(f'tileshape{len(tile_shapes)}') + tile_shapes.append(tile_shape) + else: + tile_shape = node['tile_shape'] + factor = einsum_shape[group_id] / tile_shape einsum_shape[group_id] = tile_shape + + for tensor_id in tensors: + relevant_ranks = tensor_to_relevant_ranks[tensor_id] + if group_id in relevant_ranks: + tensor_size[tensor_id] /= factor if 'spatial' not in node: spatial = 0 @@ -107,7 +127,10 @@ def compile_mapping(mapping, target = node['target'] tensor_names = node['dspace'] for tensor_name in tensor_names: - tensor_id = tensor_name_to_id[tensor_name] + if isinstance(tensor_name, int): + tensor_id = tensor_name + else: + tensor_id = tensor_name_to_id[tensor_name] if tensor_id not in tensors: continue @@ -166,5 +189,6 @@ def lambdify(d): output.temporal_steps = lambdify(output.temporal_steps) output.fanout = lambdify(output.fanout) output.occupancy = lambdify(output.occupancy) + output.fills_by_parent = lambdify(output.fills_by_parent) return tile_shapes, output \ No newline at end of file diff --git a/pytimeloop/fastfusion/mapper/mapper.py b/pytimeloop/fastfusion/mapper/mapper.py index cc16e97..7df3b7f 100644 --- a/pytimeloop/fastfusion/mapper/mapper.py +++ b/pytimeloop/fastfusion/mapper/mapper.py @@ -215,7 +215,6 @@ def get_neighbors(workload): def get_intermediate_tensors(workload: LooptreeWorkload): - tensor_id_to_name = workload.data_space_id_to_name() result = set() for einsum in workload.einsum_id_to_name(): written_tensors = workload.tensors_written_by_einsum(einsum) diff --git a/pytimeloop/fastfusion/mapper/mapper2.py b/pytimeloop/fastfusion/mapper/mapper2.py index 7937e0b..ba150b7 100644 --- a/pytimeloop/fastfusion/mapper/mapper2.py +++ b/pytimeloop/fastfusion/mapper/mapper2.py @@ -1,6 +1,8 @@ -from collections import defaultdict, deque -from functools import partial +from collections import defaultdict +from dataclasses import dataclass from itertools import product, permutations +from functools import reduce +from operator import or_, mul from ruamel.yaml import YAML yaml = YAML(typ='safe') @@ -10,12 +12,37 @@ LooptreeWorkloadDependencyAnalyzer ) -from pytimeloop.fastfusion.fastmodel import compile_mapping +from pytimeloop.fastfusion.fastmodel import compile_mapping, LooptreeOutput +from pytimeloop.fastfusion.mapper.shape_subspace import ShapeSubspace class LinearMapping: - def add_compute(self, einsum_name): - self.mapping.append({'type': 'compute', 'einsum': einsum_name}) + def __init__(self): + self.mapping = [] + + def __iter__(self): + return iter(self.mapping) + + def __getitem__(self, key): + return self.mapping[key] + + def __len__(self): + return len(self.mapping) + + def __repr__(self): + return repr(self.mapping) + + def copy(self): + lm = LinearMapping() + lm.mapping = self.mapping.copy() + return lm + + def add_compute(self, einsum_name, target): + self.mapping.append({ + 'type': 'compute', + 'einsum': einsum_name, + 'target': target + }) def add_temporal(self, rank_name, tile_shape=None): node = {'type': 'temporal', 'rank': rank_name} @@ -29,24 +56,59 @@ def add_spatial(self, rank_name, tile_shape=None): node['tile_shape'] = tile_shape self.mapping.append(node) - def add_sequential(self): - self.mapping.append({'type': 'sequential'}) + def add_sequential(self, idx=None): + node = {'type': 'sequential'} + if idx is None: + self.mapping.append(node) + else: + self.mapping.insert(idx, node) def add_pipeline(self): self.mapping.append({'type': 'pipeline'}) - def add_storage(self, target, dspaces): - self.mapping.append({ + def add_storage(self, target, dspaces, idx=None): + node = { 'type': 'storage', 'target': target, 'dspace': dspaces - }) + } + if idx is None: + self.mapping.append(node) + else: + self.mapping.insert(idx, node) + + +@dataclass +class MacArrayConstraint: + array_shape_in_parallel_dimension: str + array_shape_in_reduced_dimension: str + + weight_tensor: dict[str, str] + parallel_rank: dict[str, str] + reduced_rank: dict[str, str] -def mapper(config, spec, tmp_path, verbose_stream=None): +def mapper(config, + mac_array_constraint: MacArrayConstraint, + spec, + tmp_path, + verbose_stream=None): + workload = LooptreeWorkload.parse_cfg(config.root['problem']) analyzer = LooptreeWorkloadDependencyAnalyzer(workload) + einsum_id_to_name = workload.einsum_id_to_name() + rank_name_to_id = workload.dimension_name_to_id() + tensor_name_to_id = workload.data_space_name_to_id() + + mac_parallel_shape = mac_array_constraint.array_shape_in_parallel_dimension + mac_reduced_shape = mac_array_constraint.array_shape_in_reduced_dimension + + einsum_name_to_parallel_rank_name = mac_array_constraint.parallel_rank + einsum_name_to_reduced_rank_name = mac_array_constraint.reduced_rank + + bindings, max_fanout, max_capacity = get_hardware_levels(spec.architecture) + einsum_name_to_id = workload.einsum_name_to_id() for einsum_id in einsum_name_to_id.values(): tensors = ( @@ -56,28 +118,66 @@ def mapper(config, spec, tmp_path, verbose_stream=None): ) intermediate_tensors = tensors & get_intermediate_tensors(workload) + einsum_name = einsum_id_to_name[einsum_id] + mac_parallel_rank_name = einsum_name_to_parallel_rank_name[einsum_name] + mac_parallel_rank_id = rank_name_to_id[mac_parallel_rank_name] + mac_reduced_rank_name = einsum_name_to_reduced_rank_name[einsum_name] + mac_reduced_rank_id = rank_name_to_id[mac_reduced_rank_name] + + weight_tensor_name = mac_array_constraint.weight_tensor[einsum_name] + weight_tensor_id = tensor_name_to_id[weight_tensor_name] + weight_ranks = analyzer.einsum_dims_relevant_to_tensor(einsum_id, + weight_tensor_id) + other_weight_ranks = \ + weight_ranks - {mac_parallel_rank_id, mac_reduced_rank_id} + all_ranks = workload.einsum_ospace_dimensions(einsum_id) + non_weight_ranks = set(all_ranks) - weight_ranks + + tensor_to_relevant_ranks = { + tensor: analyzer.einsum_dims_relevant_to_tensor(einsum_id, tensor) + for tensor in tensors + } + + einsum_shape = { + rank_id: workload.get_rank_shape(rank_id)[1]+1 for rank_id in all_ranks + } + + + count = 0 mapping = LinearMapping() - for partial_mapping in make_top_loops(mapping, einsum_id, workload): + top_level_ranks = reduce( + or_, + (tensor_to_relevant_ranks[t] for t in intermediate_tensors), + set() + ) + for partial_mapping in make_top_loops(mapping, top_level_ranks): for partial_mapping in place_fusion_level(partial_mapping, - intermediate_tensors): + intermediate_tensors, + tensor_to_relevant_ranks): for partial_mapping in make_pe_spatial_fors(partial_mapping, - einsum_id, - workload): + all_ranks): for partial_mapping in make_pe_temporal_fors(partial_mapping, - einsum_id, - workload): + all_ranks): for partial_mapping in place_pe_level(partial_mapping, - tensors): + tensors, + tensor_to_relevant_ranks): for partial_mapping in make_mac_level_loops(partial_mapping, einsum_id, - parallel_rank, - parallel_rank_shape, - reduced_rank, - reduced_rank_shape, + mac_parallel_rank_id, + mac_parallel_shape, + mac_reduced_rank_id, + mac_reduced_shape, non_weight_ranks, other_weight_ranks): - compiled_results = compile_mapping(partial_mapping) - explore_tile_shape(partial_mapping, compiled_results) + _, compiled_results = compile_mapping(partial_mapping, + workload, + analyzer) + count += explore_tile_shape(partial_mapping, + einsum_shape, + compiled_results, + max_capacity, + max_fanout) + print(count/1e6) # Determine all relevant ranks for top loops @@ -88,16 +188,21 @@ def mapper(config, spec, tmp_path, verbose_stream=None): # Add temporal loops (for MAC) -def make_top_loops(mapping: LinearMapping, einsum_id, workload): - ranks = workload.einsum_ospace_dimensions(einsum_id) +def make_top_loops(mapping: LinearMapping, ranks): + original = mapping + print('n_top_loops:', + sum(reduce(mul, range(i, len(ranks)+1), 1) for i in range(1, len(ranks)+1))) for r in range(len(ranks)+1): for ordered_ranks in permutations(ranks, r=r): + mapping = original.copy() for r in ordered_ranks: - mapping.add_temporal_loop(r) + mapping.add_temporal(r) yield mapping -def place_fusion_level(mapping: LinearMapping, intermediate_tensors): +def place_fusion_level(mapping: LinearMapping, + intermediate_tensors, + tensor_to_relevant_ranks): top_idx = 0 for node in mapping: if node['type'] != 'storage': @@ -110,50 +215,67 @@ def place_fusion_level(mapping: LinearMapping, intermediate_tensors): relevant_ranks = tensor_to_relevant_ranks[tensor_id] tensor_choices = [] last_is_relevant = True + untiled = True for i, node in enumerate(mapping[top_idx:], start=top_idx): if node['type'] == 'temporal': + untiled = False rank_id = node['rank'] is_relevant = rank_id in relevant_ranks if last_is_relevant and not is_relevant: # Choice 1: fused - tensor_choices.append((i, 'GLB')) - # If untiled, choice 2: unfused - if i == top_idx: - tensor_choices.append((i, 'DRAM')) + tensor_choices.append((i, 1)) last_is_relevant = is_relevant + if last_is_relevant: + tensor_choices.append((len(mapping), 1)) + + # If untiled, another choice: unfused + if untiled: + tensor_choices.append((len(mapping), 0)) + all_tensor_choices.append(tensor_choices) + original = mapping.copy() + print('n_fusion_level:', count(product(*all_tensor_choices))) + for choice in product(*all_tensor_choices): + print(choice) for choices in product(*all_tensor_choices): if not any(c == len(mapping) for (c, level) in choices): continue + mapping = original.copy() for choice, tensor in sorted(zip(choices, intermediate_tensors), key=lambda pair: pair[0], reverse=True): idx, level = choice - mapping.insert_sequential(idx) - mapping.insert_storage(idx, level, tensor) + mapping.add_sequential(idx) + mapping.add_storage(level, {tensor}, idx=idx) yield mapping -def make_pe_spatial_fors(mapping, einsum_id, workload): - ranks = workload.einsum_ospace_dimensions(einsum_id) +def make_pe_spatial_fors(mapping, ranks): + original = mapping.copy() + print('n_pe_spatial:', + sum(reduce(mul, range(i, len(ranks)+1), 1) for i in range(1, len(ranks)+1))) for r in range(len(ranks)+1): for ordered_ranks in permutations(ranks, r=r): + mapping = original.copy() for r in ordered_ranks: - mapping.add_spatial_loop(r) + mapping.add_spatial(r) yield mapping -def make_pe_temporal_fors(mapping, einsum_id, workload): - ranks = workload.einsum_ospace_dimensions(einsum_id) +def make_pe_temporal_fors(mapping, ranks): + original = mapping.copy() + print('n_pe_temporal:', + sum(reduce(mul, range(i, len(ranks)+1), 1) for i in range(1, len(ranks)+1))) for r in range(len(ranks)+1): for ordered_ranks in permutations(ranks, r=r): + mapping = original.copy() for r in ordered_ranks: - mapping.add_spatial_loop(r) + mapping.add_spatial(r) yield mapping -def place_pe_level(mapping, tensors): +def place_pe_level(mapping, tensors, tensor_to_relevant_ranks): all_tensor_choices = [] for tensor_id in tensors: relevant_ranks = tensor_to_relevant_ranks[tensor_id] @@ -164,16 +286,20 @@ def place_pe_level(mapping, tensors): rank_id = node['rank'] is_relevant = rank_id in relevant_ranks if last_is_relevant and not is_relevant: - tensor_choices.append((i, 'PE')) + tensor_choices.append((i, 2)) last_is_relevant = is_relevant + if last_is_relevant: + tensor_choices.append((len(mapping), 2)) all_tensor_choices.append(tensor_choices) + original = mapping.copy() for choices in product(*all_tensor_choices): + mapping = original.copy() for choice, tensor in sorted(zip(choices, tensors), key=lambda pair: pair[0], reverse=True): idx, level = choice - mapping.insert_storage(idx, level, tensor) + mapping.add_storage(level, {tensor}, idx=idx) yield mapping @@ -185,17 +311,119 @@ def make_mac_level_loops(mapping, reduced_rank_shape, non_weight_ranks, other_weight_ranks): + mapping = mapping.copy() for rank in other_weight_ranks: - mapping.add_temporal_loop(rank, 1) - mapping.add_temporal_loop(parallel_rank, parallel_rank_shape) - mapping.add_temporal_loop(reduced_rank, reduced_rank_shape) + mapping.add_temporal(rank, 1) + mapping.add_temporal(parallel_rank, parallel_rank_shape) + mapping.add_temporal(reduced_rank, reduced_rank_shape) for rank in non_weight_ranks: - mapping.add_temporal_loop(rank, 1) - mapping.add_spatial_loop(parallel_rank, 1) - mapping.add_spatial_loop(reduced_rank, 1) - mapping.add_compute(einsum_id) + mapping.add_temporal(rank, 1) + mapping.add_spatial(parallel_rank, 1) + mapping.add_spatial(reduced_rank, 1) + mapping.add_compute(einsum_id, 3) yield mapping -def explore_tile_shape(mapping, compiled_results): - pass +def explore_tile_shape(mapping, + rank_shapes, + compiled_result, + max_capacity, + max_fanout, + only_count=False): + ranks = [] + for node in mapping: + if node['type'] in ['temporal', 'spatial'] and 'tile_shape' not in node: + ranks.append(node['rank']) + + num_tile_shapes = 0 + + shape_subspace = iter(ShapeSubspace(rank_shapes, ranks)) + for shape in shape_subspace: + num_tile_shapes += 1 + if only_count: + continue + + result = LooptreeOutput() + result.ops = call_with_arg(compiled_result.ops, shape) + result.temporal_steps = call_with_arg(compiled_result.temporal_steps, shape) + result.fanout = call_with_arg(compiled_result.fanout, shape) + result.occupancy = call_with_arg(compiled_result.occupancy, shape) + result.fills_by_parent = call_with_arg(compiled_result.fills_by_parent, shape) + + skip = False + + total_capacity = defaultdict(lambda: 0) + for (level, _), capacity in result.occupancy.items(): + total_capacity[level] += capacity + for level, capacity in total_capacity.items(): + if level in max_capacity and capacity > max_capacity[level]: + skip = True + break + + if skip == True: + shape_subspace.skip_current_rank_iteration() + continue + + for level, fanout in result.fanout.items(): + if level in max_fanout: + invalid_spatial = any( + spatial_fanout_in_dim > max_fanout_in_dim + for spatial_fanout_in_dim, max_fanout_in_dim + in zip(fanout, max_fanout[level]) + ) + # if invalid_spatial: + # skip = True + # break + + if skip == True: + shape_subspace.skip_current_rank_iteration() + continue + + return num_tile_shapes + + +def get_intermediate_tensors(workload: LooptreeWorkload): + result = set() + for einsum in workload.einsum_id_to_name(): + written_tensors = workload.tensors_written_by_einsum(einsum) + for tensor in written_tensors: + reader_einsums = workload.reader_einsums(tensor) + for reader in reader_einsums: + if reader in workload.einsum_id_to_name(): + result.add(tensor) + break + + return result + + +def get_hardware_levels(arch): + bindings = {} + fanout = {} + max_capacity = {} + for node in arch['nodes']: + bindings_id = len(bindings) + bindings[bindings_id] = node['name'] + fanout[bindings_id] = (node.spatial.meshX, node.spatial.meshY) + attribute = node.attributes + if 'width' in attribute and 'depth' in attribute: + width = attribute.width + depth = attribute.depth + datawidth = attribute.datawidth + if all(x is not None for x in (width, depth, datawidth)): + max_capacity[bindings_id] = \ + attribute.width * attribute.depth / attribute.datawidth + return bindings, fanout, max_capacity + + +def call_with_arg(f, arg): + if isinstance(next(iter(f.values())), tuple): + return { k: (v[0], v[1](*arg)) for k, v in f.items() } + else: + return { k: v(*arg) for k, v in f.items() } + + +def count(it): + count = 0 + for _ in it: + count += 1 + return count diff --git a/pytimeloop/fastfusion/mapper/shape_subspace.py b/pytimeloop/fastfusion/mapper/shape_subspace.py index bd06388..41a9ec2 100644 --- a/pytimeloop/fastfusion/mapper/shape_subspace.py +++ b/pytimeloop/fastfusion/mapper/shape_subspace.py @@ -53,6 +53,8 @@ def __next__(self): break except StopIteration as e: pass + if len(self.choice_iterators) == 0: + idx = 0 for j in range(idx+1, len(self.choice_iterators)): self.restart_iterator(j) else: @@ -73,7 +75,10 @@ def skip_current_rank_iteration(self, chain_skip_if_first=True): idx = len(self.choice_iterators)-i-1 if not self.is_first_choice[idx]: break - skip_limit = i+1 + if len(self.choice_iterators) == 0: + skip_limit = 0 + else: + skip_limit = i+1 if skip_limit == len(self.choice_iterators): self.is_done = True @@ -95,13 +100,18 @@ def skip_current_rank_iteration(self, chain_skip_if_first=True): def make_choice_generators(self, shape_subspace: ShapeSubspace): choice_generators = [] - for _ in shape_subspace.ranks: - choice_generators.append( - lambda shape: [ + + def gen(shape): + if shape == 1: + return [1] + else: + return [ s[0] for s in integer_factorizations_to_n_parts(shape, 2) - ] - ) + ][:-1] + + for _ in shape_subspace.ranks: + choice_generators.append(gen) return choice_generators def initialize_choice_iterators(self): diff --git a/tests/fastfusion/test_mapper.py b/tests/fastfusion/test_mapper.py index 0ac7657..50ec139 100644 --- a/tests/fastfusion/test_mapper.py +++ b/tests/fastfusion/test_mapper.py @@ -3,8 +3,7 @@ from bindings.looptree import LooptreeWorkload -from pytimeloop.fastfusion.mapper.mapper import mapper -from pytimeloop.fastfusion.mapper.shape_subspace import ShapeSubspace +from pytimeloop.fastfusion.mapper.mapper2 import mapper, MacArrayConstraint from tests.load_config_mixin import LoadConfigMixin from tests.util import TEST_TMP_DIR @@ -13,58 +12,30 @@ class TestMapper(LoadConfigMixin, unittest.TestCase): def test_mapper(self): config, spec = self.load_config([ - 'cascaded_mm.workload.yaml', - 'three_level.arch.yaml' + 'cascaded_mm_large.workload.yaml', + 'four_level.arch.yaml' ]) + mac_constraint = MacArrayConstraint( + 64, + 64, + { + 'Fc1': 'Filter1', + 'Fc2': 'Filter2' + }, + { + 'Fc1': 'M1', + 'Fc2': 'M2' + }, + { + 'Fc1': 'C1', + 'Fc2': 'C2' + } + ) + result = mapper(config, + mac_constraint, spec, - TEST_TMP_DIR, + tmp_path=TEST_TMP_DIR, verbose_stream=sys.stdout) - -class TestShapeSubspace(LoadConfigMixin, unittest.TestCase): - def setUp(self) -> None: - config, spec = self.load_config([ - 'cascaded_mm.workload.yaml', - 'three_level.arch.yaml' - ]) - - workload = LooptreeWorkload.parse_cfg(config.root['problem']) - NAME_OF_EINSUM_TO_MAP = 'Fc1' - - einsum_name_to_id = workload.einsum_name_to_id() - id_of_einsum_to_eval = einsum_name_to_id[NAME_OF_EINSUM_TO_MAP] - - ranks = workload.einsum_ospace_dimensions(id_of_einsum_to_eval) - einsum_shape = { - rank_id: workload.get_rank_shape(rank_id)[1]+1 for rank_id in ranks - } - - shape_subspace = ShapeSubspace(einsum_shape, ranks) - self.subspace_it = iter(shape_subspace) - - def test_iterate_all(self): - self.assertEqual(18, self.count_iterations(self.subspace_it)) - - def test_skip_first_iteration(self): - # Simulates first choice not being valid - first_choice = next(self.subspace_it) - self.subspace_it.skip_current_rank_iteration() - self.assertEqual(0, self.count_iterations(self.subspace_it)) - - def test_skip_second_rank_interation(self): - first_choice = next(self.subspace_it) - second_rank_val_in_first_choice = first_choice[0][-2] - for shape in self.subspace_it: - if shape[0][-2] != second_rank_val_in_first_choice: - break - self.subspace_it.skip_current_rank_iteration() - self.assertEqual(12, self.count_iterations(self.subspace_it)) - - @staticmethod - def count_iterations(it): - count = 0 - for _ in it: - count += 1 - return count \ No newline at end of file