From 89cac66129c3ee1f996ca6ebee7c25f894b92f18 Mon Sep 17 00:00:00 2001 From: Michael Gilbert Date: Wed, 16 Oct 2024 08:30:21 -0400 Subject: [PATCH] Uneven support --- bindings/looptree/ir.cpp | 77 +++++++ pytimeloop/fastfusion/fusionset.py | 218 +----------------- .../mapper/level_mapper/exhaustive.py | 141 +++++------ .../fastfusion/mapper/level_mapper/helper.py | 11 + .../mapper/level_mapper/top_level.py | 157 ++++++------- pytimeloop/fastfusion/mapper/mapper.py | 20 +- pytimeloop/fastfusion/mapper/stepped_model.py | 38 ++- pytimeloop/looptree/fastmodel/fastmodel.py | 7 +- pytimeloop/looptree/latency/latency.py | 61 +++++ .../cascaded_mm_large.workload.yaml | 36 +++ tests/test_configs/four_level.arch.yaml | 44 ++++ tests/test_configs/tiled.arch.yaml | 44 ++++ 12 files changed, 485 insertions(+), 369 deletions(-) create mode 100644 bindings/looptree/ir.cpp create mode 100644 pytimeloop/fastfusion/mapper/level_mapper/helper.py create mode 100644 tests/test_configs/cascaded_mm_large.workload.yaml create mode 100644 tests/test_configs/four_level.arch.yaml create mode 100644 tests/test_configs/tiled.arch.yaml diff --git a/bindings/looptree/ir.cpp b/bindings/looptree/ir.cpp new file mode 100644 index 0000000..ca46082 --- /dev/null +++ b/bindings/looptree/ir.cpp @@ -0,0 +1,77 @@ +#include "pytimeloop/bindings/looptree.h" + +#include + +#include +#include +#include + +#include + + +#define FUSED_WORKLOAD_METHOD(python_name, cpp_name) \ + def(#python_name, &problem::FusedWorkload::cpp_name) + +#define FUSED_WORKLOAD_ANALYZER_METHOD(python_name, cpp_name) \ + def(#python_name, &problem::FusedWorkloadDependencyAnalyzer::cpp_name) + +namespace py = pybind11; + +#define DEFINE_REPR_VIA_STRINGSTREAM(class) \ + def("__repr__", &print_via_stringstream) + +#define DEFINE_PROPERTY(class, name) \ + def_readwrite(#name, &analysis::class::name) + + +template +std::string print_via_stringstream(const T& t) +{ + std::stringstream buf; + buf << t; + return buf.str(); +} + + +namespace pytimeloop::looptree_bindings +{ + +void BindIr(py::module& m) +{ + py::class_(m, "Temporal") + .def(py::init<>()) + .DEFINE_REPR_VIA_STRINGSTREAM(analysis::Temporal); + + py::class_(m, "Spatial") + .def(py::init()) + .DEFINE_REPR_VIA_STRINGSTREAM(analysis::Spatial); + + py::class_(m, "Sequential") + .def(py::init<>()) + .DEFINE_REPR_VIA_STRINGSTREAM(analysis::Sequential); + + py::class_(m, "PipelineTemporal") + .def(py::init<>()) + .DEFINE_REPR_VIA_STRINGSTREAM(analysis::PipelineTemporal); + + py::class_(m, "PipelineSpatial") + .def(py::init<>()) + .DEFINE_REPR_VIA_STRINGSTREAM(analysis::PipelineSpatial); + + py::class_(m, "LogicalBuffer") + .def(py::init<>()) + .DEFINE_PROPERTY(LogicalBuffer, buffer_id) + .DEFINE_PROPERTY(LogicalBuffer, dspace_id) + .DEFINE_PROPERTY(LogicalBuffer, branch_leaf_id) + .DEFINE_REPR_VIA_STRINGSTREAM(analysis::LogicalBuffer); + + py::class_(m, "Occupancy") + .def(py::init<>()) + .DEFINE_PROPERTY(Occupancy, dim_in_tags); + + py::class_(m, "Fill") + .def(py::init<>()) + .DEFINE_PROPERTY(Fill, dim_in_tags); +} + +} \ No newline at end of file diff --git a/pytimeloop/fastfusion/fusionset.py b/pytimeloop/fastfusion/fusionset.py index f0204fc..f2a17fe 100644 --- a/pytimeloop/fastfusion/fusionset.py +++ b/pytimeloop/fastfusion/fusionset.py @@ -1,10 +1,10 @@ -from typing import Any, Generator -from pareto import Pareto -from compatibility import OpCompatibility from collections import defaultdict -import unittest import itertools -from util import fzs +from typing import Any, Generator + +from .compatibility import OpCompatibility +from .pareto import Pareto +from .util import fzs class FusionSet: @@ -202,210 +202,4 @@ def __lt__(self, other: "FusionSet") -> bool: return self.compatibility < other.compatibility def __repr__(self): - return f"FusionSet({self.compatibility})" - - -class TestFusionSet(unittest.TestCase): - def test_vertical_combine(self): - fs = [] - for i in range(2): - comp = OpCompatibility( - einsum_id=f"einsum1", - fused_tensors=fzs(), - fused_loops=(), - fused_ranks=fzs(), - ranks=fzs(), - tensors=fzs(), - neighbors=fzs(), - ) - fs.append(FusionSet({comp}, Pareto(data={}))) - new_fs = FusionSet.vertical_combine(fs) - self.assertEqual(len(new_fs.compatibility), 1) - self.assertEqual(new_fs.payload.data, {}) - - def test_combine(self): - comp1 = OpCompatibility( - einsum_id=f"einsum1", - fused_tensors=fzs(), - fused_loops=(), - fused_ranks=fzs(), - ranks=fzs("R"), - tensors=fzs("Q"), - neighbors=fzs("123"), - ) - comp2 = OpCompatibility( - einsum_id=f"einsum2", - fused_tensors=fzs(), - fused_loops=(), - fused_ranks=fzs(), - ranks=fzs("S"), - tensors=fzs("V"), - neighbors=fzs("ABC"), - ) - fs1 = FusionSet({comp1}, Pareto(data={})) - fs2 = FusionSet({comp2}, Pareto(data={})) - new_fs = fs1.combine(fs2) - self.assertEqual(len(new_fs.compatibility), 2) - self.assertIn(comp1, new_fs.compatibility) - self.assertIn(comp2, new_fs.compatibility) - self.assertEqual(new_fs.payload.data, {}) - self.assertEqual(new_fs.tensors, {"Q", "V"}) - self.assertEqual(new_fs.ranks, {"R", "S"}) - - def test_compatibile_with(self): - for neighbors in fzs("ABC"), fzs(): - kwargs = dict( - fused_tensors=fzs("T1"), - fused_ranks=fzs(), - ranks=fzs("A"), - tensors=fzs(), - neighbors=neighbors, - ) - - comp1 = OpCompatibility(einsum_id="A", fused_loops=(("A", 1),), **kwargs) - comp2 = OpCompatibility(einsum_id="B", fused_loops=(("A", 2),), **kwargs) - - comp4 = OpCompatibility(einsum_id="C", fused_loops=(("A", 4),), **kwargs) - comp5 = OpCompatibility(einsum_id="C", fused_loops=(("A", 3),), **kwargs) - - fs1 = FusionSet({comp1, comp2}, Pareto(data={})) - fs2 = FusionSet({comp4}, Pareto(data={})) - self.assertEqual(fs1.compatible_with(fs2), True) - - fs2 = FusionSet({comp5}, Pareto(data={})) - # Not neighbors --> compatible becuase there's nothing overlapping to check - self.assertEqual(fs1.compatible_with(fs2), not neighbors) - - # Test: - # - Drop dead - # - Finding live neighbors - # - - def test_drop_dead(self): - comp1 = OpCompatibility( - einsum_id=f"einsum1", - fused_tensors=fzs(), - fused_loops=(), - fused_ranks=fzs(), - ranks=fzs("R"), - tensors=fzs("Q"), - neighbors=fzs("123"), - ) - comp2 = OpCompatibility( - einsum_id=f"einsum2", - fused_tensors=fzs(), - fused_loops=(), - fused_ranks=fzs(), - ranks=fzs("S"), - tensors=fzs("V"), - neighbors=fzs("ABC"), - ) - fs = FusionSet({comp1, comp2}, Pareto(data={})) - fs.drop_dead({"einsum1"}) - self.assertEqual(len(fs.compatibility), 1) - self.assertIn(comp1, fs.compatibility) - self.assertEqual(fs.payload.data, {}) - fs.drop_dead(set()) - self.assertEqual(len(fs.compatibility), 0) - self.assertEqual(fs.payload.data, {}) - - def test_live_partition(self): - kwargs = dict( - fused_tensors=fzs("T1"), - ranks=fzs("A"), - tensors=fzs(), - fused_loops=(), - ) - - a = OpCompatibility( - einsum_id="A", fused_ranks=fzs("A"), neighbors=fzs("B"), **kwargs - ) - b = OpCompatibility( - einsum_id="B", fused_ranks=fzs("A"), neighbors=fzs("AC"), **kwargs - ) - c = OpCompatibility( - einsum_id="C", fused_ranks=fzs(), neighbors=fzs("BD"), **kwargs - ) - d = OpCompatibility( - einsum_id="D", fused_ranks=fzs("A"), neighbors=fzs("CE"), **kwargs - ) - e = OpCompatibility( - einsum_id="E", fused_ranks=fzs("A"), neighbors=fzs("DF"), **kwargs - ) - f = OpCompatibility( - einsum_id="F", fused_ranks=fzs("A"), neighbors=fzs("E"), **kwargs - ) - - for live, partition in [ - ("A", ("AB",)), - ("B", ("AB", "C")), - ("C", ("AB", "C", "DEF")), - ("D", ("C", "DEF")), - ("E", ("DEF",)), - ("F", ("DEF",)), - ("AF", ("AB", "DEF")), - ("ABF", ("AB", "C", "DEF")), - ]: - fs = FusionSet({a, b, c, d, e, f}, Pareto(data={})) - fs.drop_dead(set(live)) - partitions = OpCompatibility.get_tiled_partitions(fs.compatibility) - ids = tuple( - sorted("".join(sorted(p.einsum_id for p in p2)) for p2 in partitions) - ) - msg = f"Failed with {live} {partition}, got {ids}" - self.assertEqual(len(fs.compatibility), sum(len(l) for l in partition), msg) - self.assertEqual(ids, partition, msg) - - def test_bucketing(self): - tensor_choices = ["A", "B", "BC"] - rank_choices = ["MN", "NM"] - rank_size_choices = [1, 2] - has_other_einsum = [True, False] - - comps = [] - for t in tensor_choices: - for r in rank_choices: - for rs in rank_size_choices: - for other in has_other_einsum: - kwargs = dict( - fused_tensors=fzs(t), - fused_loops=tuple((x, rs) for x in r), - fused_ranks=fzs(r), - ranks=fzs("MN"), - tensors=fzs(t), - neighbors=fzs(), - ) - x = {OpCompatibility(einsum_id="einsum1", **kwargs)} - if other: - x.add(OpCompatibility(einsum_id="einsum2", **kwargs)) - comps.append(FusionSet(x, Pareto(data={}))) - - def check_bucket_sizes(bucketed, expected): - if expected: - self.assertEqual(len(bucketed), expected[0]) - for b in bucketed.values(): - check_bucket_sizes(b, expected[1:]) - - fusion_sets = FusionSet.bucket_multi_level( - comps, {"einsum1"}, {"A"}, {"M", "N"} - ) - check_bucket_sizes(fusion_sets, [2, 2, 2]) - fusion_sets = FusionSet.bucket_multi_level( - comps, {"einsum1", "einsum2"}, {"A"}, {"M", "N"} - ) - check_bucket_sizes(fusion_sets, [4, 2, 2]) - fusion_sets = FusionSet.bucket_multi_level( - comps, {"einsum1"}, {"A", "C"}, {"M"} - ) - check_bucket_sizes(fusion_sets, [3, 2, 2]) - fusion_sets = FusionSet.bucket_multi_level( - comps, {"einsum1"}, {"A", "C"}, set() - ) - check_bucket_sizes(fusion_sets, [3, 1, 1]) - fusion_sets = FusionSet.bucket_multi_level( - comps, {"einsum1", "einsum2"}, set(), set() - ) - check_bucket_sizes(fusion_sets, [2, 1, 1]) - - -if __name__ == "__main__": - unittest.main() + return f"FusionSet({self.compatibility})" \ No newline at end of file diff --git a/pytimeloop/fastfusion/mapper/level_mapper/exhaustive.py b/pytimeloop/fastfusion/mapper/level_mapper/exhaustive.py index 5335146..9d47fa1 100644 --- a/pytimeloop/fastfusion/mapper/level_mapper/exhaustive.py +++ b/pytimeloop/fastfusion/mapper/level_mapper/exhaustive.py @@ -5,6 +5,8 @@ from pytimeloop.fastfusion.mapper.shape_subspace import ShapeSubspace +from .helper import gather_relevant_boundary_idxs + class ExhaustiveLevelMapper: def __init__(self, @@ -15,19 +17,17 @@ def __init__(self, lower_mapper, partial_model, step_back_model, + analyzer, max_spatial=(1,), - max_capacity=None, - mapping_filter=None, - stats_filter=None): + max_capacity=None): self.hw_level = hw_level self.ranks = ranks self.tensors = tensors self.can_bypass = can_bypass self.lower_mapper = lower_mapper - self.mapping_filter = mapping_filter - self.stats_filter = stats_filter self.partial_model = partial_model self.step_back_model = step_back_model + self.analyzer = analyzer self.max_spatial = max_spatial self.max_capacity = max_capacity @@ -48,49 +48,83 @@ def run(self, rank_shapes, state): else: spatial_ranks_choices = product(spatial_ranks_choices, repeat=n_spatial) for temporal_ranks in temporal_ranks_choices: - for spatial_ranks in spatial_ranks_choices: - n_temporal_ranks = len(temporal_ranks) - n_spatial_ranks = tuple(len(ranks) for ranks in spatial_ranks) - all_ranks = ( - list(temporal_ranks) - + - sum((list(ranks) for ranks in spatial_ranks), start=[]) + all_tensor_choices = [] + for tensor_id in bypassing: + relevant_ranks = \ + self.analyzer.einsum_dims_relevant_to_tensor( + state.id_of_einsum_to_eval, + tensor_id + ) + all_tensor_choices.append( + (tensor_id, i) for i in + gather_relevant_boundary_idxs(temporal_ranks, + relevant_ranks) ) - tile_shape_subspace = ShapeSubspace(rank_shapes, all_ranks) - tile_shape_iterator = iter(tile_shape_subspace) - for tile_shape, leftover_rank_shapes in tile_shape_iterator: - temporal_tile_shape = tile_shape[:n_temporal_ranks] - start = n_temporal_ranks - spatial_tile_shapes = [] - for num_ranks in n_spatial_ranks: - spatial_tile_shapes.append(tile_shape[start:start+num_ranks]) - start += num_ranks + for retain_choices in product(*all_tensor_choices): + for spatial_ranks in spatial_ranks_choices: + n_temporal_ranks = len(temporal_ranks) + n_spatial_ranks = tuple(len(ranks) for ranks in spatial_ranks) + all_ranks = ( + list(temporal_ranks) + + + sum((list(ranks) for ranks in spatial_ranks), start=[]) + ) + tile_shape_subspace = ShapeSubspace(rank_shapes, all_ranks) + tile_shape_iterator = iter(tile_shape_subspace) + for tile_shape, leftover_rank_shapes in tile_shape_iterator: + temporal_tile_shape = tile_shape[:n_temporal_ranks] + start = n_temporal_ranks + spatial_tile_shapes = [] + for num_ranks in n_spatial_ranks: + spatial_tile_shapes.append(tile_shape[start:start+num_ranks]) + start += num_ranks + + temporal_loops = tuple(zip(temporal_ranks, temporal_tile_shape)) - temporal_loops = tuple(zip(temporal_ranks, temporal_tile_shape)) - if not self.check_mapping(temporal_loops, tile_shape, bypassing): - continue + spatial_loops = [ + tuple(zip(ranks, spatial_tile_shape)) + for ranks, spatial_tile_shape + in zip(spatial_ranks, spatial_tile_shapes) + ] - spatial_loops = [ - tuple(zip(ranks, spatial_tile_shape)) - for ranks, spatial_tile_shape - in zip(spatial_ranks, spatial_tile_shapes) - ] + new_state = deepcopy(state) + stats = self.partial_model(state=new_state, + temporal_loops=temporal_loops, + spatial_loops=spatial_loops, + retained_tensors=retain_choices) - new_state = deepcopy(state) - stats = self.partial_model(state=new_state, - temporal_loops=temporal_loops, - spatial_loops=spatial_loops, - retained_tensors=bypassing) + if self.lower_mapper is not None: + for stats in self.lower_mapper.run(leftover_rank_shapes, new_state): + invalid_spatial = any( + spatial_fanout > max_fanout + for spatial_fanout, max_fanout + in zip(stats.spatial[self.hw_level], self.max_spatial) + ) + if invalid_spatial: + break - if self.lower_mapper is not None: - for stats in self.lower_mapper.run(leftover_rank_shapes, new_state): + total_capacity = 0 + for (level, _), capacity in stats.capacity.items(): + if level == self.hw_level: + total_capacity += capacity + invalid_capacity = ( + self.max_capacity is not None + and + total_capacity > self.max_capacity + ) + if invalid_capacity: + tile_shape_iterator.skip_current_rank_iteration() + break + + yield stats + else: invalid_spatial = any( spatial_fanout > max_fanout for spatial_fanout, max_fanout in zip(stats.spatial[self.hw_level], self.max_spatial) ) if invalid_spatial: - break + continue total_capacity = 0 for (level, _), capacity in stats.capacity.items(): @@ -103,38 +137,5 @@ def run(self, rank_shapes, state): ) if invalid_capacity: tile_shape_iterator.skip_current_rank_iteration() - break yield stats - else: - invalid_spatial = any( - spatial_fanout > max_fanout - for spatial_fanout, max_fanout - in zip(stats.spatial[self.hw_level], self.max_spatial) - ) - if invalid_spatial: - continue - - total_capacity = 0 - for (level, _), capacity in stats.capacity.items(): - if level == self.hw_level: - total_capacity += capacity - invalid_capacity = ( - self.max_capacity is not None - and - total_capacity > self.max_capacity - ) - if invalid_capacity: - tile_shape_iterator.skip_current_rank_iteration() - - yield stats - - def check_mapping(self, temporal_loops, tile_shape, bypassing): - if self.mapping_filter is None: - return True - return self.mapping_filter(temporal_loops, tile_shape, bypassing) - - def check_stats(self, total_stats): - if self.stats_filter is None: - return True - return self.stats_filter(total_stats) diff --git a/pytimeloop/fastfusion/mapper/level_mapper/helper.py b/pytimeloop/fastfusion/mapper/level_mapper/helper.py new file mode 100644 index 0000000..a90a3b4 --- /dev/null +++ b/pytimeloop/fastfusion/mapper/level_mapper/helper.py @@ -0,0 +1,11 @@ +def gather_relevant_boundary_idxs(ranks, relevant_ranks): + idxs = [] + last_is_relevant = True + for i, r in enumerate(ranks): + is_relevant = r in relevant_ranks + if last_is_relevant and not is_relevant: + idxs.append(i) + last_is_relevant = is_relevant + if last_is_relevant: + idxs.append(len(ranks)) + return idxs \ No newline at end of file diff --git a/pytimeloop/fastfusion/mapper/level_mapper/top_level.py b/pytimeloop/fastfusion/mapper/level_mapper/top_level.py index 73a9114..ae1da45 100644 --- a/pytimeloop/fastfusion/mapper/level_mapper/top_level.py +++ b/pytimeloop/fastfusion/mapper/level_mapper/top_level.py @@ -12,6 +12,8 @@ from pytimeloop.fastfusion.compatibility import OpCompatibility from pytimeloop.fastfusion.mapper.stepped_model import SteppedModelState +from .helper import gather_relevant_boundary_idxs + class TopLevelMapper: def __init__(self, @@ -26,10 +28,9 @@ def __init__(self, partial_model, step_back_model, bits_per_word, + analyzer, max_spatial=(1,), - max_capacity=None, - mapping_filter=None, - stats_filter=None): + max_capacity=None): self.hw_level = hw_level self.ranks = frozenset(ranks) self.tensors = frozenset(tensors) @@ -37,11 +38,10 @@ def __init__(self, self.id_of_einsum_to_eval = id_of_einsum_to_eval self.neighbors = frozenset(neighbors) self.lower_mapper = lower_mapper - self.mapping_filter = mapping_filter - self.stats_filter = stats_filter self.model = model self.partial_model = partial_model self.step_back_model = step_back_model + self.analyzer = analyzer self.max_spatial = max_spatial self.max_capacity = max_capacity self.compatibility_to_df = defaultdict(lambda: defaultdict(lambda: list())) @@ -98,49 +98,87 @@ def run(self, rank_shapes): repeat=n_spatial) for temporal_ranks in temporal_ranks_choices: - for spatial_ranks in spatial_ranks_choices: - n_temporal_ranks = len(temporal_ranks) - n_spatial_ranks = tuple(len(ranks) for ranks in spatial_ranks) - all_ranks = ( - list(temporal_ranks) - + - sum((list(ranks) for ranks in spatial_ranks), start=[]) + all_tensor_choices = [] + for tensor_id in fused_tensors: + relevant_ranks = \ + self.analyzer.einsum_dims_relevant_to_tensor( + state.id_of_einsum_to_eval, + tensor_id + ) + all_tensor_choices.append( + (tensor_id, i) for i in + gather_relevant_boundary_idxs(temporal_ranks, + relevant_ranks) ) - tile_shape_subspace = ShapeSubspace(rank_shapes, all_ranks) - tile_shape_iterator = iter(tile_shape_subspace) - for tile_shape, leftover_rank_shapes in tile_shape_iterator: - temporal_tile_shape = tile_shape[:n_temporal_ranks] - start = n_temporal_ranks - spatial_tile_shapes = [] - for num_ranks in n_spatial_ranks: - spatial_tile_shapes.append(tile_shape[start:start+num_ranks]) - start += num_ranks - - temporal_loops = tuple(zip(temporal_ranks, temporal_tile_shape)) - if not self.check_mapping(temporal_loops, tile_shape, fused_tensors): - continue - - spatial_loops = [ - tuple(zip(ranks, spatial_tile_shape)) - for ranks, spatial_tile_shape - in zip(spatial_ranks, spatial_tile_shapes) - ] - - new_state = deepcopy(state) - stats = self.partial_model(state=new_state, - temporal_loops=temporal_loops, - spatial_loops=spatial_loops, - retained_tensors=fused_tensors) - - if self.lower_mapper is not None: - for stats in self.lower_mapper.run(leftover_rank_shapes, new_state): + for retain_choices in product(*all_tensor_choices): + for spatial_ranks in spatial_ranks_choices: + n_temporal_ranks = len(temporal_ranks) + n_spatial_ranks = tuple(len(ranks) for ranks in spatial_ranks) + all_ranks = ( + list(temporal_ranks) + + + sum((list(ranks) for ranks in spatial_ranks), start=[]) + ) + tile_shape_subspace = ShapeSubspace(rank_shapes, all_ranks) + tile_shape_iterator = iter(tile_shape_subspace) + for tile_shape, leftover_rank_shapes in tile_shape_iterator: + temporal_tile_shape = tile_shape[:n_temporal_ranks] + start = n_temporal_ranks + spatial_tile_shapes = [] + for num_ranks in n_spatial_ranks: + spatial_tile_shapes.append(tile_shape[start:start+num_ranks]) + start += num_ranks + + temporal_loops = tuple(zip(temporal_ranks, temporal_tile_shape)) + + spatial_loops = [ + tuple(zip(ranks, spatial_tile_shape)) + for ranks, spatial_tile_shape + in zip(spatial_ranks, spatial_tile_shapes) + ] + + new_state = deepcopy(state) + stats = self.partial_model(state=new_state, + temporal_loops=temporal_loops, + spatial_loops=spatial_loops, + retained_tensors=retain_choices) + + if self.lower_mapper is not None: + for stats in self.lower_mapper.run(leftover_rank_shapes, new_state): + invalid_spatial = any( + spatial_fanout > max_fanout + for spatial_fanout, max_fanout + in zip(stats.spatial[self.hw_level], self.max_spatial) + ) + if invalid_spatial: + break + + total_capacity = 0 + for (level, _), capacity in stats.capacity.items(): + if level == self.hw_level: + total_capacity += capacity + invalid_capacity = ( + self.max_capacity is not None + and + total_capacity > self.max_capacity + ) + if invalid_capacity: + tile_shape_iterator.skip_current_rank_iteration() + break + + self.store_evaluation_result( + fused_tensors, + temporal_loops, + stats + ) + else: invalid_spatial = any( spatial_fanout > max_fanout for spatial_fanout, max_fanout in zip(stats.spatial[self.hw_level], self.max_spatial) ) if invalid_spatial: - break + continue total_capacity = 0 for (level, _), capacity in stats.capacity.items(): @@ -153,46 +191,9 @@ def run(self, rank_shapes): ) if invalid_capacity: tile_shape_iterator.skip_current_rank_iteration() - break self.store_evaluation_result( fused_tensors, temporal_loops, stats ) - else: - invalid_spatial = any( - spatial_fanout > max_fanout - for spatial_fanout, max_fanout - in zip(stats.spatial[self.hw_level], self.max_spatial) - ) - if invalid_spatial: - continue - - total_capacity = 0 - for (level, _), capacity in stats.capacity.items(): - if level == self.hw_level: - total_capacity += capacity - invalid_capacity = ( - self.max_capacity is not None - and - total_capacity > self.max_capacity - ) - if invalid_capacity: - tile_shape_iterator.skip_current_rank_iteration() - - self.store_evaluation_result( - fused_tensors, - temporal_loops, - stats - ) - - def check_mapping(self, temporal_loops, tile_shape, bypassing): - if self.mapping_filter is None: - return True - return self.mapping_filter(temporal_loops, tile_shape, bypassing) - - def check_stats(self, total_stats): - if self.stats_filter is None: - return True - return self.stats_filter(total_stats) diff --git a/pytimeloop/fastfusion/mapper/mapper.py b/pytimeloop/fastfusion/mapper/mapper.py index 0df1e8e..cc16e97 100644 --- a/pytimeloop/fastfusion/mapper/mapper.py +++ b/pytimeloop/fastfusion/mapper/mapper.py @@ -98,11 +98,11 @@ def final_model(level, state, temporal_loops, spatial_loops, retained_tensors): return model.run(state) def partial_model(level, state, temporal_loops, spatial_loops, retained_tensors): - model.add_storage(state, - level, - temporal_loops, - spatial_loops, - retained_tensors) + model.add_level_uneven(state, + level, + temporal_loops, + spatial_loops, + retained_tensors) return Stats() @@ -132,6 +132,7 @@ def partial_model(level, state, temporal_loops, spatial_loops, retained_tensors) max_capacity=level_max_cap, can_bypass=True, lower_mapper=cur_mapper, + analyzer=analyzer, partial_model=partial(partial_model, level=hw_level), step_back_model=step_back_model) @@ -150,6 +151,7 @@ def partial_model(level, state, temporal_loops, spatial_loops, retained_tensors) lower_mapper=cur_mapper, model=model, bits_per_word=8, + analyzer=analyzer, partial_model=partial(partial_model, level=0), step_back_model=step_back_model, max_spatial=max_spatial[hw_level], @@ -158,11 +160,19 @@ def partial_model(level, state, temporal_loops, spatial_loops, retained_tensors) cur_mapper.run(einsum_shape) result = cur_mapper.get_result() + before_pareto_size = sum(v.shape[0] for v in result.values()) result_dict = {} op_data = OpData(frozenset({id_of_einsum_to_eval}), frozenset(tensors)) + after_pareto_size = 0 for op_comp, data in result.items(): result_dict[op_comp] = Pareto({op_data: data}) + after_pareto_size += sum( + v.shape[0] for v in result_dict[op_comp].data.values() + ) + + print('mapspace size:', before_pareto_size) + print('mapspace after pareto size:', after_pareto_size) return result_dict diff --git a/pytimeloop/fastfusion/mapper/stepped_model.py b/pytimeloop/fastfusion/mapper/stepped_model.py index c3a6f87..d7e5859 100644 --- a/pytimeloop/fastfusion/mapper/stepped_model.py +++ b/pytimeloop/fastfusion/mapper/stepped_model.py @@ -1,3 +1,4 @@ +from collections import defaultdict from copy import deepcopy from dataclasses import dataclass from pathlib import Path @@ -78,7 +79,6 @@ def initialize(self, state, level, id_of_einsum_to_eval, retained_tensors): state.id_of_einsum_to_eval = id_of_einsum_to_eval - def add_storage(self, state, level, temporal_loops, spatial_loops, retained_tensors): self.add_temporal_and_spatial_loops(state, temporal_loops, spatial_loops) state.mapping_of_interest.append({ @@ -88,6 +88,30 @@ def add_storage(self, state, level, temporal_loops, spatial_loops, retained_tens for tensor_id in retained_tensors] }) + def add_level_uneven(self, state, level, temporal_loops, spatial_loops, retained_tensors): + idx_to_tensor = defaultdict(lambda: list()) + for tensor, idx in retained_tensors: + idx_to_tensor[idx].append(tensor) + + self.add_spatial_loops(state, spatial_loops) + + for i, l in enumerate(temporal_loops): + if i in idx_to_tensor: + state.mapping_of_interest.append({ + 'type': 'storage', + 'target': level, + 'dspace': [self.tensor_id_to_name[tensor_id] + for tensor_id in idx_to_tensor[i]] + }) + self.add_temporal_loops(state, [l]) + state.mapping_of_interest.append({ + 'type': 'storage', + 'target': level, + 'dspace': [self.tensor_id_to_name[tensor_id] + for tensor_id in idx_to_tensor[len(temporal_loops)]] + }) + + def add_compute(self, state, level, einsum_name, temporal_loops, spatial_loops): self.add_temporal_and_spatial_loops(state, temporal_loops, spatial_loops) state.mapping_of_interest.append({ @@ -107,6 +131,8 @@ def run(self, state): # model = LooptreeModelApp(config) self.eval_count += 1 + if self.eval_count % 1000 == 0: + print(self.eval_count // 1000) result = run_fastmodel({'nodes': state.mapping}, state.id_of_einsum_to_eval, self.workload, @@ -133,13 +159,19 @@ def run(self, state): return stats def add_temporal_and_spatial_loops(self, state, temporal_loops, spatial_loops): - for rank, shape in temporal_loops: + self.add_spatial_loops(state, spatial_loops) + self.add_temporal_loops(state, temporal_loops) + + def add_temporal_loops(self, state, loops): + for rank, shape in loops: state.mapping_of_interest.append({ 'type': 'temporal', 'rank': self.dimension_id_to_name[rank], 'tile_shape': shape }) - for spatial_idx, loops in enumerate(spatial_loops): + + def add_spatial_loops(self, state, loops): + for spatial_idx, loops in enumerate(loops): for rank, shape in loops: state.mapping_of_interest.append({ 'type': 'spatial', diff --git a/pytimeloop/looptree/fastmodel/fastmodel.py b/pytimeloop/looptree/fastmodel/fastmodel.py index d052721..30ccfeb 100644 --- a/pytimeloop/looptree/fastmodel/fastmodel.py +++ b/pytimeloop/looptree/fastmodel/fastmodel.py @@ -85,6 +85,10 @@ def run_fastmodel(mapping, tensor_size[tensor_id] //= factor else: potential_tensor_access_multiplier[tensor_id] *= factor + elif node['type'] == 'sequential': + for tensor_id in tensors: + actual_tensor_access_multiplier[tensor_id] = \ + potential_tensor_access_multiplier[tensor_id] elif node['type'] == 'spatial': rank_name = node['rank'] rank_id = rank_name_to_id[rank_name] @@ -126,7 +130,8 @@ def run_fastmodel(mapping, ) ) - fanout[target] = cur_fanout + if target not in fanout: + fanout[target] = cur_fanout cur_fanout = [1] elif node['type'] == 'compute': target = node['target'] diff --git a/pytimeloop/looptree/latency/latency.py b/pytimeloop/looptree/latency/latency.py index d6617c2..47945dc 100755 --- a/pytimeloop/looptree/latency/latency.py +++ b/pytimeloop/looptree/latency/latency.py @@ -1,15 +1,76 @@ +from collections import defaultdict + from pytimeloop.isl.singular import get_value_from_singular_qpolynomial from pytimeloop.looptree.latency.processors import LATENCY_PROCESSORS from bindings.looptree import SpatialTag +def get_latency(actions, mapping, temporal_steps, workload, arch): + comp_latency = compute_latency(mapping, temporal_steps, workload) + mem_latency = memory_latency(actions, arch) + return max(comp_latency, max(mem_latency.values())) + + def compute_latency(mapping, temporal_steps, workload): return get_value_from_singular_qpolynomial( _compute_latency(mapping, 0, temporal_steps, workload)[1] ).to_python() +def memory_latency(actions, arch): + component_to_read_writes = defaultdict(lambda: [None, None]) + for (component, action), count in actions.items(): + if action == 'read': + component_to_read_writes[component][0] = count + elif action == 'write': + component_to_read_writes[component][1] = count + + component_latency = {} + bandwidths = get_bandwidth(arch) + for component, (reads, writes) in component_to_read_writes.items(): + read_bw, write_bw, shared_bw = bandwidths[component] + # All shared bw for writing + write_latency = writes / (write_bw + shared_bw) + read_latency = reads / read_bw + if write_latency >= read_latency: + component_latency[component] = write_latency + continue + # All shared bw for reading + write_latency = writes / write_bw + read_latency = reads / (read_bw + shared_bw) + if read_latency >= write_latency: + component_latency[component] = read_latency + continue + # Shared bw shared for reading and writing + component_latency[component] = ( + (reads + writes) + / + (read_bw + write_bw + shared_bw) + ) + return component_latency + + +def get_bandwidth(arch): + component_bandwidths = {} + for node in arch['nodes']: + attributes = node.attributes + n_rd_ports = attributes.get('n_rd_ports', 0) + n_wr_ports = attributes.get('n_wr_ports', 0) + n_rdwr_ports = attributes.get('n_rdwr_ports', 0) + + width = attributes['width'] + datawidth = attributes['datawidth'] + width_in_words = width/datawidth + + component_bandwidths[node['name']] = [ + n_rd_ports*width_in_words, + n_wr_ports*width_in_words, + n_rdwr_ports*width_in_words + ] + return component_bandwidths + + def _compute_latency(mapping, top_idx: int, temporal_steps, workload): einsum_name_to_id = workload.einsum_name_to_id() diff --git a/tests/test_configs/cascaded_mm_large.workload.yaml b/tests/test_configs/cascaded_mm_large.workload.yaml new file mode 100644 index 0000000..11ee33f --- /dev/null +++ b/tests/test_configs/cascaded_mm_large.workload.yaml @@ -0,0 +1,36 @@ +problem: + - shape: + name: Fc1 + dimensions: [ P1, M1, C1 ] + data_spaces: + - name: Fmap1 + dimensions: [ Fmap1_C, Fmap1_P ] + projection: '[ C1, P1 ]' + - name: Filter1 + dimensions: [ Filter1_C, Filter1_M ] + projection: '[ C1, M1 ]' + - name: Fmap2 + dimensions: [ Fmap2_C, Fmap2_P ] + projection: '[ M1, P1 ]' + read_write: True + + instance: >- + 0 <= P1 < 1024 and 0 <= M1 < 1024 and 0 <= C1 < 1024 + + - shape: + name: Fc2 + dimensions: [ P2, M2, C2 ] + data_spaces: + - name: Fmap2 + dimensions: [ Fmap2_C, Fmap2_P ] + projection: '[ C2, P2 ]' + - name: Filter2 + dimensions: [ Filter2_C, Filter2_M ] + projection: '[ C2, M2 ]' + - name: Fmap3 + dimensions: [ Fmap3_C, Fmap3_P ] + projection: '[ M2, P2 ]' + read_write: True + + instance: >- + 0 <= P2 < 1024 and 0 <= M2 < 1024 and 0 <= C2 < 1024 diff --git a/tests/test_configs/four_level.arch.yaml b/tests/test_configs/four_level.arch.yaml new file mode 100644 index 0000000..ed786e0 --- /dev/null +++ b/tests/test_configs/four_level.arch.yaml @@ -0,0 +1,44 @@ +variables: + global_cycle_seconds: 1e-9 + technology: "45nm" + +architecture: + version: 0.4 + nodes: + - !Component + name: MainMemory + class: DRAM + attributes: {width: 256, block_size: 32, word_bits: 8, datawidth: 8} + required_actions: ['read', 'write'] + - !Component + name: GlobalBuffer + class: SRAM + attributes: + depth: 16384 + width: 512 + block_size: 32 + word_bits: 8 + datawidth: 8 + n_rdwr_ports: 2 + n_rd_ports: 0 + n_wr_ports: 0 + required_actions: ['read', 'write'] + - !Container + name: PE + spatial: {meshX: 4, meshY: 4} + - !Container + name: Compute + spatial: {meshX: 64, meshY: 64} + - !Component + name: Register + class: regfile + attributes: + depth: 2 + width: 8 + datawidth: 8 + required_actions: ['read', 'write'] + - !Component + name: MACC + class: intmac + attributes: {datawidth: 8, width: 8, cycle_time: 1e-9} + required_actions: ['compute'] diff --git a/tests/test_configs/tiled.arch.yaml b/tests/test_configs/tiled.arch.yaml new file mode 100644 index 0000000..30a0fcc --- /dev/null +++ b/tests/test_configs/tiled.arch.yaml @@ -0,0 +1,44 @@ +variables: + global_cycle_seconds: 1e-9 + technology: "45nm" + +architecture: + version: 0.4 + nodes: + - !Component + name: MainMemory + class: DRAM + attributes: {width: 256, block_size: 32, word_bits: 8, datawidth: 8} + required_actions: ['read', 'write'] + - !Container + name: PE + spatial: {meshX: 4, meshY: 4} + - !Component + name: GlobalBuffer + class: SRAM + attributes: + depth: 8192 + width: 64 + block_size: 32 + word_bits: 8 + datawidth: 8 + n_rdwr_ports: 2 + n_rd_ports: 0 + n_wr_ports: 0 + required_actions: ['read', 'write'] + - !Container + name: Compute + spatial: {meshX: 64, meshY: 64} + - !Component + name: Register + class: regfile + attributes: + depth: 2 + width: 8 + datawidth: 8 + required_actions: ['read', 'write'] + - !Component + name: MACC + class: intmac + attributes: {datawidth: 8, width: 8, cycle_time: 1e-9} + required_actions: ['compute']