From 89cac66129c3ee1f996ca6ebee7c25f894b92f18 Mon Sep 17 00:00:00 2001
From: Michael Gilbert <gilbertm@mit.edu>
Date: Wed, 16 Oct 2024 08:30:21 -0400
Subject: [PATCH] Uneven support

---
 bindings/looptree/ir.cpp                      |  77 +++++++
 pytimeloop/fastfusion/fusionset.py            | 218 +-----------------
 .../mapper/level_mapper/exhaustive.py         | 141 +++++------
 .../fastfusion/mapper/level_mapper/helper.py  |  11 +
 .../mapper/level_mapper/top_level.py          | 157 ++++++-------
 pytimeloop/fastfusion/mapper/mapper.py        |  20 +-
 pytimeloop/fastfusion/mapper/stepped_model.py |  38 ++-
 pytimeloop/looptree/fastmodel/fastmodel.py    |   7 +-
 pytimeloop/looptree/latency/latency.py        |  61 +++++
 .../cascaded_mm_large.workload.yaml           |  36 +++
 tests/test_configs/four_level.arch.yaml       |  44 ++++
 tests/test_configs/tiled.arch.yaml            |  44 ++++
 12 files changed, 485 insertions(+), 369 deletions(-)
 create mode 100644 bindings/looptree/ir.cpp
 create mode 100644 pytimeloop/fastfusion/mapper/level_mapper/helper.py
 create mode 100644 tests/test_configs/cascaded_mm_large.workload.yaml
 create mode 100644 tests/test_configs/four_level.arch.yaml
 create mode 100644 tests/test_configs/tiled.arch.yaml
diff --git a/bindings/looptree/ir.cpp b/bindings/looptree/ir.cpp
new file mode 100644
index 0000000..ca46082
--- /dev/null
+++ b/bindings/looptree/ir.cpp
@@ -0,0 +1,77 @@
+#include "pytimeloop/bindings/looptree.h"
+
+#include <sstream>
+
+#include <applications/looptree-model/model.hpp>
+#include <workload/fused-workload.hpp>
+#include <workload/fused-workload-dependency-analyzer.hpp>
+
+#include <pybind11/stl.h>
+
+
+#define FUSED_WORKLOAD_METHOD(python_name, cpp_name) \
+    def(#python_name, &problem::FusedWorkload::cpp_name)
+
+#define FUSED_WORKLOAD_ANALYZER_METHOD(python_name, cpp_name) \
+    def(#python_name, &problem::FusedWorkloadDependencyAnalyzer::cpp_name)
+
+namespace py = pybind11;
+
+#define DEFINE_REPR_VIA_STRINGSTREAM(class) \
+  def("__repr__", &print_via_stringstream<class>)
+
+#define DEFINE_PROPERTY(class, name) \
+  def_readwrite(#name, &analysis::class::name)
+
+
+template<typename T>
+std::string print_via_stringstream(const T& t)
+{
+  std::stringstream buf;
+  buf << t;
+  return buf.str();
+}
+
+
+namespace pytimeloop::looptree_bindings
+{
+
+void BindIr(py::module& m)
+{
+  py::class_<analysis::Temporal>(m, "Temporal")
+    .def(py::init<>())
+    .DEFINE_REPR_VIA_STRINGSTREAM(analysis::Temporal);
+
+  py::class_<analysis::Spatial>(m, "Spatial")
+    .def(py::init<int, analysis::BufferId>())
+    .DEFINE_REPR_VIA_STRINGSTREAM(analysis::Spatial);
+
+  py::class_<analysis::Sequential>(m, "Sequential")
+    .def(py::init<>())
+    .DEFINE_REPR_VIA_STRINGSTREAM(analysis::Sequential);
+
+  py::class_<analysis::PipelineTemporal>(m, "PipelineTemporal")
+    .def(py::init<>())
+    .DEFINE_REPR_VIA_STRINGSTREAM(analysis::PipelineTemporal);
+
+  py::class_<analysis::PipelineSpatial>(m, "PipelineSpatial")
+    .def(py::init<>())
+    .DEFINE_REPR_VIA_STRINGSTREAM(analysis::PipelineSpatial);
+
+  py::class_<analysis::LogicalBuffer>(m, "LogicalBuffer")
+    .def(py::init<>())
+    .DEFINE_PROPERTY(LogicalBuffer, buffer_id)
+    .DEFINE_PROPERTY(LogicalBuffer, dspace_id)
+    .DEFINE_PROPERTY(LogicalBuffer, branch_leaf_id)
+    .DEFINE_REPR_VIA_STRINGSTREAM(analysis::LogicalBuffer);
+
+  py::class_<analysis::Occupancy>(m, "Occupancy")
+    .def(py::init<>())
+    .DEFINE_PROPERTY(Occupancy, dim_in_tags);
+
+  py::class_<analysis::Fill>(m, "Fill")
+    .def(py::init<>())
+    .DEFINE_PROPERTY(Fill, dim_in_tags);
+}
+
+}
\ No newline at end of file
diff --git a/pytimeloop/fastfusion/fusionset.py b/pytimeloop/fastfusion/fusionset.py
index f0204fc..f2a17fe 100644
--- a/pytimeloop/fastfusion/fusionset.py
+++ b/pytimeloop/fastfusion/fusionset.py
@@ -1,10 +1,10 @@
-from typing import Any, Generator
-from pareto import Pareto
-from compatibility import OpCompatibility
 from collections import defaultdict
-import unittest
 import itertools
-from util import fzs
+from typing import Any, Generator
+
+from .compatibility import OpCompatibility
+from .pareto import Pareto
+from .util import fzs
 
 
 class FusionSet:
@@ -202,210 +202,4 @@ def __lt__(self, other: "FusionSet") -> bool:
         return self.compatibility < other.compatibility
 
     def __repr__(self):
-        return f"FusionSet({self.compatibility})"
-
-
-class TestFusionSet(unittest.TestCase):
-    def test_vertical_combine(self):
-        fs = []
-        for i in range(2):
-            comp = OpCompatibility(
-                einsum_id=f"einsum1",
-                fused_tensors=fzs(),
-                fused_loops=(),
-                fused_ranks=fzs(),
-                ranks=fzs(),
-                tensors=fzs(),
-                neighbors=fzs(),
-            )
-            fs.append(FusionSet({comp}, Pareto(data={})))
-        new_fs = FusionSet.vertical_combine(fs)
-        self.assertEqual(len(new_fs.compatibility), 1)
-        self.assertEqual(new_fs.payload.data, {})
-
-    def test_combine(self):
-        comp1 = OpCompatibility(
-            einsum_id=f"einsum1",
-            fused_tensors=fzs(),
-            fused_loops=(),
-            fused_ranks=fzs(),
-            ranks=fzs("R"),
-            tensors=fzs("Q"),
-            neighbors=fzs("123"),
-        )
-        comp2 = OpCompatibility(
-            einsum_id=f"einsum2",
-            fused_tensors=fzs(),
-            fused_loops=(),
-            fused_ranks=fzs(),
-            ranks=fzs("S"),
-            tensors=fzs("V"),
-            neighbors=fzs("ABC"),
-        )
-        fs1 = FusionSet({comp1}, Pareto(data={}))
-        fs2 = FusionSet({comp2}, Pareto(data={}))
-        new_fs = fs1.combine(fs2)
-        self.assertEqual(len(new_fs.compatibility), 2)
-        self.assertIn(comp1, new_fs.compatibility)
-        self.assertIn(comp2, new_fs.compatibility)
-        self.assertEqual(new_fs.payload.data, {})
-        self.assertEqual(new_fs.tensors, {"Q", "V"})
-        self.assertEqual(new_fs.ranks, {"R", "S"})
-
-    def test_compatibile_with(self):
-        for neighbors in fzs("ABC"), fzs():
-            kwargs = dict(
-                fused_tensors=fzs("T1"),
-                fused_ranks=fzs(),
-                ranks=fzs("A"),
-                tensors=fzs(),
-                neighbors=neighbors,
-            )
-
-            comp1 = OpCompatibility(einsum_id="A", fused_loops=(("A", 1),), **kwargs)
-            comp2 = OpCompatibility(einsum_id="B", fused_loops=(("A", 2),), **kwargs)
-
-            comp4 = OpCompatibility(einsum_id="C", fused_loops=(("A", 4),), **kwargs)
-            comp5 = OpCompatibility(einsum_id="C", fused_loops=(("A", 3),), **kwargs)
-
-            fs1 = FusionSet({comp1, comp2}, Pareto(data={}))
-            fs2 = FusionSet({comp4}, Pareto(data={}))
-            self.assertEqual(fs1.compatible_with(fs2), True)
-
-            fs2 = FusionSet({comp5}, Pareto(data={}))
-            # Not neighbors --> compatible becuase there's nothing overlapping to check
-            self.assertEqual(fs1.compatible_with(fs2), not neighbors)
-
-    # Test:
-    # - Drop dead
-    # - Finding live neighbors
-    # -
-    def test_drop_dead(self):
-        comp1 = OpCompatibility(
-            einsum_id=f"einsum1",
-            fused_tensors=fzs(),
-            fused_loops=(),
-            fused_ranks=fzs(),
-            ranks=fzs("R"),
-            tensors=fzs("Q"),
-            neighbors=fzs("123"),
-        )
-        comp2 = OpCompatibility(
-            einsum_id=f"einsum2",
-            fused_tensors=fzs(),
-            fused_loops=(),
-            fused_ranks=fzs(),
-            ranks=fzs("S"),
-            tensors=fzs("V"),
-            neighbors=fzs("ABC"),
-        )
-        fs = FusionSet({comp1, comp2}, Pareto(data={}))
-        fs.drop_dead({"einsum1"})
-        self.assertEqual(len(fs.compatibility), 1)
-        self.assertIn(comp1, fs.compatibility)
-        self.assertEqual(fs.payload.data, {})
-        fs.drop_dead(set())
-        self.assertEqual(len(fs.compatibility), 0)
-        self.assertEqual(fs.payload.data, {})
-
-    def test_live_partition(self):
-        kwargs = dict(
-            fused_tensors=fzs("T1"),
-            ranks=fzs("A"),
-            tensors=fzs(),
-            fused_loops=(),
-        )
-
-        a = OpCompatibility(
-            einsum_id="A", fused_ranks=fzs("A"), neighbors=fzs("B"), **kwargs
-        )
-        b = OpCompatibility(
-            einsum_id="B", fused_ranks=fzs("A"), neighbors=fzs("AC"), **kwargs
-        )
-        c = OpCompatibility(
-            einsum_id="C", fused_ranks=fzs(), neighbors=fzs("BD"), **kwargs
-        )
-        d = OpCompatibility(
-            einsum_id="D", fused_ranks=fzs("A"), neighbors=fzs("CE"), **kwargs
-        )
-        e = OpCompatibility(
-            einsum_id="E", fused_ranks=fzs("A"), neighbors=fzs("DF"), **kwargs
-        )
-        f = OpCompatibility(
-            einsum_id="F", fused_ranks=fzs("A"), neighbors=fzs("E"), **kwargs
-        )
-
-        for live, partition in [
-            ("A", ("AB",)),
-            ("B", ("AB", "C")),
-            ("C", ("AB", "C", "DEF")),
-            ("D", ("C", "DEF")),
-            ("E", ("DEF",)),
-            ("F", ("DEF",)),
-            ("AF", ("AB", "DEF")),
-            ("ABF", ("AB", "C", "DEF")),
-        ]:
-            fs = FusionSet({a, b, c, d, e, f}, Pareto(data={}))
-            fs.drop_dead(set(live))
-            partitions = OpCompatibility.get_tiled_partitions(fs.compatibility)
-            ids = tuple(
-                sorted("".join(sorted(p.einsum_id for p in p2)) for p2 in partitions)
-            )
-            msg = f"Failed with {live} {partition}, got {ids}"
-            self.assertEqual(len(fs.compatibility), sum(len(l) for l in partition), msg)
-            self.assertEqual(ids, partition, msg)
-
-    def test_bucketing(self):
-        tensor_choices = ["A", "B", "BC"]
-        rank_choices = ["MN", "NM"]
-        rank_size_choices = [1, 2]
-        has_other_einsum = [True, False]
-
-        comps = []
-        for t in tensor_choices:
-            for r in rank_choices:
-                for rs in rank_size_choices:
-                    for other in has_other_einsum:
-                        kwargs = dict(
-                            fused_tensors=fzs(t),
-                            fused_loops=tuple((x, rs) for x in r),
-                            fused_ranks=fzs(r),
-                            ranks=fzs("MN"),
-                            tensors=fzs(t),
-                            neighbors=fzs(),
-                        )
-                        x = {OpCompatibility(einsum_id="einsum1", **kwargs)}
-                        if other:
-                            x.add(OpCompatibility(einsum_id="einsum2", **kwargs))
-                        comps.append(FusionSet(x, Pareto(data={})))
-
-        def check_bucket_sizes(bucketed, expected):
-            if expected:
-                self.assertEqual(len(bucketed), expected[0])
-                for b in bucketed.values():
-                    check_bucket_sizes(b, expected[1:])
-
-        fusion_sets = FusionSet.bucket_multi_level(
-            comps, {"einsum1"}, {"A"}, {"M", "N"}
-        )
-        check_bucket_sizes(fusion_sets, [2, 2, 2])
-        fusion_sets = FusionSet.bucket_multi_level(
-            comps, {"einsum1", "einsum2"}, {"A"}, {"M", "N"}
-        )
-        check_bucket_sizes(fusion_sets, [4, 2, 2])
-        fusion_sets = FusionSet.bucket_multi_level(
-            comps, {"einsum1"}, {"A", "C"}, {"M"}
-        )
-        check_bucket_sizes(fusion_sets, [3, 2, 2])
-        fusion_sets = FusionSet.bucket_multi_level(
-            comps, {"einsum1"}, {"A", "C"}, set()
-        )
-        check_bucket_sizes(fusion_sets, [3, 1, 1])
-        fusion_sets = FusionSet.bucket_multi_level(
-            comps, {"einsum1", "einsum2"}, set(), set()
-        )
-        check_bucket_sizes(fusion_sets, [2, 1, 1])
-
-
-if __name__ == "__main__":
-    unittest.main()
+        return f"FusionSet({self.compatibility})"
\ No newline at end of file
diff --git a/pytimeloop/fastfusion/mapper/level_mapper/exhaustive.py b/pytimeloop/fastfusion/mapper/level_mapper/exhaustive.py
index 5335146..9d47fa1 100644
--- a/pytimeloop/fastfusion/mapper/level_mapper/exhaustive.py
+++ b/pytimeloop/fastfusion/mapper/level_mapper/exhaustive.py
@@ -5,6 +5,8 @@
 
 from pytimeloop.fastfusion.mapper.shape_subspace import ShapeSubspace
 
+from .helper import gather_relevant_boundary_idxs
+
 
 class ExhaustiveLevelMapper:
     def __init__(self,
@@ -15,19 +17,17 @@ def __init__(self,
                  lower_mapper,
                  partial_model,
                  step_back_model,
+                 analyzer,
                  max_spatial=(1,),
-                 max_capacity=None,
-                 mapping_filter=None,
-                 stats_filter=None):
+                 max_capacity=None):
         self.hw_level = hw_level
         self.ranks = ranks
         self.tensors = tensors
         self.can_bypass = can_bypass
         self.lower_mapper = lower_mapper
-        self.mapping_filter = mapping_filter
-        self.stats_filter = stats_filter
         self.partial_model = partial_model
         self.step_back_model = step_back_model
+        self.analyzer = analyzer
         self.max_spatial = max_spatial
         self.max_capacity = max_capacity
 
@@ -48,49 +48,83 @@ def run(self, rank_shapes, state):
             else:
                 spatial_ranks_choices = product(spatial_ranks_choices, repeat=n_spatial)
             for temporal_ranks in temporal_ranks_choices:
-                for spatial_ranks in spatial_ranks_choices:
-                    n_temporal_ranks = len(temporal_ranks)
-                    n_spatial_ranks = tuple(len(ranks) for ranks in spatial_ranks)
-                    all_ranks = (
-                        list(temporal_ranks) 
-                        +
-                        sum((list(ranks) for ranks in spatial_ranks), start=[])
+                all_tensor_choices = []
+                for tensor_id in bypassing:
+                    relevant_ranks = \
+                        self.analyzer.einsum_dims_relevant_to_tensor(
+                            state.id_of_einsum_to_eval,
+                            tensor_id
+                        )
+                    all_tensor_choices.append(
+                        (tensor_id, i) for i in
+                        gather_relevant_boundary_idxs(temporal_ranks,
+                                                      relevant_ranks)
                     )
-                    tile_shape_subspace = ShapeSubspace(rank_shapes, all_ranks)
-                    tile_shape_iterator = iter(tile_shape_subspace)
-                    for tile_shape, leftover_rank_shapes in tile_shape_iterator:
-                        temporal_tile_shape = tile_shape[:n_temporal_ranks]
-                        start = n_temporal_ranks
-                        spatial_tile_shapes = []
-                        for num_ranks in n_spatial_ranks:
-                            spatial_tile_shapes.append(tile_shape[start:start+num_ranks])
-                            start += num_ranks
+                for retain_choices in product(*all_tensor_choices):
+                    for spatial_ranks in spatial_ranks_choices:
+                        n_temporal_ranks = len(temporal_ranks)
+                        n_spatial_ranks = tuple(len(ranks) for ranks in spatial_ranks)
+                        all_ranks = (
+                            list(temporal_ranks) 
+                            +
+                            sum((list(ranks) for ranks in spatial_ranks), start=[])
+                        )
+                        tile_shape_subspace = ShapeSubspace(rank_shapes, all_ranks)
+                        tile_shape_iterator = iter(tile_shape_subspace)
+                        for tile_shape, leftover_rank_shapes in tile_shape_iterator:
+                            temporal_tile_shape = tile_shape[:n_temporal_ranks]
+                            start = n_temporal_ranks
+                            spatial_tile_shapes = []
+                            for num_ranks in n_spatial_ranks:
+                                spatial_tile_shapes.append(tile_shape[start:start+num_ranks])
+                                start += num_ranks
+
+                            temporal_loops = tuple(zip(temporal_ranks, temporal_tile_shape))
 
-                        temporal_loops = tuple(zip(temporal_ranks, temporal_tile_shape))
-                        if not self.check_mapping(temporal_loops, tile_shape, bypassing):
-                            continue
+                            spatial_loops = [
+                                tuple(zip(ranks, spatial_tile_shape))
+                                for ranks, spatial_tile_shape
+                                in zip(spatial_ranks, spatial_tile_shapes)
+                            ]
 
-                        spatial_loops = [
-                            tuple(zip(ranks, spatial_tile_shape))
-                            for ranks, spatial_tile_shape
-                            in zip(spatial_ranks, spatial_tile_shapes)
-                        ]
+                            new_state = deepcopy(state)
+                            stats = self.partial_model(state=new_state,
+                                                    temporal_loops=temporal_loops,
+                                                    spatial_loops=spatial_loops,
+                                                    retained_tensors=retain_choices)
 
-                        new_state = deepcopy(state)
-                        stats = self.partial_model(state=new_state,
-                                                   temporal_loops=temporal_loops,
-                                                   spatial_loops=spatial_loops,
-                                                   retained_tensors=bypassing)
+                            if self.lower_mapper is not None:
+                                for stats in self.lower_mapper.run(leftover_rank_shapes, new_state):
+                                    invalid_spatial = any(
+                                        spatial_fanout > max_fanout
+                                        for spatial_fanout, max_fanout
+                                        in zip(stats.spatial[self.hw_level], self.max_spatial)
+                                    )
+                                    if invalid_spatial:
+                                        break
 
-                        if self.lower_mapper is not None:
-                            for stats in self.lower_mapper.run(leftover_rank_shapes, new_state):
+                                    total_capacity = 0
+                                    for (level, _), capacity in stats.capacity.items():
+                                        if level == self.hw_level:
+                                            total_capacity += capacity
+                                    invalid_capacity = (
+                                        self.max_capacity is not None
+                                        and
+                                        total_capacity > self.max_capacity
+                                    )
+                                    if invalid_capacity:
+                                        tile_shape_iterator.skip_current_rank_iteration()
+                                        break
+
+                                    yield stats
+                            else:
                                 invalid_spatial = any(
                                     spatial_fanout > max_fanout
                                     for spatial_fanout, max_fanout
                                     in zip(stats.spatial[self.hw_level], self.max_spatial)
                                 )
                                 if invalid_spatial:
-                                    break
+                                    continue
 
                                 total_capacity = 0
                                 for (level, _), capacity in stats.capacity.items():
@@ -103,38 +137,5 @@ def run(self, rank_shapes, state):
                                 )
                                 if invalid_capacity:
                                     tile_shape_iterator.skip_current_rank_iteration()
-                                    break
 
                                 yield stats
-                        else:
-                            invalid_spatial = any(
-                                spatial_fanout > max_fanout
-                                for spatial_fanout, max_fanout
-                                in zip(stats.spatial[self.hw_level], self.max_spatial)
-                            )
-                            if invalid_spatial:
-                                continue
-
-                            total_capacity = 0
-                            for (level, _), capacity in stats.capacity.items():
-                                if level == self.hw_level:
-                                    total_capacity += capacity
-                            invalid_capacity = (
-                                self.max_capacity is not None
-                                and
-                                total_capacity > self.max_capacity
-                            )
-                            if invalid_capacity:
-                                tile_shape_iterator.skip_current_rank_iteration()
-
-                            yield stats
-
-    def check_mapping(self, temporal_loops, tile_shape, bypassing):
-        if self.mapping_filter is None:
-            return True
-        return self.mapping_filter(temporal_loops, tile_shape, bypassing)
-
-    def check_stats(self, total_stats):
-        if self.stats_filter is None:
-            return True
-        return self.stats_filter(total_stats)
diff --git a/pytimeloop/fastfusion/mapper/level_mapper/helper.py b/pytimeloop/fastfusion/mapper/level_mapper/helper.py
new file mode 100644
index 0000000..a90a3b4
--- /dev/null
+++ b/pytimeloop/fastfusion/mapper/level_mapper/helper.py
@@ -0,0 +1,11 @@
+def gather_relevant_boundary_idxs(ranks, relevant_ranks):
+    idxs = []
+    last_is_relevant = True
+    for i, r in enumerate(ranks):
+        is_relevant = r in relevant_ranks
+        if last_is_relevant and not is_relevant:
+            idxs.append(i)
+        last_is_relevant = is_relevant
+    if last_is_relevant:
+        idxs.append(len(ranks))
+    return idxs
\ No newline at end of file
diff --git a/pytimeloop/fastfusion/mapper/level_mapper/top_level.py b/pytimeloop/fastfusion/mapper/level_mapper/top_level.py
index 73a9114..ae1da45 100644
--- a/pytimeloop/fastfusion/mapper/level_mapper/top_level.py
+++ b/pytimeloop/fastfusion/mapper/level_mapper/top_level.py
@@ -12,6 +12,8 @@
 from pytimeloop.fastfusion.compatibility import OpCompatibility
 from pytimeloop.fastfusion.mapper.stepped_model import SteppedModelState
 
+from .helper import gather_relevant_boundary_idxs
+
 
 class TopLevelMapper:
     def __init__(self,
@@ -26,10 +28,9 @@ def __init__(self,
                  partial_model,
                  step_back_model,
                  bits_per_word,
+                 analyzer,
                  max_spatial=(1,),
-                 max_capacity=None,
-                 mapping_filter=None,
-                 stats_filter=None):
+                 max_capacity=None):
         self.hw_level = hw_level
         self.ranks = frozenset(ranks)
         self.tensors = frozenset(tensors)
@@ -37,11 +38,10 @@ def __init__(self,
         self.id_of_einsum_to_eval = id_of_einsum_to_eval
         self.neighbors = frozenset(neighbors)
         self.lower_mapper = lower_mapper
-        self.mapping_filter = mapping_filter
-        self.stats_filter = stats_filter
         self.model = model
         self.partial_model = partial_model
         self.step_back_model = step_back_model
+        self.analyzer = analyzer
         self.max_spatial = max_spatial
         self.max_capacity = max_capacity
         self.compatibility_to_df = defaultdict(lambda: defaultdict(lambda: list()))
@@ -98,49 +98,87 @@ def run(self, rank_shapes):
                                                 repeat=n_spatial)
 
             for temporal_ranks in temporal_ranks_choices:
-                for spatial_ranks in spatial_ranks_choices:
-                    n_temporal_ranks = len(temporal_ranks)
-                    n_spatial_ranks = tuple(len(ranks) for ranks in spatial_ranks)
-                    all_ranks = (
-                        list(temporal_ranks) 
-                        +
-                        sum((list(ranks) for ranks in spatial_ranks), start=[])
+                all_tensor_choices = []
+                for tensor_id in fused_tensors:
+                    relevant_ranks = \
+                        self.analyzer.einsum_dims_relevant_to_tensor(
+                            state.id_of_einsum_to_eval,
+                            tensor_id
+                        )
+                    all_tensor_choices.append(
+                        (tensor_id, i) for i in
+                        gather_relevant_boundary_idxs(temporal_ranks,
+                                                      relevant_ranks)
                     )
-                    tile_shape_subspace = ShapeSubspace(rank_shapes, all_ranks)
-                    tile_shape_iterator = iter(tile_shape_subspace)
-                    for tile_shape, leftover_rank_shapes in tile_shape_iterator:
-                        temporal_tile_shape = tile_shape[:n_temporal_ranks]
-                        start = n_temporal_ranks
-                        spatial_tile_shapes = []
-                        for num_ranks in n_spatial_ranks:
-                            spatial_tile_shapes.append(tile_shape[start:start+num_ranks])
-                            start += num_ranks
-
-                        temporal_loops = tuple(zip(temporal_ranks, temporal_tile_shape))
-                        if not self.check_mapping(temporal_loops, tile_shape, fused_tensors):
-                            continue
-
-                        spatial_loops = [
-                            tuple(zip(ranks, spatial_tile_shape))
-                            for ranks, spatial_tile_shape
-                            in zip(spatial_ranks, spatial_tile_shapes)
-                        ]
-
-                        new_state = deepcopy(state)
-                        stats = self.partial_model(state=new_state,
-                                                   temporal_loops=temporal_loops,
-                                                   spatial_loops=spatial_loops,
-                                                   retained_tensors=fused_tensors)
-
-                        if self.lower_mapper is not None:
-                            for stats in self.lower_mapper.run(leftover_rank_shapes, new_state):
+                for retain_choices in product(*all_tensor_choices):
+                    for spatial_ranks in spatial_ranks_choices:
+                        n_temporal_ranks = len(temporal_ranks)
+                        n_spatial_ranks = tuple(len(ranks) for ranks in spatial_ranks)
+                        all_ranks = (
+                            list(temporal_ranks) 
+                            +
+                            sum((list(ranks) for ranks in spatial_ranks), start=[])
+                        )
+                        tile_shape_subspace = ShapeSubspace(rank_shapes, all_ranks)
+                        tile_shape_iterator = iter(tile_shape_subspace)
+                        for tile_shape, leftover_rank_shapes in tile_shape_iterator:
+                            temporal_tile_shape = tile_shape[:n_temporal_ranks]
+                            start = n_temporal_ranks
+                            spatial_tile_shapes = []
+                            for num_ranks in n_spatial_ranks:
+                                spatial_tile_shapes.append(tile_shape[start:start+num_ranks])
+                                start += num_ranks
+
+                            temporal_loops = tuple(zip(temporal_ranks, temporal_tile_shape))
+
+                            spatial_loops = [
+                                tuple(zip(ranks, spatial_tile_shape))
+                                for ranks, spatial_tile_shape
+                                in zip(spatial_ranks, spatial_tile_shapes)
+                            ]
+
+                            new_state = deepcopy(state)
+                            stats = self.partial_model(state=new_state,
+                                                    temporal_loops=temporal_loops,
+                                                    spatial_loops=spatial_loops,
+                                                    retained_tensors=retain_choices)
+
+                            if self.lower_mapper is not None:
+                                for stats in self.lower_mapper.run(leftover_rank_shapes, new_state):
+                                    invalid_spatial = any(
+                                        spatial_fanout > max_fanout
+                                        for spatial_fanout, max_fanout
+                                        in zip(stats.spatial[self.hw_level], self.max_spatial)
+                                    )
+                                    if invalid_spatial:
+                                        break
+
+                                    total_capacity = 0
+                                    for (level, _), capacity in stats.capacity.items():
+                                        if level == self.hw_level:
+                                            total_capacity += capacity
+                                    invalid_capacity = (
+                                        self.max_capacity is not None
+                                        and
+                                        total_capacity > self.max_capacity
+                                    )
+                                    if invalid_capacity:
+                                        tile_shape_iterator.skip_current_rank_iteration()
+                                        break
+
+                                    self.store_evaluation_result(
+                                        fused_tensors,
+                                        temporal_loops,
+                                        stats
+                                    )
+                            else:
                                 invalid_spatial = any(
                                     spatial_fanout > max_fanout
                                     for spatial_fanout, max_fanout
                                     in zip(stats.spatial[self.hw_level], self.max_spatial)
                                 )
                                 if invalid_spatial:
-                                    break
+                                    continue
 
                                 total_capacity = 0
                                 for (level, _), capacity in stats.capacity.items():
@@ -153,46 +191,9 @@ def run(self, rank_shapes):
                                 )
                                 if invalid_capacity:
                                     tile_shape_iterator.skip_current_rank_iteration()
-                                    break
 
                                 self.store_evaluation_result(
                                     fused_tensors,
                                     temporal_loops,
                                     stats
                                 )
-                        else:
-                            invalid_spatial = any(
-                                spatial_fanout > max_fanout
-                                for spatial_fanout, max_fanout
-                                in zip(stats.spatial[self.hw_level], self.max_spatial)
-                            )
-                            if invalid_spatial:
-                                continue
-
-                            total_capacity = 0
-                            for (level, _), capacity in stats.capacity.items():
-                                if level == self.hw_level:
-                                    total_capacity += capacity
-                            invalid_capacity = (
-                                self.max_capacity is not None
-                                and
-                                total_capacity > self.max_capacity
-                            )
-                            if invalid_capacity:
-                                tile_shape_iterator.skip_current_rank_iteration()
-
-                            self.store_evaluation_result(
-                                fused_tensors,
-                                temporal_loops,
-                                stats
-                            )
-
-    def check_mapping(self, temporal_loops, tile_shape, bypassing):
-        if self.mapping_filter is None:
-            return True
-        return self.mapping_filter(temporal_loops, tile_shape, bypassing)
-
-    def check_stats(self, total_stats):
-        if self.stats_filter is None:
-            return True
-        return self.stats_filter(total_stats)
diff --git a/pytimeloop/fastfusion/mapper/mapper.py b/pytimeloop/fastfusion/mapper/mapper.py
index 0df1e8e..cc16e97 100644
--- a/pytimeloop/fastfusion/mapper/mapper.py
+++ b/pytimeloop/fastfusion/mapper/mapper.py
@@ -98,11 +98,11 @@ def final_model(level, state, temporal_loops, spatial_loops, retained_tensors):
         return model.run(state)
 
     def partial_model(level, state, temporal_loops, spatial_loops, retained_tensors):
-        model.add_storage(state,
-                          level,
-                          temporal_loops,
-                          spatial_loops,
-                          retained_tensors)
+        model.add_level_uneven(state,
+                               level,
+                               temporal_loops,
+                               spatial_loops,
+                               retained_tensors)
         return Stats()
 
 
@@ -132,6 +132,7 @@ def partial_model(level, state, temporal_loops, spatial_loops, retained_tensors)
                                                max_capacity=level_max_cap,
                                                can_bypass=True,
                                                lower_mapper=cur_mapper,
+                                               analyzer=analyzer,
                                                partial_model=partial(partial_model,
                                                                      level=hw_level),
                                                step_back_model=step_back_model)
@@ -150,6 +151,7 @@ def partial_model(level, state, temporal_loops, spatial_loops, retained_tensors)
                                 lower_mapper=cur_mapper,
                                 model=model,
                                 bits_per_word=8,
+                                analyzer=analyzer,
                                 partial_model=partial(partial_model, level=0),
                                 step_back_model=step_back_model,
                                 max_spatial=max_spatial[hw_level],
@@ -158,11 +160,19 @@ def partial_model(level, state, temporal_loops, spatial_loops, retained_tensors)
     cur_mapper.run(einsum_shape)
 
     result = cur_mapper.get_result()
+    before_pareto_size = sum(v.shape[0] for v in result.values())
 
     result_dict = {}
     op_data = OpData(frozenset({id_of_einsum_to_eval}), frozenset(tensors))
+    after_pareto_size = 0
     for op_comp, data in result.items():
         result_dict[op_comp] = Pareto({op_data: data})
+        after_pareto_size += sum(
+            v.shape[0] for v in result_dict[op_comp].data.values()
+        )
+
+    print('mapspace size:', before_pareto_size)
+    print('mapspace after pareto size:', after_pareto_size)
 
     return result_dict
 
diff --git a/pytimeloop/fastfusion/mapper/stepped_model.py b/pytimeloop/fastfusion/mapper/stepped_model.py
index c3a6f87..d7e5859 100644
--- a/pytimeloop/fastfusion/mapper/stepped_model.py
+++ b/pytimeloop/fastfusion/mapper/stepped_model.py
@@ -1,3 +1,4 @@
+from collections import defaultdict
 from copy import deepcopy
 from dataclasses import dataclass
 from pathlib import Path
@@ -78,7 +79,6 @@ def initialize(self, state, level, id_of_einsum_to_eval, retained_tensors):
 
         state.id_of_einsum_to_eval = id_of_einsum_to_eval
 
-
     def add_storage(self, state, level, temporal_loops, spatial_loops, retained_tensors):
         self.add_temporal_and_spatial_loops(state, temporal_loops, spatial_loops)
         state.mapping_of_interest.append({
@@ -88,6 +88,30 @@ def add_storage(self, state, level, temporal_loops, spatial_loops, retained_tens
                        for tensor_id in retained_tensors]
         })
 
+    def add_level_uneven(self, state, level, temporal_loops, spatial_loops, retained_tensors):
+        idx_to_tensor = defaultdict(lambda: list())
+        for tensor, idx in retained_tensors:
+            idx_to_tensor[idx].append(tensor)
+
+        self.add_spatial_loops(state, spatial_loops)
+
+        for i, l in enumerate(temporal_loops):
+            if i in idx_to_tensor:
+                state.mapping_of_interest.append({
+                    'type': 'storage',
+                    'target': level,
+                    'dspace': [self.tensor_id_to_name[tensor_id]
+                               for tensor_id in idx_to_tensor[i]]
+                })
+            self.add_temporal_loops(state, [l])
+        state.mapping_of_interest.append({
+            'type': 'storage',
+            'target': level,
+            'dspace': [self.tensor_id_to_name[tensor_id]
+                        for tensor_id in idx_to_tensor[len(temporal_loops)]]
+        })
+        
+
     def add_compute(self, state, level, einsum_name, temporal_loops, spatial_loops):
         self.add_temporal_and_spatial_loops(state, temporal_loops, spatial_loops)
         state.mapping_of_interest.append({
@@ -107,6 +131,8 @@ def run(self, state):
 
         # model = LooptreeModelApp(config)
         self.eval_count += 1
+        if self.eval_count % 1000 == 0:
+            print(self.eval_count // 1000)
         result = run_fastmodel({'nodes': state.mapping},
                                state.id_of_einsum_to_eval,
                                self.workload,
@@ -133,13 +159,19 @@ def run(self, state):
         return stats
 
     def add_temporal_and_spatial_loops(self, state, temporal_loops, spatial_loops):
-        for rank, shape in temporal_loops:
+        self.add_spatial_loops(state, spatial_loops)
+        self.add_temporal_loops(state, temporal_loops)
+
+    def add_temporal_loops(self, state, loops):
+        for rank, shape in loops:
             state.mapping_of_interest.append({
                 'type': 'temporal',
                 'rank': self.dimension_id_to_name[rank],
                 'tile_shape': shape
             })
-        for spatial_idx, loops in enumerate(spatial_loops):
+
+    def add_spatial_loops(self, state, loops):
+        for spatial_idx, loops in enumerate(loops):
             for rank, shape in loops:
                 state.mapping_of_interest.append({
                     'type': 'spatial',
diff --git a/pytimeloop/looptree/fastmodel/fastmodel.py b/pytimeloop/looptree/fastmodel/fastmodel.py
index d052721..30ccfeb 100644
--- a/pytimeloop/looptree/fastmodel/fastmodel.py
+++ b/pytimeloop/looptree/fastmodel/fastmodel.py
@@ -85,6 +85,10 @@ def run_fastmodel(mapping,
                     tensor_size[tensor_id] //= factor
                 else:
                     potential_tensor_access_multiplier[tensor_id] *= factor
+        elif node['type'] == 'sequential':
+            for tensor_id in tensors:
+                actual_tensor_access_multiplier[tensor_id] = \
+                    potential_tensor_access_multiplier[tensor_id]
         elif node['type'] == 'spatial':
             rank_name = node['rank']
             rank_id = rank_name_to_id[rank_name]
@@ -126,7 +130,8 @@ def run_fastmodel(mapping,
                     )
                 )
 
-                fanout[target] = cur_fanout
+                if target not in fanout:
+                    fanout[target] = cur_fanout
                 cur_fanout = [1]
         elif node['type'] == 'compute':
             target = node['target']
diff --git a/pytimeloop/looptree/latency/latency.py b/pytimeloop/looptree/latency/latency.py
index d6617c2..47945dc 100755
--- a/pytimeloop/looptree/latency/latency.py
+++ b/pytimeloop/looptree/latency/latency.py
@@ -1,15 +1,76 @@
+from collections import defaultdict
+
 from pytimeloop.isl.singular import get_value_from_singular_qpolynomial
 from pytimeloop.looptree.latency.processors import LATENCY_PROCESSORS
 
 from bindings.looptree import SpatialTag
 
 
+def get_latency(actions, mapping, temporal_steps, workload, arch):
+    comp_latency = compute_latency(mapping, temporal_steps, workload)
+    mem_latency = memory_latency(actions, arch)
+    return max(comp_latency, max(mem_latency.values()))
+
+
 def compute_latency(mapping, temporal_steps, workload):
     return get_value_from_singular_qpolynomial(
         _compute_latency(mapping, 0, temporal_steps, workload)[1]
     ).to_python()
 
 
+def memory_latency(actions, arch):
+    component_to_read_writes = defaultdict(lambda: [None, None])
+    for (component, action), count in actions.items():
+        if action == 'read':
+            component_to_read_writes[component][0] = count
+        elif action == 'write':
+            component_to_read_writes[component][1] = count
+
+    component_latency = {}
+    bandwidths = get_bandwidth(arch)
+    for component, (reads, writes) in component_to_read_writes.items():
+        read_bw, write_bw, shared_bw = bandwidths[component]
+        # All shared bw for writing
+        write_latency = writes / (write_bw + shared_bw)
+        read_latency = reads / read_bw
+        if write_latency >= read_latency:
+            component_latency[component] = write_latency
+            continue
+        # All shared bw for reading
+        write_latency = writes / write_bw
+        read_latency = reads / (read_bw + shared_bw)
+        if read_latency >= write_latency:
+            component_latency[component] = read_latency
+            continue
+        # Shared bw shared for reading and writing
+        component_latency[component] = (
+            (reads + writes)
+            / 
+            (read_bw + write_bw + shared_bw)
+        )
+    return component_latency
+
+
+def get_bandwidth(arch):
+    component_bandwidths = {}
+    for node in arch['nodes']:
+        attributes = node.attributes
+        n_rd_ports = attributes.get('n_rd_ports', 0)
+        n_wr_ports = attributes.get('n_wr_ports', 0)
+        n_rdwr_ports = attributes.get('n_rdwr_ports', 0)
+
+        width = attributes['width']
+        datawidth = attributes['datawidth']
+        width_in_words = width/datawidth
+
+        component_bandwidths[node['name']] = [
+            n_rd_ports*width_in_words,
+            n_wr_ports*width_in_words,
+            n_rdwr_ports*width_in_words
+        ]
+    return component_bandwidths
+
+
 def _compute_latency(mapping, top_idx: int, temporal_steps, workload):
     einsum_name_to_id = workload.einsum_name_to_id()
 
diff --git a/tests/test_configs/cascaded_mm_large.workload.yaml b/tests/test_configs/cascaded_mm_large.workload.yaml
new file mode 100644
index 0000000..11ee33f
--- /dev/null
+++ b/tests/test_configs/cascaded_mm_large.workload.yaml
@@ -0,0 +1,36 @@
+problem:
+  - shape:
+      name: Fc1
+      dimensions: [ P1, M1, C1 ]
+      data_spaces:
+      - name: Fmap1
+        dimensions: [ Fmap1_C, Fmap1_P ]
+        projection: '[ C1, P1 ]'
+      - name: Filter1
+        dimensions: [ Filter1_C, Filter1_M ]
+        projection: '[ C1, M1 ]'
+      - name: Fmap2
+        dimensions: [ Fmap2_C, Fmap2_P ]
+        projection: '[ M1, P1 ]'
+        read_write: True
+
+    instance: >-
+      0 <= P1 < 1024 and 0 <= M1 < 1024 and 0 <= C1 < 1024
+
+  - shape:
+      name: Fc2
+      dimensions: [ P2, M2, C2 ]
+      data_spaces:
+      - name: Fmap2
+        dimensions: [ Fmap2_C, Fmap2_P ]
+        projection: '[ C2, P2 ]'
+      - name: Filter2
+        dimensions: [ Filter2_C, Filter2_M ]
+        projection: '[ C2, M2 ]'
+      - name: Fmap3
+        dimensions: [ Fmap3_C, Fmap3_P ]
+        projection: '[ M2, P2 ]'
+        read_write: True
+
+    instance: >-
+      0 <= P2 < 1024 and 0 <= M2 < 1024 and 0 <= C2 < 1024
diff --git a/tests/test_configs/four_level.arch.yaml b/tests/test_configs/four_level.arch.yaml
new file mode 100644
index 0000000..ed786e0
--- /dev/null
+++ b/tests/test_configs/four_level.arch.yaml
@@ -0,0 +1,44 @@
+variables:
+  global_cycle_seconds: 1e-9
+  technology: "45nm"
+
+architecture:
+  version: 0.4
+  nodes:
+  - !Component
+    name: MainMemory
+    class: DRAM
+    attributes: {width: 256, block_size: 32, word_bits: 8, datawidth: 8}
+    required_actions: ['read', 'write']
+  - !Component
+    name: GlobalBuffer
+    class: SRAM
+    attributes:
+      depth: 16384
+      width: 512
+      block_size: 32
+      word_bits: 8
+      datawidth: 8
+      n_rdwr_ports: 2
+      n_rd_ports: 0
+      n_wr_ports: 0
+    required_actions: ['read', 'write']
+  - !Container
+    name: PE
+    spatial: {meshX: 4, meshY: 4}
+  - !Container
+    name: Compute
+    spatial: {meshX: 64, meshY: 64}
+  - !Component
+    name: Register
+    class: regfile
+    attributes:
+      depth: 2
+      width: 8
+      datawidth: 8
+    required_actions: ['read', 'write']
+  - !Component
+    name: MACC
+    class: intmac
+    attributes: {datawidth: 8, width: 8, cycle_time: 1e-9}
+    required_actions: ['compute']
diff --git a/tests/test_configs/tiled.arch.yaml b/tests/test_configs/tiled.arch.yaml
new file mode 100644
index 0000000..30a0fcc
--- /dev/null
+++ b/tests/test_configs/tiled.arch.yaml
@@ -0,0 +1,44 @@
+variables:
+  global_cycle_seconds: 1e-9
+  technology: "45nm"
+
+architecture:
+  version: 0.4
+  nodes:
+  - !Component
+    name: MainMemory
+    class: DRAM
+    attributes: {width: 256, block_size: 32, word_bits: 8, datawidth: 8}
+    required_actions: ['read', 'write']
+  - !Container
+    name: PE
+    spatial: {meshX: 4, meshY: 4}
+  - !Component
+    name: GlobalBuffer
+    class: SRAM
+    attributes:
+      depth: 8192
+      width: 64
+      block_size: 32
+      word_bits: 8
+      datawidth: 8
+      n_rdwr_ports: 2
+      n_rd_ports: 0
+      n_wr_ports: 0
+    required_actions: ['read', 'write']
+  - !Container
+    name: Compute
+    spatial: {meshX: 64, meshY: 64}
+  - !Component
+    name: Register
+    class: regfile
+    attributes:
+      depth: 2
+      width: 8
+      datawidth: 8
+    required_actions: ['read', 'write']
+  - !Component
+    name: MACC
+    class: intmac
+    attributes: {datawidth: 8, width: 8, cycle_time: 1e-9}
+    required_actions: ['compute']