From 1cde00fca2f4b41e4608ae20800223d82f223bec Mon Sep 17 00:00:00 2001
From: Michael Gilbert <gilbertm@mit.edu>
Date: Mon, 11 Nov 2024 07:58:44 -0500
Subject: [PATCH] Deduplicate similar Einsums

---
 pytimeloop/fastfusion/layerdeduplication.py   | 115 +++++++++---------
 pytimeloop/fastfusion/mapper/mapper2.py       | 106 +++++++++++++++-
 .../fastfusion/mapper/per_einsum_mapper.py    |   5 +-
 .../fastfusion/mapper/shape_subspace2.py      |  96 +++++++++++++++
 tests/fastfusion/test_layerdeduplication.py   |  26 +++-
 .../cascaded_mm_multi_32.workload.yaml        |  32 ++---
 6 files changed, 298 insertions(+), 82 deletions(-)
 create mode 100644 pytimeloop/fastfusion/mapper/shape_subspace2.py

diff --git a/pytimeloop/fastfusion/layerdeduplication.py b/pytimeloop/fastfusion/layerdeduplication.py
index 0fb62b0..91c855f 100644
--- a/pytimeloop/fastfusion/layerdeduplication.py
+++ b/pytimeloop/fastfusion/layerdeduplication.py
@@ -1,5 +1,5 @@
 from collections import defaultdict
-from itertools import permutations
+from itertools import permutations, product
 
 from pytimeloop.looptree.mapping_utilities import get_intermediate_tensors
 
@@ -26,69 +26,72 @@ def is_equivalent(einsum_id1, einsum_id2, workload, analyzer):
     if einsum2_output_tensor is None:
         einsum2_output_tensor = set()
 
-    einsum1_tensors = einsum1_input_tensors | einsum1_output_tensor
-    einsum2_tensors = einsum2_input_tensors | einsum2_output_tensor
-
     intermediate_tensors = get_intermediate_tensors(workload)
 
-    tensor_properties = defaultdict(set)
-    for tensor in einsum1_input_tensors | einsum2_input_tensors:
-        tensor_properties[tensor].add('input')
-    for tensor in einsum1_output_tensor | einsum2_output_tensor:
-        tensor_properties[tensor].add('input')
-    for tensor in intermediate_tensors:
-        if tensor not in tensor_properties:
-            continue
-        tensor_properties[tensor].add('intermediate')
-    tensor_properties = {
-        tensor: frozenset(properties)
-        for tensor, properties in tensor_properties.items()
-    }
-    property_to_tensors = defaultdict(lambda: (set(), set()))
-    for tensor, property in tensor_properties:
-        tensor_sets = property_to_tensors[property]
-        if tensor in einsum1_tensors:
-            tensor_sets[0].add(tensor)
-        else:
-            tensor_sets[1].add(tensor)
-
-    for tensor_sets in property_to_tensors.values():
-        if len(tensor_sets[0]) != len(tensor_sets[1]):
-            return None, None
-
-
-
-    # Check if we can rename einsum1 ranks to create einsum2
-    for renamed_ranks in permutations(einsum2_ranks):
-        rank_renaming = {
-            r1: r2 for r1, r2 in zip(einsum1_ranks, renamed_ranks)
+    all_tensor_properties = []
+    all_tensors = [
+        (einsum1_input_tensors, einsum1_output_tensor),
+        (einsum2_input_tensors, einsum2_output_tensor)
+    ]
+    for input_tensors, output_tensors in all_tensors:
+        tensor_properties = defaultdict(set)
+        for tensor in input_tensors:
+            tensor_properties[tensor].add('input')
+        for tensor in output_tensors:
+            tensor_properties[tensor].add('output')
+        for tensor in tensor_properties:
+            if tensor in intermediate_tensors:
+                tensor_properties[tensor].add('intermediate')
+        tensor_properties = {
+            tensor: frozenset(properties)
+            for tensor, properties in tensor_properties.items()
         }
-        # for tensor_renaming in get_tensor_renamings(property_to_tensors):
-        for renamed_input_tensors in permutations(einsum2_input_tensors):
-            input_tensor_renaming = {
-                t1: t2 for t1, t2
-                in zip(einsum1_input_tensors, renamed_input_tensors)
+        all_tensor_properties.append(tensor_properties)
+
+    property_to_tensors = defaultdict(lambda: (set(), set()))
+    for i, tensor_properties in enumerate(all_tensor_properties):
+        for tensor, property in tensor_properties.items():
+            tensor_sets = property_to_tensors[property]
+            tensor_sets[i].add(tensor)
+
+    # Check if we can rename tensors in einsum1 to einsum2
+    for tensor_renaming in tensor_renamings(property_to_tensors):
+        # Check if we can rename einsum1 ranks to create einsum2
+        for renamed_ranks in permutations(einsum2_ranks):
+            rank_renaming = {
+                r1: r2 for r1, r2 in zip(einsum1_ranks, renamed_ranks)
             }
-            for renamed_output_tensors in permutations(einsum2_output_tensor):
-                output_tensor_renaming = {
-                    t1: t2 for t1, t2
-                    in zip(einsum1_output_tensor, renamed_output_tensors)
-                }
-                tensor_renaming = input_tensor_renaming | output_tensor_renaming
+            if not _shape_is_equivalent(rank_renaming, workload):
+                continue
 
-                if not _shape_is_equivalent(rank_renaming, workload):
-                    continue
+            if not _dependency_is_equivalent(einsum_id1,
+                                            einsum_id2,
+                                            rank_renaming,
+                                            tensor_renaming,
+                                            analyzer):
+                continue
 
-                if not _dependency_is_equivalent(einsum_id1,
-                                                einsum_id2,
-                                                rank_renaming,
-                                                tensor_renaming,
-                                                analyzer):
-                    continue
+            return rank_renaming, tensor_renaming
+    return None, None
 
-                return rank_renaming, tensor_renaming
 
-    return None, None
+def tensor_renamings(property_to_tensors):
+    for tensors_of_1, tensors_of_2 in property_to_tensors.values():
+        if len(tensors_of_1) != len(tensors_of_2):
+            return
+
+    all_tensors_of_1 = [
+        t
+        for tensors_of_1, _ in property_to_tensors.values()
+        for t in tensors_of_1
+    ]
+    permutations_of_tensor_2_by_property = []
+    for _, tensors_of_2 in property_to_tensors.values():
+        permutations_of_tensor_2_by_property.append(permutations(tensors_of_2))
+    for permutation_of_2 in product(*permutations_of_tensor_2_by_property):
+        permutation_of_2 = tuple(t for tupl in permutation_of_2 for t in tupl)
+        renaming = dict(zip(all_tensors_of_1, permutation_of_2))
+        yield renaming
 
 
 def _shape_is_equivalent(rank_renaming, workload):
diff --git a/pytimeloop/fastfusion/mapper/mapper2.py b/pytimeloop/fastfusion/mapper/mapper2.py
index d5cb462..583a74d 100644
--- a/pytimeloop/fastfusion/mapper/mapper2.py
+++ b/pytimeloop/fastfusion/mapper/mapper2.py
@@ -1,4 +1,5 @@
 from collections import defaultdict
+from copy import deepcopy
 import logging.handlers
 from pathlib import Path
 import logging
@@ -17,6 +18,7 @@
 from pytimeloop.fastfusion.layerdeduplication import is_equivalent
 from pytimeloop.fastfusion.mapper.logging import make_queue_and_listener
 from pytimeloop.fastfusion.mapper.per_einsum_mapper import get_top_loop_jobs, mapper_place_fusion_level
+from pytimeloop.fastfusion.sim import Tiling, Loop, TensorStorage
 
 from pytimeloop.timeloopfe.v4 import Ert
 from pytimeloop.timeloopfe.common.backend_calls import call_accelergy_verbose
@@ -37,6 +39,8 @@ def mapper(
     log_queue, log_queue_listener = make_queue_and_listener()
 
     workload = LooptreeWorkload.parse_cfg(config.root["problem"])
+    analyzer = LooptreeWorkloadDependencyAnalyzer(workload)
+    equivalent_groups = EquivalentGroups.from_workload(workload, analyzer)
 
     einsum_name_to_id = workload.einsum_name_to_id()
 
@@ -47,8 +51,15 @@ def mapper(
     ert = Ert(ert_dict["ERT"])
     energy_dict = ert.to_dict()
 
+    grouped_similar_einsums = convert_rank_to_group_renaming(
+        detect_similar_einsums(workload, analyzer),
+        equivalent_groups
+    )
+    logger.info(f"Found {len(grouped_similar_einsums)} unique Einsums\n"
+                + f"\tConverter: {grouped_similar_einsums}")
+
     args = get_top_loop_jobs(
-        einsum_name_to_id=einsum_name_to_id,
+        einsums_to_explore=list(grouped_similar_einsums.keys()),
         config=config,
         pe_array_constraint=pe_array_constraint,
         mac_array_constraint=mac_array_constraint,
@@ -61,11 +72,11 @@ def mapper(
     )
     
     print(f'Number of jobs: {len(args)}')
-    
-    logger.debug("Starting workers")
+    n_workers = 128
+    logger.debug(f"Starting {n_workers} workers")
     log_queue_listener.start()
     
-    result = Parallel(n_jobs=128)(
+    result = Parallel(n_jobs=n_workers)(
         delayed(mapper_place_fusion_level)(**a) for a in args
     )
     data = defaultdict(dict)
@@ -78,5 +89,92 @@ def mapper(
         
     log_queue_listener.stop()
     logger.info(f"Mapper finished for {spec}")
+
+    generated_data = {}
+    logger.info(f"Generating data for non-unique Einsums")
+    for ref_einsum, others in grouped_similar_einsums.items():
+        for other_einsum, (rank_renaming, tensor_renaming) in others.items():
+            logger.info(f"Generating data for {other_einsum}. "
+                        + f"Rank renaming={rank_renaming}. "
+                        + f"Tensor renaming={tensor_renaming}")
+            generated_data[other_einsum] = generate_data(data[ref_einsum],
+                                                         rank_renaming,
+                                                         tensor_renaming)
+
+    for einsum, mapping in generated_data.items():
+        data[einsum] = mapping
+
+    logger.info(f"Final set of Einsums: {set(data.keys())}")
+
+    # data has to come out in sorted Einsum-id order
+    data = {k: v for k, v in sorted(data.items(), key=lambda item: item[0])}
+
     return data
 
+
+def generate_data(data, rank_renaming, tensor_renaming):
+    return {
+        _convert_tiling(tiling, rank_renaming, tensor_renaming)
+        :
+        _convert_stats(stats, rank_renaming, tensor_renaming)
+        for tiling, stats in data.items()
+    }
+
+
+def _convert_tiling(tiling: Tiling, rank_renaming, tensor_renaming):
+    return Tiling(
+        loops=tuple(Loop(rank_renaming[l.rank_id], l.bound, l.is_spatial)
+                    for l in tiling.loops),
+        tensors=frozenset(TensorStorage(tensor_renaming[ts.tensor_id],
+                                        ts.backer_id,
+                                        ts.above_loop_index,
+                                        ts.tile_size)
+                          for ts in tiling.tensors)
+    )
+
+
+def _convert_stats(stats, rank_renaming, tensor_renaming):
+    return deepcopy(stats)
+
+
+def detect_similar_einsums(workload, analyzer, return_all_as_unique=False):
+    if return_all_as_unique:
+        return {ref: {} for ref in workload.einsum_id_to_name()}
+
+    ref_to_other_einsums = {}
+    for einsum in workload.einsum_id_to_name():
+        found = False
+        for ref_einsum in ref_to_other_einsums:
+            rank_renaming, tensor_renaming = is_equivalent(ref_einsum,
+                                                           einsum,
+                                                           workload,
+                                                           analyzer)
+            if rank_renaming is not None:
+                ref_to_other_einsums[ref_einsum][einsum] = (rank_renaming,
+                                                            tensor_renaming)
+                found = True
+                break
+        if not found:
+            ref_to_other_einsums[einsum] = {}
+    return ref_to_other_einsums
+
+
+def convert_rank_to_group_renaming(ref_to_other_einsums, equiv_ranks):
+    return {
+        ref: {
+            other: (_convert_rank_renaming(rank_renaming, equiv_ranks),
+                    tensor_renaming)
+            for other, (rank_renaming, tensor_renaming) in others.items()
+        }
+        for ref, others in ref_to_other_einsums.items()
+    }
+
+
+def _convert_rank_renaming(rank_renaming, equiv_ranks):
+    # The Tiling class uses string ids
+    return {
+        str(equiv_ranks.rank_to_group_id[r1])
+        :
+        str(equiv_ranks.rank_to_group_id[r2])
+        for r1, r2 in rank_renaming.items()
+    }
diff --git a/pytimeloop/fastfusion/mapper/per_einsum_mapper.py b/pytimeloop/fastfusion/mapper/per_einsum_mapper.py
index d91a42c..a758d69 100644
--- a/pytimeloop/fastfusion/mapper/per_einsum_mapper.py
+++ b/pytimeloop/fastfusion/mapper/per_einsum_mapper.py
@@ -83,6 +83,7 @@ def add_storage(self, target, dspaces, idx=None):
         else:
             self.mapping.insert(idx, node)
 
+
 @log_worker(f"{__name__}:_mapper_place_fusion_level")
 def mapper_place_fusion_level(
     config,
@@ -216,13 +217,13 @@ def get_top_loop_jobs(
     spec,
     explore_glb_uneven,
     explore_pe_uneven,
-    einsum_name_to_id,
+    einsums_to_explore,
     energy_dict,
     log_queue=None,
     verbose_stream=None,
 ):
     args = []
-    for einsum_id in einsum_name_to_id.values():
+    for einsum_id in einsums_to_explore:
         # if log_queue is not None:
         #     log_queue.info(f"[{einsum_id}] Exploring mapspace of Einsum {einsum_id}")
         logfunc = lambda msg: None # log_queue.debug(f"[{einsum_id}] " + msg)
diff --git a/pytimeloop/fastfusion/mapper/shape_subspace2.py b/pytimeloop/fastfusion/mapper/shape_subspace2.py
new file mode 100644
index 0000000..3480fd0
--- /dev/null
+++ b/pytimeloop/fastfusion/mapper/shape_subspace2.py
@@ -0,0 +1,96 @@
+"""
+General idea:
+- Often, higher hardware utilization leads to better metrics
+- When that is not the case, the cause is that the hardware resource is
+  shared and higher utilization by one user trades off utilization by
+  another.
+
+In terms of tile shape, the utilization of interest is:
+- Higher buffer utilization due to smaller factor, larger tile shape of
+  temporal loops
+Spatial loops are more complicated because larger tile shape means lower
+parallel hardware utilization.
+
+To keep the shape iterator more generic, we allow tagging of a particular
+loop with hints:
+- Maximize tile shape
+- Minimize tile shape
+- Explore all
+The maximize/minimize tile shape tags will cause the iterator to attempt
+to quickly find the largest/smallest *valid* tile shape.
+"""
+from collections.abc import Callable
+from enum import Enum
+
+from .shape_subspace import ShapeSubspace
+
+
+class IteratorHint(Enum):
+    MAXIMIZE = 0
+    MINIMIZE = 1
+    EXPLORE  = 2
+
+
+class FastShapeSubspaceIterator:
+    def __init__(self,
+                 shape_subspace: ShapeSubspace,
+                 hints: list[IteratorHint]):
+        self.shape_subspace = shape_subspace
+        self.hints = hints
+
+    def explore_idx(self, idx: int):
+        hint = self.hints[idx]
+        if hint == IteratorHint.MAXIMIZE:
+            binary_search(min_val,
+                          max_val,
+                          evaluator,
+                          search_max=True)
+        elif hint == IteratorHint.MINIMIZE:
+            binary_search(min_val,
+                          max_val,
+                          evaluator,
+                          search_max=False)
+        elif hint == IteratorHint.EXPLORE:
+            exhaustive(min_val,
+                       max_val,
+                       evaluator)
+            pass
+        else:
+            raise ValueError(f"Unknown hint {hint}")
+
+
+class EvaluationResult(Enum):
+    TOO_SMALL = 0
+    VALID     = 1
+    TOO_LARGE = 2
+
+
+def binary_search(min: int,
+                  max: int,
+                  evaluate: Callable[[int], EvaluationResult],
+                  search_max: bool):
+    if min > max:
+        raise ValueError("min must be lower or equal to max")
+
+    while min < max - 1:
+        cur = (min + max) // 2
+        cur_result = evaluate(cur)
+        if cur_result == EvaluationResult.TOO_LARGE:
+            max = cur - 1
+        elif cur_result == EvaluationResult.TOO_SMALL:
+            min = cur + 1
+        else:
+            if search_max:
+                min = cur
+            else:
+                max = cur
+
+    assert min >= max - 1 and min < max
+    if search_max:
+        evaluate_order = [max, min]
+    else:
+        evaluate_order = [min, max]
+    for cur in evaluate_order:
+        if evaluate(cur) == EvaluationResult.VALID:
+            return cur
+    return None
diff --git a/tests/fastfusion/test_layerdeduplication.py b/tests/fastfusion/test_layerdeduplication.py
index 667406f..f794012 100644
--- a/tests/fastfusion/test_layerdeduplication.py
+++ b/tests/fastfusion/test_layerdeduplication.py
@@ -7,7 +7,7 @@
 
 
 class TestLayerDeduplication(LoadConfigMixin, unittest.TestCase):
-    def test_is_equivalent_mismatch(self):
+    def test_is_equivalent_mismatch_from_shape(self):
         config, spec = self.load_config([
             'four_level.arch.yaml',
             'cascaded_mm.workload.yaml'
@@ -21,7 +21,7 @@ def test_is_equivalent_mismatch(self):
         self.assertIs(rank_renaming, None)
         self.assertIs(tensor_renaming, None)
 
-    def test_is_equivalent_match(self):
+    def test_is_equivalent_mismatch_from_tensor_roles(self):
         config, spec = self.load_config([
             'four_level.arch.yaml',
             'cascaded_mm_32.workload.yaml'
@@ -32,5 +32,23 @@ def test_is_equivalent_match(self):
         rank_renaming, tensor_renaming = \
             is_equivalent(0, 1, workload, analyzer)
 
-        self.assertEqual(rank_renaming, {0: 9, 1: 10, 2: 11})
-        self.assertEqual(tensor_renaming, {0: 2, 1: 3, 2: 4})
\ No newline at end of file
+        self.assertEqual(rank_renaming, None)
+        self.assertEqual(tensor_renaming, None)
+
+    def test_is_equivalent_matches(self):
+        config, spec = self.load_config([
+            'four_level.arch.yaml',
+            'cascaded_mm_multi_32.workload.yaml'
+        ])
+        workload = LooptreeWorkload.parse_cfg(config.root['problem'])
+        analyzer = LooptreeWorkloadDependencyAnalyzer(workload)
+
+        rank_renaming, tensor_renaming = \
+            is_equivalent(0, 1, workload, analyzer)
+        self.assertEqual(rank_renaming, None)
+        self.assertEqual(tensor_renaming, None)
+
+        rank_renaming, tensor_renaming = \
+            is_equivalent(1, 2, workload, analyzer)
+        self.assertEqual(rank_renaming, {9: 16, 10: 17, 11: 18})
+        self.assertEqual(tensor_renaming, {2: 4, 3: 5, 4: 6})
\ No newline at end of file
diff --git a/tests/test_configs/cascaded_mm_multi_32.workload.yaml b/tests/test_configs/cascaded_mm_multi_32.workload.yaml
index fe5ebac..848c8dd 100644
--- a/tests/test_configs/cascaded_mm_multi_32.workload.yaml
+++ b/tests/test_configs/cascaded_mm_multi_32.workload.yaml
@@ -6,7 +6,7 @@ problem:
       - {name: Fmap1,   dimensions: [ Fmap1_C, Fmap1_P ],     projection: '[ C1, P1 ]'}
       - {name: Filter1, dimensions: [ Filter1_C, Filter1_M ], projection: '[ C1, M1 ]'}
       - {name: Fmap2,   dimensions: [ Fmap2_C, Fmap2_P ],     projection: '[ M1, P1 ]', read_write: True}
-    instance: 0 <= P1 < 32 and 0 <= M1 < 32 and 0 <= C1 < 32
+    instance: 0 <= P1 < 2 and 0 <= M1 < 2 and 0 <= C1 < 2
 
   - shape:
       name: Fc2
@@ -15,7 +15,7 @@ problem:
       - {name: Fmap2,   dimensions: [ Fmap2_C, Fmap2_P ],     projection: '[ C2, P2 ]'}
       - {name: Filter2, dimensions: [ Filter2_C, Filter2_M ], projection: '[ C2, M2 ]'}
       - {name: Fmap3,   dimensions: [ Fmap3_C, Fmap3_P ],     projection: '[ M2, P2 ]', read_write: True}
-    instance: 0 <= P2 < 32 and 0 <= M2 < 32 and 0 <= C2 < 32
+    instance: 0 <= P2 < 2 and 0 <= M2 < 2 and 0 <= C2 < 2
 
   - shape:
       name: Fc3
@@ -24,7 +24,7 @@ problem:
       - {name: Fmap3,   dimensions: [ Fmap3_C, Fmap3_P ],     projection: '[ C3, P3 ]'}
       - {name: Filter3, dimensions: [ Filter3_C, Filter3_M ], projection: '[ C3, M3 ]'}
       - {name: Fmap4,   dimensions: [ Fmap4_C, Fmap4_P ],     projection: '[ M3, P3 ]', read_write: True}
-    instance: 0 <= P3 < 32 and 0 <= M3 < 32 and 0 <= C3 < 32
+    instance: 0 <= P3 < 2 and 0 <= M3 < 2 and 0 <= C3 < 2
 
   - shape:
       name: Fc4
@@ -33,7 +33,7 @@ problem:
       - {name: Fmap4,   dimensions: [ Fmap4_C, Fmap4_P ],     projection: '[ C4, P4 ]'}
       - {name: Filter4, dimensions: [ Filter4_C, Filter4_M ], projection: '[ C4, M4 ]'}
       - {name: Fmap5,   dimensions: [ Fmap5_C, Fmap5_P ],     projection: '[ M4, P4 ]', read_write: True}
-    instance: 0 <= P4 < 32 and 0 <= M4 < 32 and 0 <= C4 < 32
+    instance: 0 <= P4 < 2 and 0 <= M4 < 2 and 0 <= C4 < 2
 
   - shape:
       name: Fc5
@@ -42,7 +42,7 @@ problem:
       - {name: Fmap5,   dimensions: [ Fmap5_C, Fmap5_P ],     projection: '[ C5, P5 ]'}
       - {name: Filter5, dimensions: [ Filter5_C, Filter5_M ], projection: '[ C5, M5 ]'}
       - {name: Fmap6,   dimensions: [ Fmap6_C, Fmap6_P ],     projection: '[ M5, P5 ]', read_write: True}
-    instance: 0 <= P5 < 32 and 0 <= M5 < 32 and 0 <= C5 < 32
+    instance: 0 <= P5 < 2 and 0 <= M5 < 2 and 0 <= C5 < 2
 
   - shape:
       name: Fc6
@@ -51,7 +51,7 @@ problem:
       - {name: Fmap6,   dimensions: [ Fmap6_C, Fmap6_P ],     projection: '[ C6, P6 ]'}
       - {name: Filter6, dimensions: [ Filter6_C, Filter6_M ], projection: '[ C6, M6 ]'}
       - {name: Fmap7,   dimensions: [ Fmap7_C, Fmap7_P ],     projection: '[ M6, P6 ]', read_write: True}
-    instance: 0 <= P6 < 32 and 0 <= M6 < 32 and 0 <= C6 < 32
+    instance: 0 <= P6 < 2 and 0 <= M6 < 2 and 0 <= C6 < 2
 
   - shape:
       name: Fc7
@@ -60,7 +60,7 @@ problem:
       - {name: Fmap7,   dimensions: [ Fmap7_C, Fmap7_P ],     projection: '[ C7, P7 ]'}
       - {name: Filter7, dimensions: [ Filter7_C, Filter7_M ], projection: '[ C7, M7 ]'}
       - {name: Fmap8,   dimensions: [ Fmap8_C, Fmap8_P ],     projection: '[ M7, P7 ]', read_write: True}
-    instance: 0 <= P7 < 32 and 0 <= M7 < 32 and 0 <= C7 < 32
+    instance: 0 <= P7 < 2 and 0 <= M7 < 2 and 0 <= C7 < 2
 
   - shape:
       name: Fc8
@@ -69,7 +69,7 @@ problem:
       - {name: Fmap8,   dimensions: [ Fmap8_C, Fmap8_P ],     projection: '[ C8, P8 ]'}
       - {name: Filter8, dimensions: [ Filter8_C, Filter8_M ], projection: '[ C8, M8 ]'}
       - {name: Fmap9,   dimensions: [ Fmap9_C, Fmap9_P ],     projection: '[ M8, P8 ]', read_write: True}
-    instance: 0 <= P8 < 32 and 0 <= M8 < 32 and 0 <= C8 < 32
+    instance: 0 <= P8 < 2 and 0 <= M8 < 2 and 0 <= C8 < 2
 
   # - shape:
   #     name: Fc9
@@ -78,7 +78,7 @@ problem:
   #     - {name: Fmap9,   dimensions: [ Fmap9_C, Fmap9_P ],     projection: '[ C9, P9 ]'}
   #     - {name: Filter9, dimensions: [ Filter9_C, Filter9_M ], projection: '[ C9, M9 ]'}
   #     - {name: Fmap10,   dimensions: [ Fmap10_C, Fmap10_P ],     projection: '[ M9, P9 ]', read_write: True}
-  #   instance: 0 <= P9 < 32 and 0 <= M9 < 32 and 0 <= C9 < 32
+  #   instance: 0 <= P9 < 2 and 0 <= M9 < 2 and 0 <= C9 < 2
 
   # - shape:
   #     name: Fc10
@@ -87,7 +87,7 @@ problem:
   #     - {name: Fmap10,   dimensions: [ Fmap10_C, Fmap10_P ],     projection: '[ C10, P10 ]'}
   #     - {name: Filter10, dimensions: [ Filter10_C, Filter10_M ], projection: '[ C10, M10 ]'}
   #     - {name: Fmap11,   dimensions: [ Fmap11_C, Fmap11_P ],     projection: '[ M10, P10 ]', read_write: True}
-  #   instance: 0 <= P10 < 32 and 0 <= M10 < 32 and 0 <= C10 < 32
+  #   instance: 0 <= P10 < 2 and 0 <= M10 < 2 and 0 <= C10 < 2
 
   # - shape:
   #     name: Fc11
@@ -96,7 +96,7 @@ problem:
   #     - {name: Fmap11,   dimensions: [ Fmap11_C, Fmap11_P ],     projection: '[ C11, P11 ]'}
   #     - {name: Filter11, dimensions: [ Filter11_C, Filter11_M ], projection: '[ C11, M11 ]'}
   #     - {name: Fmap12,   dimensions: [ Fmap12_C, Fmap12_P ],     projection: '[ M11, P11 ]', read_write: True}
-  #   instance: 0 <= P11 < 32 and 0 <= M11 < 32 and 0 <= C11 < 32
+  #   instance: 0 <= P11 < 2 and 0 <= M11 < 2 and 0 <= C11 < 2
 
   # - shape:
   #     name: Fc12
@@ -105,7 +105,7 @@ problem:
   #     - {name: Fmap12,   dimensions: [ Fmap12_C, Fmap12_P ],     projection: '[ C12, P12 ]'}
   #     - {name: Filter12, dimensions: [ Filter12_C, Filter12_M ], projection: '[ C12, M12 ]'}
   #     - {name: Fmap13,   dimensions: [ Fmap13_C, Fmap13_P ],     projection: '[ M12, P12 ]', read_write: True}
-  #   instance: 0 <= P12 < 32 and 0 <= M12 < 32 and 0 <= C12 < 32
+  #   instance: 0 <= P12 < 2 and 0 <= M12 < 2 and 0 <= C12 < 2
 
   # - shape:
   #     name: Fc13
@@ -114,7 +114,7 @@ problem:
   #     - {name: Fmap13,   dimensions: [ Fmap13_C, Fmap13_P ],     projection: '[ C13, P13 ]'}
   #     - {name: Filter13, dimensions: [ Filter13_C, Filter13_M ], projection: '[ C13, M13 ]'}
   #     - {name: Fmap14,   dimensions: [ Fmap14_C, Fmap14_P ],     projection: '[ M13, P13 ]', read_write: True}
-  #   instance: 0 <= P13 < 32 and 0 <= M13 < 32 and 0 <= C13 < 32
+  #   instance: 0 <= P13 < 2 and 0 <= M13 < 2 and 0 <= C13 < 2
 
   # - shape:
   #     name: Fc14
@@ -123,7 +123,7 @@ problem:
   #     - {name: Fmap14,   dimensions: [ Fmap14_C, Fmap14_P ],     projection: '[ C14, P14 ]'}
   #     - {name: Filter14, dimensions: [ Filter14_C, Filter14_M ], projection: '[ C14, M14 ]'}
   #     - {name: Fmap15,   dimensions: [ Fmap15_C, Fmap15_P ],     projection: '[ M14, P14 ]', read_write: True}
-  #   instance: 0 <= P14 < 32 and 0 <= M14 < 32 and 0 <= C14 < 32
+  #   instance: 0 <= P14 < 2 and 0 <= M14 < 2 and 0 <= C14 < 2
 
   # - shape:
   #     name: Fc15
@@ -132,7 +132,7 @@ problem:
   #     - {name: Fmap15,   dimensions: [ Fmap15_C, Fmap15_P ],     projection: '[ C15, P15 ]'}
   #     - {name: Filter15, dimensions: [ Filter15_C, Filter15_M ], projection: '[ C15, M15 ]'}
   #     - {name: Fmap16,   dimensions: [ Fmap16_C, Fmap16_P ],     projection: '[ M15, P15 ]', read_write: True}
-  #   instance: 0 <= P15 < 32 and 0 <= M15 < 32 and 0 <= C15 < 32
+  #   instance: 0 <= P15 < 2 and 0 <= M15 < 2 and 0 <= C15 < 2
 
   # - shape:
   #     name: Fc16
@@ -141,4 +141,4 @@ problem:
   #     - {name: Fmap16,   dimensions: [ Fmap16_C, Fmap16_P ],     projection: '[ C16, P16 ]'}
   #     - {name: Filter16, dimensions: [ Filter16_C, Filter16_M ], projection: '[ C16, M16 ]'}
   #     - {name: Fmap17,   dimensions: [ Fmap17_C, Fmap17_P ],     projection: '[ M16, P16 ]', read_write: True}
-  #   instance: 0 <= P16 < 32 and 0 <= M16 < 32 and 0 <= C16 < 32
\ No newline at end of file
+  #   instance: 0 <= P16 < 2 and 0 <= M16 < 2 and 0 <= C16 < 2
\ No newline at end of file