From fef4d05c5fa80b76add8246d40647c5bc6dd5cad Mon Sep 17 00:00:00 2001
From: Hanno Becker <beckphan@amazon.co.uk>
Date: Mon, 25 Mar 2024 04:09:00 +0000
Subject: [PATCH 01/15] Allow pre->pre and late->late cross loop dependencies

---
 slothy/core/core.py     | 57 +++++++++++++++++++++++++++++++----------
 slothy/core/dataflow.py |  2 ++
 2 files changed, 45 insertions(+), 14 deletions(-)

diff --git a/slothy/core/core.py b/slothy/core/core.py
index 0d082493..13ebb649 100644
--- a/slothy/core/core.py
+++ b/slothy/core/core.py
@@ -1775,6 +1775,13 @@ def _add_path_constraint( self, consumer, producer, cb):
             cb()
             return
 
+        if self._is_low(consumer) and self._is_high(producer):
+            ct = cb()
+            ct.OnlyEnforceIf([consumer.pre_var, producer.pre_var])
+            ct = cb()
+            ct.OnlyEnforceIf([consumer.post_var, producer.post_var])
+            return
+
         if self._is_input(producer) and self._is_low(consumer):
             return
         if self._is_output(consumer) and self._is_high(producer):
@@ -1802,6 +1809,9 @@ def _add_path_constraint_from( self, consumer, producer, cb_lst):
         bvars = [ self._NewBoolVar("") for _ in cb_lst ]
         self._AddExactlyOne(bvars)
 
+        if self._is_low(consumer) and self._is_high(producer):
+            raise Exception("Not yet implemented")
+
         if not self.config.sw_pipelining.enabled or producer.is_virtual or consumer.is_virtual:
             for (cb, bvar) in zip(cb_lst, bvars, strict=True):
                 cb().OnlyEnforceIf(bvar)
@@ -2141,15 +2151,29 @@ def _is_output(self, t):
         assert isinstance(t, ComputationNode)
         return t.is_virtual_output
 
-    def _iter_dependencies(self, with_virt=True):
-        def f(t):
-            if with_virt:
-                return True
+    def _iter_dependencies(self, with_virt=True, with_duals=True):
+        def check_dep(t):
             (consumer, producer, _, _) = t
-            return consumer in self._get_nodes() and \
-                   producer.src in self._get_nodes()
+            if with_virt:
+                yield t
+            elif consumer in self._get_nodes() and \
+                 producer.src in self._get_nodes():
+                yield t
+
+        def is_cross_iteration_dependency(producer, consumer):
+            if not self.config.sw_pipelining.enabled is True:
+                return False
+            return self._is_low(producer.src) and self._is_high(consumer)
 
-        yield from filter(f, self._model.tree.iter_dependencies())
+        for t in self._model.tree.iter_dependencies():
+            yield from check_dep(t)
+
+            if with_duals is False:
+                continue
+
+            (consumer, producer, a, b) = t
+            if is_cross_iteration_dependency(producer, consumer):
+                yield from check_dep((consumer.sibling, producer.sibling(), a, b))
 
     def _iter_dependencies_with_lifetime(self):
 
@@ -2158,7 +2182,7 @@ def _get_lifetime_start(src):
                 return src.src.out_lifetime_start[src.idx]
             if isinstance(src, InstructionInOut):
                 return src.src.inout_lifetime_start[src.idx]
-            raise SlothyException("Unknown register source")
+            raise SlothyException(f"Unknown register source {src}")
 
         def _get_lifetime_end(src):
             if isinstance(src, InstructionOutput):
@@ -2168,9 +2192,9 @@ def _get_lifetime_end(src):
             raise SlothyException("Unknown register source")
 
         for (consumer, producer, ty, idx) in self._iter_dependencies():
-            start_var = _get_lifetime_start(producer)
-            end_var = _get_lifetime_end(producer)
-            yield (consumer, producer, ty, idx, start_var, end_var, producer.alloc())
+            producer_start_var = _get_lifetime_start(producer)
+            producer_end_var = _get_lifetime_end(producer)
+            yield (consumer, producer, ty, idx, producer_start_var, producer_end_var, producer.alloc())
 
     def _iter_cross_iteration_dependencies(self):
         def is_cross_iteration_dependency(dep):
@@ -2387,15 +2411,20 @@ def _add_constraints_loop_optimization(self):
                 self._AddImplication( producer.src.post_var, consumer.post_var )
                 self._AddImplication( consumer.pre_var, producer.src.pre_var )
                 self._AddImplication( producer.src.pre_var, consumer.post_var.Not() )
-            elif self._is_low(producer.src):
+            elif self._is_low(producer.src) and self._is_high(consumer):
+                self._AddImplication( producer.src.pre_var, consumer.pre_var )
+                self._AddImplication( consumer.post_var, producer.src.post_var )
+            #     self._AddImplication(producer.src.pre_var
+            #     pass
+
                 # An instruction with forward dependency to the next iteration
                 # cannot be an early instruction, and an instruction depending
                 # on an instruction from a previous iteration cannot be late.
 
                 # pylint:disable=singleton-comparison
-                self._Add(producer.src.pre_var == False)
+               #  self._Add(producer.src.pre_var == False)
                 # pylint:disable=singleton-comparison
-                self._Add(consumer.post_var == False)
+               # self._Add(consumer.post_var == False)
 
     # ================================================================
     #                  CONSTRAINTS (Single issuing)                  #
diff --git a/slothy/core/dataflow.py b/slothy/core/dataflow.py
index e2e24b4b..96b7a72d 100644
--- a/slothy/core/dataflow.py
+++ b/slothy/core/dataflow.py
@@ -74,6 +74,8 @@ def alloc(self):
         return self.src.alloc_out_var[self.idx]
     def reduce(self):
         return self
+    def sibling(self):
+        return InstructionOutput(self.src.sibling, self.idx)
 
 class InstructionInOut(RegisterSource):
     """Represents an input/output of a node in the data flow graph"""

From d836e274edd0842f6975e607d069dc0d5516a26e Mon Sep 17 00:00:00 2001
From: Hanno Becker <beckphan@amazon.co.uk>
Date: Thu, 28 Mar 2024 05:37:05 +0000
Subject: [PATCH 02/15] WIP: Add prefetch instruction to AArch64 model

---
 slothy/targets/aarch64/aarch64_neon.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/slothy/targets/aarch64/aarch64_neon.py b/slothy/targets/aarch64/aarch64_neon.py
index 77b67a5f..839bf709 100644
--- a/slothy/targets/aarch64/aarch64_neon.py
+++ b/slothy/targets/aarch64/aarch64_neon.py
@@ -929,6 +929,17 @@ def make(cls, src):
         obj.addr = obj.args_in[0]
         return obj
 
+class prefetch(Ldr_Q): # pylint: disable=missing-docstring,invalid-name
+    pattern = "prfm pld1lkeep, [<Xc>, <imm>]"
+    inputs = ["Xc"]
+    @classmethod
+    def make(cls, src):
+        obj = AArch64Instruction.build(cls, src)
+        obj.increment = None
+        obj.pre_index = obj.immediate
+        obj.addr = obj.args_in[0]
+        return obj
+
 class q_ldr_with_inc_hint(Ldr_Q): # pylint: disable=missing-docstring,invalid-name
     pattern = "ldrh <Qa>, <Xc>, <imm>, <Th>"
     inputs = ["Xc", "Th"]

From af4956676706c62dfe3032897fe5c8ab9b7fad60 Mon Sep 17 00:00:00 2001
From: Hanno Becker <beckphan@amazon.co.uk>
Date: Fri, 29 Mar 2024 04:10:25 +0000
Subject: [PATCH 03/15] Add a few ld1 variants to AArch64 model

---
 slothy/core/dataflow.py                |  2 ++
 slothy/targets/aarch64/aarch64_neon.py | 40 ++++++++++++++++++++++++++
 2 files changed, 42 insertions(+)

diff --git a/slothy/core/dataflow.py b/slothy/core/dataflow.py
index 96b7a72d..0bb152b4 100644
--- a/slothy/core/dataflow.py
+++ b/slothy/core/dataflow.py
@@ -89,6 +89,8 @@ def alloc(self):
         return self.src.alloc_in_out_var[self.idx]
     def reduce(self):
         return self.src.src_in_out[self.idx].reduce()
+    def sibling(self):
+        return InstructionInOut(self.src.sibling, self.idx)
 
 class VirtualInstruction:
     """A 'virtual' instruction node for inputs and outputs."""
diff --git a/slothy/targets/aarch64/aarch64_neon.py b/slothy/targets/aarch64/aarch64_neon.py
index 839bf709..0b50aa23 100644
--- a/slothy/targets/aarch64/aarch64_neon.py
+++ b/slothy/targets/aarch64/aarch64_neon.py
@@ -929,6 +929,18 @@ def make(cls, src):
         obj.addr = obj.args_in[0]
         return obj
 
+class q_ld1(Ldr_Q): # pylint: disable=missing-docstring,invalid-name
+    pattern = "ld1 {<Va>.<dt>}, [<Xc>]"
+    inputs = ["Xc"]
+    outputs = ["Va"]
+    @classmethod
+    def make(cls, src):
+        obj = AArch64Instruction.build(cls, src)
+        obj.increment = None
+        obj.pre_index = None
+        obj.addr = obj.args_in[0]
+        return obj
+
 class prefetch(Ldr_Q): # pylint: disable=missing-docstring,invalid-name
     pattern = "prfm pld1lkeep, [<Xc>, <imm>]"
     inputs = ["Xc"]
@@ -1072,6 +1084,22 @@ def write(self):
         self.immediate = simplify(self.pre_index)
         return super().write()
 
+class q_ld1_with_inc(Ldr_Q): # pylint: disable=missing-docstring,invalid-name
+    pattern = "ld1 {<Va>.<dt>}, [<Xc>, <imm>]"
+    inputs = ["Xc"]
+    outputs = ["Va"]
+    @classmethod
+    def make(cls, src):
+        obj = AArch64Instruction.build(cls, src)
+        obj.increment = None
+        obj.pre_index = obj.immediate
+        obj.addr = obj.args_in[0]
+        return obj
+
+    def write(self):
+        self.immediate = simplify(self.pre_index)
+        return super().write()
+
 class q_ldp_with_inc(Ldp_Q): # pylint: disable=missing-docstring,invalid-name
     pattern = "ldp <Qa>, <Qb>, [<Xc>, <imm>]"
     inputs = ["Xc"]
@@ -1112,6 +1140,18 @@ def make(cls, src):
         obj.addr = obj.args_in[0]
         return obj
 
+class q_ld1_with_postinc(Ldr_Q): # pylint: disable=missing-docstring,invalid-name
+    pattern = "ld1 {<Va>.<dt>}, [<Xc>], <imm>"
+    inputs = ["Xc"]
+    outputs = ["Va"]
+    @classmethod
+    def make(cls, src):
+        obj = AArch64Instruction.build(cls, src)
+        obj.increment = obj.immediate
+        obj.pre_index = None
+        obj.addr = obj.args_in[0]
+        return obj
+
 class q_ldp_with_postinc(Ldp_Q): # pylint: disable=missing-docstring,invalid-name
     pattern = "ldp <Qa>, <Qb>, [<Xc>], <imm>"
     inputs = ["Xc"]

From 35a284e192c7c0497b601513302758955d5352f6 Mon Sep 17 00:00:00 2001
From: Hanno Becker <beckphan@amazon.co.uk>
Date: Fri, 29 Mar 2024 04:10:59 +0000
Subject: [PATCH 04/15] Add uxtl, smlal, smlal2 to AArch64 model

---
 slothy/targets/aarch64/aarch64_neon.py | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/slothy/targets/aarch64/aarch64_neon.py b/slothy/targets/aarch64/aarch64_neon.py
index 0b50aa23..21020708 100644
--- a/slothy/targets/aarch64/aarch64_neon.py
+++ b/slothy/targets/aarch64/aarch64_neon.py
@@ -2153,6 +2153,11 @@ class vuzp2(AArch64Instruction): # pylint: disable=missing-docstring,invalid-nam
     inputs = ["Va", "Vb"]
     outputs = ["Vd"]
 
+class vuxtl(AArch64Instruction): # pylint: disable=missing-docstring,invalid-name
+    pattern = "uxtl <Vd>.<dt0>, <Va>.<dt1>"
+    inputs = ["Va"]
+    outputs = ["Vd"]
+
 class vqrdmulh(AArch64Instruction): # pylint: disable=missing-docstring,invalid-name
     pattern = "sqrdmulh <Vd>.<dt0>, <Va>.<dt1>, <Vb>.<dt2>"
     inputs = ["Va", "Vb"]
@@ -2336,11 +2341,24 @@ class vmull(AArch64Instruction): # pylint: disable=missing-docstring,invalid-nam
     inputs = ["Va", "Vb"]
     outputs = ["Vd"]
 
-class vmlal(AArch64Instruction): # pylint: disable=missing-docstring,invalid-name
+class Vmlal(AArch64Instruction):
+    pass
+
+class vmlal(Vmlal): # pylint: disable=missing-docstring,invalid-name
     pattern = "umlal <Vd>.<dt0>, <Va>.<dt1>, <Vb>.<dt2>"
     inputs = ["Va", "Vb"]
     in_outs=["Vd"]
 
+class vsmlal(Vmlal): # pylint: disable=missing-docstring,invalid-name
+    pattern = "smlal <Vd>.<dt0>, <Va>.<dt1>, <Vb>.<dt2>"
+    inputs = ["Va", "Vb"]
+    in_outs=["Vd"]
+
+class vsmlal2(Vmlal): # pylint: disable=missing-docstring,invalid-name
+    pattern = "smlal2 <Vd>.<dt0>, <Va>.<dt1>, <Vb>.<dt2>"
+    inputs = ["Va", "Vb"]
+    in_outs=["Vd"]
+
 class vsrshr(AArch64Instruction): # pylint: disable=missing-docstring,invalid-name
     pattern = "srshr <Vd>.<dt0>, <Va>.<dt1>, <imm>"
     inputs = ["Va"]

From 6addfcbbed5d8437b32246a65547850c0e769cbf Mon Sep 17 00:00:00 2001
From: Hanno Becker <beckphan@amazon.co.uk>
Date: Fri, 29 Mar 2024 04:12:16 +0000
Subject: [PATCH 05/15] Add uxtl, smlal, smlal2 to experimental N1 model

---
 .../targets/aarch64/neoverse_n1_experimental.py   | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/slothy/targets/aarch64/neoverse_n1_experimental.py b/slothy/targets/aarch64/neoverse_n1_experimental.py
index a4e79ea2..12f11f79 100644
--- a/slothy/targets/aarch64/neoverse_n1_experimental.py
+++ b/slothy/targets/aarch64/neoverse_n1_experimental.py
@@ -94,10 +94,11 @@ def get_min_max_objective(slothy):
     (vmovi)                   : ExecutionUnit.V(),
     (vand, vadd)              : ExecutionUnit.V(),
     (vxtn)                    : ExecutionUnit.V(),
-    (vshl, vshl_d, vshli, vshrn) : ExecutionUnit.V1(),
+    (vuxtl, vshl, vshl_d,
+     vshli, vshrn)            : ExecutionUnit.V1(),
     vusra                     : ExecutionUnit.V1(),
     AESInstruction            : ExecutionUnit.V0(),
-    (vmul, vmlal, vmull)      : ExecutionUnit.V0(),
+    (vmul, Vmlal, vmull)      : ExecutionUnit.V0(),
     AArch64NeonLogical        : ExecutionUnit.V(),
     (AArch64BasicArithmetic,
      AArch64ConditionalSelect,
@@ -128,10 +129,11 @@ def get_min_max_objective(slothy):
     AArch64NeonLogical         : 1,
     (vmovi)                    : 1,
     (vxtn)                     : 1,
-    (vshl, vshl_d, vshli, vshrn) : 1,
+    (vuxtl, vshl, vshl_d,
+     vshli, vshrn)             : 1,
     (vmul)                     : 2,
     vusra                      : 1,
-    (vmlal, vmull)             : 1,
+    (Vmlal, vmull)             : 1,
     (AArch64BasicArithmetic,
      AArch64ConditionalSelect,
      AArch64ConditionalCompare,
@@ -164,8 +166,9 @@ def get_min_max_objective(slothy):
     (vmovi)                   : 2,
     (vmul)                    : 5,
     vusra                     : 4, # TODO: Add fwd path
-    (vmlal, vmull)            : 4, # TODO: Add fwd path
-    (vshl, vshl_d, vshli, vshrn) : 2,
+    (Vmlal, vmull)            : 4, # TODO: Add fwd path
+    (vuxtl, vshl, vshl_d,
+     vshli, vshrn)            : 2,
     (AArch64BasicArithmetic,
      AArch64ConditionalSelect,
      AArch64ConditionalCompare,

From 6c354cbaa1ed49538c91a46dab366b21e9be4c5d Mon Sep 17 00:00:00 2001
From: Hanno Becker <beckphan@amazon.co.uk>
Date: Fri, 29 Mar 2024 04:55:56 +0000
Subject: [PATCH 06/15] Fix inconsistency in visualization style of
 optimization

With `config.visualize_expected_performance` set, a nice graph with legend
would be shown, but not so if `visualize_expected_performance == False`.

This commit largely copy-pastes the visualization code from
`visualize_expected_performance` to the other visualization
functions, so that a legend is printed in any case.

Ultimately, this should be cleaned up a bit and a helper function
introduced for the pretty printing of a performance/reordering graph.
---
 slothy/core/config.py |   2 +
 slothy/core/core.py   | 185 ++++++++++++++++++++++++++++++++++++------
 2 files changed, 160 insertions(+), 27 deletions(-)

diff --git a/slothy/core/config.py b/slothy/core/config.py
index 8982f525..3e7258ac 100644
--- a/slothy/core/config.py
+++ b/slothy/core/config.py
@@ -1048,6 +1048,8 @@ def __init__(self, Arch, Target):
         self.late_char = 'l'
         self.core_char = '*'
 
+        self.mirror_char = "~"
+
         self.typing_hints = {}
 
         self.solver_random_seed = 42
diff --git a/slothy/core/core.py b/slothy/core/core.py
index 13ebb649..37989e65 100644
--- a/slothy/core/core.py
+++ b/slothy/core/core.py
@@ -58,13 +58,91 @@ def orig_code(self, val):
         assert self._orig_code is None
         self._orig_code = val
 
-    def _gen_orig_code_visualized(self):
-        if self.codesize == 0:
-            return
+    def _gen_orig_code_visualized_perf(self):
+        early_char = self.config.early_char
+        late_char  = self.config.late_char
+        core_char  = self.config.core_char
+        d = self.config.placeholder_char
+
+        mirror_char = self.config.mirror_char
+
+        fixlen = max(map(len, self.orig_code)) + 8
+
+        def arr_width(arr):
+            mi = min(arr)
+            ma = max(0, max(arr)) # pylint:disable=nested-min-max
+            return mi, ma-mi
+
+        def center_str_fixlen(txt, fixlen, char='-'):
+            txt = ' ' + txt + ' '
+            l = min(len(txt), fixlen)
+            lpad = (fixlen - l) // 2
+            rpad = (fixlen - l) - lpad
+            return char * lpad + txt + char * rpad
+
+        block_size = 25
+
+        min_pos, width = arr_width(self.cycle_position_with_bubbles.values())
+        width = max(width, block_size+5)
+
+        cycles = width
+        cycle_blocks = math.ceil(cycles/block_size)
+        cycle_remainder = cycles % block_size
+
+        legend0 = center_str_fixlen('cycle (expected)', cycles, '-') + '>'
+        legend1 = ''.join([str(i*block_size).ljust(block_size) for i in range(cycle_blocks)])
+        legend2 = (('|' + '-' * (block_size - 1))) * (cycle_blocks - 1)
+        legend2 = legend2 + '|' + '-' * (cycle_remainder if cycle_remainder != 0 else block_size)
+        yield SourceLine("")
+        yield SourceLine("").set_comment(legend0).set_length(fixlen + 1)
+        yield SourceLine("").set_comment(legend1).set_length(fixlen + 1)
+        yield SourceLine("").set_comment(legend2).set_length(fixlen + 1)
+        for i in range(self.codesize):
+            pos = self.cycle_position_with_bubbles[i] - min_pos
+            c = core_char
+            if self.config.sw_pipelining.enabled and self.is_pre(i):
+                c = early_char
+            elif self.config.sw_pipelining.enabled and self.is_post(i):
+                c = late_char
+
+            # String of the form "...{e,*,l}...", with e/l/* in position pos
+            t_comment = [d for _ in range(width+1)]
+            if self.config.sw_pipelining.enabled is True:
+                if min_pos < 0:
+                    t_comment[-min_pos] = "'"
+                c_pos = max(-min_pos,0) + self.cycles
+                while c_pos < width:
+                    t_comment[c_pos] = "'"
+                    c_pos += self.cycles
+
+            c_pos = pos
+            t_comment[c_pos] = c
+
+            if self.config.sw_pipelining.enabled is True:
+                # Also display sibling of instruction in other iterations
+                c = mirror_char
+                c_pos = pos - self.cycles
+                while c_pos >= 0:
+                    t_comment[c_pos] = c
+                    c_pos -= self.cycles
+                c_pos = pos + self.cycles
+                while c_pos < width:
+                    t_comment[c_pos] = c
+                    c_pos += self.cycles
+
+            t_comment = ''.join(t_comment)
+
+            yield SourceLine("")                                      \
+                .set_comment(f"{self.orig_code[i].text:{fixlen-3}s}") \
+                .add_comment(t_comment)
+
+        yield SourceLine("")
 
+    def _gen_orig_code_visualized_perm(self):
         early_char = self.config.early_char
         late_char  = self.config.late_char
         core_char  = self.config.core_char
+        mirror_char = self.config.mirror_char
         d = self.config.placeholder_char
 
         fixlen = max(map(len, self.orig_code)) + 8
@@ -74,10 +152,30 @@ def arr_width(arr):
             ma = max(0, max(arr)) # pylint:disable=nested-min-max
             return mi, ma-mi
 
+        def center_str_fixlen(txt, fixlen, char='-'):
+            txt = ' ' + txt + ' '
+            l = min(len(txt), fixlen)
+            lpad = (fixlen - l) // 2
+            rpad = (fixlen - l) - lpad
+            return char * lpad + txt + char * rpad
+
         min_pos, width = arr_width(self.reordering.values())
 
+        block_size = 25
+        width = max(width, block_size+5)
+
+        cycles = width
+        cycle_blocks = math.ceil(cycles/block_size)
+        cycle_remainder = cycles % block_size
+
+        legend0 = center_str_fixlen('new position', cycles, '-') + '>'
+        legend1 = ''.join([str(i*block_size).ljust(block_size) for i in range(cycle_blocks)])
+        legend2 = (('|' + '-' * (block_size - 1))) * (cycle_blocks - 1)
+        legend2 = legend2 + '|' + '-' * (cycle_remainder if cycle_remainder != 0 else block_size)
         yield SourceLine("")
-        yield SourceLine("").set_comment("original source code")
+        yield SourceLine("").set_comment(legend0).set_length(fixlen + 1)
+        yield SourceLine("").set_comment(legend1).set_length(fixlen + 1)
+        yield SourceLine("").set_comment(legend2).set_length(fixlen + 1)
         for i in range(self.codesize):
             pos = self.reordering[i] - min_pos
             c = core_char
@@ -88,18 +186,29 @@ def arr_width(arr):
 
             # String of the form "...{e,*,l}...", with e/l/* in position pos
             t_comment = [d for _ in range(width+1)]
-            if min_pos < 0:
-                t_comment[-min_pos] = '|'
-            if width > max(-min_pos,0) + self.codesize:
-                t_comment[max(-min_pos,0) + self.codesize] = '|'
-            c_pos = pos
-            while c_pos >= 0:
-                t_comment[c_pos] = c
-                c_pos -= self.codesize
+            if self.config.sw_pipelining.enabled is True:
+                if min_pos < 0:
+                    t_comment[-min_pos] = "'"
+                c_pos = max(-min_pos,0) + self.codesize
+                while c_pos < width:
+                    t_comment[c_pos] = "'"
+                    c_pos += self.codesize
+
             c_pos = pos
-            while c_pos < width:
-                t_comment[c_pos] = c
-                c_pos += self.codesize
+            t_comment[c_pos] = c
+
+            if self.config.sw_pipelining.enabled is True:
+                # Also display sibling of instruction in other iterations
+                c = mirror_char
+                c_pos = pos - self.codesize
+                while c_pos >= 0:
+                    t_comment[c_pos] = c
+                    c_pos -= self.codesize
+                c_pos = pos + self.codesize
+                while c_pos < width:
+                    t_comment[c_pos] = c
+                    c_pos += self.codesize
+
             t_comment = ''.join(t_comment)
 
             yield SourceLine("")                                      \
@@ -108,6 +217,15 @@ def arr_width(arr):
 
         yield SourceLine("")
 
+    def _gen_orig_code_visualized(self):
+        if self.codesize == 0:
+            return
+
+        if self.config.visualize_expected_performance:
+            yield from self._gen_orig_code_visualized_perf()
+        else:
+            yield from self._gen_orig_code_visualized_perm()
+
     @property
     def cycles(self):
         """The number of cycles that SLOTHY thinks the code will take.
@@ -421,15 +539,35 @@ def _get_code(self, visualize_reordering):
         core_char  = self.config.core_char
         d = self.config.placeholder_char
 
+        def center_str_fixlen(txt, fixlen, char='-'):
+            txt = ' ' + txt + ' '
+            l = min(len(txt), fixlen)
+            lpad = (fixlen - l) // 2
+            rpad = (fixlen - l) - lpad
+            return char * lpad + txt + char * rpad
+
         def gen_visualized_code_perm():
-            yield SourceLine("").set_comment("----- original position ---->")
+            cs = self.codesize_with_bubbles
+            if cs == 0:
+                return
+            block_size = 25
+            width = max(self.codesize, block_size + 5)
+            blocks = math.ceil(width/block_size)
+
+            legend0 = center_str_fixlen('original position', width - 1, '-') + '>'
+            legend1 = ''.join([str(i*block_size).ljust(block_size) for i in range(blocks)])
+            legend2 = (('|' + '-' * (block_size - 1))) * (blocks - 1)
+            legend2 = legend2 + '|' + '-' * max(width % block_size - 1, 0)
+            yield SourceLine("").set_comment(legend0).set_length(fixlen)
+            yield SourceLine("").set_comment(legend1).set_length(fixlen)
+            yield SourceLine("").set_comment(legend2).set_length(fixlen)
             for i in range(self.codesize_with_bubbles):
                 p = ri.get(i, None)
                 if p is None:
                     gap_str = "gap"
                     yield SourceLine("")    \
                         .set_comment(f"{gap_str:{fixlen-4}s}") \
-                        .add_comment(d * self.codesize)
+                        .add_comment(d * width)
                     continue
                 s = code[self.periodic_reordering[p]]
                 c = core_char
@@ -437,25 +575,18 @@ def gen_visualized_code_perm():
                     c = early_char
                 elif self.is_post(p):
                     c = late_char
-                vis = d * p + c + d * (self.codesize - p - 1)
+                vis = d * p + c + d * (width - p - 1)
                 yield s.copy().set_length(fixlen).set_comment(vis)
 
-        def center_str_fixlen(txt, fixlen, char='-'):
-            txt = ' ' + txt + ' '
-            l = min(len(txt), fixlen)
-            lpad = (fixlen - l) // 2
-            rpad = (fixlen - l) - lpad
-            return char * lpad + txt + char * rpad
-
         def gen_visualized_code_perf():
             cs = self.codesize_with_bubbles
             if cs == 0:
                 return
-            cycles = self.cycles
             block_size = 25
+            cycles = max(self.cycles, block_size + 5)
             cycle_blocks = math.ceil(cycles/block_size)
 
-            legend0 = center_str_fixlen('expected cycle count', cycles - 1, '-') + '>'
+            legend0 = center_str_fixlen('cycle (expected)', cycles - 1, '-') + '>'
             legend1 = ''.join([str(i*block_size).ljust(block_size) for i in range(cycle_blocks)])
             legend2 = (('|' + '-' * (block_size - 1))) * (cycle_blocks - 1)
             legend2 = legend2 + '|' + '-' * max(cycles % block_size - 1, 0)

From 4bbe21f7454d5680dd139fbdb745907449974196 Mon Sep 17 00:00:00 2001
From: Hanno Becker <beckphan@amazon.co.uk>
Date: Fri, 29 Mar 2024 06:19:44 +0000
Subject: [PATCH 07/15] Improve compatibility with C preprocessor by adding -I
 config option

---
 slothy/core/config.py | 12 ++++++++++++
 slothy/core/slothy.py | 31 ++++++++++++++++++-------------
 slothy/helper.py      | 33 +++++++++++++++++++++++----------
 3 files changed, 53 insertions(+), 23 deletions(-)

diff --git a/slothy/core/config.py b/slothy/core/config.py
index 3e7258ac..d9e6e2e4 100644
--- a/slothy/core/config.py
+++ b/slothy/core/config.py
@@ -334,6 +334,14 @@ def compiler_binary(self):
         or `with_llvm_mca_after` are set."""
         return self._compiler_binary
 
+    @property
+    def compiler_include_paths(self):
+        """Include path to add to compiler invocations
+
+        This is only relevant if `with_preprocessor` or `with_llvm_mca_before`
+        or `with_llvm_mca_after` are set."""
+        return self._compiler_include_paths
+
     @property
     def llvm_mca_binary(self):
         """The llvm-mca binary to be used for estimated performance annotations
@@ -1021,6 +1029,7 @@ def __init__(self, Arch, Target):
         self._split_heuristic_preprocess_naive_interleaving_by_latency = False
 
         self._compiler_binary = "gcc"
+        self._compiler_include_paths = None
         self._llvm_mca_binary = "llvm-mca"
 
         self.keep_tags = True
@@ -1127,6 +1136,9 @@ def with_llvm_mca_before(self, val):
     @compiler_binary.setter
     def compiler_binary(self, val):
         self._compiler_binary = val
+    @compiler_include_paths.setter
+    def compiler_include_paths(self, val):
+        self._compiler_include_paths = val
     @llvm_mca_binary.setter
     def llvm_mca_binary(self, val):
         self._llvm_mca_binary = val
diff --git a/slothy/core/slothy.py b/slothy/core/slothy.py
index b3808ebe..89f925ad 100644
--- a/slothy/core/slothy.py
+++ b/slothy/core/slothy.py
@@ -161,7 +161,8 @@ def unfold(self, start=None, end=None, macros=True, aliases=False):
 
         if c.with_preprocessor:
             self.logger.info("Apply C preprocessor...")
-            body = CPreprocessor.unfold(pre, body, c.compiler_binary)
+            body = CPreprocessor.unfold(pre, body, post, c.compiler_binary,
+                                        include=c.compiler_include_paths)
             self.logger.debug("Code after preprocessor:")
             Slothy._dump("preprocessed", body, self.logger, err=False)
 
@@ -183,9 +184,10 @@ def unfold(self, start=None, end=None, macros=True, aliases=False):
 
         self.source = pre + body + post
 
-    def _make_llvm_mca_stats(self, pre, code, txt, indentation):
+    def _make_llvm_mca_stats(self, pre, code, post, txt, indentation):
         try:
-            code = CPreprocessor.unfold(pre, code, self.config.compiler_binary)
+            code = CPreprocessor.unfold(pre, code, post, self.config.compiler_binary,
+                                        include=self.config.compiler_include_paths)
             stats = LLVM_Mca.run(pre, code, self.config.llvm_mca_binary,
                              self.config.arch.llvm_mca_arch,
                              self.config.target.llvm_mca_target, self.logger)
@@ -234,7 +236,8 @@ def optimize(self, start=None, end=None, loop_synthesis_cb=None, logname=None):
 
         if c.with_preprocessor:
             self.logger.info("Apply C preprocessor...")
-            body = CPreprocessor.unfold(pre, body, c.compiler_binary)
+            body = CPreprocessor.unfold(pre, body, post, c.compiler_binary,
+                                        include=c.compiler_include_paths)
             self.logger.debug("Code after preprocessor:")
             Slothy._dump("preprocessed", body, self.logger, err=False)
 
@@ -245,7 +248,7 @@ def optimize(self, start=None, end=None, loop_synthesis_cb=None, logname=None):
         self.logger.info("Instructions in body: %d", len(list(filter(None, body))))
 
         if self.config.with_llvm_mca_before is True:
-            orig_stats = self._make_llvm_mca_stats(pre, body, "ORIGINAL", indentation)
+            orig_stats = self._make_llvm_mca_stats(pre, body, post, "ORIGINAL", indentation)
 
         early, core, late, num_exceptional = Heuristics.periodic(body, logger, c)
 
@@ -253,7 +256,7 @@ def optimize(self, start=None, end=None, loop_synthesis_cb=None, logname=None):
             core = core + orig_stats
 
         if self.config.with_llvm_mca_after is True:
-            new_stats_kernel = self._make_llvm_mca_stats(pre, core, "OPTIMIZED",
+            new_stats_kernel = self._make_llvm_mca_stats(pre, core, post, "OPTIMIZED",
                                                          indentation)
 
             core = core + new_stats_kernel
@@ -314,12 +317,13 @@ def get_input_from_output(self, start, end, outputs=None):
         dfgc = DFGConfig(c)
         return list(DFG(body, logger.getChild("dfg_find_deps"), dfgc).inputs)
 
-    def _fusion_core(self, pre, body, logger):
+    def _fusion_core(self, pre, body, post, logger):
         c = self.config.copy()
 
         if c.with_preprocessor:
             self.logger.info("Apply C preprocessor...")
-            body = CPreprocessor.unfold(pre, body, c.compiler_binary)
+            body = CPreprocessor.unfold(pre, body, post, c.compiler_binary,
+                                        include=c.compiler_include_paths)
             self.logger.debug("Code after preprocessor:")
             Slothy._dump("preprocessed", body, self.logger, err=False)
         body = SourceLine.split_semicolons(body)
@@ -400,7 +404,8 @@ def optimize_loop(self, loop_lbl, postamble_label=None):
 
         if c.with_preprocessor:
             self.logger.info("Apply C preprocessor...")
-            body = CPreprocessor.unfold(early, body, c.compiler_binary)
+            body = CPreprocessor.unfold(early, body, late, c.compiler_binary,
+                                        include=c.compiler_include_paths)
             self.logger.debug("Code after preprocessor:")
             Slothy._dump("preprocessed", body, self.logger, err=False)
 
@@ -412,7 +417,7 @@ def optimize_loop(self, loop_lbl, postamble_label=None):
             loop_lbl, len(body))
 
         if self.config.with_llvm_mca_before is True:
-            orig_stats = self._make_llvm_mca_stats(early, body, "ORIGINAL", indentation)
+            orig_stats = self._make_llvm_mca_stats(early, body, late, "ORIGINAL", indentation)
 
         preamble_code, kernel_code, postamble_code, num_exceptional = \
             Heuristics.periodic(body, logger, c)
@@ -422,19 +427,19 @@ def optimize_loop(self, loop_lbl, postamble_label=None):
 
         if self.config.with_llvm_mca_after is True:
             print(SourceLine.write_multiline(kernel_code))
-            new_stats_kernel = self._make_llvm_mca_stats(early, kernel_code, "OPTIMIZED",
+            new_stats_kernel = self._make_llvm_mca_stats(early, kernel_code, late, "OPTIMIZED",
                                                          indentation)
             kernel_code = kernel_code + new_stats_kernel
 
             if self.config.sw_pipelining.optimize_preamble is True \
                and len(preamble_code) > 0:
-                new_stats_preamble = self._make_llvm_mca_stats(early, preamble_code, "PREAMBLE",
+                new_stats_preamble = self._make_llvm_mca_stats(early, preamble_code, late, "PREAMBLE",
                                                                indentation)
                 preamble_code = preamble_code + new_stats_preamble
 
             if self.config.sw_pipelining.optimize_postamble is True \
                and len(postamble_code) > 0:
-                new_stats_postamble = self._make_llvm_mca_stats(early, postamble_code, "POSTAMBLE",
+                new_stats_postamble = self._make_llvm_mca_stats(early, postamble_code, late, "POSTAMBLE",
                                                                 indentation)
                 postamble_code = postamble_code + new_stats_postamble
 
diff --git a/slothy/helper.py b/slothy/helper.py
index e2d17744..83d0ca74 100644
--- a/slothy/helper.py
+++ b/slothy/helper.py
@@ -912,30 +912,43 @@ def extract_from_file(filename):
 class CPreprocessor():
     """Helper class for the application of the C preprocessor"""
 
-    magic_string = "SLOTHY_PREPROCESSED_REGION"
+    magic_string_start = "SLOTHY_PREPROCESSED_REGION_BEGIN"
+    magic_string_end = "SLOTHY_PREPROCESSED_REGION_END"
 
     @staticmethod
-    def unfold(header, body, gcc):
+    def unfold(header, body, post, gcc, include=None):
         """Runs the concatenation of header and body through the preprocessor"""
 
         assert SourceLine.is_source(body)
         assert SourceLine.is_source(header)
+        assert SourceLine.is_source(post)
 
         body_txt = SourceLine.write_multiline(body)
         header_txt = SourceLine.write_multiline(header)
+        footer_txt = SourceLine.write_multiline(post)
+
+        code_txt = '\n'.join([header_txt,
+                              CPreprocessor.magic_string_start,
+                              body_txt,
+                              CPreprocessor.magic_string_end,
+                              footer_txt])
+
+        if include is None:
+            include = []
+            # Ignore #include's
+            code_txt = code_txt.replace("#include","//#include")
+        else:
+            include = ["-I", include]
 
-        code_txt = '\n'.join([header_txt, CPreprocessor.magic_string, body_txt])
-
-        # Ignore #include's until -I can be configured
-        code_txt = code_txt.replace("#include","//#include")
+        cmd = [gcc] + include + ["-E", "-CC", "-x", "assembler-with-cpp","-"]
 
         # Pass -CC to keep comments
-        r = subprocess.run([gcc, "-E", "-CC", "-x", "assembler-with-cpp","-"],
-                           input=code_txt, text=True, capture_output=True, check=True)
+        r = subprocess.run(cmd, input=code_txt, text=True, capture_output=True, check=True)
 
         unfolded_code = r.stdout.split('\n')
-        magic_idx = unfolded_code.index(CPreprocessor.magic_string)
-        unfolded_code = unfolded_code[magic_idx+1:]
+        magic_idx_start = unfolded_code.index(CPreprocessor.magic_string_start)
+        magic_idx_end = unfolded_code.index(CPreprocessor.magic_string_end)
+        unfolded_code = unfolded_code[magic_idx_start+1:magic_idx_end]
 
         return [SourceLine(r) for r in unfolded_code]
 

From c25fb0c16d6a2d8f7bc5192514b7d4ce35ef9dd3 Mon Sep 17 00:00:00 2001
From: Hanno Becker <beckphan@amazon.co.uk>
Date: Fri, 29 Mar 2024 06:20:34 +0000
Subject: [PATCH 08/15] Parse slothy-cli arguments as strings by default

The previous identifier is too strict e.g. for paths.
---
 slothy-cli | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/slothy-cli b/slothy-cli
index 388e09c5..7b7539ef 100755
--- a/slothy-cli
+++ b/slothy-cli
@@ -209,10 +209,8 @@ def _main():
             logger.debug("Parsing %s is a dictionary -- parse recursively", val)
             return { parse_config_value_as(k, None) : parse_config_value_as(v, None)
                      for k,v in kvs }
-        if val.isidentifier():
-            logger.debug("Parsing %s as string", val)
-            return val
-        raise CmdLineException(f"Could not parse configuration value '{val}'")
+        logger.debug("Parsing %s as string", val)
+        return val
 
     # A plain '-c' without arguments should list all available configuration options
     if [] in args.config:

From 4fabba80bf6275e8a20a9e1bda89a6af72910d8c Mon Sep 17 00:00:00 2001
From: Hanno Becker <beckphan@amazon.co.uk>
Date: Fri, 29 Mar 2024 06:21:16 +0000
Subject: [PATCH 09/15] Add LLVM MCA target name for Neoverse N1 model

---
 slothy/targets/aarch64/neoverse_n1_experimental.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/slothy/targets/aarch64/neoverse_n1_experimental.py b/slothy/targets/aarch64/neoverse_n1_experimental.py
index 12f11f79..cbef25c9 100644
--- a/slothy/targets/aarch64/neoverse_n1_experimental.py
+++ b/slothy/targets/aarch64/neoverse_n1_experimental.py
@@ -34,6 +34,7 @@
 from slothy.targets.aarch64.aarch64_neon import *
 
 issue_rate = 4
+llvm_mca_target="neoverse-n1"
 
 class ExecutionUnit(Enum):
     """Enumeration of execution units in approximative Neoverse-N1 SLOTHY model"""

From f2ebb78d186eb717ef698b07f63854039f991c7b Mon Sep 17 00:00:00 2001
From: Hanno Becker <beckphan@amazon.co.uk>
Date: Fri, 29 Mar 2024 06:36:16 +0000
Subject: [PATCH 10/15] Improve compatibility with LLVM MCA

By specifying `-c llvm_mca_full`, all available statistics will now
be printed. By default, only a subset is shown.

By specifying, -c llvm_mca_issue_width_overwrite`, use the issue
width of the SLOTHY model instead of LLVM MCA's one.
---
 slothy/core/config.py | 20 ++++++++++++++++++++
 slothy/core/slothy.py | 10 ++++++++--
 slothy/helper.py      | 13 +++++++++----
 3 files changed, 37 insertions(+), 6 deletions(-)

diff --git a/slothy/core/config.py b/slothy/core/config.py
index d9e6e2e4..935977de 100644
--- a/slothy/core/config.py
+++ b/slothy/core/config.py
@@ -306,6 +306,18 @@ def with_llvm_mca(self):
         """
         return self._with_llvm_mca_before and self._with_llvm_mca_after
 
+    @property
+    def llvm_mca_full(self):
+        """Indicates whether all available statistics from LLVM MCA should be printed.
+        """
+        return self._llvm_mca_full
+
+    @property
+    def llvm_mca_issue_width_overwrite(self):
+        """Overwrite LLVM MCA's in-built issue width with the one SLOTHY uses
+        """
+        return self._llvm_mca_issue_width_overwrite
+
     @property
     def with_llvm_mca_before(self):
         """Indicates whether LLVM MCA should be run prior to optimization
@@ -1039,6 +1051,8 @@ def __init__(self, Arch, Target):
         self._do_address_fixup = True
 
         self._with_preprocessor = False
+        self._llvm_mca_full = False
+        self._llvm_mca_issue_width_overwrite = False
         self._with_llvm_mca_before = False
         self._with_llvm_mca_after = False
         self._max_solutions = 64
@@ -1123,6 +1137,12 @@ def max_solutions(self, val):
     @with_preprocessor.setter
     def with_preprocessor(self, val):
         self._with_preprocessor = val
+    @llvm_mca_issue_width_overwrite.setter
+    def llvm_mca_issue_width_overwrite(self, val):
+        self._llvm_mca_issue_width_overwrite = val
+    @llvm_mca_full.setter
+    def llvm_mca_full(self, val):
+        self._llvm_mca_full = val
     @with_llvm_mca.setter
     def with_llvm_mca(self, val):
         self._with_llvm_mca_before = val
diff --git a/slothy/core/slothy.py b/slothy/core/slothy.py
index 89f925ad..e2d0f70b 100644
--- a/slothy/core/slothy.py
+++ b/slothy/core/slothy.py
@@ -188,9 +188,15 @@ def _make_llvm_mca_stats(self, pre, code, post, txt, indentation):
         try:
             code = CPreprocessor.unfold(pre, code, post, self.config.compiler_binary,
                                         include=self.config.compiler_include_paths)
+            if self.config.llvm_mca_issue_width_overwrite is True:
+                issue_width = self.config.target.issue_rate
+            else:
+                issue_width = None
             stats = LLVM_Mca.run(pre, code, self.config.llvm_mca_binary,
-                             self.config.arch.llvm_mca_arch,
-                             self.config.target.llvm_mca_target, self.logger)
+                                 self.config.arch.llvm_mca_arch,
+                                 self.config.target.llvm_mca_target, self.logger,
+                                 full=self.config.llvm_mca_full,
+                                 issue_width=issue_width)
             stats = ["",f"LLVM MCA STATISTICS ({txt}) BEGIN",""] + stats + \
                 ["", f"ORIGINAL LLVM MCA STATISTICS ({txt}) END",""]
             stats = [SourceLine("").add_comment(r) for r in stats]
diff --git a/slothy/helper.py b/slothy/helper.py
index 83d0ca74..d3a6a95c 100644
--- a/slothy/helper.py
+++ b/slothy/helper.py
@@ -959,7 +959,7 @@ class LLVM_Mca():
     """Helper class for the application of the LLVM MCA tool"""
 
     @staticmethod
-    def run(header, body, mca_binary, arch, cpu, log):
+    def run(header, body, mca_binary, arch, cpu, log, full=False, issue_width=None):
         """Runs LLVM-MCA tool on body and returns result as array of strings"""
 
         LLVM_MCA_BEGIN = SourceLine("").add_comment("LLVM-MCA-BEGIN")
@@ -968,9 +968,14 @@ def run(header, body, mca_binary, arch, cpu, log):
         data = SourceLine.write_multiline(header + [LLVM_MCA_BEGIN] + body + [LLVM_MCA_END])
 
         try:
-            r = subprocess.run([mca_binary, f"--mcpu={cpu}", f"--march={arch}",
-                            "--instruction-info=0", "--dispatch-stats=0", "--timeline=1", "--timeline-max-cycles=0",
-                            "--timeline-max-iterations=3"],
+            if full is False:
+                args = ["--instruction-info=0", "--dispatch-stats=0", "--timeline=1", "--timeline-max-cycles=0",
+                            "--timeline-max-iterations=3"]
+            else:
+                args = ["--all-stats", "--all-views", "--bottleneck-analysis", "--timeline=1", "--timeline-max-cycles=0", "--timeline-max-iterations=3"]
+            if issue_width is not None:
+                args += ["--dispatch", str(issue_width)]
+            r = subprocess.run([mca_binary, f"--mcpu={cpu}", f"--march={arch}"] + args,
                             input=data, text=True, capture_output=True, check=True)
         except subprocess.CalledProcessError as exc:
             raise LLVM_Mca_Error from exc

From 526a2f57d37a13b6883c7b588e860514b99e118e Mon Sep 17 00:00:00 2001
From: Hanno Becker <beckphan@amazon.co.uk>
Date: Fri, 29 Mar 2024 09:00:47 +0000
Subject: [PATCH 11/15] Remove debug print statement

---
 slothy/core/slothy.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/slothy/core/slothy.py b/slothy/core/slothy.py
index e2d0f70b..5c17dec0 100644
--- a/slothy/core/slothy.py
+++ b/slothy/core/slothy.py
@@ -432,7 +432,6 @@ def optimize_loop(self, loop_lbl, postamble_label=None):
             kernel_code = kernel_code + orig_stats
 
         if self.config.with_llvm_mca_after is True:
-            print(SourceLine.write_multiline(kernel_code))
             new_stats_kernel = self._make_llvm_mca_stats(early, kernel_code, late, "OPTIMIZED",
                                                          indentation)
             kernel_code = kernel_code + new_stats_kernel

From e13ecae6de2abce09a6a06051e7dee67dd206234 Mon Sep 17 00:00:00 2001
From: Hanno Becker <beckphan@amazon.co.uk>
Date: Sun, 31 Mar 2024 05:37:46 +0100
Subject: [PATCH 12/15] Add W-form variant of stack-store with immediate

---
 slothy/targets/aarch64/aarch64_neon.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/slothy/targets/aarch64/aarch64_neon.py b/slothy/targets/aarch64/aarch64_neon.py
index 21020708..9fb54ecb 100644
--- a/slothy/targets/aarch64/aarch64_neon.py
+++ b/slothy/targets/aarch64/aarch64_neon.py
@@ -2550,6 +2550,21 @@ def write(self):
         self.immediate = simplify(self.pre_index)
         return super().write()
 
+class w_str_sp_imm(Str_X): # pylint: disable=missing-docstring,invalid-name
+    pattern = "str <Wa>, [sp, <imm>]"
+    inputs = ["Wa"]
+    @classmethod
+    def make(cls, src):
+        obj = AArch64Instruction.build(cls, src)
+        obj.increment = None
+        obj.pre_index = obj.immediate
+        obj.addr = "sp"
+        return obj
+
+    def write(self):
+        self.immediate = simplify(self.pre_index)
+        return super().write()
+
 class x_str_postinc(Str_X): # pylint: disable=missing-docstring,invalid-name
     pattern = "str <Xa>, [<Xc>], <imm>"
     inputs = ["Xa", "Xc"]

From 3035d83b6ba9eb34ee09c4b32b797b464b044c55 Mon Sep 17 00:00:00 2001
From: Hanno Becker <beckphan@amazon.co.uk>
Date: Sun, 31 Mar 2024 06:17:58 +0100
Subject: [PATCH 13/15] Print solver's bound for # of cycles in generated
 assembly

The solver provides us with a lower/upper bound for the objective
it is minimizing/maximizing.

In the case where we minimize the number of cycles, print the
objective bound in the emitted assembly, so it's clear if the
code is optimal according to the model, or can perhaps be improved
by running SLOTHY with larger timeout.
---
 slothy/core/core.py | 76 +++++++++++++++++++++++++++++++++++----------
 1 file changed, 60 insertions(+), 16 deletions(-)

diff --git a/slothy/core/core.py b/slothy/core/core.py
index 37989e65..2d79c07b 100644
--- a/slothy/core/core.py
+++ b/slothy/core/core.py
@@ -234,13 +234,32 @@ def cycles(self):
         count per iteration."""
         return (self.codesize_with_bubbles // self.config.target.issue_rate)
 
+    @property
+    def cycles_bound(self):
+        """A lower bound for the number of cycles obtained during optimization.
+
+        This may be lower than the estimated cycle count of the result itself if optimization
+        terminated prematurely, e.g. because of a timeout."""
+        return self._cycles_bound
+
+    @property
+    def ipc_bound(self):
+        """An uppwer bound on the instruction/cycle (IPC) count obtained during optimization.
+
+        This may be lower than the IPC value of the result itself if optimization
+        terminated prematurely, e.g. because of a timeout."""
+        cc = self.cycles_bound
+        if cc is None or cc == 0:
+            return None
+        return (self.codesize / cc)
+
     @property
     def ipc(self):
         """The instruction/cycle (IPC) count that SLOTHY thinks the code will have."""
         cc = self.cycles
         if cc == 0:
             return 0
-        return (self.codesize / self.cycles)
+        return (self.codesize / cc)
 
     @property
     def orig_code_visualized(self):
@@ -623,6 +642,13 @@ def gen_visualized_code():
         res.append(SourceLine("")                                     \
                    .set_comment(f"Expected IPC:    {self.ipc:.2f}")   \
                    .set_length(fixlen))
+        if self.cycles_bound is not None:
+            res.append(SourceLine("")                                           \
+                       .set_comment(f"Cycle bound:     {self.cycles_bound}")    \
+                       .set_length(fixlen))
+            res.append(SourceLine("")                                           \
+                       .set_comment(f"IPC bound:       {self.ipc_bound:.2f}")   \
+                       .set_length(fixlen))
 
         res += list(gen_visualized_code())
         res += self.orig_code_visualized
@@ -844,6 +870,11 @@ def stalls(self, v):
         assert self._stalls is None
         self._stalls = v
 
+    @cycles_bound.setter
+    def cycles_bound(self, v):
+        assert self._cycles_bound is None
+        self._cycles_bound = v
+
     def _build_stalls_idxs(self):
         self._stalls_idxs = { j for (i,j) in self.reordering.items() if
                               self.reordering_with_bubbles[i] + 1 not in
@@ -1155,6 +1186,7 @@ def __init__(self, config):
         self._reordering_with_bubbles = None
         self._valid = False
         self._success = None
+        self._cycles_bound = None
         self._stalls = None
         self._stalls_idxs = None
         self._input = None
@@ -1573,14 +1605,14 @@ def on_solution_callback(self):
                 bound = self.BestObjectiveBound()
                 time = self.WallTime()
                 if self.__printer is not None:
-                    add_cur = self.__printer(cur)
-                    add_bound = self.__printer(bound)
+                    cur_str = self.__printer(cur)
+                    bound_str = self.__printer(bound)
                 else:
-                    add_cur = ""
-                    add_bound = ""
+                    cur_str = str(cur)
+                    bound_str = str(bound)
                 self.__logger.info(
                     f"[{time:.4f}s]: Found {self.__solution_count} solutions so far... " +
-                    f"objective {cur}{add_cur}, bound {bound}{add_bound} ({self.__objective_desc})")
+                    f"objective ({self.__objective_desc}): currently {cur_str}, bound {bound_str}")
                 if self.__is_good_enough and self.__is_good_enough(cur, bound):
                     self.StopSearch()
             if self.__solution_count >= self.__max_solutions:
@@ -1721,6 +1753,11 @@ def _extract_positions(self, get_value):
 
         if self.config.variable_size:
             self._result.stalls = get_value(self._model.stalls)
+            stalls_bound = self._model.cp_solver.BestObjectiveBound()
+            stats = self._stalls_to_stats(stalls_bound)
+            if stats is not None:
+                cycles_bound, _ = stats
+                self._result.cycles_bound = cycles_bound
 
         nodes = self._model.tree.nodes
         if self.config.sw_pipelining.enabled:
@@ -2959,6 +2996,21 @@ def restrict_slots_for_instructions_by_property(self, filter_func, slots):
     #                         OBJECTIVES                            #
     # ==============================================================#
 
+    def _stalls_to_stats(self, stalls):
+        psize = self._model.min_slots + \
+            self._model.pfactor * stalls
+        cc = psize // self.config.target.issue_rate
+        cs = self._model.tree.num_nodes
+        if cc == 0:
+            return None
+        cycles = psize // self._model.pfactor
+        ipc = cs / cc
+        return (cycles, ipc)
+
+    def _print_stalls(self, stalls):
+        (cycles, ipc) = self._stalls_to_stats(stalls)
+        return f" (Cycles ~ {cycles}, IPC ~ {ipc:.2f})"
+
     def _add_objective(self, force_objective=False):
         minlist = []
         maxlist = []
@@ -2969,17 +3021,9 @@ def _add_objective(self, force_objective=False):
 
         # If the number of stalls is variable, its minimization is our objective
         if force_objective is False and self.config.variable_size:
-            name = "minimize number of stalls"
+            name = "minimize cycles"
             if self.config.constraints.functional_only is False:
-                def get_cpi(stalls):
-                    psize = self._model.min_slots + \
-                        self._model.pfactor * stalls
-                    cc = psize // self.config.target.issue_rate
-                    cs = self._model.tree.num_nodes
-                    if cc == 0:
-                        return ""
-                    return f" (Cycles ~ {psize // self._model.pfactor}, IPC ~ {cs / cc:.2f})"
-                printer = get_cpi
+                printer = self._print_stalls
             minlist = [self._model.stalls]
         elif self.config.has_objective and not self.config.ignore_objective:
             if self.config.sw_pipelining.enabled is True and \

From 687bdbf648b52e13e01e411a9f2db1bd3a2db2be Mon Sep 17 00:00:00 2001
From: Hanno Becker <beckphan@amazon.co.uk>
Date: Sun, 31 Mar 2024 07:02:15 +0100
Subject: [PATCH 14/15] Report wall and user time in generated assembly

---
 slothy/core/core.py | 41 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)

diff --git a/slothy/core/core.py b/slothy/core/core.py
index 2d79c07b..ccf9ea67 100644
--- a/slothy/core/core.py
+++ b/slothy/core/core.py
@@ -253,6 +253,16 @@ def ipc_bound(self):
             return None
         return (self.codesize / cc)
 
+    @property
+    def optimization_wall_time(self):
+        """Returns the amount of wall clock time in seconds the optimization has taken"""
+        return self._optimization_wall_time
+
+    @property
+    def optimization_user_time(self):
+        """Returns the amount of CPU time in seconds the optimization has taken"""
+        return self._optimization_user_time
+
     @property
     def ipc(self):
         """The instruction/cycle (IPC) count that SLOTHY thinks the code will have."""
@@ -296,6 +306,16 @@ def codesize_with_bubbles(self, v):
         assert self._codesize_with_bubbles is None
         self._codesize_with_bubbles = v
 
+    @optimization_user_time.setter
+    def optimization_user_time(self, v):
+        assert self._optimization_user_time is None
+        self._optimization_user_time = v
+
+    @optimization_wall_time.setter
+    def optimization_wall_time(self, v):
+        assert self._optimization_wall_time is None
+        self._optimization_wall_time = v
+
     @property
     def pre_core_post_dict(self):
         """Dictionary indicating interleaving of iterations.
@@ -643,12 +663,28 @@ def gen_visualized_code():
                    .set_comment(f"Expected IPC:    {self.ipc:.2f}")   \
                    .set_length(fixlen))
         if self.cycles_bound is not None:
+            res.append(SourceLine("")                                           \
+                       .set_comment(f"")                                        \
+                       .set_length(fixlen))
             res.append(SourceLine("")                                           \
                        .set_comment(f"Cycle bound:     {self.cycles_bound}")    \
                        .set_length(fixlen))
             res.append(SourceLine("")                                           \
                        .set_comment(f"IPC bound:       {self.ipc_bound:.2f}")   \
                        .set_length(fixlen))
+        if self.optimization_wall_time is not None:
+            res.append(SourceLine("")                                           \
+                       .set_comment(f"")                                        \
+                       .set_length(fixlen))
+            res.append(SourceLine("")                                           \
+                       .set_comment(f"Wall time:     {self.optimization_wall_time:.2f}s")  \
+                       .set_length(fixlen))
+            res.append(SourceLine("")                                           \
+                       .set_comment(f"User time:     {self.optimization_user_time:.2f}s")  \
+                       .set_length(fixlen))
+        res.append(SourceLine("")                                           \
+                   .set_comment(f"")                                        \
+                   .set_length(fixlen))
 
         res += list(gen_visualized_code())
         res += self.orig_code_visualized
@@ -1194,6 +1230,8 @@ def __init__(self, config):
         self._pre_core_post_dict = None
         self._codesize_with_bubbles = None
         self._register_used = None
+        self._optimization_wall_time = None
+        self._optimization_user_time = None
 
         self.lock()
 
@@ -1759,6 +1797,9 @@ def _extract_positions(self, get_value):
                 cycles_bound, _ = stats
                 self._result.cycles_bound = cycles_bound
 
+        self._result.optimization_wall_time = self._model.cp_solver.WallTime()
+        self._result.optimization_user_time = self._model.cp_solver.UserTime()
+
         nodes = self._model.tree.nodes
         if self.config.sw_pipelining.enabled:
             nodes_low = self._model.tree.nodes_low

From 38d5db8b587ef5d0c6371140dc63fc4249904dd6 Mon Sep 17 00:00:00 2001
From: Hanno Becker <beckphan@amazon.co.uk>
Date: Sun, 31 Mar 2024 06:21:31 +0100
Subject: [PATCH 15/15] N1 model: Leave comment regarding the modelling of
 Q-str

---
 slothy/targets/aarch64/neoverse_n1_experimental.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/slothy/targets/aarch64/neoverse_n1_experimental.py b/slothy/targets/aarch64/neoverse_n1_experimental.py
index cbef25c9..fa47c4d7 100644
--- a/slothy/targets/aarch64/neoverse_n1_experimental.py
+++ b/slothy/targets/aarch64/neoverse_n1_experimental.py
@@ -87,6 +87,15 @@ def get_min_max_objective(slothy):
     (Ldp_X, Ldr_X,
      Str_X, Stp_X,
      Ldr_Q, Str_Q)            : ExecutionUnit.LSU(),
+    # TODO: The following would be more accurate, but does not
+    #       necessarily lead to better results, while making the
+    #       optimization slower. Investigate...
+    #
+    # Ldr_Q)            : ExecutionUnit.LSU(),
+    # Str_Q : [[ExecutionUnit.VEC0, ExecutionUnit.LSU0],
+    #          [ExecutionUnit.VEC0, ExecutionUnit.LSU1],
+    #          [ExecutionUnit.VEC1, ExecutionUnit.LSU0],
+    #          [ExecutionUnit.VEC1, ExecutionUnit.LSU1]],
     (vuzp1, vuzp2, vzip1,
      Vrev, uaddlp)           : ExecutionUnit.V(),
     (vmov)                    : ExecutionUnit.V(),