diff --git a/example.py b/example.py index fc133550..9043fa24 100644 --- a/example.py +++ b/example.py @@ -581,7 +581,9 @@ def __init__(self, var="", arch=AArch64_Neon, target=Target_CortexA55): def core(self,slothy): slothy.config.variable_size=True slothy.config.constraints.stalls_first_attempt=32 - slothy.optimize() + slothy.config.selfcheck = False + slothy.config.allow_useless_instructions = True + slothy.fusion_region("start", "end", ssa=False) class AArch64Example0Equ(Example): def __init__(self, var="", arch=AArch64_Neon, target=Target_CortexA55): @@ -639,6 +641,21 @@ def core(self,slothy): slothy.config.sw_pipelining.optimize_postamble = False slothy.optimize_loop("start") +class AArch64Split0(Example): + def __init__(self, var="", arch=AArch64_Neon, target=Target_CortexA55): + name = "aarch64_split0" + infile = name + + if var != "": + name += f"_{var}" + infile += f"_{var}" + name += f"_{target_label_dict[target]}" + + super().__init__(infile, name, rename=True, arch=arch, target=target) + + def core(self,slothy): + slothy.config.allow_useless_instructions = True + slothy.fusion_region("start", "end", ssa=False) class Armv7mExample0(Example): def __init__(self, var="", arch=Arch_Armv7M, target=Target_CortexM7): name = "armv7m_simple0" @@ -1211,7 +1228,7 @@ def core(self, slothy): slothy.config.constraints.stalls_first_attempt = 110 slothy.optimize_loop("layer123_start") - + class ntt_dilithium_123(Example): @@ -1334,7 +1351,7 @@ def core(self, slothy): slothy.optimize_loop("layer5678_start") slothy.config = conf.copy() - + if self.timeout is not None: slothy.config.timeout = self.timeout // 12 @@ -1351,7 +1368,7 @@ def core(self, slothy): slothy.config.split_heuristic_stepsize = 0.1 slothy.config.constraints.stalls_first_attempt = 14 slothy.optimize_loop("layer1234_start") - + class ntt_dilithium_1234(Example): def __init__(self, var="", arch=AArch64_Neon, target=Target_CortexA72): @@ -1495,6 +1512,8 @@ def main(): AArch64Example2(), AArch64Example2(target=Target_CortexA72), + AArch64Split0(), + # Armv7m examples Armv7mExample0(), diff --git a/examples/naive/aarch64/aarch64_split0.s b/examples/naive/aarch64/aarch64_split0.s new file mode 100644 index 00000000..a448f25c --- /dev/null +++ b/examples/naive/aarch64/aarch64_split0.s @@ -0,0 +1,15 @@ +start: +ldr q0, [x1, #0] +ldr q1, [x2, #0] +eor3 v5.16b, v1.16b, v2.16b, v3.16b // @slothy:some_tag // some comment +eor3 v3.16b, v1.16b, v2.16b, v3.16b // Cannot we split naively +ldr q8, [x0] +ldr q9, [x0, #1*16] +ldr q10, [x0, #2*16] +ldr q11, [x0, #3*16] +mul v24.8h, v9.8h, v0.h[0] +sqrdmulh v9.8h, v9.8h, v0.h[1] +mls v24.8h, v9.8h, v1.h[0] +sub v9.8h, v8.8h, v24.8h +add v8.8h, v8.8h, v24.8h +end: diff --git a/examples/opt/aarch64/aarch64_split0_opt_a55.s b/examples/opt/aarch64/aarch64_split0_opt_a55.s new file mode 100644 index 00000000..d0df6a96 --- /dev/null +++ b/examples/opt/aarch64/aarch64_split0_opt_a55.s @@ -0,0 +1,17 @@ +start: +ldr q0, [x1, #0] +ldr q1, [x2, #0] +eor v5.16B, v1.16B, v2.16B// some comment // @slothy:some_tag +eor v5.16B, v5.16B, v3.16B// some comment // @slothy:some_tag +eor3 v3.16B, v1.16B, v2.16B, v3.16B// Cannot we split naively +ldr q8, [x0] +ldr q9, [x0, #16] +ldr q10, [x0, #32] +ldr q11, [x0, #48] +mul v24.8H, v9.8H, v0.H[0] +sqrdmulh v9.8H, v9.8H, v0.H[1] +mls v24.8H, v9.8H, v1.H[0] +sub v9.8H, v8.8H, v24.8H +add v8.8H, v8.8H, v24.8H +end: + diff --git a/slothy/core/dataflow.py b/slothy/core/dataflow.py index e5f27005..2e333dd5 100644 --- a/slothy/core/dataflow.py +++ b/slothy/core/dataflow.py @@ -525,7 +525,18 @@ def apply_cbs(self, cb, logger, one_a_time=False): break z = filter(lambda x: x.delete is False, self.nodes) - z = map(lambda x: ([x.inst], x.inst.source_line), z) + + def pair_with_source(i): + return ([i], i.source_line) + def map_node(t): + s = t.inst + if not isinstance(t.inst, list): + s = [s] + return map(pair_with_source, s) + def flatten(llst): + return [x for y in llst for x in y] + + z = flatten(map(map_node, z)) self.src = list(z) diff --git a/slothy/core/slothy.py b/slothy/core/slothy.py index b0b776ee..2fbfdf8a 100644 --- a/slothy/core/slothy.py +++ b/slothy/core/slothy.py @@ -324,7 +324,7 @@ def get_input_from_output(self, start, end, outputs=None): dfgc = DFGConfig(c) return list(DFG(body, logger.getChild("dfg_find_deps"), dfgc).inputs) - def _fusion_core(self, pre, body, post, logger): + def _fusion_core(self, pre, body, post, logger, ssa=True): c = self.config.copy() if c.with_preprocessor: @@ -342,9 +342,10 @@ def _fusion_core(self, pre, body, post, logger): body = AsmAllocation.unfold_all_aliases(c.register_aliases, body) dfgc = DFGConfig(c) - dfg = DFG(body, logger.getChild("ssa"), dfgc, parsing_cb=False) - dfg.ssa() - body = [ ComputationNode.to_source_line(t) for t in dfg.nodes ] + if ssa is True: + dfg = DFG(body, logger.getChild("ssa"), dfgc, parsing_cb=False) + dfg.ssa() + body = [ ComputationNode.to_source_line(t) for t in dfg.nodes ] dfg = DFG(body, logger.getChild("fusion"), dfgc, parsing_cb=False) dfg.apply_fusion_cbs() @@ -352,18 +353,18 @@ def _fusion_core(self, pre, body, post, logger): return body - def fusion_region(self, start, end): + def fusion_region(self, start, end, **kwargs): """Run fusion callbacks on straightline code""" logger = self.logger.getChild(f"ssa_{start}_{end}") pre, body, post = AsmHelper.extract(self.source, start, end) body_ssa = [ SourceLine(f"{start}:") ] +\ - self._fusion_core(pre, body, logger) + \ + self._fusion_core(pre, body, post, logger, **kwargs) + \ [ SourceLine(f"{end}:") ] self.source = pre + body_ssa + post assert SourceLine.is_source(self.source) - def fusion_loop(self, loop_lbl): + def fusion_loop(self, loop_lbl, **kwargs): """Run fusion callbacks on loop body""" logger = self.logger.getChild(f"ssa_loop_{loop_lbl}") @@ -373,25 +374,12 @@ def fusion_loop(self, loop_lbl): indentation = AsmHelper.find_indentation(body) body_ssa = SourceLine.read_multiline(loop.start(loop_cnt)) + \ - SourceLine.apply_indentation(self._fusion_core(pre, body, logger), indentation) + \ + SourceLine.apply_indentation(self._fusion_core(pre, body, late, logger, **kwargs), indentation) + \ SourceLine.read_multiline(loop.end(other_data)) self.source = pre + body_ssa + post assert SourceLine.is_source(self.source) - c = self.config.copy() - self.config.keep_tags = True - self.config.constraints.functional_only = True - self.config.constraints.allow_reordering = False - self.config.sw_pipelining.enabled = False - self.config.split_heuristic = False - self.config.inputs_are_outputs = True - self.config.sw_pipelining.unknown_iteration_count = False - self.optimize_loop(loop_lbl) - self.config = c - - assert SourceLine.is_source(self.source) - def optimize_loop(self, loop_lbl, postamble_label=None): """Optimize the loop starting at a given label The postamble_label marks the end of the loop kernel. diff --git a/slothy/targets/aarch64/aarch64_neon.py b/slothy/targets/aarch64/aarch64_neon.py index 68fdf3c0..8422a5fd 100644 --- a/slothy/targets/aarch64/aarch64_neon.py +++ b/slothy/targets/aarch64/aarch64_neon.py @@ -44,6 +44,7 @@ class which generates instruction parsers and writers from instruction templates from enum import Enum from functools import cache +from slothy.helper import SourceLine from sympy import simplify from slothy.targets.common import * @@ -3175,7 +3176,50 @@ def core(inst,t,log=None): return core -veor.global_fusion_cb = eor3_fusion_cb() +# TODO: Test only... +# veor.global_fusion_cb = eor3_fusion_cb() + +def eor3_splitting_cb(): + def core(inst,t,log=None): + + d = inst.args_out[0] + a = inst.args_in[0] + b = inst.args_in[1] + c = inst.args_in[2] + + # Check if we can use the output as a temporary + if d in [a,b,c]: + return False + + eor0 = AArch64Instruction.build(veor, { "Vd": d, "Va" : a, "Vb" : b, + "datatype0":"16b", + "datatype1":"16b", + "datatype2":"16b" }) + eor1 = AArch64Instruction.build(veor, { "Vd": d, "Va" : d, "Vb" : c, + "datatype0":"16b", + "datatype1":"16b", + "datatype2":"16b" }) + + eor0_src = SourceLine(eor0.write()).\ + add_tags(inst.source_line.tags).\ + add_comments(inst.source_line.comments) + eor1_src = SourceLine(eor1.write()).\ + add_tags(inst.source_line.tags).\ + add_comments(inst.source_line.comments) + + eor0.source_line = eor0_src + eor1.source_line = eor1_src + + if log is not None: + log(f"EOR3 splitting: {t.inst}; {eor0} + {eor1}") + + t.changed = True + t.inst = [eor0, eor1] + return True + + return core + +veor3.global_fusion_cb = eor3_splitting_cb() def iter_aarch64_instructions(): yield from all_subclass_leaves(Instruction)