Skip to content

Commit

Permalink
TEST: Demonstrate how to potentially do instruction-splitting
Browse files Browse the repository at this point in the history
  • Loading branch information
hanno-becker authored and dop-amin committed Dec 4, 2024
1 parent 8df6fb5 commit 9ab558d
Show file tree
Hide file tree
Showing 6 changed files with 121 additions and 27 deletions.
27 changes: 23 additions & 4 deletions example.py
Original file line number Diff line number Diff line change
Expand Up @@ -581,7 +581,9 @@ def __init__(self, var="", arch=AArch64_Neon, target=Target_CortexA55):
def core(self,slothy):
slothy.config.variable_size=True
slothy.config.constraints.stalls_first_attempt=32
slothy.optimize()
slothy.config.selfcheck = False
slothy.config.allow_useless_instructions = True
slothy.fusion_region("start", "end", ssa=False)

class AArch64Example0Equ(Example):
def __init__(self, var="", arch=AArch64_Neon, target=Target_CortexA55):
Expand Down Expand Up @@ -639,6 +641,21 @@ def core(self,slothy):
slothy.config.sw_pipelining.optimize_postamble = False
slothy.optimize_loop("start")

class AArch64Split0(Example):
def __init__(self, var="", arch=AArch64_Neon, target=Target_CortexA55):
name = "aarch64_split0"
infile = name

if var != "":
name += f"_{var}"
infile += f"_{var}"
name += f"_{target_label_dict[target]}"

super().__init__(infile, name, rename=True, arch=arch, target=target)

def core(self,slothy):
slothy.config.allow_useless_instructions = True
slothy.fusion_region("start", "end", ssa=False)
class Armv7mExample0(Example):
def __init__(self, var="", arch=Arch_Armv7M, target=Target_CortexM7):
name = "armv7m_simple0"
Expand Down Expand Up @@ -1211,7 +1228,7 @@ def core(self, slothy):
slothy.config.constraints.stalls_first_attempt = 110
slothy.optimize_loop("layer123_start")




class ntt_dilithium_123(Example):
Expand Down Expand Up @@ -1334,7 +1351,7 @@ def core(self, slothy):
slothy.optimize_loop("layer5678_start")

slothy.config = conf.copy()

if self.timeout is not None:
slothy.config.timeout = self.timeout // 12

Expand All @@ -1351,7 +1368,7 @@ def core(self, slothy):
slothy.config.split_heuristic_stepsize = 0.1
slothy.config.constraints.stalls_first_attempt = 14
slothy.optimize_loop("layer1234_start")


class ntt_dilithium_1234(Example):
def __init__(self, var="", arch=AArch64_Neon, target=Target_CortexA72):
Expand Down Expand Up @@ -1495,6 +1512,8 @@ def main():
AArch64Example2(),
AArch64Example2(target=Target_CortexA72),

AArch64Split0(),

# Armv7m examples
Armv7mExample0(),

Expand Down
15 changes: 15 additions & 0 deletions examples/naive/aarch64/aarch64_split0.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
start:
ldr q0, [x1, #0]
ldr q1, [x2, #0]
eor3 v5.16b, v1.16b, v2.16b, v3.16b // @slothy:some_tag // some comment
eor3 v3.16b, v1.16b, v2.16b, v3.16b // Cannot we split naively
ldr q8, [x0]
ldr q9, [x0, #1*16]
ldr q10, [x0, #2*16]
ldr q11, [x0, #3*16]
mul v24.8h, v9.8h, v0.h[0]
sqrdmulh v9.8h, v9.8h, v0.h[1]
mls v24.8h, v9.8h, v1.h[0]
sub v9.8h, v8.8h, v24.8h
add v8.8h, v8.8h, v24.8h
end:
17 changes: 17 additions & 0 deletions examples/opt/aarch64/aarch64_split0_opt_a55.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
start:
ldr q0, [x1, #0]
ldr q1, [x2, #0]
eor v5.16B, v1.16B, v2.16B// some comment // @slothy:some_tag
eor v5.16B, v5.16B, v3.16B// some comment // @slothy:some_tag
eor3 v3.16B, v1.16B, v2.16B, v3.16B// Cannot we split naively
ldr q8, [x0]
ldr q9, [x0, #16]
ldr q10, [x0, #32]
ldr q11, [x0, #48]
mul v24.8H, v9.8H, v0.H[0]
sqrdmulh v9.8H, v9.8H, v0.H[1]
mls v24.8H, v9.8H, v1.H[0]
sub v9.8H, v8.8H, v24.8H
add v8.8H, v8.8H, v24.8H
end:

13 changes: 12 additions & 1 deletion slothy/core/dataflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -525,7 +525,18 @@ def apply_cbs(self, cb, logger, one_a_time=False):
break

z = filter(lambda x: x.delete is False, self.nodes)
z = map(lambda x: ([x.inst], x.inst.source_line), z)

def pair_with_source(i):
return ([i], i.source_line)
def map_node(t):
s = t.inst
if not isinstance(t.inst, list):
s = [s]
return map(pair_with_source, s)
def flatten(llst):
return [x for y in llst for x in y]

z = flatten(map(map_node, z))

self.src = list(z)

Expand Down
30 changes: 9 additions & 21 deletions slothy/core/slothy.py
Original file line number Diff line number Diff line change
Expand Up @@ -324,7 +324,7 @@ def get_input_from_output(self, start, end, outputs=None):
dfgc = DFGConfig(c)
return list(DFG(body, logger.getChild("dfg_find_deps"), dfgc).inputs)

def _fusion_core(self, pre, body, post, logger):
def _fusion_core(self, pre, body, post, logger, ssa=True):
c = self.config.copy()

if c.with_preprocessor:
Expand All @@ -342,28 +342,29 @@ def _fusion_core(self, pre, body, post, logger):
body = AsmAllocation.unfold_all_aliases(c.register_aliases, body)
dfgc = DFGConfig(c)

dfg = DFG(body, logger.getChild("ssa"), dfgc, parsing_cb=False)
dfg.ssa()
body = [ ComputationNode.to_source_line(t) for t in dfg.nodes ]
if ssa is True:
dfg = DFG(body, logger.getChild("ssa"), dfgc, parsing_cb=False)
dfg.ssa()
body = [ ComputationNode.to_source_line(t) for t in dfg.nodes ]

dfg = DFG(body, logger.getChild("fusion"), dfgc, parsing_cb=False)
dfg.apply_fusion_cbs()
body = [ ComputationNode.to_source_line(t) for t in dfg.nodes ]

return body

def fusion_region(self, start, end):
def fusion_region(self, start, end, **kwargs):
"""Run fusion callbacks on straightline code"""
logger = self.logger.getChild(f"ssa_{start}_{end}")
pre, body, post = AsmHelper.extract(self.source, start, end)

body_ssa = [ SourceLine(f"{start}:") ] +\
self._fusion_core(pre, body, logger) + \
self._fusion_core(pre, body, post, logger, **kwargs) + \
[ SourceLine(f"{end}:") ]
self.source = pre + body_ssa + post
assert SourceLine.is_source(self.source)

def fusion_loop(self, loop_lbl):
def fusion_loop(self, loop_lbl, **kwargs):
"""Run fusion callbacks on loop body"""
logger = self.logger.getChild(f"ssa_loop_{loop_lbl}")

Expand All @@ -373,25 +374,12 @@ def fusion_loop(self, loop_lbl):
indentation = AsmHelper.find_indentation(body)

body_ssa = SourceLine.read_multiline(loop.start(loop_cnt)) + \
SourceLine.apply_indentation(self._fusion_core(pre, body, logger), indentation) + \
SourceLine.apply_indentation(self._fusion_core(pre, body, late, logger, **kwargs), indentation) + \
SourceLine.read_multiline(loop.end(other_data))

self.source = pre + body_ssa + post
assert SourceLine.is_source(self.source)

c = self.config.copy()
self.config.keep_tags = True
self.config.constraints.functional_only = True
self.config.constraints.allow_reordering = False
self.config.sw_pipelining.enabled = False
self.config.split_heuristic = False
self.config.inputs_are_outputs = True
self.config.sw_pipelining.unknown_iteration_count = False
self.optimize_loop(loop_lbl)
self.config = c

assert SourceLine.is_source(self.source)

def optimize_loop(self, loop_lbl, postamble_label=None):
"""Optimize the loop starting at a given label
The postamble_label marks the end of the loop kernel.
Expand Down
46 changes: 45 additions & 1 deletion slothy/targets/aarch64/aarch64_neon.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ class which generates instruction parsers and writers from instruction templates
from enum import Enum
from functools import cache

from slothy.helper import SourceLine
from sympy import simplify

from slothy.targets.common import *
Expand Down Expand Up @@ -3175,7 +3176,50 @@ def core(inst,t,log=None):

return core

veor.global_fusion_cb = eor3_fusion_cb()
# TODO: Test only...
# veor.global_fusion_cb = eor3_fusion_cb()

def eor3_splitting_cb():
def core(inst,t,log=None):

d = inst.args_out[0]
a = inst.args_in[0]
b = inst.args_in[1]
c = inst.args_in[2]

# Check if we can use the output as a temporary
if d in [a,b,c]:
return False

eor0 = AArch64Instruction.build(veor, { "Vd": d, "Va" : a, "Vb" : b,
"datatype0":"16b",
"datatype1":"16b",
"datatype2":"16b" })
eor1 = AArch64Instruction.build(veor, { "Vd": d, "Va" : d, "Vb" : c,
"datatype0":"16b",
"datatype1":"16b",
"datatype2":"16b" })

eor0_src = SourceLine(eor0.write()).\
add_tags(inst.source_line.tags).\
add_comments(inst.source_line.comments)
eor1_src = SourceLine(eor1.write()).\
add_tags(inst.source_line.tags).\
add_comments(inst.source_line.comments)

eor0.source_line = eor0_src
eor1.source_line = eor1_src

if log is not None:
log(f"EOR3 splitting: {t.inst}; {eor0} + {eor1}")

t.changed = True
t.inst = [eor0, eor1]
return True

return core

veor3.global_fusion_cb = eor3_splitting_cb()

def iter_aarch64_instructions():
yield from all_subclass_leaves(Instruction)
Expand Down

0 comments on commit 9ab558d

Please sign in to comment.