diff --git a/.github/workflows/test_basic.yaml b/.github/workflows/test_basic.yaml
index 8da89a0f..49231d93 100644
--- a/.github/workflows/test_basic.yaml
+++ b/.github/workflows/test_basic.yaml
@@ -19,6 +19,21 @@ jobs:
- name: Run examples
run: |
python3 example.py --dry-run
+ tutorial:
+ if: ${{ github.event.label.name == 'needs-ci' ||
+ github.event.pull_request.user.login == 'hanno-becker' ||
+ github.event.pull_request.user.login == 'dop-amin' ||
+ github.event.pull_request.user.login == 'mkannwischer'
+ }}
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v3
+ - name: Install python dependencies
+ run: |
+ python -m pip install -r requirements.txt
+ - name: Run tutorial
+ run: |
+ (cd tutorial && ./tutorial_all.sh)
examples_basic:
if: ${{ github.event.label.name == 'needs-ci' ||
github.event.pull_request.user.login == 'hanno-becker' ||
diff --git a/example.py b/example.py
index c6330771..3e1181dd 100644
--- a/example.py
+++ b/example.py
@@ -517,6 +517,62 @@ def core(self, slothy):
slothy.config.sw_pipelining.halving_heuristic_periodic = True
slothy.optimize_loop("layer345_loop")
+class AArch64Example0(Example):
+ def __init__(self, var="", arch=AArch64_Neon, target=Target_CortexA55):
+ name = "aarch64_simple0"
+ infile = name
+
+ if var != "":
+ name += f"_{var}"
+ infile += f"_{var}"
+ name += f"_{target_label_dict[target]}"
+
+ super().__init__(infile, name, rename=True, arch=arch, target=target)
+
+ def core(self,slothy):
+ slothy.config.variable_size=True
+ slothy.config.constraints.stalls_first_attempt=32
+ slothy.optimize()
+
+class AArch64Example1(Example):
+ def __init__(self, var="", arch=AArch64_Neon, target=Target_CortexA55):
+ name = "aarch64_simple0_macros"
+ infile = name
+
+ if var != "":
+ name += f"_{var}"
+ infile += f"_{var}"
+ name += f"_{target_label_dict[target]}"
+
+ super().__init__(infile, name, rename=True, arch=arch, target=target)
+
+ def core(self,slothy):
+ slothy.config.variable_size=True
+ slothy.config.constraints.stalls_first_attempt=32
+ slothy.optimize(start="start", end="end")
+
+
+class AArch64Example2(Example):
+ def __init__(self, var="", arch=AArch64_Neon, target=Target_CortexA55):
+ name = "aarch64_simple0_loop"
+ infile = name
+
+ if var != "":
+ name += f"_{var}"
+ infile += f"_{var}"
+ name += f"_{target_label_dict[target]}"
+
+ super().__init__(infile, name, rename=True, arch=arch, target=target)
+
+ def core(self,slothy):
+ slothy.config.variable_size=True
+ slothy.config.constraints.stalls_first_attempt=32
+ slothy.config.sw_pipelining.enabled = True
+ slothy.config.sw_pipelining.optimize_preamble = False
+ slothy.config.sw_pipelining.optimize_postamble = False
+ slothy.optimize_loop("start")
+
+
class ntt_kyber_123_4567(Example):
def __init__(self, var="", arch=AArch64_Neon, target=Target_CortexA55, timeout=None):
@@ -1197,6 +1253,13 @@ def main():
Example2(),
Example3(),
+ AArch64Example0(),
+ AArch64Example0(target=Target_CortexA72),
+ AArch64Example1(),
+ AArch64Example1(target=Target_CortexA72),
+ AArch64Example2(),
+ AArch64Example2(target=Target_CortexA72),
+
CRT(),
ntt_n256_l6_s32("bar"),
diff --git a/examples/naive/aarch64/aarch64_simple0.s b/examples/naive/aarch64/aarch64_simple0.s
new file mode 100644
index 00000000..17299dcc
--- /dev/null
+++ b/examples/naive/aarch64/aarch64_simple0.s
@@ -0,0 +1,24 @@
+ldr q0, [x1, #0]
+ldr q1, [x2, #0]
+
+ldr q8, [x0]
+ldr q9, [x0, #1*16]
+ldr q10, [x0, #2*16]
+ldr q11, [x0, #3*16]
+
+mul v24.8h, v9.8h, v0.h[0]
+sqrdmulh v9.8h, v9.8h, v0.h[1]
+mls v24.8h, v9.8h, v1.h[0]
+sub v9.8h, v8.8h, v24.8h
+add v8.8h, v8.8h, v24.8h
+
+mul v24.8h, v11.8h, v0.h[0]
+sqrdmulh v11.8h, v11.8h, v0.h[1]
+mls v24.8h, v11.8h, v1.h[0]
+sub v11.8h, v10.8h, v24.8h
+add v10.8h, v10.8h, v24.8h
+
+str q8, [x0], #4*16
+str q9, [x0, #-3*16]
+str q10, [x0, #-2*16]
+str q11, [x0, #-1*16]
\ No newline at end of file
diff --git a/examples/naive/aarch64/aarch64_simple0_loop.s b/examples/naive/aarch64/aarch64_simple0_loop.s
new file mode 100644
index 00000000..10512245
--- /dev/null
+++ b/examples/naive/aarch64/aarch64_simple0_loop.s
@@ -0,0 +1,55 @@
+qdata0 .req q8
+qdata1 .req q9
+qdata2 .req q10
+qdata3 .req q11
+
+qtwiddle .req q0
+qmodulus .req q1
+
+data0 .req v8
+data1 .req v9
+data2 .req v10
+data3 .req v11
+
+twiddle .req v0
+modulus .req v1
+
+tmp .req v12
+
+data_ptr .req x0
+twiddle_ptr .req x1
+modulus_ptr .req x2
+
+.macro barmul out, in, twiddle, modulus
+ mul \out.8h, \in.8h, \twiddle.h[0]
+ sqrdmulh \in.8h, \in.8h, \twiddle.h[1]
+ mls \out.8h, \in.8h, \modulus.h[0]
+.endm
+
+.macro butterfly data0, data1, tmp, twiddle, modulus
+ barmul \tmp, \data1, \twiddle, \modulus
+ sub \data1.8h, \data0.8h, \tmp.8h
+ add \data0.8h, \data0.8h, \tmp.8h
+.endm
+
+count .req x2
+ldr qtwiddle, [twiddle_ptr, #0]
+ldr qmodulus, [modulus_ptr, #0]
+mov count, #16
+start:
+
+ ldr qdata0, [data_ptr, #0*16]
+ ldr qdata1, [data_ptr, #1*16]
+ ldr qdata2, [data_ptr, #2*16]
+ ldr qdata3, [data_ptr, #3*16]
+
+ butterfly data0, data1, tmp, twiddle, modulus
+ butterfly data2, data3, tmp, twiddle, modulus
+
+ str qdata0, [data_ptr], #4*16
+ str qdata1, [data_ptr, #-3*16]
+ str qdata2, [data_ptr, #-2*16]
+ str qdata3, [data_ptr, #-1*16]
+
+ subs count, count, #1
+ cbnz count, start
diff --git a/examples/naive/aarch64/aarch64_simple0_macros.s b/examples/naive/aarch64/aarch64_simple0_macros.s
new file mode 100644
index 00000000..d41f0056
--- /dev/null
+++ b/examples/naive/aarch64/aarch64_simple0_macros.s
@@ -0,0 +1,55 @@
+qdata0 .req q8
+qdata1 .req q9
+qdata2 .req q10
+qdata3 .req q11
+
+qtwiddle .req q0
+qmodulus .req q1
+
+data0 .req v8
+data1 .req v9
+data2 .req v10
+data3 .req v11
+
+twiddle .req v0
+modulus .req v1
+
+tmp .req v12
+
+data_ptr .req x0
+twiddle_ptr .req x1
+modulus_ptr .req x2
+
+.macro barmul out, in, twiddle, modulus
+ mul \out.8h, \in.8h, \twiddle.h[0]
+ sqrdmulh \in.8h, \in.8h, \twiddle.h[1]
+ mls \out.8h, \in.8h, \modulus.h[0]
+.endm
+
+.macro butterfly data0, data1, tmp, twiddle, modulus
+ barmul \tmp, \data1, \twiddle, \modulus
+ sub \data1.8h, \data0.8h, \tmp.8h
+ add \data0.8h, \data0.8h, \tmp.8h
+.endm
+
+count .req x2
+
+start:
+
+ ldr qtwiddle, [twiddle_ptr, #0]
+ ldr qmodulus, [modulus_ptr, #0]
+
+ ldr qdata0, [data_ptr, #0*16]
+ ldr qdata1, [data_ptr, #1*16]
+ ldr qdata2, [data_ptr, #2*16]
+ ldr qdata3, [data_ptr, #3*16]
+
+ butterfly data0, data1, tmp, twiddle, modulus
+ butterfly data2, data3, tmp, twiddle, modulus
+
+ str qdata0, [data_ptr], #4*16
+ str qdata1, [data_ptr, #-3*16]
+ str qdata2, [data_ptr, #-2*16]
+ str qdata3, [data_ptr, #-1*16]
+
+end:
diff --git a/examples/opt/aarch64/aarch64_simple0_loop_opt_a55.s b/examples/opt/aarch64/aarch64_simple0_loop_opt_a55.s
new file mode 100644
index 00000000..f945ec37
--- /dev/null
+++ b/examples/opt/aarch64/aarch64_simple0_loop_opt_a55.s
@@ -0,0 +1,125 @@
+qdata0 .req q8
+qdata1 .req q9
+qdata2 .req q10
+qdata3 .req q11
+
+qtwiddle .req q0
+qmodulus .req q1
+
+data0 .req v8
+data1 .req v9
+data2 .req v10
+data3 .req v11
+
+twiddle .req v0
+modulus .req v1
+
+tmp .req v12
+
+data_ptr .req x0
+twiddle_ptr .req x1
+modulus_ptr .req x2
+
+.macro barmul out, in, twiddle, modulus
+ mul \out.8h, \in.8h, \twiddle.h[0]
+ sqrdmulh \in.8h, \in.8h, \twiddle.h[1]
+ mls \out.8h, \in.8h, \modulus.h[0]
+.endm
+
+.macro butterfly data0, data1, tmp, twiddle, modulus
+ barmul \tmp, \data1, \twiddle, \modulus
+ sub \data1.8h, \data0.8h, \tmp.8h
+ add \data0.8h, \data0.8h, \tmp.8h
+.endm
+
+count .req x2
+ldr qtwiddle, [twiddle_ptr, #0]
+ldr qmodulus, [modulus_ptr, #0]
+mov count, #16
+ ldr q3, [x0, #16]
+ sqrdmulh v7.8H, v3.8H, v0.H[1]
+ sub count, count, #1
+start:
+ mul v3.8H, v3.8H, v0.H[0] // ....*.............
+ // gap // ..................
+ ldr q19, [x0, #48] // ...*..............
+ // gap // ..................
+ // gap // ..................
+ // gap // ..................
+ ldr q15, [x0, #0] // *.................
+ // gap // ..................
+ // gap // ..................
+ // gap // ..................
+ mls v3.8H, v7.8H, v1.H[0] // ......*...........
+ // gap // ..................
+ mul v13.8H, v19.8H, v0.H[0] // .........*........
+ // gap // ..................
+ sqrdmulh v19.8H, v19.8H, v0.H[1] // ..........*.......
+ // gap // ..................
+ ldr q7, [x0, #32] // ..*...............
+ // gap // ..................
+ // gap // ..................
+ // gap // ..................
+ sub v17.8H, v15.8H, v3.8H // .......*..........
+ // gap // ..................
+ add v10.8H, v15.8H, v3.8H // ........*.........
+ // gap // ..................
+ mls v13.8H, v19.8H, v1.H[0] // ...........*......
+ // gap // ..................
+ str q17, [x0, #16] // ...............*..
+ // gap // ..................
+ ldr q3, [x0, #80] // .e................
+ // gap // ..................
+ // gap // ..................
+ // gap // ..................
+ add v15.8H, v7.8H, v13.8H // .............*....
+ // gap // ..................
+ str q10, [x0], #4*16 // ..............*...
+ // gap // ..................
+ sub v13.8H, v7.8H, v13.8H // ............*.....
+ // gap // ..................
+ str q15, [x0, #-32] // ................*.
+ // gap // ..................
+ sqrdmulh v7.8H, v3.8H, v0.H[1] // .....e............
+ // gap // ..................
+ str q13, [x0, #-16] // .................*
+ // gap // ..................
+
+ // original source code
+ // ldr q8, [x0, #0*16] // .......|.*...............
+ // ldr q9, [x0, #1*16] // e......|..........e......
+ // ldr q10, [x0, #2*16] // .......|.....*...........
+ // ldr q11, [x0, #3*16] // .......|*................
+ // mul v12.8h, v9.8h, v0.h[0] // .......*.................
+ // sqrdmulh v9.8h, v9.8h, v0.h[1] // .....e.|...............e.
+ // mls v12.8h, v9.8h, v1.h[0] // .......|..*..............
+ // sub v9.8h, v8.8h, v12.8h // .......|......*..........
+ // add v8.8h, v8.8h, v12.8h // .......|.......*.........
+ // mul v12.8h, v11.8h, v0.h[0] // .......|...*.............
+ // sqrdmulh v11.8h, v11.8h, v0.h[1] // .......|....*............
+ // mls v12.8h, v11.8h, v1.h[0] // .......|........*........
+ // sub v11.8h, v10.8h, v12.8h // ...*...|.............*...
+ // add v10.8h, v10.8h, v12.8h // .*.....|...........*.....
+ // str q8, [x0], #4*16 // ..*....|............*....
+ // str q9, [x0, #-3*16] // .......|.........*.......
+ // str q10, [x0, #-2*16] // ....*..|..............*..
+ // str q11, [x0, #-1*16] // ......*|................*
+
+ sub count, count, #1
+ cbnz count, start
+ mul v3.8H, v3.8H, v0.H[0]
+ ldr q19, [x0, #48]
+ ldr q15, [x0, #0]
+ mls v3.8H, v7.8H, v1.H[0]
+ mul v13.8H, v19.8H, v0.H[0]
+ sqrdmulh v19.8H, v19.8H, v0.H[1]
+ ldr q7, [x0, #32]
+ sub v17.8H, v15.8H, v3.8H
+ add v10.8H, v15.8H, v3.8H
+ mls v13.8H, v19.8H, v1.H[0]
+ str q17, [x0, #16]
+ add v15.8H, v7.8H, v13.8H
+ str q10, [x0], #4*16
+ sub v13.8H, v7.8H, v13.8H
+ str q15, [x0, #-32]
+ str q13, [x0, #-16]
\ No newline at end of file
diff --git a/examples/opt/aarch64/aarch64_simple0_loop_opt_a72.s b/examples/opt/aarch64/aarch64_simple0_loop_opt_a72.s
new file mode 100644
index 00000000..3e8c3935
--- /dev/null
+++ b/examples/opt/aarch64/aarch64_simple0_loop_opt_a72.s
@@ -0,0 +1,117 @@
+qdata0 .req q8
+qdata1 .req q9
+qdata2 .req q10
+qdata3 .req q11
+
+qtwiddle .req q0
+qmodulus .req q1
+
+data0 .req v8
+data1 .req v9
+data2 .req v10
+data3 .req v11
+
+twiddle .req v0
+modulus .req v1
+
+tmp .req v12
+
+data_ptr .req x0
+twiddle_ptr .req x1
+modulus_ptr .req x2
+
+.macro barmul out, in, twiddle, modulus
+ mul \out.8h, \in.8h, \twiddle.h[0]
+ sqrdmulh \in.8h, \in.8h, \twiddle.h[1]
+ mls \out.8h, \in.8h, \modulus.h[0]
+.endm
+
+.macro butterfly data0, data1, tmp, twiddle, modulus
+ barmul \tmp, \data1, \twiddle, \modulus
+ sub \data1.8h, \data0.8h, \tmp.8h
+ add \data0.8h, \data0.8h, \tmp.8h
+.endm
+
+count .req x2
+ldr qtwiddle, [twiddle_ptr, #0]
+ldr qmodulus, [modulus_ptr, #0]
+mov count, #16
+ ldr q7, [x0, #16]
+ ldr q17, [x0, #48]
+ sqrdmulh v25.8H, v7.8H, v0.H[1]
+ mul v3.8H, v7.8H, v0.H[0]
+ sqrdmulh v27.8H, v17.8H, v0.H[1]
+ mls v3.8H, v25.8H, v1.H[0]
+ sub count, count, #1
+start:
+ ldr q15, [x0, #0] // *.................
+ ldr q7, [x0, #80] // .e................
+ mul v13.8H, v17.8H, v0.H[0] // .........*........
+ ldr q19, [x0, #32] // ..*...............
+ ldr q17, [x0, #112] // ...e..............
+ // gap // ..................
+ // gap // ..................
+ // gap // ..................
+ mls v13.8H, v27.8H, v1.H[0] // ...........*......
+ // gap // ..................
+ // gap // ..................
+ // gap // ..................
+ sqrdmulh v25.8H, v7.8H, v0.H[1] // .....e............
+ // gap // ..................
+ add v4.8H, v15.8H, v3.8H // ........*.........
+ sub v10.8H, v15.8H, v3.8H // .......*..........
+ // gap // ..................
+ // gap // ..................
+ mul v3.8H, v7.8H, v0.H[0] // ....e.............
+ // gap // ..................
+ // gap // ..................
+ sub v31.8H, v19.8H, v13.8H // ............*.....
+ // gap // ..................
+ // gap // ..................
+ sqrdmulh v27.8H, v17.8H, v0.H[1] // ..........e.......
+ add v28.8H, v19.8H, v13.8H // .............*....
+ // gap // ..................
+ // gap // ..................
+ // gap // ..................
+ str q10, [x0, #16] // ...............*..
+ mls v3.8H, v25.8H, v1.H[0] // ......e...........
+ str q31, [x0, #48] // .................*
+ str q4, [x0], #4*16 // ..............*...
+ str q28, [x0, #-32] // ................*.
+ // gap // ..................
+ // gap // ..................
+
+ // original source code
+ // ldr q8, [x0, #0*16] // .................*.................
+ // ldr q9, [x0, #1*16] // e................|e................
+ // ldr q10, [x0, #2*16] // ..*..............|..*..............
+ // ldr q11, [x0, #3*16] // ...e.............|...e.............
+ // mul v12.8h, v9.8h, v0.h[0] // ........e........|........e........
+ // sqrdmulh v9.8h, v9.8h, v0.h[1] // .....e...........|.....e...........
+ // mls v12.8h, v9.8h, v1.h[0] // .............e...|.............e...
+ // sub v9.8h, v8.8h, v12.8h // .......*.........|.......*.........
+ // add v8.8h, v8.8h, v12.8h // ......*..........|......*..........
+ // mul v12.8h, v11.8h, v0.h[0] // .*...............|.*...............
+ // sqrdmulh v11.8h, v11.8h, v0.h[1] // ..........e......|..........e......
+ // mls v12.8h, v11.8h, v1.h[0] // ....*............|....*............
+ // sub v11.8h, v10.8h, v12.8h // .........*.......|.........*.......
+ // add v10.8h, v10.8h, v12.8h // ...........*.....|...........*.....
+ // str q8, [x0], #4*16 // ...............*.|...............*.
+ // str q9, [x0, #-3*16] // ............*....|............*....
+ // str q10, [x0, #-2*16] // ................*|................*
+ // str q11, [x0, #-1*16] // ..............*..|..............*..
+
+ sub count, count, #1
+ cbnz count, start
+ ldr q15, [x0, #0]
+ mul v13.8H, v17.8H, v0.H[0]
+ ldr q19, [x0, #32]
+ mls v13.8H, v27.8H, v1.H[0]
+ add v4.8H, v15.8H, v3.8H
+ sub v10.8H, v15.8H, v3.8H
+ sub v31.8H, v19.8H, v13.8H
+ add v28.8H, v19.8H, v13.8H
+ str q10, [x0, #16]
+ str q31, [x0, #48]
+ str q4, [x0], #4*16
+ str q28, [x0, #-32]
\ No newline at end of file
diff --git a/examples/opt/aarch64/aarch64_simple0_macros_opt_a55.s b/examples/opt/aarch64/aarch64_simple0_macros_opt_a55.s
new file mode 100644
index 00000000..56215f2c
--- /dev/null
+++ b/examples/opt/aarch64/aarch64_simple0_macros_opt_a55.s
@@ -0,0 +1,117 @@
+qdata0 .req q8
+qdata1 .req q9
+qdata2 .req q10
+qdata3 .req q11
+
+qtwiddle .req q0
+qmodulus .req q1
+
+data0 .req v8
+data1 .req v9
+data2 .req v10
+data3 .req v11
+
+twiddle .req v0
+modulus .req v1
+
+tmp .req v12
+
+data_ptr .req x0
+twiddle_ptr .req x1
+modulus_ptr .req x2
+
+.macro barmul out, in, twiddle, modulus
+ mul \out.8h, \in.8h, \twiddle.h[0]
+ sqrdmulh \in.8h, \in.8h, \twiddle.h[1]
+ mls \out.8h, \in.8h, \modulus.h[0]
+.endm
+
+.macro butterfly data0, data1, tmp, twiddle, modulus
+ barmul \tmp, \data1, \twiddle, \modulus
+ sub \data1.8h, \data0.8h, \tmp.8h
+ add \data0.8h, \data0.8h, \tmp.8h
+.endm
+
+count .req x2
+
+ start:
+ ldr q0, [x1, #0] // *...................
+ // gap // ....................
+ // gap // ....................
+ // gap // ....................
+ ldr q31, [x0, #16] // ...*................
+ // gap // ....................
+ // gap // ....................
+ // gap // ....................
+ ldr q6, [x2, #0] // .*..................
+ // gap // ....................
+ // gap // ....................
+ // gap // ....................
+ mul v29.8H, v31.8H, v0.H[0] // ......*.............
+ // gap // ....................
+ sqrdmulh v3.8H, v31.8H, v0.H[1] // .......*............
+ // gap // ....................
+ ldr q14, [x0, #48] // .....*..............
+ // gap // ....................
+ // gap // ....................
+ // gap // ....................
+ ldr q9, [x0, #0] // ..*.................
+ // gap // ....................
+ // gap // ....................
+ // gap // ....................
+ mul v10.8H, v14.8H, v0.H[0] // ...........*........
+ // gap // ....................
+ sqrdmulh v22.8H, v14.8H, v0.H[1] // ............*.......
+ // gap // ....................
+ mls v29.8H, v3.8H, v6.H[0] // ........*...........
+ // gap // ....................
+ ldr q18, [x0, #32] // ....*...............
+ // gap // ....................
+ // gap // ....................
+ // gap // ....................
+ mls v10.8H, v22.8H, v6.H[0] // .............*......
+ // gap // ....................
+ add v21.8H, v9.8H, v29.8H // ..........*.........
+ // gap // ....................
+ sub v29.8H, v9.8H, v29.8H // .........*..........
+ // gap // ....................
+ // gap // ....................
+ // gap // ....................
+ str q21, [x0], #4*16 // ................*...
+ // gap // ....................
+ add v13.8H, v18.8H, v10.8H // ...............*....
+ // gap // ....................
+ str q29, [x0, #-48] // .................*..
+ // gap // ....................
+ sub v3.8H, v18.8H, v10.8H // ..............*.....
+ // gap // ....................
+ str q13, [x0, #-32] // ..................*.
+ // gap // ....................
+ // gap // ....................
+ // gap // ....................
+ str q3, [x0, #-16] // ...................*
+ // gap // ....................
+
+ // original source code
+ // ldr q0, [x1, #0] // *...................
+ // ldr q1, [x2, #0] // ..*.................
+ // ldr q8, [x0, #0*16] // ......*.............
+ // ldr q9, [x0, #1*16] // .*..................
+ // ldr q10, [x0, #2*16] // ..........*.........
+ // ldr q11, [x0, #3*16] // .....*..............
+ // mul v12.8h, v9.8h, v0.h[0] // ...*................
+ // sqrdmulh v9.8h, v9.8h, v0.h[1] // ....*...............
+ // mls v12.8h, v9.8h, v1.h[0] // .........*..........
+ // sub v9.8h, v8.8h, v12.8h // .............*......
+ // add v8.8h, v8.8h, v12.8h // ............*.......
+ // mul v12.8h, v11.8h, v0.h[0] // .......*............
+ // sqrdmulh v11.8h, v11.8h, v0.h[1] // ........*...........
+ // mls v12.8h, v11.8h, v1.h[0] // ...........*........
+ // sub v11.8h, v10.8h, v12.8h // .................*..
+ // add v10.8h, v10.8h, v12.8h // ...............*....
+ // str q8, [x0], #4*16 // ..............*.....
+ // str q9, [x0, #-3*16] // ................*...
+ // str q10, [x0, #-2*16] // ..................*.
+ // str q11, [x0, #-1*16] // ...................*
+
+ end:
diff --git a/examples/opt/aarch64/aarch64_simple0_macros_opt_a72.s b/examples/opt/aarch64/aarch64_simple0_macros_opt_a72.s
new file mode 100644
index 00000000..8cd8d874
--- /dev/null
+++ b/examples/opt/aarch64/aarch64_simple0_macros_opt_a72.s
@@ -0,0 +1,130 @@
+qdata0 .req q8
+qdata1 .req q9
+qdata2 .req q10
+qdata3 .req q11
+
+qtwiddle .req q0
+qmodulus .req q1
+
+data0 .req v8
+data1 .req v9
+data2 .req v10
+data3 .req v11
+
+twiddle .req v0
+modulus .req v1
+
+tmp .req v12
+
+data_ptr .req x0
+twiddle_ptr .req x1
+modulus_ptr .req x2
+
+.macro barmul out, in, twiddle, modulus
+ mul \out.8h, \in.8h, \twiddle.h[0]
+ sqrdmulh \in.8h, \in.8h, \twiddle.h[1]
+ mls \out.8h, \in.8h, \modulus.h[0]
+.endm
+
+.macro butterfly data0, data1, tmp, twiddle, modulus
+ barmul \tmp, \data1, \twiddle, \modulus
+ sub \data1.8h, \data0.8h, \tmp.8h
+ add \data0.8h, \data0.8h, \tmp.8h
+.endm
+
+count .req x2
+
+start:
+ ldr q30, [x0, #48] // .....*..............
+ ldr q9, [x1, #0] // *...................
+// gap // ....................
+ ldr q6, [x0, #0] // ..*.................
+ ldr q18, [x0, #32] // ....*...............
+// gap // ....................
+ ldr q27, [x0, #16] // ...*................
+ ldr q7, [x2, #0] // .*..................
+// gap // ....................
+// gap // ....................
+// gap // ....................
+// gap // ....................
+ mul v16.8H, v30.8H, v9.H[0] // ...........*........
+// gap // ....................
+// gap // ....................
+// gap // ....................
+// gap // ....................
+// gap // ....................
+ sqrdmulh v31.8H, v30.8H, v9.H[1] // ............*.......
+// gap // ....................
+// gap // ....................
+// gap // ....................
+// gap // ....................
+// gap // ....................
+ sqrdmulh v15.8H, v27.8H, v9.H[1] // .......*............
+// gap // ....................
+// gap // ....................
+// gap // ....................
+// gap // ....................
+// gap // ....................
+ mul v27.8H, v27.8H, v9.H[0] // ......*.............
+// gap // ....................
+// gap // ....................
+// gap // ....................
+// gap // ....................
+// gap // ....................
+ mls v16.8H, v31.8H, v7.H[0] // .............*......
+// gap // ....................
+// gap // ....................
+// gap // ....................
+// gap // ....................
+// gap // ....................
+ mls v27.8H, v15.8H, v7.H[0] // ........*...........
+// gap // ....................
+// gap // ....................
+// gap // ....................
+// gap // ....................
+// gap // ....................
+// gap // ....................
+// gap // ....................
+// gap // ....................
+ add v30.8H, v18.8H, v16.8H // ...............*....
+ sub v24.8H, v18.8H, v16.8H // ..............*.....
+// gap // ....................
+// gap // ....................
+// gap // ....................
+// gap // ....................
+ sub v7.8H, v6.8H, v27.8H // .........*..........
+ add v27.8H, v6.8H, v27.8H // ..........*.........
+// gap // ....................
+ str q24, [x0, #48] // ...................*
+ str q30, [x0, #32] // ..................*.
+// gap // ....................
+// gap // ....................
+// gap // ....................
+// gap // ....................
+ str q7, [x0, #16] // .................*..
+ str q27, [x0], #4*16 // ................*...
+// gap // ....................
+
+// original source code
+// ldr q0, [x1, #0] // .*..................
+// ldr q1, [x2, #0] // .....*..............
+// ldr q8, [x0, #0*16] // ..*.................
+// ldr q9, [x0, #1*16] // ....*...............
+// ldr q10, [x0, #2*16] // ...*................
+// ldr q11, [x0, #3*16] // *...................
+// mul v12.8h, v9.8h, v0.h[0] // .........*..........
+// sqrdmulh v9.8h, v9.8h, v0.h[1] // ........*...........
+// mls v12.8h, v9.8h, v1.h[0] // ...........*........
+// sub v9.8h, v8.8h, v12.8h // ..............*.....
+// add v8.8h, v8.8h, v12.8h // ...............*....
+// mul v12.8h, v11.8h, v0.h[0] // ......*.............
+// sqrdmulh v11.8h, v11.8h, v0.h[1] // .......*............
+// mls v12.8h, v11.8h, v1.h[0] // ..........*.........
+// sub v11.8h, v10.8h, v12.8h // .............*......
+// add v10.8h, v10.8h, v12.8h // ............*.......
+// str q8, [x0], #4*16 // ...................*
+// str q9, [x0, #-3*16] // ..................*.
+// str q10, [x0, #-2*16] // .................*..
+// str q11, [x0, #-1*16] // ................*...
+
+end:
diff --git a/examples/opt/aarch64/aarch64_simple0_opt_a55.s b/examples/opt/aarch64/aarch64_simple0_opt_a55.s
new file mode 100644
index 00000000..66feac88
--- /dev/null
+++ b/examples/opt/aarch64/aarch64_simple0_opt_a55.s
@@ -0,0 +1,78 @@
+ ldr q2, [x1, #0] // *...................
+ // gap // ....................
+ // gap // ....................
+ // gap // ....................
+ ldr q0, [x0, #48] // .....*..............
+ // gap // ....................
+ // gap // ....................
+ // gap // ....................
+ ldr q31, [x0, #16] // ...*................
+ // gap // ....................
+ // gap // ....................
+ // gap // ....................
+ mul v7.8H, v0.8H, v2.H[0] // ...........*........
+ // gap // ....................
+ ldr q12, [x2, #0] // .*..................
+ // gap // ....................
+ // gap // ....................
+ // gap // ....................
+ sqrdmulh v23.8H, v31.8H, v2.H[1] // .......*............
+ // gap // ....................
+ mul v22.8H, v31.8H, v2.H[0] // ......*.............
+ // gap // ....................
+ sqrdmulh v2.8H, v0.8H, v2.H[1] // ............*.......
+ // gap // ....................
+ ldr q28, [x0] // ..*.................
+ // gap // ....................
+ // gap // ....................
+ // gap // ....................
+ mls v22.8H, v23.8H, v12.H[0] // ........*...........
+ // gap // ....................
+ ldr q23, [x0, #32] // ....*...............
+ // gap // ....................
+ // gap // ....................
+ // gap // ....................
+ mls v7.8H, v2.8H, v12.H[0] // .............*......
+ // gap // ....................
+ add v12.8H, v28.8H, v22.8H // ..........*.........
+ // gap // ....................
+ sub v2.8H, v28.8H, v22.8H // .........*..........
+ // gap // ....................
+ // gap // ....................
+ // gap // ....................
+ str q12, [x0], #4*16 // ................*...
+ // gap // ....................
+ add v8.8H, v23.8H, v7.8H // ...............*....
+ // gap // ....................
+ str q2, [x0, #-48] // .................*..
+ // gap // ....................
+ sub v22.8H, v23.8H, v7.8H // ..............*.....
+ // gap // ....................
+ str q8, [x0, #-32] // ..................*.
+ // gap // ....................
+ // gap // ....................
+ // gap // ....................
+ str q22, [x0, #-16] // ...................*
+ // gap // ....................
+
+ // original source code
+ // ldr q0, [x1, #0] // *...................
+ // ldr q1, [x2, #0] // ....*...............
+ // ldr q8, [x0] // ........*...........
+ // ldr q9, [x0, #1*16] // ..*.................
+ // ldr q10, [x0, #2*16] // ..........*.........
+ // ldr q11, [x0, #3*16] // .*..................
+ // mul v24.8h, v9.8h, v0.h[0] // ......*.............
+ // sqrdmulh v9.8h, v9.8h, v0.h[1] // .....*..............
+ // mls v24.8h, v9.8h, v1.h[0] // .........*..........
+ // sub v9.8h, v8.8h, v24.8h // .............*......
+ // add v8.8h, v8.8h, v24.8h // ............*.......
+ // mul v24.8h, v11.8h, v0.h[0] // ...*................
+ // sqrdmulh v11.8h, v11.8h, v0.h[1] // .......*............
+ // mls v24.8h, v11.8h, v1.h[0] // ...........*........
+ // sub v11.8h, v10.8h, v24.8h // .................*..
+ // add v10.8h, v10.8h, v24.8h // ...............*....
+ // str q8, [x0], #4*16 // ..............*.....
+ // str q9, [x0, #-3*16] // ................*...
+ // str q10, [x0, #-2*16] // ..................*.
+ // str q11, [x0, #-1*16] // ...................*
diff --git a/examples/opt/aarch64/aarch64_simple0_opt_a72.s b/examples/opt/aarch64/aarch64_simple0_opt_a72.s
new file mode 100644
index 00000000..f1a6f5cf
--- /dev/null
+++ b/examples/opt/aarch64/aarch64_simple0_opt_a72.s
@@ -0,0 +1,91 @@
+ ldr q0, [x1, #0] // *...................
+ ldr q7, [x0, #16] // ...*................
+// gap // ....................
+ ldr q10, [x0, #48] // .....*..............
+// gap // ....................
+// gap // ....................
+ ldr q13, [x2, #0] // .*..................
+// gap // ....................
+// gap // ....................
+ ldr q3, [x0, #32] // ....*...............
+// gap // ....................
+// gap // ....................
+ sqrdmulh v2.8H, v7.8H, v0.H[1] // .......*............
+ ldr q9, [x0] // ..*.................
+// gap // ....................
+// gap // ....................
+// gap // ....................
+// gap // ....................
+ mul v26.8H, v7.8H, v0.H[0] // ......*.............
+// gap // ....................
+// gap // ....................
+// gap // ....................
+// gap // ....................
+// gap // ....................
+ sqrdmulh v14.8H, v10.8H, v0.H[1] // ............*.......
+// gap // ....................
+// gap // ....................
+// gap // ....................
+// gap // ....................
+// gap // ....................
+ mls v26.8H, v2.8H, v13.H[0] // ........*...........
+// gap // ....................
+// gap // ....................
+// gap // ....................
+// gap // ....................
+// gap // ....................
+ mul v11.8H, v10.8H, v0.H[0] // ...........*........
+// gap // ....................
+// gap // ....................
+// gap // ....................
+// gap // ....................
+// gap // ....................
+ mls v11.8H, v14.8H, v13.H[0] // .............*......
+// gap // ....................
+// gap // ....................
+// gap // ....................
+// gap // ....................
+// gap // ....................
+ add v22.8H, v9.8H, v26.8H // ..........*.........
+ sub v9.8H, v9.8H, v26.8H // .........*..........
+// gap // ....................
+// gap // ....................
+// gap // ....................
+// gap // ....................
+// gap // ....................
+// gap // ....................
+// gap // ....................
+ str q9, [x0, #16] // .................*..
+ sub v31.8H, v3.8H, v11.8H // ..............*.....
+ add v9.8H, v3.8H, v11.8H // ...............*....
+ str q22, [x0], #4*16 // ................*...
+// gap // ....................
+// gap // ....................
+// gap // ....................
+// gap // ....................
+// gap // ....................
+ str q31, [x0, #-16] // ...................*
+ str q9, [x0, #-32] // ..................*.
+// gap // ....................
+
+// original source code
+// ldr q0, [x1, #0] // *...................
+// ldr q1, [x2, #0] // ...*................
+// ldr q8, [x0] // ......*.............
+// ldr q9, [x0, #1*16] // .*..................
+// ldr q10, [x0, #2*16] // ....*...............
+// ldr q11, [x0, #3*16] // ..*.................
+// mul v24.8h, v9.8h, v0.h[0] // .......*............
+// sqrdmulh v9.8h, v9.8h, v0.h[1] // .....*..............
+// mls v24.8h, v9.8h, v1.h[0] // .........*..........
+// sub v9.8h, v8.8h, v24.8h // .............*......
+// add v8.8h, v8.8h, v24.8h // ............*.......
+// mul v24.8h, v11.8h, v0.h[0] // ..........*.........
+// sqrdmulh v11.8h, v11.8h, v0.h[1] // ........*...........
+// mls v24.8h, v11.8h, v1.h[0] // ...........*........
+// sub v11.8h, v10.8h, v24.8h // ...............*....
+// add v10.8h, v10.8h, v24.8h // ................*...
+// str q8, [x0], #4*16 // .................*..
+// str q9, [x0, #-3*16] // ..............*.....
+// str q10, [x0, #-2*16] // ...................*
+// str q11, [x0, #-1*16] // ..................*.
diff --git a/slothy/core/slothy.py b/slothy/core/slothy.py
index d43a39c2..596ec311 100644
--- a/slothy/core/slothy.py
+++ b/slothy/core/slothy.py
@@ -232,9 +232,6 @@ def optimize(self, start=None, end=None, loop_synthesis_cb=None, logname=None):
# Check if the body has a dominant indentation
indentation = AsmHelper.find_indentation(body)
- if self.config.with_llvm_mca_before is True:
- orig_stats = self._make_llvm_mca_stats(pre, body, "ORIGINAL", indentation)
-
if c.with_preprocessor:
self.logger.info("Apply C preprocessor...")
body = CPreprocessor.unfold(pre, body, c.compiler_binary)
@@ -246,6 +243,10 @@ def optimize(self, start=None, end=None, loop_synthesis_cb=None, logname=None):
body = AsmAllocation.unfold_all_aliases(c.register_aliases, body)
body = SourceLine.apply_indentation(body, indentation)
self.logger.info("Instructions in body: %d", len(list(filter(None, body))))
+
+ if self.config.with_llvm_mca_before is True:
+ orig_stats = self._make_llvm_mca_stats(pre, body, "ORIGINAL", indentation)
+
early, core, late, num_exceptional = Heuristics.periodic(body, logger, c)
if self.config.with_llvm_mca_before is True:
@@ -265,6 +266,7 @@ def indented(code):
if end is not None:
core += [SourceLine(f"{end}:")]
+ core = SourceLine.apply_indentation(core, self.config.indentation)
if not self.config.sw_pipelining.enabled:
assert early == []
assert late == []
@@ -395,9 +397,6 @@ def optimize_loop(self, loop_lbl, postamble_label=None):
c = self.config.copy()
c.add_aliases(aliases)
- if self.config.with_llvm_mca_before is True:
- orig_stats = self._make_llvm_mca_stats(early, body, "ORIGINAL", indentation)
-
if c.with_preprocessor:
self.logger.info("Apply C preprocessor...")
body = CPreprocessor.unfold(early, body, c.compiler_binary)
@@ -408,10 +407,12 @@ def optimize_loop(self, loop_lbl, postamble_label=None):
body = AsmMacro.unfold_all_macros(early, body, inherit_comments=c.inherit_macro_comments)
body = AsmAllocation.unfold_all_aliases(c.register_aliases, body)
body = SourceLine.apply_indentation(body, indentation)
-
self.logger.info("Optimizing loop %s (%d instructions) ...",
loop_lbl, len(body))
+ if self.config.with_llvm_mca_before is True:
+ orig_stats = self._make_llvm_mca_stats(early, body, "ORIGINAL", indentation)
+
preamble_code, kernel_code, postamble_code, num_exceptional = \
Heuristics.periodic(body, logger, c)
@@ -419,16 +420,19 @@ def optimize_loop(self, loop_lbl, postamble_label=None):
kernel_code = kernel_code + orig_stats
if self.config.with_llvm_mca_after is True:
+ print(SourceLine.write_multiline(kernel_code))
new_stats_kernel = self._make_llvm_mca_stats(early, kernel_code, "OPTIMIZED",
indentation)
kernel_code = kernel_code + new_stats_kernel
- if len(preamble_code) > 0:
+ if self.config.sw_pipelining.optimize_preamble is True \
+ and len(preamble_code) > 0:
new_stats_preamble = self._make_llvm_mca_stats(early, preamble_code, "PREAMBLE",
indentation)
preamble_code = preamble_code + new_stats_preamble
- if len(postamble_code) > 0:
+ if self.config.sw_pipelining.optimize_postamble is True \
+ and len(postamble_code) > 0:
new_stats_postamble = self._make_llvm_mca_stats(early, postamble_code, "POSTAMBLE",
indentation)
postamble_code = postamble_code + new_stats_postamble
diff --git a/tutorial/README.md b/tutorial/README.md
new file mode 100644
index 00000000..a49bb728
--- /dev/null
+++ b/tutorial/README.md
@@ -0,0 +1,1050 @@
+# SLOTHY Tutorial
+
+This tutorial introduces you to using the SLOTHY superoptimizer for optimizing assembly programs for a specific microarchitecture.
+It goes beyond what is written in the [README](../README.md) or the [SLOTHY
+paper](https://eprint.iacr.org/2022/1303.pdf) in that it gives more examples on how we, the developers of SLOTHY,
+typically use SLOTHY to optimize cryptographic code. At the end of the tutorial, you should be familiar with
+the workflow of using SLOTHY as well as a number of common ways to debug or improve your results.
+
+## Introduction to SLOTHY
+
+SLOTHY is a fixed instruction superoptimizer: Its input is assembly and its output is semantically-equivalent optimized
+assembly using the same instructions and data flow. The fact that SLOTHY does not change instructions is very important
+both theoretically (in terms of complexity of optimization) and practically (in terms of developer control) and sets SLOTHY apart from
+_synthesizing_ superoptimizers like [souper](https://github.com/google/souper).
+
+Concretely, SLOTHY performs three main jobs:
+1. (Re-)schedule instructions to hide latencies and improve utilization of all execution units.
+2. Rename registers in case this enables a better scheduling.
+3. Perform software pipelining (aka periodic loop interleaving). We will cover software pipelining in more depth later in this tutorial.
+
+SLOTHY performs these jobs by first lifting the input assembly into a data-flow graph (DFG) modelling dependencies
+between instructions. At this level, the ordering of instructions and the choice of register names is no longer visible.
+The goal of SLOTHY, then, is to find a traversal/lowering of the DFG that results in the least
+number of pipeline stalls. A traversal/lowering of the graph is assigning to each instruction an index at which
+the instruction will be in the output, plus a choice of registers to be used for its outputs. SLOTHY does so by turning
+the graph together with information about the (micro)architecture into constraints that are fed into an external constraint
+solver; so far, we have been using Google OR-tools, but in principle one can use other solvers as well.
+Constraints come in two flavours: Architectural and microarchitectural. Architectural constraints simply ensure that the
+resulting code is architecturally valid (e.g. SLOTHY does not use a vector register in a scalar instruction) and
+functionally correct (it has the same DFG). Microarchitectural constraints imply (hopefully) that the code will run fast
+on the target; SLOTHY models microarchitectures in terms of issue width, instruction latencies, throughput, forwarding
+paths, and the number of execution units able to execute certain instructions. We refer to the [SLOTHY
+paper](https://eprint.iacr.org/2022/1303.pdf) for details of the constraint model, which are not relevant here.
+
+Note again that SLOTHY does (largely) not change instructions: Instruction selection is left to the developer.
+For cryptographic code -- which is what SLOTHY was developed for -- instruction selection is a core focus of research
+and highly-optimized instruction sequences implementing a cryptographic (sub-)routine usually exist. Tight control over
+the choice of instructions is also important from a security perspective, as variable-time instructions have to be
+avoided.
+
+**High-assurance cryptography**: While formal verification is not part of SLOTHY itself, there is potential for
+combining existing formal verification tools with SLOTHY. From a high level, formal verification should be relatively
+simple, owing to the fact that SLOTHY does not change the DFG: In fact, SLOTHY itself includes a selfcheck that
+lifts the output assembly back into a DFG and confirms that it is isomorphic to the input DFG via the permutation found by
+SLOTHY. However, while this is a strong indicator of correctness of the output assembly, it does _not_ amount to a
+formal verification, as pitfalls do remain (notably bad user configurations and subtleties around modelling
+memory and load/store offsets which we will not discuss in this tutorial). Research into combining SLOTHY with trusted
+verification infrastructure is therefore needed. As a first promising example, AWS-LC has recently
+[integrated](https://github.com/aws/aws-lc/pull/1478) an implementation of X25519 that was auto-generated by SLOTHY and
+formally verified using the [HOL-Light](https://github.com/jrh13/hol-light) proof assistant.
+
+## Table of contents
+
+1) [Installation](#1-installation). This is limited to the fastest way of installing SLOTHY using pip. For more complete instructions, see the [README](../README.md).
+2) [Getting started](#2-getting-started)
+3) [Using SLOTHY for your own code](#3-writing-your-own-calling-code)
+4) [Using SLOTHY's Software Pipelining](#4-software-pipelining)
+5) [Checking the quality of SLOTHY optimizations](#5-checking-the-quality-of-slothy-optimizations)
+6) [Optimizing a full Neon NTT](#6-optimizing-a-full-neon-ntt)
+7) [Optimizing larger pieces of code](#7-optimizing-larger-pieces-of-code)
+8) [Adding a new microarchitecture](#8-adding-a-new-microarchitecture)
+
+The SLOTHY calling code used for the parts 3-7 is located in `tutorial-{3a,3b,4,5,6,7}.py`.
+
+## 1. Installation
+
+SLOTHY requires python3 (>= 3.10).
+The easiest way to install the dependencies of SLOTHY is using pip.
+It's advised to make use of [virtual environment](https://docs.python.org/3/library/venv.html).
+
+The following steps should get you started:
+
+```bash
+git clone https://github.com/slothy-optimizer/slothy
+cd slothy
+# setup venv
+python3 -m venv venv
+source venv/bin/activate
+# install dependencies
+pip install -r requirements.txt
+```
+
+You can try to run SLOTHY on one of the examples that come with SLOTHY to make sure it runs without errors:
+```
+python3 example.py --examples simple0
+```
+
+We will look into more examples shortly and discuss input, output, and available flags.
+
+## 2. Getting Started
+
+The simplest way to get started using SLOTHY is by trying out some of the examples that come with SLOTHY.
+Once you work on your own code, you will likely be using the `slothy-cli` command or calling the SLOTHY module from your own Python script for invoking SLOTHY allowing you to control all the different options SLOTHY has.
+However, for now we will be using the [example.py](../example.py) script and containing a number of examples including the ones we have optimized in the SLOTHY paper.
+You can run `python3 example.py --help` to see all examples available.
+
+Let's look at a very simple example from the previous section called `aarch64_simple0`.
+You can find the corresponding code in [examples/naive/aarch64/aarch64_simple0.s](../examples/naive/aarch64/aarch64_simple0.s):
+```nasm
+ldr q0, [x1, #0]
+ldr q1, [x2, #0]
+
+ldr q8, [x0]
+ldr q9, [x0, #1*16]
+ldr q10, [x0, #2*16]
+ldr q11, [x0, #3*16]
+
+mul v24.8h, v9.8h, v0.h[0]
+sqrdmulh v9.8h, v9.8h, v0.h[1]
+mls v24.8h, v9.8h, v1.h[0]
+sub v9.8h, v8.8h, v24.8h
+add v8.8h, v8.8h, v24.8h
+
+mul v24.8h, v11.8h, v0.h[0]
+sqrdmulh v11.8h, v11.8h, v0.h[1]
+mls v24.8h, v11.8h, v1.h[0]
+sub v11.8h, v10.8h, v24.8h
+add v10.8h, v10.8h, v24.8h
+
+str q8, [x0], #4*16
+str q9, [x0, #-3*16]
+str q10, [x0, #-2*16]
+str q11, [x0, #-1*16]
+```
+
+It contains a straight-line piece of assembly for the Armv8-A architecture. This architecture implements the Neon vector instruction extension and all the instructions in this example are Neon vector instructions.
+If you have never written Neon assembly before, you do not have to worry about it at this point.
+All you need to know about the code is that it loads some vectors from memory, performs some arithmetic operations, and writes back the result to memory.
+Note that there are two independent streams of computation on the four vectors loaded from memory, and, hence, there is quite some possibilities to re-order this code without affecting its semantics.
+This code is able to run on a variety of different microarchitectures, ranging from low-end energy efficient in-order cores like the Arm Cortex-A55 to high-end out-of-order CPUs with very complex pipelines like the Apple M1 or Arm Neoverse server CPUs.
+For the in-order cores, the instruction scheduling plays the most essential role as poorly scheduled code is very likely to have poor performance, and hence, we will focus on the Cortex-A55 architecture in the following.
+Note, however, that SLOTHY has been used to also obtain significant speed-ups for out-of-order cores.
+
+SLOTHY comes with models for various Arm architectures, including the power-efficient, in-order
+[Cortex-A55](https://developer.arm.com/Processors/Cortex-A55), so we can now optimize this piece of code for that
+microarchitecture. [example.py](../example.py) contains the needed SLOTHY incarnations for convenience, so we can simply run `python3
+example.py --examples aarch64_simple0_a55` which will optimize for the Cortex-A55 microarchitecture. You can check
+[example.py](../example.py) for the details. This will optimize the piece of code above and write the output code to
+[examples/opt/aarch64/aarch64_simple0_opt_a55.s](../examples/opt/aarch64/aarch64_simple0_opt_a55.s).
+SLOTHY should print something similar to this:
+```
+INFO:aarch64_simple0_a55:Instructions in body: 20
+INFO:aarch64_simple0_a55.slothy:Perform internal binary search for minimal number of stalls...
+INFO:aarch64_simple0_a55.slothy:Attempt optimization with max 32 stalls...
+INFO:aarch64_simple0_a55.slothy:Objective: minimize number of stalls
+INFO:aarch64_simple0_a55.slothy:Invoking external constraint solver (OR-Tools CP-SAT v9.7.2996) ...
+INFO:aarch64_simple0_a55.slothy:[0.0653s]: Found 1 solutions so far... objective 19.0, bound 12.0 (minimize number of stalls)
+INFO:aarch64_simple0_a55.slothy:[0.0801s]: Found 2 solutions so far... objective 18.0, bound 12.0 (minimize number of stalls)
+INFO:aarch64_simple0_a55.slothy:OPTIMAL, wall time: 0.180540 s
+INFO:aarch64_simple0_a55.slothy:Booleans in result: 449
+INFO:aarch64_simple0_a55.slothy.selfcheck:OK!
+INFO:aarch64_simple0_a55.slothy:Minimum number of stalls: 18
+```
+
+You can follow the steps SLOTHY performs and see the calls the constraint solver trying to find a re-scheduling of this code containing at most 32 stalls (a default starting point we have set here to speed up the example).
+At the same time it is trying to minimize the number of stalls. This is passed as an objective to the constraint solver (OR-tools) which tries to find a solution with the minimum number of stalls.
+The best solution it can find has 16 stalls -- which is guaranteed to be the minimum number of stalls given this piece of code and the model of the microarchitecture in SLOTHY.
+In the last step, SLOTHY will transform the found traversal of the DFG into actual assembly and write it to the file.
+To make sure everything worked out as expected, it will perform a selfcheck which consists of transforming the output assembly into a DFG again and testing that the resulting graph is isomorphic to the input DFG.
+
+We can now take a look at the output assembly in [examples/opt/aarch64/aarch64_simple0_opt_a55.s](../examples/opt/aarch64/aarch64_simple0_opt_a55.s):
+```nasm
+ldr q8, [x1, #0] // *...................
+// gap // ....................
+// gap // ....................
+// gap // ....................
+ldr q30, [x0, #16] // ...*................
+// gap // ....................
+// gap // ....................
+// gap // ....................
+ldr q25, [x0] // ..*.................
+// gap // ....................
+// gap // ....................
+// gap // ....................
+mul v13.8H, v30.8H, v8.H[0] // ......*.............
+// gap // ....................
+sqrdmulh v21.8H, v30.8H, v8.H[1] // .......*............
+// gap // ....................
+ldr q30, [x0, #48] // .....*..............
+// gap // ....................
+// gap // ....................
+// gap // ....................
+ldr q3, [x2, #0] // .*..................
+// gap // ....................
+// gap // ....................
+// gap // ....................
+sqrdmulh v5.8H, v30.8H, v8.H[1] // ............*.......
+// gap // ....................
+mul v30.8H, v30.8H, v8.H[0] // ...........*........
+// gap // ....................
+mls v13.8H, v21.8H, v3.H[0] // ........*...........
+// gap // ....................
+ldr q15, [x0, #32] // ....*...............
+// gap // ....................
+// gap // ....................
+// gap // ....................
+mls v30.8H, v5.8H, v3.H[0] // .............*......
+// gap // ....................
+add v8.8H, v25.8H, v13.8H // ..........*.........
+// gap // ....................
+sub v20.8H, v25.8H, v13.8H // .........*..........
+// gap // ....................
+// gap // ....................
+// gap // ....................
+str q8, [x0], #4*16 // ................*...
+// gap // ....................
+add v26.8H, v15.8H, v30.8H // ...............*....
+// gap // ....................
+str q20, [x0, #-48] // .................*..
+// gap // ....................
+sub v5.8H, v15.8H, v30.8H // ..............*.....
+// gap // ....................
+str q26, [x0, #-32] // ..................*.
+// gap // ....................
+// gap // ....................
+// gap // ....................
+str q5, [x0, #-16] // ...................*
+// gap // ....................
+
+// original source code
+// ldr q0, [x1, #0] // *...................
+// ldr q1, [x2, #0] // ......*.............
+// ldr q8, [x0] // ..*.................
+// ldr q9, [x0, #1*16] // .*..................
+// ldr q10, [x0, #2*16] // ..........*.........
+// ldr q11, [x0, #3*16] // .....*..............
+// mul v24.8h, v9.8h, v0.h[0] // ...*................
+// sqrdmulh v9.8h, v9.8h, v0.h[1] // ....*...............
+// mls v24.8h, v9.8h, v1.h[0] // .........*..........
+// sub v9.8h, v8.8h, v24.8h // .............*......
+// add v8.8h, v8.8h, v24.8h // ............*.......
+// mul v24.8h, v11.8h, v0.h[0] // ........*...........
+// sqrdmulh v11.8h, v11.8h, v0.h[1] // .......*............
+// mls v24.8h, v11.8h, v1.h[0] // ...........*........
+// sub v11.8h, v10.8h, v24.8h // .................*..
+// add v10.8h, v10.8h, v24.8h // ...............*....
+// str q8, [x0], #4*16 // ..............*.....
+// str q9, [x0, #-3*16] // ................*...
+// str q10, [x0, #-2*16] // ..................*.
+// str q11, [x0, #-1*16] // ...................*
+```
+
+At the top you can see the re-scheduled assembly and at the bottom you find the original source code as a comment.
+As comments next to the two sections, you can also see a visual representation on how these instructions have been rescheduled.
+You can see that various instructions have been moved around to achieve fewer stalls.
+
+Note that if you do run SLOTHY again, it may produce a different scheduling with the same minimal number of stalls.
+This is expected and due to the constraint solver not producing deterministic outputs.
+
+In the scheduled code, you can see `// gap` where SLOTHY would expect a "gap" in the current model:
+This is not a pipeline stall in the sense of a wasted cycle, but rather an issue slot of
+the CPU that was not used. The Cortex-A55 is a dual-issue CPU meaning in ideal circumstances 2 instructions can be issued per cycle.
+However, the Neon pipeline can only issue a single (128-bit/q-form) Neon instruction per cycle.
+Since our code only consists of (128-bit/q-form) Neon instructions, the best we can hope for is a single `gap` after each instruction.
+To make use of these issue slots one would have to mix in scalar instructions (or use 64-bit (d-form) Neon instructions).
+
+Also note the registers used: In the original code `v24` was as a temporary register in both computation streams preventing to effectively interleave them.
+SLOTHY renamed those registers to be able to interleave both computations. Other registers have also been arbitrarily
+renamed, but without any specific reason.
+
+## 3. Writing your own calling code
+
+When writing your own calls to SLOTHY, there are generally two options:
+(1) Using SLOTHY as a Python module, or (2) using `slothy-cli` using command line options. We will continue with (1) to demonstrate some features.
+To reproduce the example above, you can place the following code into your own Python script in the root directory of SLOTHY:
+
+```python
+import logging
+import sys
+
+from slothy import Slothy
+
+import slothy.targets.aarch64.aarch64_neon as AArch64_Neon
+import slothy.targets.aarch64.cortex_a55 as Target_CortexA55
+
+logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+
+arch = AArch64_Neon
+target = Target_CortexA55
+
+slothy = Slothy(arch, target)
+
+# example
+slothy.load_source_from_file("examples/naive/aarch64/aarch64_simple0.s")
+slothy.config.variable_size=True
+slothy.config.constraints.stalls_first_attempt=32
+
+slothy.optimize()
+slothy.write_source_to_file("opt/aarch64_simple0_a55.s")
+```
+
+You will need to pass to SLOTHY both the architecture model (containing the instruction mnemonics and which registers
+are input and outputs for each instruction) and the microarchitectual model (containing latencies, throughputs,
+execution units, etc.). In this case, we use the AArch64+Neon architecture model and the Arm Cortex-A55
+microarchitecture model that come with SLOTHY.
+
+The calls to SLOTHY should be self-explanatory:
+ - `load_source_from_file` loads an assembly file to be optimized.
+ - `slothy.config` can be used to configure SLOTHY. For the documentation of the configuration options, see the comments in [config.py](../slothy/core/config.py).
+ - `optimize` performs the actual optimizations by calling the external constraint solver.
+ - `write_source_to_file` writes back the optimized assembly to a file.
+
+Setting `slothy.config.variable_size` results in the number of stalls being a parameter of the model that the constraint
+solver is trying to minimize within a static 'stall budget'. By default, SLOTHY would start with a stall budget of 0 and
+exponentially increase until a solution is found. To speed this process up, we set `stalls_first_attempt=32`, starting
+the search with a sufficient stall budget of 32 cycles.
+
+The `variable_size` may not perform well for large examples. The default strategy (`variable_size=False`) is, hence, to
+pass a fixed number of allowed stalls to the constraint solver and to have SLOTHY perform an 'external' binary search to
+find the minimum number of stalls for which a solution exists.
+
+Even with this small Neon example, you can see that understanding the input code is much easier than the output code. In
+fact, the input code can be further clarified through the use of macros and register aliases, leading to the following
+'clean' version from
+[examples/naive/aarch64/aarch64_simple0_macros.s](../examples/naive/aarch64/aarch64_simple0_macros.s) which makes it
+apparent that our example is just a pair of NTT butterflies using Barrett multiplication. Note that the `.req` and
+`.macro` directives used here are commonly supported [assembly
+directives](https://www.sourceware.org/binutils/docs/as/ARM-Directives.html).
+
+```nasm
+qdata0 .req q8
+qdata1 .req q9
+qdata2 .req q10
+qdata3 .req q11
+
+qtwiddle .req q0
+
+data0 .req v8
+data1 .req v9
+data2 .req v10
+data3 .req v11
+
+twiddle .req v0
+modulus .req v1
+
+tmp .req v12
+
+data_ptr .req x0
+twiddle_ptr .req x1
+
+.macro barmul out, in, twiddle, modulus
+ mul \out.8h, \in.8h, \twiddle.h[0]
+ sqrdmulh \in.8h, \in.8h, \twiddle.h[1]
+ mls \out.8h, \in.8h, \modulus.h[0]
+.endm
+
+.macro butterfly data0, data1, tmp, twiddle, modulus
+ barmul \tmp, \data1, \twiddle, \modulus
+ sub \data1.8h, \data0.8h, \tmp.8h
+ add \data0.8h, \data0.8h, \tmp.8h
+.endm
+
+start:
+
+ ldr qtwiddle, [twiddle_ptr, #0]
+
+ ldr qdata0, [data_ptr, #0*16]
+ ldr qdata1, [data_ptr, #1*16]
+ ldr qdata2, [data_ptr, #2*16]
+ ldr qdata3, [data_ptr, #3*16]
+
+ butterfly data0, data1, tmp, twiddle, modulus
+ butterfly data2, data3, tmp, twiddle, modulus
+
+ str qdata0, [data_ptr], #4*16
+ str qdata1, [data_ptr, #-3*16]
+ str qdata2, [data_ptr, #-2*16]
+ str qdata3, [data_ptr, #-1*16]
+
+end:
+```
+
+SLOTHY will then internally expand all macros and the resulting DFG will be exactly the same as before.
+To make this work, we have to slightly change the SLOTHY code:
+```python
+# example
+slothy.load_source_from_file("../examples/naive/aarch64/aarch64_simple0_macros.s")
+slothy.config.variable_size=True
+slothy.config.constraints.stalls_first_attempt=32
+
+slothy.optimize(start="start", end="end")
+slothy.write_source_to_file("opt/aarch64_simple0_macros_a55.s")
+```
+
+The difference is that, we have to explicitly pass `start` and `end` labels to SLOTHY.
+This is because SLOTHY does not understand the code before that and the parsing would fail if run on that part of the code.
+
+We have found it very useful to base assembly optimization on a 'clean' version as above and automate its optimization
+using SLOTHY, which is the reason why we believe that SLOTHY can help with the development of auditable and maintainable
+high-performance assembly.
+
+
+## 4. Software Pipelining
+
+One of the most powerful features of SLOTHY is [software
+pipelining](https://en.wikipedia.org/wiki/Software_pipelining). The core idea of software pipelining is that loop
+scheduling can be improved by moving some instructions to earlier or later iterations of the loop, that is, by
+interleaving loop iterations. Note that this does not mean that the loop has to be unrolled: By maintaining
+the periodicity of the interleaved code, it is possible to keep it within a loop, thereby retaining code compactness.
+Only the first and last iteration(s) may require to be treated separately; those are called the preamble and
+postamble, respectively.
+
+Let's look at an example demonstrating how SLOTHY can perform software pipelining for you.
+Consider the simple case of performing the code from the previous example within a loop with a fixed number of iterations (>=2). This is exactly what the
+`aarch64_simple0_loop` example in SLOTHY does:
+```nasm
+... // .req and .macro as above
+
+count .req x2
+ldr qtwiddle, [twiddle_ptr, #0]
+ldr qmodulus, modulus_ptr, #0
+
+mov count, #16
+start:
+
+ ldr qtwiddle, [twiddle_ptr, #0]
+
+ ldr qdata0, [data_ptr, #0*16]
+ ldr qdata1, [data_ptr, #1*16]
+ ldr qdata2, [data_ptr, #2*16]
+ ldr qdata3, [data_ptr, #3*16]
+
+ butterfly data0, data1, tmp, twiddle, modulus
+ butterfly data2, data3, tmp, twiddle, modulus
+
+ str qdata0, [data_ptr], #4*16
+ str qdata1, [data_ptr, #-3*16]
+ str qdata2, [data_ptr, #-2*16]
+ str qdata3, [data_ptr, #-1*16]
+
+ subs count, count, #1
+ cbnz count, start
+```
+
+Let's use SLOTHY to superoptimize this loop:
+```python
+slothy.load_source_from_file("examples/naive/aarch64/aarch64_simple0_loop.s")
+slothy.config.variable_size=True
+slothy.config.constraints.stalls_first_attempt=32
+
+slothy.config.sw_pipelining.enabled = True
+slothy.config.sw_pipelining.optimize_preamble = False
+slothy.config.sw_pipelining.optimize_postamble = False
+slothy.optimize_loop("start")
+slothy.write_source_to_file("opt/aarch64_simple0_loop_a55.s")
+```
+
+Software pipelining needs to be enabled by setting `slothy.config.sw_pipelining.enabled = True`.
+We also need to specifically tell SLOTHY that we would like to optimize the loop starting at `start` -- SLOTHY will
+automatically detect that the loop ends at `cbnz count, start`. Finally, `optimize_preamble = False` and
+`optimize_preamble = False` prevent SLOTHY from optimizing the loop preamble and postamble (first/last iteration), which
+it would by default -- you normally want this set, but we unset it here to simplify the output. This is what it will
+look like:
+
+```nasm
+// ...
+count .req x2
+ldr qtwiddle, [twiddle_ptr, #0]
+ldr qmodulus, [modulus_ptr, #0]
+mov count, #16
+ ldr q3, [x0, #16]
+ sqrdmulh v7.8H, v3.8H, v0.H[1]
+ sub count, count, #1
+start:
+ mul v3.8H, v3.8H, v0.H[0] // ....*.............
+ // gap // ..................
+ ldr q19, [x0, #48] // ...*..............
+ // gap // ..................
+ // gap // ..................
+ // gap // ..................
+ ldr q15, [x0, #0] // *.................
+ // gap // ..................
+ // gap // ..................
+ // gap // ..................
+ mls v3.8H, v7.8H, v1.H[0] // ......*...........
+ // gap // ..................
+ mul v13.8H, v19.8H, v0.H[0] // .........*........
+ // gap // ..................
+ sqrdmulh v19.8H, v19.8H, v0.H[1] // ..........*.......
+ // gap // ..................
+ ldr q7, [x0, #32] // ..*...............
+ // gap // ..................
+ // gap // ..................
+ // gap // ..................
+ sub v17.8H, v15.8H, v3.8H // .......*..........
+ // gap // ..................
+ add v10.8H, v15.8H, v3.8H // ........*.........
+ // gap // ..................
+ mls v13.8H, v19.8H, v1.H[0] // ...........*......
+ // gap // ..................
+ str q17, [x0, #16] // ...............*..
+ // gap // ..................
+ ldr q3, [x0, #80] // .e................
+ // gap // ..................
+ // gap // ..................
+ // gap // ..................
+ add v15.8H, v7.8H, v13.8H // .............*....
+ // gap // ..................
+ str q10, [x0], #4*16 // ..............*...
+ // gap // ..................
+ sub v13.8H, v7.8H, v13.8H // ............*.....
+ // gap // ..................
+ str q15, [x0, #-32] // ................*.
+ // gap // ..................
+ sqrdmulh v7.8H, v3.8H, v0.H[1] // .....e............
+ // gap // ..................
+ str q13, [x0, #-16] // .................*
+ // gap // ..................
+
+ // original source code
+ // ldr q8, [x0, #0*16] // .......|.*...............
+ // ldr q9, [x0, #1*16] // e......|..........e......
+ // ldr q10, [x0, #2*16] // .......|.....*...........
+ // ldr q11, [x0, #3*16] // .......|*................
+ // mul v12.8h, v9.8h, v0.h[0] // .......*.................
+ // sqrdmulh v9.8h, v9.8h, v0.h[1] // .....e.|...............e.
+ // mls v12.8h, v9.8h, v1.h[0] // .......|..*..............
+ // sub v9.8h, v8.8h, v12.8h // .......|......*..........
+ // add v8.8h, v8.8h, v12.8h // .......|.......*.........
+ // mul v12.8h, v11.8h, v0.h[0] // .......|...*.............
+ // sqrdmulh v11.8h, v11.8h, v0.h[1] // .......|....*............
+ // mls v12.8h, v11.8h, v1.h[0] // .......|........*........
+ // sub v11.8h, v10.8h, v12.8h // ...*...|.............*...
+ // add v10.8h, v10.8h, v12.8h // .*.....|...........*.....
+ // str q8, [x0], #4*16 // ..*....|............*....
+ // str q9, [x0, #-3*16] // .......|.........*.......
+ // str q10, [x0, #-2*16] // ....*..|..............*..
+ // str q11, [x0, #-1*16] // ......*|................*
+
+ sub count, count, #1
+ cbnz count, start
+ mul v3.8H, v3.8H, v0.H[0]
+ ldr q19, [x0, #48]
+ ldr q15, [x0, #0]
+ mls v3.8H, v7.8H, v1.H[0]
+ mul v13.8H, v19.8H, v0.H[0]
+ sqrdmulh v19.8H, v19.8H, v0.H[1]
+ ldr q7, [x0, #32]
+ sub v17.8H, v15.8H, v3.8H
+ add v10.8H, v15.8H, v3.8H
+ mls v13.8H, v19.8H, v1.H[0]
+ str q17, [x0, #16]
+ add v15.8H, v7.8H, v13.8H
+ str q10, [x0], #4*16
+ sub v13.8H, v7.8H, v13.8H
+ str q15, [x0, #-32]
+ str q13, [x0, #-16]
+```
+
+Let's start by looking at the optimized loop body going from `start:` to `cbnz count, start`:
+We see that the loop now has 4 blocks of 3 `gap`s meaning that SLOTHY predicts 4 stalls of 1 cycle each. This compares
+to 7 stalls in the version without software pipelining. We see that 2 load instructions are marked as early instructions
+(annotated `(e)`), meaning they have been moved into the previous iteration: Intuitively, this makes sense: We know
+statically what data we need to load for the next iteration, and loads have a fairly long latency, so we can improve
+performance by issuing loads early. For the code to still be correct, SLOTHY decreases the number of iterations by one
+(`sub count, count, #1`), adds the missing early-instructions for the first iteration before the loop, and finally adds
+the non-early instructions of the last iteration after the loop.
+
+Another experimental feature that can be witnessed in this example is _address offset fixup_. The two `ldr`s that
+were moved into the previous iteration have been reordered with the `str _, [x0], #64` which modifies the address
+register. SLOTHY is aware of this and has adjusted the immediate offsets in `ldr` accordingly. Without this, software
+pipelining would not be possible here. Address offset fixup is an important yet somewhat subtle feature, and mistakes
+in its handling are currently not caught by SLOTHY's selfcheck. Going into the details of why that is goes too far for
+this tutorial, but it is one of the reasons why the selfcheck does, as it stands, not replace a formal verification.
+
+## 5. Checking the quality of SLOTHY optimizations
+
+You may ask how we know that SLOTHY has actually done something useful here? Sure enough, the interleaving in the above
+example looks somewhat sensible, and SLOTHY's model predicts only few full-cycle stalls. However, at this point we don't
+have any indicator of the impact of SLOTHY's optimizations on real hardware.
+
+Indeed, developing accurate microarchitectural models for SLOTHY is a time-consuming and iterative process:
+It usually takes a while until you have refined things to the point where SLOTHY's prediction closely relates to
+performance on real hardware. The most common refinement steps are:
+1) There is a mistake in the microarchitectural model mismatching what is written in the Software Optimization Guide (SWOG);
+2) Some aspect of the microarchtecture (e.g., certain forwarding paths or other latency exceptions) is not documented in the SWOG.
+
+We briefly discuss two ways that we found useful to evaluate the quality SLOTHY's optimizations and drive the
+refinement of microarchitectural models.
+
+First, one useful tool for approximate but independent (of SLOTHY) performance evaluation is LLVM's [Machine Code
+Analyzer](https://llvm.org/docs/CommandGuide/llvm-mca.html). If you have `llvm-mca` available in your PATH (you may have
+to compile LLVM >= 18 yourself), you can make use of it in SLOTHY by setting the `with_llvm_mca` flag.
+Let's look at the last example and enable LLVM MCA:
+
+```python
+slothy.load_source_from_file("../examples/naive/aarch64/aarch64_simple0_loop.s")
+slothy.config.variable_size=True
+slothy.config.constraints.stalls_first_attempt=32
+
+slothy.config.sw_pipelining.enabled = True
+slothy.config.sw_pipelining.optimize_preamble = False
+slothy.config.sw_pipelining.optimize_postamble = False
+slothy.config.with_llvm_mca = True
+slothy.optimize_loop("start")
+slothy.write_source_to_file("./aarch64_simple0_loop_mca_a55.s")
+```
+
+This will call LLVM MCA on both the original code and the optimized code and append the LLVM MCA statistics as a comment to the output.
+Somewhere in the code you will see:
+```nasm
+// LLVM MCA STATISTICS (ORIGINAL) BEGIN
+//
+// Iterations: 100
+// Instructions: 2000
+// Total Cycles: 3102
+// Total uOps: 2100
+//
+// Dispatch Width: 2
+// uOps Per Cycle: 0.68
+// IPC: 0.64
+// Block RThroughput: 10.5
+
+...
+
+// LLVM MCA STATISTICS (OPTIMIZED) BEGIN
+//
+// Iterations: 100
+// Instructions: 2000
+// Total Cycles: 2102
+// Total uOps: 2100
+//
+// Dispatch Width: 2
+// uOps Per Cycle: 1.00
+// IPC: 0.95
+// Block RThroughput: 10.5
+```
+
+This suggests that our optimizations were actually useful: With respect to LLVM-MCA's scheduling model of Cortex-A55, the
+ cycle count per iteration was reduced from 31 cycles to 21 cycles.
+
+But LLVM MCA gives you more: It outputs a timeline view showing how each instruction travels through the pipeline:
+```nasm
+// Timeline view (ORIGINAL):
+// 0123456789 0123456789 0123456789 0123456789 01234
+// Index 0123456789 0123456789 0123456789 0123456789 0123456789
+//
+// [0,0] DeeE . . . . . . . . . . . . . . . . . . . ldr q0, [x1]
+// [0,1] .DeeE. . . . . . . . . . . . . . . . . . . ldr q1, [x2]
+// [0,2] . DeeE . . . . . . . . . . . . . . . . . . ldr q8, [x0]
+// [0,3] . DeeE . . . . . . . . . . . . . . . . . . ldr q9, [x0, #16]
+// [0,4] . DeeE . . . . . . . . . . . . . . . . . . ldr q10, [x0, #32]
+// [0,5] . DeeE . . . . . . . . . . . . . . . . . . ldr q11, [x0, #48]
+// [0,6] . .DeeeE . . . . . . . . . . . . . . . . . mul v24.8h, v9.8h, v0.h[0]
+// [0,7] . . DeeeE . . . . . . . . . . . . . . . . . sqrdmulh v9.8h, v9.8h, v0.h[1]
+// [0,8] . . .DeeeE . . . . . . . . . . . . . . . . mls v24.8h, v9.8h, v1.h[0]
+// [0,9] . . . DeE . . . . . . . . . . . . . . . . sub v9.8h, v8.8h, v24.8h
+// [0,10] . . . .DeE . . . . . . . . . . . . . . . . add v8.8h, v8.8h, v24.8h
+// [0,11] . . . . DeeeE . . . . . . . . . . . . . . . mul v24.8h, v11.8h, v0.h[0]
+// [0,12] . . . . DeeeE . . . . . . . . . . . . . . . sqrdmulh v11.8h, v11.8h, v0.h[1]
+// [0,13] . . . . . DeeeE . . . . . . . . . . . . . . mls v24.8h, v11.8h, v1.h[0]
+// [0,14] . . . . . .DeE . . . . . . . . . . . . . . sub v11.8h, v10.8h, v24.8h
+// [0,15] . . . . . . DeE. . . . . . . . . . . . . . add v10.8h, v10.8h, v24.8h
+// [0,16] . . . . . . DE. . . . . . . . . . . . . . str q8, [x0], #64
+// [0,17] . . . . . . DE . . . . . . . . . . . . . stur q9, [x0, #-48]
+// [0,18] . . . . . . DE . . . . . . . . . . . . . stur q10, [x0, #-32]
+// [0,19] . . . . . . .DE . . . . . . . . . . . . . stur q11, [x0, #-16]
+// [1,0] . . . . . . .DeeE. . . . . . . . . . . . . ldr q0, [x1]
+// [1,1] . . . . . . . DeeE . . . . . . . . . . . . ldr q1, [x2]
+// [1,2] . . . . . . . DeeE . . . . . . . . . . . . ldr q8, [x0]
+// [1,3] . . . . . . . DeeE . . . . . . . . . . . . ldr q9, [x0, #16]
+// [1,4] . . . . . . . DeeE . . . . . . . . . . . . ldr q10, [x0, #32]
+// [1,5] . . . . . . . .DeeE. . . . . . . . . . . . ldr q11, [x0, #48]
+// [1,6] . . . . . . . . DeeeE . . . . . . . . . . . mul v24.8h, v9.8h, v0.h[0]
+// [1,7] . . . . . . . . DeeeE . . . . . . . . . . . sqrdmulh v9.8h, v9.8h, v0.h[1]
+// [1,8] . . . . . . . . . DeeeE . . . . . . . . . . mls v24.8h, v9.8h, v1.h[0]
+// [1,9] . . . . . . . . . .DeE . . . . . . . . . . sub v9.8h, v8.8h, v24.8h
+// [1,10] . . . . . . . . . . DeE. . . . . . . . . . add v8.8h, v8.8h, v24.8h
+// [1,11] . . . . . . . . . . DeeeE . . . . . . . . . mul v24.8h, v11.8h, v0.h[0]
+// [1,12] . . . . . . . . . . DeeeE . . . . . . . . . sqrdmulh v11.8h, v11.8h, v0.h[1]
+// [1,13] . . . . . . . . . . . DeeeE . . . . . . . . mls v24.8h, v11.8h, v1.h[0]
+// [1,14] . . . . . . . . . . . . DeE. . . . . . . . sub v11.8h, v10.8h, v24.8h
+// [1,15] . . . . . . . . . . . . DeE . . . . . . . add v10.8h, v10.8h, v24.8h
+// [1,16] . . . . . . . . . . . . DE . . . . . . . str q8, [x0], #64
+// [1,17] . . . . . . . . . . . . DE . . . . . . . stur q9, [x0, #-48]
+// [1,18] . . . . . . . . . . . . .DE . . . . . . . stur q10, [x0, #-32]
+// [1,19] . . . . . . . . . . . . . DE . . . . . . . stur q11, [x0, #-16]
+// [2,0] . . . . . . . . . . . . . DeeE . . . . . . ldr q0, [x1]
+// [2,1] . . . . . . . . . . . . . DeeE . . . . . . ldr q1, [x2]
+// [2,2] . . . . . . . . . . . . . DeeE . . . . . . ldr q8, [x0]
+// [2,3] . . . . . . . . . . . . . DeeE . . . . . . ldr q9, [x0, #16]
+// [2,4] . . . . . . . . . . . . . .DeeE. . . . . . ldr q10, [x0, #32]
+// [2,5] . . . . . . . . . . . . . . DeeE . . . . . ldr q11, [x0, #48]
+// [2,6] . . . . . . . . . . . . . . DeeeE . . . . . mul v24.8h, v9.8h, v0.h[0]
+// [2,7] . . . . . . . . . . . . . . DeeeE . . . . . sqrdmulh v9.8h, v9.8h, v0.h[1]
+// [2,8] . . . . . . . . . . . . . . . DeeeE . . . . mls v24.8h, v9.8h, v1.h[0]
+// [2,9] . . . . . . . . . . . . . . . . DeE. . . . sub v9.8h, v8.8h, v24.8h
+// [2,10] . . . . . . . . . . . . . . . . DeE . . . add v8.8h, v8.8h, v24.8h
+// [2,11] . . . . . . . . . . . . . . . . DeeeE . . . mul v24.8h, v11.8h, v0.h[0]
+// [2,12] . . . . . . . . . . . . . . . . DeeeE. . . sqrdmulh v11.8h, v11.8h, v0.h[1]
+// [2,13] . . . . . . . . . . . . . . . . . DeeeE . . mls v24.8h, v11.8h, v1.h[0]
+// [2,14] . . . . . . . . . . . . . . . . . . DeE . sub v11.8h, v10.8h, v24.8h
+// [2,15] . . . . . . . . . . . . . . . . . . DeE . add v10.8h, v10.8h, v24.8h
+// [2,16] . . . . . . . . . . . . . . . . . . DE . str q8, [x0], #64
+// [2,17] . . . . . . . . . . . . . . . . . . .DE . stur q9, [x0, #-48]
+// [2,18] . . . . . . . . . . . . . . . . . . . DE. stur q10, [x0, #-32]
+// [2,19] . . . . . . . . . . . . . . . . . . . DE stur q11, [x0, #-16]
+...
+
+// Timeline view (OPTIMIZED):
+// 0123456789 0123456789 0123456789
+// Index 0123456789 0123456789 0123456789 01234
+//
+// [0,0] DeeE . . . . . . . . . . . . . ldr q7, [x1]
+// [0,1] .DeeE. . . . . . . . . . . . . ldr q31, [x0, #16]
+// [0,2] . DeeE . . . . . . . . . . . . ldr q11, [x0, #48]
+// [0,3] . DeeeE . . . . . . . . . . . . mul v20.8h, v31.8h, v7.h[0]
+// [0,4] . DeeeE. . . . . . . . . . . . sqrdmulh v31.8h, v31.8h, v7.h[1]
+// [0,5] . .DeeeE . . . . . . . . . . . mul v18.8h, v11.8h, v7.h[0]
+// [0,6] . . DeeeE . . . . . . . . . . . sqrdmulh v7.8h, v11.8h, v7.h[1]
+// [0,7] . . DeeE . . . . . . . . . . . ldr q11, [x2]
+// [0,8] . . DeeE . . . . . . . . . . . ldr q8, [x0]
+// [0,9] . . .DeeeE . . . . . . . . . . mls v20.8h, v31.8h, v11.h[0]
+// [0,10] . . . DeeeE . . . . . . . . . . mls v18.8h, v7.8h, v11.h[0]
+// [0,11] . . . DeeE . . . . . . . . . . ldr q7, [x0, #32]
+// [0,12] . . . DeE . . . . . . . . . . sub v31.8h, v8.8h, v20.8h
+// [0,13] . . . .DeE . . . . . . . . . . add v11.8h, v8.8h, v20.8h
+// [0,14] . . . . DeE. . . . . . . . . . sub v20.8h, v7.8h, v18.8h
+// [0,15] . . . . DE . . . . . . . . . . str q31, [x0, #16]
+// [0,16] . . . . DeE . . . . . . . . . add v7.8h, v7.8h, v18.8h
+// [0,17] . . . . DE . . . . . . . . . str q11, [x0], #64
+// [0,18] . . . . DE . . . . . . . . . stur q7, [x0, #-32]
+// [0,19] . . . . .DE . . . . . . . . . stur q20, [x0, #-16]
+// [1,0] . . . . .DeeE. . . . . . . . . ldr q7, [x1]
+// [1,1] . . . . . DeeE . . . . . . . . ldr q31, [x0, #16]
+// [1,2] . . . . . DeeE . . . . . . . . ldr q11, [x0, #48]
+// [1,3] . . . . . DeeeE. . . . . . . . mul v20.8h, v31.8h, v7.h[0]
+// [1,4] . . . . . .DeeeE . . . . . . . sqrdmulh v31.8h, v31.8h, v7.h[1]
+// [1,5] . . . . . . DeeeE . . . . . . . mul v18.8h, v11.8h, v7.h[0]
+// [1,6] . . . . . . DeeeE . . . . . . . sqrdmulh v7.8h, v11.8h, v7.h[1]
+// [1,7] . . . . . . DeeE . . . . . . . ldr q11, [x2]
+// [1,8] . . . . . . DeeE . . . . . . . ldr q8, [x0]
+// [1,9] . . . . . . . DeeeE . . . . . . mls v20.8h, v31.8h, v11.h[0]
+// [1,10] . . . . . . . DeeeE . . . . . . mls v18.8h, v7.8h, v11.h[0]
+// [1,11] . . . . . . . DeeE . . . . . . ldr q7, [x0, #32]
+// [1,12] . . . . . . . .DeE . . . . . . sub v31.8h, v8.8h, v20.8h
+// [1,13] . . . . . . . . DeE. . . . . . add v11.8h, v8.8h, v20.8h
+// [1,14] . . . . . . . . DeE . . . . . sub v20.8h, v7.8h, v18.8h
+// [1,15] . . . . . . . . DE. . . . . . str q31, [x0, #16]
+// [1,16] . . . . . . . . DeE . . . . . add v7.8h, v7.8h, v18.8h
+// [1,17] . . . . . . . . DE . . . . . str q11, [x0], #64
+// [1,18] . . . . . . . . .DE . . . . . stur q7, [x0, #-32]
+// [1,19] . . . . . . . . . DE . . . . . stur q20, [x0, #-16]
+// [2,0] . . . . . . . . . DeeE . . . . ldr q7, [x1]
+// [2,1] . . . . . . . . . DeeE . . . . ldr q31, [x0, #16]
+// [2,2] . . . . . . . . . DeeE . . . . ldr q11, [x0, #48]
+// [2,3] . . . . . . . . . .DeeeE . . . mul v20.8h, v31.8h, v7.h[0]
+// [2,4] . . . . . . . . . . DeeeE . . . sqrdmulh v31.8h, v31.8h, v7.h[1]
+// [2,5] . . . . . . . . . . DeeeE . . . mul v18.8h, v11.8h, v7.h[0]
+// [2,6] . . . . . . . . . . DeeeE . . . sqrdmulh v7.8h, v11.8h, v7.h[1]
+// [2,7] . . . . . . . . . . DeeE . . . ldr q11, [x2]
+// [2,8] . . . . . . . . . . .DeeE. . . ldr q8, [x0]
+// [2,9] . . . . . . . . . . . DeeeE . . mls v20.8h, v31.8h, v11.h[0]
+// [2,10] . . . . . . . . . . . DeeeE . . mls v18.8h, v7.8h, v11.h[0]
+// [2,11] . . . . . . . . . . . DeeE . . ldr q7, [x0, #32]
+// [2,12] . . . . . . . . . . . . DeE. . sub v31.8h, v8.8h, v20.8h
+// [2,13] . . . . . . . . . . . . DeE . add v11.8h, v8.8h, v20.8h
+// [2,14] . . . . . . . . . . . . DeE . sub v20.8h, v7.8h, v18.8h
+// [2,15] . . . . . . . . . . . . DE . str q31, [x0, #16]
+// [2,16] . . . . . . . . . . . . DeE . add v7.8h, v7.8h, v18.8h
+// [2,17] . . . . . . . . . . . . .DE . str q11, [x0], #64
+// [2,18] . . . . . . . . . . . . . DE. stur q7, [x0, #-32]
+// [2,19] . . . . . . . . . . . . . DE stur q20, [x0, #-16]
+```
+
+However, LLVM MCA's model might not be accurate either and cannot replacement measurements on real hardware -- so let's
+do that. Here, we use a [profiling tool](https://github.com/slothy-optimizer/pqax/tree/main/tests/profiling) we wrote
+as part of the [pqax](https://github.com/slothy-optimizer/pqax) benchmarking framework. It takes an assembly snippet as
+input and automatically generates a program running and benchmarking prefixes of the input, and combining them into a
+performance diagram similar to the one generated by LLVM-MCA. Here's the output in our case:
+
+```nasm
+===== Stepwise profiling =======
+[ 0]: ldr q0, [x1, #0] ......*.....................................
+[ 1]: ldr q8, [x0, #0*16] .......*....................................
+[ 2]: ldr q9, [x0, #1*16] .........*..................................
+[ 3]: ldr q10, [x0, #2*16] ...........*................................
+[ 4]: ldr q11, [x0, #3*16] .............*..............................
+[ 5]: mul v12.8h, v9.8h, v0.h[0] ...............*............................
+[ 6]: sqrdmulh v9.8h, v9.8h, v0.h[1] ................*...........................
+[ 7]: mls v12.8h, v9.8h, v1.h[0] .................*..........................
+[ 8]: sub v9.8h, v8.8h, v12.8h .....................*......................
+[ 9]: add v8.8h, v8.8h, v12.8h ........................*...................
+[ 10]: mul v12.8h, v11.8h, v0.h[0] .........................*..................
+[ 11]: sqrdmulh v11.8h, v11.8h, v0.h[1] ..........................*.................
+[ 12]: mls v12.8h, v11.8h, v1.h[0] ...........................*................
+[ 13]: sub v11.8h, v10.8h, v12.8h ...............................*............
+[ 14]: add v10.8h, v10.8h, v12.8h ..................................*.........
+[ 15]: str q8, [x0], #4*16 ...................................*........
+[ 16]: str q9, [x0, #-3*16] .....................................*......
+[ 17]: str q10, [x0, #-2*16] .....................................*......
+[ 18]: str q11, [x0, #-1*16] ........................................*...
+
+===== Stepwise profiling (OPTIMIZED) =======
+[ 0]: ldr q18, [x0, #16] // .*........................
+[ 1]: sqrdmulh v8.8H, v6.8H, v2.H[1] // ..*.......................
+[ 2]: mul v23.8H, v6.8H, v2.H[0] // ...*......................
+[ 3]: ldr q31, [x0, #32] // ....*.....................
+[ 4]: mul v3.8H, v18.8H, v2.H[0] // ......*...................
+[ 5]: mls v23.8H, v8.8H, v1.H[0] // .......*..................
+[ 6]: sqrdmulh v9.8H, v18.8H, v2.H[1] // ........*.................
+[ 7]: ldr q15, [x0, #0] // .........*................
+[ 8]: sub v11.8H, v31.8H, v23.8H // ...........*..............
+[ 9]: mls v3.8H, v9.8H, v1.H[0] // ............*.............
+[ 10]: add v16.8H, v31.8H, v23.8H // .............*............
+[ 11]: str q11, [x0, #48] // ..............*...........
+[ 12]: ldr q2, [x1, #0] // ...............*..........
+[ 13]: add v13.8H, v15.8H, v3.8H // .................*........
+[ 14]: str q16, [x0, #32] // ..................*.......
+[ 15]: sub v7.8H, v15.8H, v3.8H // ...................*......
+[ 16]: str q13, [x0], #4*16 // ....................*.....
+[ 17]: ldr q6, [x0, #48] // .....................*....
+[ 18]: str q7, [x0, #-48] // .......................*..
+```
+
+We can see that SLOTHY's predictions were exactly right, and that LLVM-MCA's model is off in a few places.
+So, in a nutshell, we'd say that LLVM-MCA is great for quick evaluation of performance, but when you get down
+to the last cycle and fine-tuning your model, there is no way around measurements on real hardware.
+
+## 6. Optimizing a full Neon NTT
+
+The examples previously considered were all toy examples, so you may wonder how to apply SLOTHY to actual cryptographic code.
+Let's look at a real-world example: The Kyber number-theoretic transform -- a core arithmetic function of the Kyber key-encapsulation mechanism making up a large chunk of the total run-time.
+The target platform is again the Arm Cortex-A55 and the code primarily consists of
+Neon vector instructions.
+We'll consider a straightforward implementation available here: [ntt_kyber_123_4567.s](../examples/naive/aarch64/ntt_kyber_123_4567.s).
+If you have ever written an NTT, it should be fairly easy to understand what the code is doing.
+The code consists of 2 main loops implementing layers 1+2+3 and 4+5+6+7 of the NTT.
+The actual operations are wrapped in macros implementing butterflies on single vector registers.
+Note that this code performs very poorly: No consideration was given to the intricacies of the microarchitecture.
+
+Let's run SLOTHY on this code:
+```python
+slothy.load_source_from_file("examples/naive/aarch64/ntt_kyber_123_4567.s")
+slothy.config.sw_pipelining.enabled = True
+slothy.config.inputs_are_outputs = True
+slothy.config.sw_pipelining.minimize_overlapping = False
+slothy.config.variable_size = True
+slothy.config.reserved_regs = [f"x{i}" for i in range(0, 7)] + ["x30", "sp"]
+slothy.config.constraints.stalls_first_attempt = 64
+slothy.optimize_loop("layer123_start")
+slothy.optimize_loop("layer4567_start")
+slothy.write_source_to_file("opt/ntt_kyber_123_4567_opt_a55.s")
+```
+
+We simply optimize both loops separately.
+You will notice some additional flags we have set. To read the documentation of those, please have a look at [config.py](../slothy/core/config.py).
+We have set an additional flag: `inputs_are_outputs = True`. This tells SLOTHY that the registers that are used as
+inputs to the loop (e.g., the pointer to the polynomial input) are also outputs of the entire loop; otherwise, SLOTHY
+could overwrite them in the postamble once they are no longer needed. You most likely want `inputs_are_outputs=True`
+whenever you are optimizing a loop. We also use the `reserved_regs` option to tell SLOTHY that registers `x0, ..., x7,
+x30, sp` are used for other purposes and should not be used by SLOTHY. When optimizing only parts of a function, it is
+essential to tell SLOTHY which registers should not be used: By default SLOTHY will use any of the architectural
+registers. If you are familiar with inline assembly, SLOTHY's `reserved_regs` are essentially the complement of the
+'clobber list'.
+
+When running this example, you will notice that it has a significantly longer runtime.
+On my Intel i7-1360P it takes approximately 15 minutes to optimize both loops.
+You may instead look at an optimized version of the same code [examples/opt/aarch64/ntt_kyber_123_4567_opt_a55.s](../examples/opt/aarch64/ntt_kyber_123_4567_opt_a55.s).
+You notice that both loops have many early instructions, and coming up with this code by hand would be tedious, time-consuming and error-prone.
+
+
+## 7. Optimizing larger pieces of code
+
+We've seen that the code above can be optimized relatively fast (within seconds to minutes on a laptop).
+When using a more powerful machine and allowing optimization times of hours, one can scale this up to larger examples.
+We've successfully used (vanilla) SLOTHY for optimized code snippets of up to 180 instructions.
+However, for larger code at a certain point the constraint solving becomes prohibitively expensive and we need to use a different strategy.
+
+One such example is the X25519 implementation we looked at in the [SLOTHY paper](https://eprint.iacr.org/2022/1303) available in [X25519-AArch64-simple.s](../examples/naive/aarch64/X25519-AArch64-simple.s)
+It is a hybrid vector-scalar implementation based on an [implementation](https://github.com/Emill/X25519-AArch64) by Lenngren.
+Its core loop consists of 958 instructions which well exceeds what SLOTHY can currently optimize in a single pass.
+
+However, we can still make use of SLOTHY to optimize this code by employing heuristics.
+One particularly useful heuristics supported by SLOTHY is the `splitting` heuristic.
+When a piece of code is too large to be optimized at once, it splits it into multiple overlapping pieces that are optimized separately.
+With this approach one loses the optimality guarantees as it may be that there is a solution that SLOTHY cannot find due to the splitting.
+However, by repeatedly running SLOTHY using the `splitting` heuristic, we managed to outperform the state-of-the-art and get very close to optimal results (in terms of IPC).
+
+To demonstrate the splitting heuristic we can use the following SLOTHY call:
+```python
+# example
+slothy.load_source_from_file("../examples/naive/aarch64/X25519-AArch64-simple.s")
+
+# first pass: replace symbolic register names by architectural registers
+slothy.config.inputs_are_outputs=True
+slothy.config.outputs=["x0"]
+slothy.config.constraints.functional_only = True
+slothy.config.constraints.allow_reordering = False
+slothy.optimize(start="mainloop", end="end_label")
+slothy.config.constraints.functional_only = False
+slothy.config.constraints.allow_reordering = True
+
+# second pass: splitting heuristic
+slothy.config.variable_size=True
+slothy.config.constraints.stalls_first_attempt=32
+slothy.config.split_heuristic = True
+slothy.config.split_heuristic_stepsize = 0.05
+slothy.config.split_heuristic_factor = 10
+slothy.config.split_heuristic_repeat = 2
+slothy.optimize(start="mainloop", end="end_label")
+slothy.write_source_to_file("opt/X25519-AArch64-simple_opt.s")
+```
+
+
+The `splitting` heuristic can be turned on by setting `slothy.config.split_heuristic = True`.
+It has three main parameters:
+- `split_heuristic_factor` : Determines the size of each split. In this case, 10 means that we will be optimizing 10% of the original code at a time.
+- `split_heuristic_stepsize` : Controls the degree of overlapping of the sliding window. Setting it to 0.05 means the sliding window moves by 5% every time. We will start with optimizing the first 10% ([0,0.1]) of the code, then [0.05,0.15], [0.1,0.20], ...
+- `split_heuristic_repeat`: The number of times the optimization should be repeated.
+
+You will notice in the example above, that there is another call to `slothy.optimize()` prior to that.
+This is needed as the input implementation is using symbolic register names which is a feature unrelated to the splitting heuristic that we want to demonstrate here.
+It allows a developer of the code to leave the register allocation up to SLOTHY.
+Unfortunately, it is not compatible with the splitting heuristic (as register allocation can't be performed locally), and hence we first need to do the register allocation on the full code before we continue.
+We can configure SLOTHY to only consider register allocation by setting the `allow_reordering=False` (disabling the ordering constraints) and `functional_only=True` (disabling the microarchitectural constraints).
+In this way, the constraints remain manageable, and SLOTHY finds a register allocation within a few minutes.
+
+Running this example takes around 15 minutes.
+You can instead look at the output available in [opt/X25519-AArch64-simple_opt.s](opt/X25519-AArch64-simple_opt.s)
+The output will look similar to the previous examples and contains significantly less pipeline stalls than the input.
+For achieving the best performance, we require a few more calls to SLOTHY. You can find the script we used [here](../paper/scripts/slothy_x25519.sh) - it runs around 1.5 hours.
+
+## 8. Adding a new microarchitecture
+
+You may wonder how to extend SLOTHY to include a new microarchitecture.
+For example, you may want to optimize code for a newer iteration of the Arm Cortex-A55, e.g., the Arm Cortex-A510.
+To understand what is needed for that, let's look at the microarchitectural model for the Cortex-A55 available in [slothy/targets/aarch64/cortex_a55.py](../slothy/targets/aarch64/cortex_a55.py).
+
+Skipping some boilerplate code, you will see the following structure:
+```python
+from slothy.targets.aarch64.aarch64_neon import *
+
+issue_rate = 2
+class ExecutionUnit(Enum):
+ """Enumeration of execution units in Cortex-A55 model"""
+ SCALAR_ALU0=1
+ SCALAR_ALU1=2
+ SCALAR_MAC=3
+ SCALAR_LOAD=4
+ SCALAR_STORE=5
+ VEC0=6
+ VEC1=7
+ # ...
+
+execution_units = {
+ // ...
+}
+
+inverse_throughput = {
+ // ...
+}
+
+default_latencies = {
+ // ...
+}
+
+
+def get_latency(src, out_idx, dst):
+ // ...
+ latency = lookup_multidict(
+ default_latencies, src)
+ // ...
+ return latency
+
+def get_units(src):
+ units = lookup_multidict(execution_units, src)
+ if isinstance(units,list):
+ return units
+ return [units]
+
+def get_inverse_throughput(src):
+ return lookup_multidict(
+ inverse_throughput, src)
+```
+
+Going through the snippet, we can see the core components:
+ - Definition of the `issue_rate` corresponding to the number of issue slots available per cycle. Since the Cortex-A55 is a dual-issue CPU, this is two.
+ - Definition of an `Enum` modelling the different execution units available. In this case, we model 2 scalar units, one
+ MAC unit, 2 64-bit vector units, one load unit, and one store unit.
+ - Finally, we need to implement the functions `get_latency`, `get_units`, `get_inverse_throughput` returning the
+ latency, occupied execution units, and throughputs. The input to these functions is a class from the architectural
+ model representing the instruction in question. For example, the class `vmull` in
+ [aarch64_neon.py](../slothy/targets/aarch64/aarch64_neon.py) corresponds to the `umull` instruction. We commonly
+ implement this using dictionaries above.
+
+For example, for the (128-bit/qform) `vmull` instruction, we can find in the [Arm Cortex-A55 Software Optimization
+Guide](https://developer.arm.com/documentation/EPM128372/latest/) that it occupies both vector execution units, has an
+inverse throughput of 1, and a latency of 4 cycles. We can model this in the following way:
+
+```python
+execution_units = {
+ ( vmull ): [[ExecutionUnit.VEC0, ExecutionUnit.VEC1]],
+}
+
+inverse_throughput = {
+ ( vmull ) : 1,
+}
+
+default_latencies = {
+ ( vmull ) : 4,
+}
+```
+
+We mostly use the tuple-syntax, so we can group together instructions that belong together.
+For example, later we may want to add the Neon `add`. From the SWOG we can see that (128-bit/qform) `add` occupies both
+64-bit vector execution units, has a latency of 3 cycles, and throughput of 1 cycle.
+We can extend the above model as follows:
+
+```python
+execution_units = {
+ ( vmull, vadd ): [[ExecutionUnit.VEC0, ExecutionUnit.VEC1]],
+}
+
+inverse_throughput = {
+ ( vmull, vadd ) : 1,
+}
+
+default_latencies = {
+ ( vmull ) : 4,
+ ( vadd ) : 3,
+}
+```
+
+
+(When looking at the actual model, you will notice that this is not quite how it is modelled. You will see that for some
+instructions, we have to distinguish between the q-form (128-bit) and the d-form (64-bit) of the instruction. Q-form
+instructions occupy both vector execution units, while most D-form instructions occupy only 1. Latencies also vary
+depending on the actual form.)
+
+Note that both the architectural model and the micro-architectural model can be built lazily: As long as the
+corresponding instruction do not appear in your input, you may leave out their description.
+As soon as you hit an instruction that is not part of the architectural or micro-architectural model, you will see an
+error.
+
+## Troubleshooting
+
+- ModuleNotFoundError: No module named 'ortools'
+
+This suggests that you have not installed the required dependencies needed by SLOTHY.
+Either you need to follow the installation instructions, or if you have done that already, you likely forgot to enter the virtual environment you have installed them in using `source venv/bin/activate`. You will have to run this every time you open a new terminal.
+
+- The selfcheck passes but the code is functionally incorrect!
+
+The most common reason for this is a bad configuration: Check that you all registers that must be kept for the sake of
+the surrounding code are marked as `reserved_regs`.
+
+Another possibility, albeit hopefully rare by now, is a failure during address offset fixup: This feature is not yet
+stable, and the selfcheck is currently blind to erroneous calculations here. If you are sure your configuration is correct, you might
+want to check the adjusted address offsets manually. If you find a bug, let us know!
diff --git a/tutorial/opt/X25519-AArch64-simple_opt.s b/tutorial/opt/X25519-AArch64-simple_opt.s
new file mode 100644
index 00000000..3bedd5ad
--- /dev/null
+++ b/tutorial/opt/X25519-AArch64-simple_opt.s
@@ -0,0 +1,3652 @@
+ /* X25519-AArch64 by Emil Lenngren (2018)
+ *
+ * To the extent possible under law, the person who associated CvC0 with
+ * X25519-AArch64 has waived all copyright and related or neighboring rights
+ * to X25519-AArch64.
+ *
+ * You should have received a copy of the CvC0 legalcode along with this
+ * work. If not, see .
+ */
+
+/*
+ * This is an AArch64 implementation of X25519.
+ * It follows the reference implementation where the representation of
+ * a field element [0..2^255-19) is represented by a 256-bit little endian integer,
+ * reduced modulo 2^256-38, and may possibly be in the range [2^256-38..2^256).
+ * The scalar is a 256-bit integer where certain bits are hardcoded per specification.
+ *
+ * The implementation runs in constant time (~145k cycles on Cortex-vA53),
+ * and no conditional branches or memory access pattern depend on secret data.
+ */
+
+/*
+ * Implementation manually de-interleaved and modularized for use with SLOTHY. See
+ *
+ * Fast and Clean: Auditable High Performance Assembly via Constraint Solving
+ * (Abdulrahman, Becker, Kannwischer, Klein)
+ */
+
+#include
+#include "instruction_wrappers.i"
+
+.macro fcsel_dform out, in0, in1, cond // @slothy:no-unfold
+ fcsel dform_\out, dform_\in0, dform_\in1, \cond
+.endm
+
+#define STACK_MASK1 0
+#define STACK_MASK2 8
+#define STACK_A_0 16
+#define STACK_A_8 (STACK_A_0+ 8)
+#define STACK_A_16 (STACK_A_0+16)
+#define STACK_A_24 (STACK_A_0+24)
+#define STACK_A_32 (STACK_A_0+32)
+#define STACK_B_0 64
+#define STACK_B_8 (STACK_B_0+ 8)
+#define STACK_B_16 (STACK_B_0+16)
+#define STACK_B_24 (STACK_B_0+24)
+#define STACK_B_32 (STACK_B_0+32)
+#define STACK_CTR 104
+#define STACK_LASTBIT 108
+#define STACK_SCALAR 112
+#define STACK_X_0 168
+#define STACK_X_8 (STACK_X_0+ 8)
+#define STACK_X_16 (STACK_X_0+16)
+#define STACK_X_24 (STACK_X_0+24)
+#define STACK_X_32 (STACK_X_0+32)
+#define STACK_OUT_PTR (STACK_X_0+48)
+
+ .cpu generic+fp+simd
+ .text
+ .align 2
+
+ // in: x0: pointer
+ // out: x0: loaded value
+ // .type load64unaligned, %function
+load64unaligned:
+ ldrb w1, [x0]
+ ldrb w2, [x0, #1]
+ ldrb w3, [x0, #2]
+ ldrb w4, [x0, #3]
+ ldrb w5, [x0, #4]
+ ldrb w6, [x0, #5]
+ ldrb w7, [x0, #6]
+ ldrb w8, [x0, #7]
+
+ orr w1, w1, w2, lsl #8
+ orr w3, w3, w4, lsl #8
+ orr w5, w5, w6, lsl #8
+ orr w7, w7, w8, lsl #8
+
+ orr w1, w1, w3, lsl #16
+ orr w5, w5, w7, lsl #16
+
+ orr x0, x1, x5, lsl #32
+
+ ret
+ // .size load64unaligned, .-load64unaligned
+
+ // in: x0: pointer
+ // out: x0-x3: loaded value
+ // .type load256unaligned, %function
+load256unaligned:
+ stp x29, x30, [sp, #-64]!
+ mov x29, sp
+ stp x19, x20, [sp, #16]
+ stp x21, x22, [sp, #32]
+
+ mov x19, x0
+ bl load64unaligned
+ mov x20, x0
+ add x0, x19, #8
+ bl load64unaligned
+ mov x21, x0
+ add x0, x19, #16
+ bl load64unaligned
+ mov x22, x0
+ add x0, x19, #24
+ bl load64unaligned
+ mov x3, x0
+
+ mov x0, x20
+ mov x1, x21
+ mov x2, x22
+
+ ldp x19, x20, [sp, #16]
+ ldp x21, x22, [sp, #32]
+ ldp x29, x30, [sp], #64
+ ret
+ // .size load256unaligned, .-load256unaligned
+
+vAB0 .req v0
+vAB1 .req v1
+vAB2 .req v2
+vAB3 .req v3
+vAB4 .req v4
+vAB5 .req v5
+vAB6 .req v6
+vAB7 .req v7
+vAB8 .req v8
+vAB9 .req v9
+
+vT0 .req vAB0
+vT1 .req vAB1
+vT2 .req vAB2
+vT3 .req vAB3
+vT4 .req vAB4
+vT5 .req vAB5
+vT6 .req vAB6
+vT7 .req vAB7
+vT8 .req vAB8
+vT9 .req vAB9
+
+vTA0 .req vAB0
+vTA1 .req vAB1
+vTA2 .req vAB2
+vTA3 .req vAB3
+vTA4 .req vAB4
+vTA5 .req vAB5
+vTA6 .req vAB6
+vTA7 .req vAB7
+vTA8 .req vAB8
+vTA9 .req vAB9
+
+vBX0 .req v10
+vBX1 .req v11
+vBX2 .req v12
+vBX3 .req v13
+vBX4 .req v14
+vBX5 .req v15
+vBX6 .req v16
+vBX7 .req v17
+vBX8 .req v18
+vBX9 .req v19
+
+vDC0 .req vBX0
+vDC1 .req vBX1
+vDC2 .req vBX2
+vDC3 .req vBX3
+vDC4 .req vBX4
+vDC5 .req vBX5
+vDC6 .req vBX6
+vDC7 .req vBX7
+vDC8 .req vBX8
+vDC9 .req vBX9
+
+vADBC0 .req v20
+vADBC1 .req v21
+vADBC2 .req v22
+vADBC3 .req v23
+vADBC4 .req v24
+vADBC5 .req v25
+vADBC6 .req v26
+vADBC7 .req v27
+vADBC8 .req v28
+vADBC9 .req v29
+
+vX4Z50 .req vADBC0
+vX4Z51 .req vADBC1
+vX4Z52 .req vADBC2
+vX4Z53 .req vADBC3
+vX4Z54 .req vADBC4
+vX4Z55 .req vADBC5
+vX4Z56 .req vADBC6
+vX4Z57 .req vADBC7
+vX4Z58 .req vADBC8
+vX4Z59 .req vADBC9
+
+vMaskA .req v30
+vMaskB .req v15
+
+vZ20 .req v1
+vZ22 .req v3
+vZ24 .req v5
+vZ26 .req v7
+vZ28 .req v9
+
+vZ30 .req v11
+vZ32 .req v13
+vZ34 .req v15
+vZ36 .req v17
+vZ38 .req v19
+
+vX20 .req v0
+vX22 .req v2
+vX24 .req v4
+vX26 .req v6
+vX28 .req v8
+
+vX30 .req v10
+vX32 .req v12
+vX34 .req v14
+vX36 .req v16
+vX38 .req v18
+
+vB0 .req v20
+vB2 .req v21
+vB4 .req v22
+vB6 .req v23
+vB8 .req v24
+
+vA0 .req v0
+vA2 .req v2
+vA4 .req v4
+vA6 .req v6
+vA8 .req v8
+
+vC0 .req v10
+vC2 .req v12
+vC4 .req v14
+vC6 .req v16
+vC8 .req v18
+
+vD0 .req v25
+vD2 .req v26
+vD4 .req v27
+vD6 .req v28
+vD8 .req v29
+
+vF0 .req v1
+vF2 .req v3
+vF4 .req v5
+vF6 .req v7
+vF8 .req v9
+
+vG0 .req v20
+vG2 .req v21
+vG4 .req v22
+vG6 .req v23
+vG8 .req v24
+
+// F
+sF0 .req x0
+sF1 .req x1
+sF2 .req x2
+sF3 .req x3
+sF4 .req x4
+sF5 .req x5
+sF6 .req x6
+sF7 .req x7
+sF8 .req x8
+sF9 .req x9
+
+sAA0 .req x20
+sAA1 .req x21
+sAA2 .req x22
+sAA3 .req x23
+sAA4 .req x24
+sAA5 .req x25
+sAA6 .req x26
+sAA7 .req x27
+sAA8 .req x28
+sAA9 .req x19
+
+stmp .req x2
+
+// G
+sG0 .req x0
+sG1 .req x1
+sG2 .req x2
+sG3 .req x3
+sG4 .req x4
+sG5 .req x5
+sG6 .req x6
+sG7 .req x7
+sG8 .req x8
+sG9 .req x9
+
+sBB0 .req x0
+sBB1 .req x1
+sBB2 .req x2
+sBB3 .req x3
+sBB4 .req x4
+sBB5 .req x5
+sBB6 .req x6
+sBB7 .req x7
+sBB8 .req x8
+sBB9 .req x9
+
+// E
+sE0 .req x10
+sE1 .req x11
+sE2 .req x12
+sE3 .req x13
+sE4 .req x14
+sE5 .req x15
+sE6 .req x16
+sE7 .req x17
+sE8 .req x19
+sE9 .req x20
+
+sZ40 .req x23
+sZ41 .req x3
+sZ42 .req x21
+sZ44 .req x7
+sZ45 .req x6
+sZ46 .req x24
+sZ48 .req x22
+
+START:
+
+
+.macro scalar_stack_ldr sA, offset, name
+ ldr \sA\()0, [sp, #\offset\()_0] // @slothy:reads=[\name\()0]
+ ldr \sA\()2, [sp, #\offset\()_8] // @slothy:reads=[\name\()8]
+ ldr \sA\()4, [sp, #\offset\()_16] // @slothy:reads=[\name\()16]
+ ldr \sA\()6, [sp, #\offset\()_24] // @slothy:reads=[\name\()24]
+ ldr \sA\()8, [sp, #\offset\()_32] // @slothy:reads=[\name\()32]
+.endm
+
+.macro scalar_stack_str offset, sA, name
+ stp \sA\()0, \sA\()2, [sp, #\offset\()_0] // @slothy:writes=[\name\()0,\name\()8]
+ stp \sA\()4, \sA\()6, [sp, #\offset\()_16] // @slothy:writes=[\name\()16,\name\()24]
+ str \sA\()8, [sp, #\offset\()_32] // @slothy:writes=[\name\()32]
+.endm
+
+.macro vector_stack_str offset, vA, name
+ stp D<\vA\()0>, D<\vA\()2>, [sp, #\offset\()_0] // @slothy:writes=[\name\()0,\name\()8]
+ stp D<\vA\()4>, D<\vA\()6>, [sp, #\offset\()_16] // @slothy:writes=[\name\()16,\name\()24]
+ str D<\vA\()8>, [sp, #\offset\()_32] // @slothy:writes=[\name\()32]
+.endm
+
+ // TODO: eliminate this explicit register assignment by converting stack_vld2_lane to AArch64Instruction
+ xvector_load_lane_tmp .req x26
+
+.macro vector_load_lane vA, offset, lane, name
+ add xvector_load_lane_tmp, sp, #\offset\()_0
+ ld2 { \vA\()0.s, \vA\()1.s }[\lane\()], [xvector_load_lane_tmp], #8 // @slothy:reads=[\name\()0]
+ ld2 { \vA\()2.s, \vA\()3.s }[\lane\()], [xvector_load_lane_tmp], #8 // @slothy:reads=[\name\()8]
+ ld2 { \vA\()4.s, \vA\()5.s }[\lane\()], [xvector_load_lane_tmp], #8 // @slothy:reads=[\name\()16]
+ ld2 { \vA\()6.s, \vA\()7.s }[\lane\()], [xvector_load_lane_tmp], #8 // @slothy:reads=[\name\()24]
+ ld2 { \vA\()8.s, \vA\()9.s }[\lane\()], [xvector_load_lane_tmp], #8 // @slothy:reads=[\name\()32]
+.endm
+
+.macro vector_sub_inner vC0, vC2, vC4, vC6, vC8, vA0, vA2, vA4, vA6, vA8, vB0, vB2, vB4, vB6, vB8
+ // (2^255-19)*4 - vB
+ sub \vC0\().2s, v28.2s, \vB0\().2s
+ sub \vC2\().2s, v29.2s, \vB2\().2s
+ sub \vC4\().2s, v29.2s, \vB4\().2s
+ sub \vC6\().2s, v29.2s, \vB6\().2s
+ sub \vC8\().2s, v29.2s, \vB8\().2s
+
+ // ... + vA
+ add \vC0\().2s, \vA0\().2s, \vC0\().2s
+ add \vC2\().2s, \vA2\().2s, \vC2\().2s
+ add \vC4\().2s, \vA4\().2s, \vC4\().2s
+ add \vC6\().2s, \vA6\().2s, \vC6\().2s
+ add \vC8\().2s, \vA8\().2s, \vC8\().2s
+.endm
+
+.macro vector_sub vC, vA, vB
+ vector_sub_inner \vC\()0, \vC\()2, \vC\()4, \vC\()6, \vC\()8, \vA\()0, \vA\()2, \vA\()4, \vA\()6, \vA\()8, \vB\()0, \vB\()2, \vB\()4, \vB\()6, \vB\()8
+.endm
+
+
+.macro vector_add_inner vC0, vC2, vC4, vC6, vC8, vA0, vA2, vA4, vA6, vA8, vB0, vB2, vB4, vB6, vB8
+ add \vC0\().2s, \vA0\().2s, \vB0\().2s
+ add \vC2\().2s, \vA2\().2s, \vB2\().2s
+ add \vC4\().2s, \vA4\().2s, \vB4\().2s
+ add \vC6\().2s, \vA6\().2s, \vB6\().2s
+ add \vC8\().2s, \vA8\().2s, \vB8\().2s
+.endm
+
+.macro vector_add vC, vA, vB
+ vector_add_inner \vC\()0, \vC\()2, \vC\()4, \vC\()6, \vC\()8, \vA\()0, \vA\()2, \vA\()4, \vA\()6, \vA\()8, \vB\()0, \vB\()2, \vB\()4, \vB\()6, \vB\()8
+.endm
+
+.macro vector_cmov_inner vA0, vA2, vA4, vA6, vA8, vB0, vB2, vB4, vB6, vB8, vC0, vC2, vC4, vC6, vC8
+ fcsel_dform \vA0, \vB0, \vC0, eq
+ fcsel_dform \vA2, \vB2, \vC2, eq
+ fcsel_dform \vA4, \vB4, \vC4, eq
+ fcsel_dform \vA6, \vB6, \vC6, eq
+ fcsel_dform \vA8, \vB8, \vC8, eq
+.endm
+
+.macro vector_cmov vA, vB, vC
+ vector_cmov_inner \vA\()0, \vA\()2, \vA\()4, \vA\()6, \vA\()8, \vB\()0, \vB\()2, \vB\()4, \vB\()6, \vB\()8, \vC\()0, \vC\()2, \vC\()4, \vC\()6, \vC\()8,
+.endm
+
+.macro vector_transpose_inner vA0, vA1, vA2, vA3, vA4, vA5, vA6, vA7, vA8, vA9, vB0, vB2, vB4, vB6, vB8, vC0, vC2, vC4, vC6, vC8
+ trn2 \vA1\().2s, \vB0\().2s, \vC0\().2s
+ trn1 \vA0\().2s, \vB0\().2s, \vC0\().2s
+ trn2 \vA3\().2s, \vB2\().2s, \vC2\().2s
+ trn1 \vA2\().2s, \vB2\().2s, \vC2\().2s
+ trn2 \vA5\().2s, \vB4\().2s, \vC4\().2s
+ trn1 \vA4\().2s, \vB4\().2s, \vC4\().2s
+ trn2 \vA7\().2s, \vB6\().2s, \vC6\().2s
+ trn1 \vA6\().2s, \vB6\().2s, \vC6\().2s
+ trn2 \vA9\().2s, \vB8\().2s, \vC8\().2s
+ trn1 \vA8\().2s, \vB8\().2s, \vC8\().2s
+.endm
+
+.macro vector_transpose vA, vB, vC
+ vector_transpose_inner \vA\()0, \vA\()1, \vA\()2, \vA\()3, \vA\()4, \vA\()5, \vA\()6, \vA\()7, \vA\()8, \vA\()9, \vB\()0, \vB\()2, \vB\()4, \vB\()6, \vB\()8, \vC\()0, \vC\()2, \vC\()4, \vC\()6, \vC\()8,
+.endm
+
+.macro vector_to_scalar_inner sA0, sA2, sA4, sA6, sA8, vB0, vB2, vB4, vB6, vB8
+ mov \sA0, \vB0\().d[0]
+ mov \sA2, \vB2\().d[0]
+ mov \sA4, \vB4\().d[0]
+ mov \sA6, \vB6\().d[0]
+ mov \sA8, \vB8\().d[0]
+.endm
+
+.macro vector_to_scalar sA, vB
+ vector_to_scalar_inner \sA\()0, \sA\()2, \sA\()4, \sA\()6, \sA\()8, \vB\()0, \vB\()2, \vB\()4, \vB\()6, \vB\()8
+.endm
+
+.macro scalar_to_vector_inner vA0, vA2, vA4, vA6, vA8, sB0, sB2, sB4, sB6, sB8
+ mov \vA0\().d[0], \sB0
+ mov \vA2\().d[0], \sB2
+ mov \vA4\().d[0], \sB4
+ mov \vA6\().d[0], \sB6
+ mov \vA8\().d[0], \sB8
+.endm
+
+.macro scalar_to_vector vA, sB
+ scalar_to_vector_inner \vA\()0, \vA\()2, \vA\()4, \vA\()6, \vA\()8, \sB\()0, \sB\()2, \sB\()4, \sB\()6, \sB\()8
+.endm
+
+
+.macro vector_extract_upper_inner vA0, vA2, vA4, vA6, vA8, vB0, vB2, vB4, vB6, vB8
+ mov \vA0\().d[0], \vB0\().d[1]
+ mov \vA2\().d[0], \vB2\().d[1]
+ mov \vA4\().d[0], \vB4\().d[1]
+ mov \vA6\().d[0], \vB6\().d[1]
+ mov \vA8\().d[0], \vB8\().d[1]
+.endm
+
+.macro vector_extract_upper vA, vB
+ vector_extract_upper_inner \vA\()0, \vA\()2, \vA\()4, \vA\()6, \vA\()8, \vB\()0, \vB\()2, \vB\()4, \vB\()6, \vB\()8
+.endm
+
+.macro vector_compress_inner vA0, vA2, vA4, vA6, vA8, vB0, vB1, vB2, vB3, vB4, vB5, vB6, vB7, vB8, vB9
+ trn1 \vA0\().4s, \vB0\().4s, \vB1\().4s
+ trn1 \vA2\().4s, \vB2\().4s, \vB3\().4s
+ trn1 \vA4\().4s, \vB4\().4s, \vB5\().4s
+ trn1 \vA6\().4s, \vB6\().4s, \vB7\().4s
+ trn1 \vA8\().4s, \vB8\().4s, \vB9\().4s
+.endm
+
+.macro vector_compress vA, vB
+ vector_compress_inner \vA\()0, \vA\()2, \vA\()4, \vA\()6, \vA\()8, \vB\()0, \vB\()1, \vB\()2, \vB\()3, \vB\()4, \vB\()5, \vB\()6, \vB\()7, \vB\()8, \vB\()9,
+.endm
+
+.macro scalar_clear_carries_inner sA0, sA1, sA2, sA3, sA4, sA5, sA6, sA7, sA8, sA9
+ and \sA1, \sA1, #0x1ffffff
+ and \sA3, \sA3, #0x1ffffff
+ and \sA5, \sA5, #0x1ffffff
+ and \sA7, \sA7, #0x1ffffff
+ mov W<\sA0>, W<\sA0>
+ mov W<\sA2>, W<\sA2>
+ mov W<\sA4>, W<\sA4>
+ mov W<\sA6>, W<\sA6>
+ mov W<\sA8>, W<\sA8>
+.endm
+
+.macro scalar_clear_carries sA
+ scalar_clear_carries_inner \sA\()0, \sA\()1, \sA\()2, \sA\()3, \sA\()4, \sA\()5, \sA\()6, \sA\()7, \sA\()8, \sA\()9
+.endm
+
+.macro scalar_decompress_inner sA0, sA1, sA2, sA3, sA4, sA5, sA6, sA7, sA8, sA9
+ lsr \sA1, \sA0, #32
+ lsr \sA3, \sA2, #32
+ lsr \sA5, \sA4, #32
+ lsr \sA7, \sA6, #32
+ lsr \sA9, \sA8, #32
+.endm
+
+.macro scalar_decompress sA
+ scalar_decompress_inner \sA\()0, \sA\()1, \sA\()2, \sA\()3, \sA\()4, \sA\()5, \sA\()6, \sA\()7, \sA\()8, \sA\()9
+.endm
+
+ // TODO: eliminate those. should be easy
+ vR_l4h4l5h5 .req vADBC4
+ vR_l6h6l7h7 .req vADBC5
+
+ vR_l0h0l1h1 .req vADBC0
+ vR_l2h2l3h3 .req vADBC1
+
+ vR_l0123 .req vADBC4
+ vR_l4567 .req vADBC6
+ vR_h0123 .req vADBC5
+ vR_h4567 .req vADBC7
+ vR_l89h89 .req vADBC8
+
+ vR_h89xx .req vADBC9
+
+ vSum0123 .req vADBC0
+ vSum4567 .req vADBC1
+ vSum89xx .req vADBC2
+
+ vDiff0123 .req v10
+ vDiff4567 .req v11
+ vDiff89xx .req v12
+
+ // TODO: eliminate those explicit register assignments by converting stack_vld1r and stack_vldr_bform to AArch64Instruction
+ vrepack_inner_tmp .req v19
+ vrepack_inner_tmp2 .req v0
+
+.macro vector_addsub_repack_inner vA0, vA1, vA2, vA3, vA4, vA5, vA6, vA7, vA8, vA9, vC0, vC1, vC2, vC3, vC4, vC5, vC6, vC7, vC8, vC9
+ uzp1 vR_l4h4l5h5.4s, \vC4\().4s, \vC5\().4s
+ uzp1 vR_l6h6l7h7.4s, \vC6\().4s, \vC7\().4s
+ ld1r {vrepack_inner_tmp.2d}, [sp] // @slothy:reads=mask1
+ uzp1 vR_l4567.4s, vR_l4h4l5h5.4s, vR_l6h6l7h7.4s
+ uzp2 vR_h4567.4s, vR_l4h4l5h5.4s, vR_l6h6l7h7.4s
+ trn1 vR_l89h89.4s, \vC8\().4s, \vC9\().4s
+ ldr B, [sp, #STACK_MASK2] // @slothy:reads=mask2
+ uzp1 vR_l0h0l1h1.4s, \vC0\().4s, \vC1\().4s
+ uzp1 vR_l2h2l3h3.4s, \vC2\().4s, \vC3\().4s
+ mov vR_h89xx.d[0], vR_l89h89.d[1]
+ uzp1 vR_l0123.4s, vR_l0h0l1h1.4s, vR_l2h2l3h3.4s
+ uzp2 vR_h0123.4s, vR_l0h0l1h1.4s, vR_l2h2l3h3.4s
+ add vDiff4567.4s, vR_l4567.4s, vrepack_inner_tmp.4s
+ add vDiff89xx.2s, vR_l89h89.2s, vrepack_inner_tmp.2s
+ mov vrepack_inner_tmp.b[0], vrepack_inner_tmp2.b[0]
+ add vSum0123.4s, vR_l0123.4s, vR_h0123.4s
+ add vSum4567.4s, vR_l4567.4s, vR_h4567.4s
+ add vSum89xx.2s, vR_l89h89.2s, vR_h89xx.2s
+ add vDiff0123.4s, vR_l0123.4s, vrepack_inner_tmp.4s
+ sub vDiff4567.4s, vDiff4567.4s, vR_h4567.4s
+ sub vDiff0123.4s, vDiff0123.4s, vR_h0123.4s
+ sub vDiff89xx.2s, vDiff89xx.2s, vR_h89xx.2s
+ zip1 \vA0\().4s, vDiff0123.4s, vSum0123.4s
+ zip2 \vA2\().4s, vDiff0123.4s, vSum0123.4s
+ zip1 \vA4\().4s, vDiff4567.4s, vSum4567.4s
+ zip2 \vA6\().4s, vDiff4567.4s, vSum4567.4s
+ zip1 \vA8\().2s, vDiff89xx.2s, vSum89xx.2s
+ zip2 \vA9\().2s, vDiff89xx.2s, vSum89xx.2s
+ mov \vA1\().d[0], \vA0\().d[1]
+ mov \vA3\().d[0], \vA2\().d[1]
+ mov \vA5\().d[0], \vA4\().d[1]
+ mov \vA7\().d[0], \vA6\().d[1]
+.endm
+
+.macro vector_addsub_repack vA, vC
+vector_addsub_repack_inner \vA\()0, \vA\()1, \vA\()2, \vA\()3, \vA\()4, \vA\()5, \vA\()6, \vA\()7, \vA\()8, \vA\()9, \vC\()0, \vC\()1, \vC\()2, \vC\()3, \vC\()4, \vC\()5, \vC\()6, \vC\()7, \vC\()8, \vC\()9
+.endm
+
+// sAA0 .. sAA9 output AA = A^2
+// sA0 .. sA9 input A
+// TODO: simplify (this is still the same instruction order as before; we can make it simpler and leave the re-ordering to Sloty)
+.macro scalar_sqr_inner sAA0, sAA1, sAA2, sAA3, sAA4, sAA5, sAA6, sAA7, sAA8, sAA9, sA0, sA1, sA2, sA3, sA4, sA5, sA6, sA7, sA8, sA9
+ lsr \sA1, \sA0, #32
+ lsr \sA3, \sA2, #32
+ lsr \sA5, \sA4, #32
+ lsr \sA7, \sA6, #32
+ lsr \sA9, \sA8, #32
+ add X, \sA9, \sA9
+ add X, \sA8, \sA8
+ add X, \sA7, \sA7
+ add X, \sA6, \sA6
+ add X, \sA5, \sA5
+ add X, \sA4, \sA4
+ add X, \sA3, \sA3
+ add X, \sA2, \sA2
+ add X, \sA1, \sA1
+ umull X, W<\sA4>, W<\sA4>
+ umull X, W<\sA4>, W
+ mul W<\sA9>, W<\sA9>, W
+ mul W<\sA7>, W<\sA7>, W
+ mul W<\sA5>, W<\sA5>, W
+ umaddl X, W<\sA9>, W, X
+ umaddl X, W<\sA0>, W, X
+ umull X, W<\sA0>, W<\sA0>
+ umull X, W<\sA0>, W
+ umull X, W<\sA0>, W
+ umull X, W<\sA0>, W
+ umull X, W<\sA0>, W
+ umull X, W<\sA0>, W
+ umull X, W<\sA0>, W
+ umull X, W<\sA0>, W
+ umaddl X, W<\sA0>, W, X
+ mul W, W<\sA6>, W
+ umaddl X, W<\sA1>, W, X
+ umaddl X, W<\sA1>, W, X
+ umaddl X, W, W, X
+ umaddl X, W<\sA1>, W, X
+ umaddl X, W, W, X
+ umaddl X, W<\sA1>, W, X
+ umaddl X, W, W, X
+ umaddl X, W<\sA1>, W, X
+ mul W, W<\sA8>, W
+ umaddl X, W<\sA2>, W<\sA2>, X
+ umaddl X, W<\sA2>, W, X
+ umaddl X, W<\sA2>, W, X
+ umaddl X, W<\sA2>, W, X
+ umaddl X, W<\sA2>, W, X
+ umaddl X, W<\sA2>, W, X
+ umaddl X, W<\sA3>, W, X
+ umaddl X, W<\sA3>, W, X
+ umaddl X, W, W, X
+ umaddl X, W<\sA3>, W, X
+ umaddl X, W<\sA8>, W, X
+ umaddl X, W<\sA6>, W, X
+ add X, X, X, lsr #26
+ umaddl X, W<\sA5>, W, X
+ add X, X, X, lsr #25
+ bic X, X, #0x1ffffff
+ add X, X, X, lsr #24
+ and X, X, #0x1ffffff
+ add X, X, X, lsr #21
+ umaddl X, W<\sA7>, W, X
+ add X, X, X
+ add X, X, X
+ add X, X, X
+ add X, X, X
+ umaddl X, W, W, X
+ umaddl X, W, W, X
+ and X, X, #0x3ffffff
+ umaddl X, W<\sA7>, W, X
+ umaddl X, W<\sA7>, W, X
+ umaddl X, W<\sA7>, W, X
+ umaddl X, W<\sA7>, W, X
+ umaddl X, W, W, X
+ umaddl X, W, W, X
+ umaddl X, W, W, X
+ umaddl X, W, W, X
+ umaddl X, W, W, X
+ umaddl X, W, W, X
+ umaddl X, W<\sA9>, W, X
+ umaddl X, W<\sA9>, W, X
+ umaddl X, W<\sA9>, W, X
+ umaddl X, W<\sA9>, W, X
+ umaddl X, W<\sA9>, W, X
+ umaddl X, W<\sA9>, W, X
+ umaddl X, W<\sA9>, W, X
+ umaddl X, W<\sA9>, W, X
+ add \sAA1, X, X, lsr #26
+ and \sAA0, X, #0x3ffffff
+ add \sAA2, X, \sAA1, lsr #25
+ bfi \sAA0, \sAA1, #32, #25
+ add \sAA3, X, \sAA2, lsr #26
+ and \sAA2, \sAA2, #0x3ffffff
+ add \sAA4, X, \sAA3, lsr #25
+ bfi \sAA2, \sAA3, #32, #25
+ add \sAA5, X, \sAA4, lsr #26
+ and \sAA4, \sAA4, #0x3ffffff
+ add \sAA6, X, \sAA5, lsr #25
+ bfi \sAA4, \sAA5, #32, #25
+ add \sAA7, X, \sAA6, lsr #26
+ and \sAA6, \sAA6, #0x3ffffff
+ add \sAA8, X, \sAA7, lsr #25
+ bfi \sAA6, \sAA7, #32, #25
+ add \sAA9, X, \sAA8, lsr #26
+ and \sAA8, \sAA8, #0x3ffffff
+ bfi \sAA8, \sAA9, #32, #26
+.endm
+
+.macro scalar_sqr sAA, sA
+scalar_sqr_inner \sAA\()0, \sAA\()1, \sAA\()2, \sAA\()3, \sAA\()4, \sAA\()5, \sAA\()6, \sAA\()7, \sAA\()8, \sAA\()9, \sA\()0, \sA\()1, \sA\()2, \sA\()3, \sA\()4, \sA\()5, \sA\()6, \sA\()7, \sA\()8, \sA\()9
+.endm
+
+// sC0 .. sC9 output C = A*B
+// sA0 .. sA9 input A
+// sB0 .. sB9 input B
+.macro scalar_mul_inner sC0, sC1, sC2, sC3, sC4, sC5, sC6, sC7, sC8, sC9, sA0, sA1, sA2, sA3, sA4, sA5, sA6, sA7, sA8, sA9, sB0, sB1, sB2, sB3, sB4, sB5, sB6, sB7, sB8, sB9
+
+
+ mul W, W<\sA1>, W
+ mul W, W<\sA2>, W
+ mul W, W<\sA3>, W
+ mul W, W<\sA5>, W
+ mul W, W<\sA6>, W
+ mul W, W<\sA7>, W
+ mul W, W<\sA8>, W
+ mul W, W<\sA9>, W
+
+ umull X, W<\sA1>, W<\sB8>
+ umaddl X, W<\sA3>, W<\sB6>, X
+ umaddl X, W<\sA5>, W<\sB4>, X
+ umaddl X, W<\sA7>, W<\sB2>, X
+ umaddl X, W<\sA9>, W<\sB0>, X
+ umaddl X, W<\sA0>, W<\sB9>, X
+ umaddl X, W<\sA2>, W<\sB7>, X
+ umaddl X, W<\sA4>, W<\sB5>, X
+ umaddl X