diff --git a/.github/workflows/test_basic.yaml b/.github/workflows/test_basic.yaml index 8da89a0f..49231d93 100644 --- a/.github/workflows/test_basic.yaml +++ b/.github/workflows/test_basic.yaml @@ -19,6 +19,21 @@ jobs: - name: Run examples run: | python3 example.py --dry-run + tutorial: + if: ${{ github.event.label.name == 'needs-ci' || + github.event.pull_request.user.login == 'hanno-becker' || + github.event.pull_request.user.login == 'dop-amin' || + github.event.pull_request.user.login == 'mkannwischer' + }} + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - name: Install python dependencies + run: | + python -m pip install -r requirements.txt + - name: Run tutorial + run: | + (cd tutorial && ./tutorial_all.sh) examples_basic: if: ${{ github.event.label.name == 'needs-ci' || github.event.pull_request.user.login == 'hanno-becker' || diff --git a/example.py b/example.py index c6330771..3e1181dd 100644 --- a/example.py +++ b/example.py @@ -517,6 +517,62 @@ def core(self, slothy): slothy.config.sw_pipelining.halving_heuristic_periodic = True slothy.optimize_loop("layer345_loop") +class AArch64Example0(Example): + def __init__(self, var="", arch=AArch64_Neon, target=Target_CortexA55): + name = "aarch64_simple0" + infile = name + + if var != "": + name += f"_{var}" + infile += f"_{var}" + name += f"_{target_label_dict[target]}" + + super().__init__(infile, name, rename=True, arch=arch, target=target) + + def core(self,slothy): + slothy.config.variable_size=True + slothy.config.constraints.stalls_first_attempt=32 + slothy.optimize() + +class AArch64Example1(Example): + def __init__(self, var="", arch=AArch64_Neon, target=Target_CortexA55): + name = "aarch64_simple0_macros" + infile = name + + if var != "": + name += f"_{var}" + infile += f"_{var}" + name += f"_{target_label_dict[target]}" + + super().__init__(infile, name, rename=True, arch=arch, target=target) + + def core(self,slothy): + slothy.config.variable_size=True + slothy.config.constraints.stalls_first_attempt=32 + slothy.optimize(start="start", end="end") + + +class AArch64Example2(Example): + def __init__(self, var="", arch=AArch64_Neon, target=Target_CortexA55): + name = "aarch64_simple0_loop" + infile = name + + if var != "": + name += f"_{var}" + infile += f"_{var}" + name += f"_{target_label_dict[target]}" + + super().__init__(infile, name, rename=True, arch=arch, target=target) + + def core(self,slothy): + slothy.config.variable_size=True + slothy.config.constraints.stalls_first_attempt=32 + slothy.config.sw_pipelining.enabled = True + slothy.config.sw_pipelining.optimize_preamble = False + slothy.config.sw_pipelining.optimize_postamble = False + slothy.optimize_loop("start") + + class ntt_kyber_123_4567(Example): def __init__(self, var="", arch=AArch64_Neon, target=Target_CortexA55, timeout=None): @@ -1197,6 +1253,13 @@ def main(): Example2(), Example3(), + AArch64Example0(), + AArch64Example0(target=Target_CortexA72), + AArch64Example1(), + AArch64Example1(target=Target_CortexA72), + AArch64Example2(), + AArch64Example2(target=Target_CortexA72), + CRT(), ntt_n256_l6_s32("bar"), diff --git a/examples/naive/aarch64/aarch64_simple0.s b/examples/naive/aarch64/aarch64_simple0.s new file mode 100644 index 00000000..17299dcc --- /dev/null +++ b/examples/naive/aarch64/aarch64_simple0.s @@ -0,0 +1,24 @@ +ldr q0, [x1, #0] +ldr q1, [x2, #0] + +ldr q8, [x0] +ldr q9, [x0, #1*16] +ldr q10, [x0, #2*16] +ldr q11, [x0, #3*16] + +mul v24.8h, v9.8h, v0.h[0] +sqrdmulh v9.8h, v9.8h, v0.h[1] +mls v24.8h, v9.8h, v1.h[0] +sub v9.8h, v8.8h, v24.8h +add v8.8h, v8.8h, v24.8h + +mul v24.8h, v11.8h, v0.h[0] +sqrdmulh v11.8h, v11.8h, v0.h[1] +mls v24.8h, v11.8h, v1.h[0] +sub v11.8h, v10.8h, v24.8h +add v10.8h, v10.8h, v24.8h + +str q8, [x0], #4*16 +str q9, [x0, #-3*16] +str q10, [x0, #-2*16] +str q11, [x0, #-1*16] \ No newline at end of file diff --git a/examples/naive/aarch64/aarch64_simple0_loop.s b/examples/naive/aarch64/aarch64_simple0_loop.s new file mode 100644 index 00000000..10512245 --- /dev/null +++ b/examples/naive/aarch64/aarch64_simple0_loop.s @@ -0,0 +1,55 @@ +qdata0 .req q8 +qdata1 .req q9 +qdata2 .req q10 +qdata3 .req q11 + +qtwiddle .req q0 +qmodulus .req q1 + +data0 .req v8 +data1 .req v9 +data2 .req v10 +data3 .req v11 + +twiddle .req v0 +modulus .req v1 + +tmp .req v12 + +data_ptr .req x0 +twiddle_ptr .req x1 +modulus_ptr .req x2 + +.macro barmul out, in, twiddle, modulus + mul \out.8h, \in.8h, \twiddle.h[0] + sqrdmulh \in.8h, \in.8h, \twiddle.h[1] + mls \out.8h, \in.8h, \modulus.h[0] +.endm + +.macro butterfly data0, data1, tmp, twiddle, modulus + barmul \tmp, \data1, \twiddle, \modulus + sub \data1.8h, \data0.8h, \tmp.8h + add \data0.8h, \data0.8h, \tmp.8h +.endm + +count .req x2 +ldr qtwiddle, [twiddle_ptr, #0] +ldr qmodulus, [modulus_ptr, #0] +mov count, #16 +start: + + ldr qdata0, [data_ptr, #0*16] + ldr qdata1, [data_ptr, #1*16] + ldr qdata2, [data_ptr, #2*16] + ldr qdata3, [data_ptr, #3*16] + + butterfly data0, data1, tmp, twiddle, modulus + butterfly data2, data3, tmp, twiddle, modulus + + str qdata0, [data_ptr], #4*16 + str qdata1, [data_ptr, #-3*16] + str qdata2, [data_ptr, #-2*16] + str qdata3, [data_ptr, #-1*16] + + subs count, count, #1 + cbnz count, start diff --git a/examples/naive/aarch64/aarch64_simple0_macros.s b/examples/naive/aarch64/aarch64_simple0_macros.s new file mode 100644 index 00000000..d41f0056 --- /dev/null +++ b/examples/naive/aarch64/aarch64_simple0_macros.s @@ -0,0 +1,55 @@ +qdata0 .req q8 +qdata1 .req q9 +qdata2 .req q10 +qdata3 .req q11 + +qtwiddle .req q0 +qmodulus .req q1 + +data0 .req v8 +data1 .req v9 +data2 .req v10 +data3 .req v11 + +twiddle .req v0 +modulus .req v1 + +tmp .req v12 + +data_ptr .req x0 +twiddle_ptr .req x1 +modulus_ptr .req x2 + +.macro barmul out, in, twiddle, modulus + mul \out.8h, \in.8h, \twiddle.h[0] + sqrdmulh \in.8h, \in.8h, \twiddle.h[1] + mls \out.8h, \in.8h, \modulus.h[0] +.endm + +.macro butterfly data0, data1, tmp, twiddle, modulus + barmul \tmp, \data1, \twiddle, \modulus + sub \data1.8h, \data0.8h, \tmp.8h + add \data0.8h, \data0.8h, \tmp.8h +.endm + +count .req x2 + +start: + + ldr qtwiddle, [twiddle_ptr, #0] + ldr qmodulus, [modulus_ptr, #0] + + ldr qdata0, [data_ptr, #0*16] + ldr qdata1, [data_ptr, #1*16] + ldr qdata2, [data_ptr, #2*16] + ldr qdata3, [data_ptr, #3*16] + + butterfly data0, data1, tmp, twiddle, modulus + butterfly data2, data3, tmp, twiddle, modulus + + str qdata0, [data_ptr], #4*16 + str qdata1, [data_ptr, #-3*16] + str qdata2, [data_ptr, #-2*16] + str qdata3, [data_ptr, #-1*16] + +end: diff --git a/examples/opt/aarch64/aarch64_simple0_loop_opt_a55.s b/examples/opt/aarch64/aarch64_simple0_loop_opt_a55.s new file mode 100644 index 00000000..f945ec37 --- /dev/null +++ b/examples/opt/aarch64/aarch64_simple0_loop_opt_a55.s @@ -0,0 +1,125 @@ +qdata0 .req q8 +qdata1 .req q9 +qdata2 .req q10 +qdata3 .req q11 + +qtwiddle .req q0 +qmodulus .req q1 + +data0 .req v8 +data1 .req v9 +data2 .req v10 +data3 .req v11 + +twiddle .req v0 +modulus .req v1 + +tmp .req v12 + +data_ptr .req x0 +twiddle_ptr .req x1 +modulus_ptr .req x2 + +.macro barmul out, in, twiddle, modulus + mul \out.8h, \in.8h, \twiddle.h[0] + sqrdmulh \in.8h, \in.8h, \twiddle.h[1] + mls \out.8h, \in.8h, \modulus.h[0] +.endm + +.macro butterfly data0, data1, tmp, twiddle, modulus + barmul \tmp, \data1, \twiddle, \modulus + sub \data1.8h, \data0.8h, \tmp.8h + add \data0.8h, \data0.8h, \tmp.8h +.endm + +count .req x2 +ldr qtwiddle, [twiddle_ptr, #0] +ldr qmodulus, [modulus_ptr, #0] +mov count, #16 + ldr q3, [x0, #16] + sqrdmulh v7.8H, v3.8H, v0.H[1] + sub count, count, #1 +start: + mul v3.8H, v3.8H, v0.H[0] // ....*............. + // gap // .................. + ldr q19, [x0, #48] // ...*.............. + // gap // .................. + // gap // .................. + // gap // .................. + ldr q15, [x0, #0] // *................. + // gap // .................. + // gap // .................. + // gap // .................. + mls v3.8H, v7.8H, v1.H[0] // ......*........... + // gap // .................. + mul v13.8H, v19.8H, v0.H[0] // .........*........ + // gap // .................. + sqrdmulh v19.8H, v19.8H, v0.H[1] // ..........*....... + // gap // .................. + ldr q7, [x0, #32] // ..*............... + // gap // .................. + // gap // .................. + // gap // .................. + sub v17.8H, v15.8H, v3.8H // .......*.......... + // gap // .................. + add v10.8H, v15.8H, v3.8H // ........*......... + // gap // .................. + mls v13.8H, v19.8H, v1.H[0] // ...........*...... + // gap // .................. + str q17, [x0, #16] // ...............*.. + // gap // .................. + ldr q3, [x0, #80] // .e................ + // gap // .................. + // gap // .................. + // gap // .................. + add v15.8H, v7.8H, v13.8H // .............*.... + // gap // .................. + str q10, [x0], #4*16 // ..............*... + // gap // .................. + sub v13.8H, v7.8H, v13.8H // ............*..... + // gap // .................. + str q15, [x0, #-32] // ................*. + // gap // .................. + sqrdmulh v7.8H, v3.8H, v0.H[1] // .....e............ + // gap // .................. + str q13, [x0, #-16] // .................* + // gap // .................. + + // original source code + // ldr q8, [x0, #0*16] // .......|.*............... + // ldr q9, [x0, #1*16] // e......|..........e...... + // ldr q10, [x0, #2*16] // .......|.....*........... + // ldr q11, [x0, #3*16] // .......|*................ + // mul v12.8h, v9.8h, v0.h[0] // .......*................. + // sqrdmulh v9.8h, v9.8h, v0.h[1] // .....e.|...............e. + // mls v12.8h, v9.8h, v1.h[0] // .......|..*.............. + // sub v9.8h, v8.8h, v12.8h // .......|......*.......... + // add v8.8h, v8.8h, v12.8h // .......|.......*......... + // mul v12.8h, v11.8h, v0.h[0] // .......|...*............. + // sqrdmulh v11.8h, v11.8h, v0.h[1] // .......|....*............ + // mls v12.8h, v11.8h, v1.h[0] // .......|........*........ + // sub v11.8h, v10.8h, v12.8h // ...*...|.............*... + // add v10.8h, v10.8h, v12.8h // .*.....|...........*..... + // str q8, [x0], #4*16 // ..*....|............*.... + // str q9, [x0, #-3*16] // .......|.........*....... + // str q10, [x0, #-2*16] // ....*..|..............*.. + // str q11, [x0, #-1*16] // ......*|................* + + sub count, count, #1 + cbnz count, start + mul v3.8H, v3.8H, v0.H[0] + ldr q19, [x0, #48] + ldr q15, [x0, #0] + mls v3.8H, v7.8H, v1.H[0] + mul v13.8H, v19.8H, v0.H[0] + sqrdmulh v19.8H, v19.8H, v0.H[1] + ldr q7, [x0, #32] + sub v17.8H, v15.8H, v3.8H + add v10.8H, v15.8H, v3.8H + mls v13.8H, v19.8H, v1.H[0] + str q17, [x0, #16] + add v15.8H, v7.8H, v13.8H + str q10, [x0], #4*16 + sub v13.8H, v7.8H, v13.8H + str q15, [x0, #-32] + str q13, [x0, #-16] \ No newline at end of file diff --git a/examples/opt/aarch64/aarch64_simple0_loop_opt_a72.s b/examples/opt/aarch64/aarch64_simple0_loop_opt_a72.s new file mode 100644 index 00000000..3e8c3935 --- /dev/null +++ b/examples/opt/aarch64/aarch64_simple0_loop_opt_a72.s @@ -0,0 +1,117 @@ +qdata0 .req q8 +qdata1 .req q9 +qdata2 .req q10 +qdata3 .req q11 + +qtwiddle .req q0 +qmodulus .req q1 + +data0 .req v8 +data1 .req v9 +data2 .req v10 +data3 .req v11 + +twiddle .req v0 +modulus .req v1 + +tmp .req v12 + +data_ptr .req x0 +twiddle_ptr .req x1 +modulus_ptr .req x2 + +.macro barmul out, in, twiddle, modulus + mul \out.8h, \in.8h, \twiddle.h[0] + sqrdmulh \in.8h, \in.8h, \twiddle.h[1] + mls \out.8h, \in.8h, \modulus.h[0] +.endm + +.macro butterfly data0, data1, tmp, twiddle, modulus + barmul \tmp, \data1, \twiddle, \modulus + sub \data1.8h, \data0.8h, \tmp.8h + add \data0.8h, \data0.8h, \tmp.8h +.endm + +count .req x2 +ldr qtwiddle, [twiddle_ptr, #0] +ldr qmodulus, [modulus_ptr, #0] +mov count, #16 + ldr q7, [x0, #16] + ldr q17, [x0, #48] + sqrdmulh v25.8H, v7.8H, v0.H[1] + mul v3.8H, v7.8H, v0.H[0] + sqrdmulh v27.8H, v17.8H, v0.H[1] + mls v3.8H, v25.8H, v1.H[0] + sub count, count, #1 +start: + ldr q15, [x0, #0] // *................. + ldr q7, [x0, #80] // .e................ + mul v13.8H, v17.8H, v0.H[0] // .........*........ + ldr q19, [x0, #32] // ..*............... + ldr q17, [x0, #112] // ...e.............. + // gap // .................. + // gap // .................. + // gap // .................. + mls v13.8H, v27.8H, v1.H[0] // ...........*...... + // gap // .................. + // gap // .................. + // gap // .................. + sqrdmulh v25.8H, v7.8H, v0.H[1] // .....e............ + // gap // .................. + add v4.8H, v15.8H, v3.8H // ........*......... + sub v10.8H, v15.8H, v3.8H // .......*.......... + // gap // .................. + // gap // .................. + mul v3.8H, v7.8H, v0.H[0] // ....e............. + // gap // .................. + // gap // .................. + sub v31.8H, v19.8H, v13.8H // ............*..... + // gap // .................. + // gap // .................. + sqrdmulh v27.8H, v17.8H, v0.H[1] // ..........e....... + add v28.8H, v19.8H, v13.8H // .............*.... + // gap // .................. + // gap // .................. + // gap // .................. + str q10, [x0, #16] // ...............*.. + mls v3.8H, v25.8H, v1.H[0] // ......e........... + str q31, [x0, #48] // .................* + str q4, [x0], #4*16 // ..............*... + str q28, [x0, #-32] // ................*. + // gap // .................. + // gap // .................. + + // original source code + // ldr q8, [x0, #0*16] // .................*................. + // ldr q9, [x0, #1*16] // e................|e................ + // ldr q10, [x0, #2*16] // ..*..............|..*.............. + // ldr q11, [x0, #3*16] // ...e.............|...e............. + // mul v12.8h, v9.8h, v0.h[0] // ........e........|........e........ + // sqrdmulh v9.8h, v9.8h, v0.h[1] // .....e...........|.....e........... + // mls v12.8h, v9.8h, v1.h[0] // .............e...|.............e... + // sub v9.8h, v8.8h, v12.8h // .......*.........|.......*......... + // add v8.8h, v8.8h, v12.8h // ......*..........|......*.......... + // mul v12.8h, v11.8h, v0.h[0] // .*...............|.*............... + // sqrdmulh v11.8h, v11.8h, v0.h[1] // ..........e......|..........e...... + // mls v12.8h, v11.8h, v1.h[0] // ....*............|....*............ + // sub v11.8h, v10.8h, v12.8h // .........*.......|.........*....... + // add v10.8h, v10.8h, v12.8h // ...........*.....|...........*..... + // str q8, [x0], #4*16 // ...............*.|...............*. + // str q9, [x0, #-3*16] // ............*....|............*.... + // str q10, [x0, #-2*16] // ................*|................* + // str q11, [x0, #-1*16] // ..............*..|..............*.. + + sub count, count, #1 + cbnz count, start + ldr q15, [x0, #0] + mul v13.8H, v17.8H, v0.H[0] + ldr q19, [x0, #32] + mls v13.8H, v27.8H, v1.H[0] + add v4.8H, v15.8H, v3.8H + sub v10.8H, v15.8H, v3.8H + sub v31.8H, v19.8H, v13.8H + add v28.8H, v19.8H, v13.8H + str q10, [x0, #16] + str q31, [x0, #48] + str q4, [x0], #4*16 + str q28, [x0, #-32] \ No newline at end of file diff --git a/examples/opt/aarch64/aarch64_simple0_macros_opt_a55.s b/examples/opt/aarch64/aarch64_simple0_macros_opt_a55.s new file mode 100644 index 00000000..56215f2c --- /dev/null +++ b/examples/opt/aarch64/aarch64_simple0_macros_opt_a55.s @@ -0,0 +1,117 @@ +qdata0 .req q8 +qdata1 .req q9 +qdata2 .req q10 +qdata3 .req q11 + +qtwiddle .req q0 +qmodulus .req q1 + +data0 .req v8 +data1 .req v9 +data2 .req v10 +data3 .req v11 + +twiddle .req v0 +modulus .req v1 + +tmp .req v12 + +data_ptr .req x0 +twiddle_ptr .req x1 +modulus_ptr .req x2 + +.macro barmul out, in, twiddle, modulus + mul \out.8h, \in.8h, \twiddle.h[0] + sqrdmulh \in.8h, \in.8h, \twiddle.h[1] + mls \out.8h, \in.8h, \modulus.h[0] +.endm + +.macro butterfly data0, data1, tmp, twiddle, modulus + barmul \tmp, \data1, \twiddle, \modulus + sub \data1.8h, \data0.8h, \tmp.8h + add \data0.8h, \data0.8h, \tmp.8h +.endm + +count .req x2 + + start: + ldr q0, [x1, #0] // *................... + // gap // .................... + // gap // .................... + // gap // .................... + ldr q31, [x0, #16] // ...*................ + // gap // .................... + // gap // .................... + // gap // .................... + ldr q6, [x2, #0] // .*.................. + // gap // .................... + // gap // .................... + // gap // .................... + mul v29.8H, v31.8H, v0.H[0] // ......*............. + // gap // .................... + sqrdmulh v3.8H, v31.8H, v0.H[1] // .......*............ + // gap // .................... + ldr q14, [x0, #48] // .....*.............. + // gap // .................... + // gap // .................... + // gap // .................... + ldr q9, [x0, #0] // ..*................. + // gap // .................... + // gap // .................... + // gap // .................... + mul v10.8H, v14.8H, v0.H[0] // ...........*........ + // gap // .................... + sqrdmulh v22.8H, v14.8H, v0.H[1] // ............*....... + // gap // .................... + mls v29.8H, v3.8H, v6.H[0] // ........*........... + // gap // .................... + ldr q18, [x0, #32] // ....*............... + // gap // .................... + // gap // .................... + // gap // .................... + mls v10.8H, v22.8H, v6.H[0] // .............*...... + // gap // .................... + add v21.8H, v9.8H, v29.8H // ..........*......... + // gap // .................... + sub v29.8H, v9.8H, v29.8H // .........*.......... + // gap // .................... + // gap // .................... + // gap // .................... + str q21, [x0], #4*16 // ................*... + // gap // .................... + add v13.8H, v18.8H, v10.8H // ...............*.... + // gap // .................... + str q29, [x0, #-48] // .................*.. + // gap // .................... + sub v3.8H, v18.8H, v10.8H // ..............*..... + // gap // .................... + str q13, [x0, #-32] // ..................*. + // gap // .................... + // gap // .................... + // gap // .................... + str q3, [x0, #-16] // ...................* + // gap // .................... + + // original source code + // ldr q0, [x1, #0] // *................... + // ldr q1, [x2, #0] // ..*................. + // ldr q8, [x0, #0*16] // ......*............. + // ldr q9, [x0, #1*16] // .*.................. + // ldr q10, [x0, #2*16] // ..........*......... + // ldr q11, [x0, #3*16] // .....*.............. + // mul v12.8h, v9.8h, v0.h[0] // ...*................ + // sqrdmulh v9.8h, v9.8h, v0.h[1] // ....*............... + // mls v12.8h, v9.8h, v1.h[0] // .........*.......... + // sub v9.8h, v8.8h, v12.8h // .............*...... + // add v8.8h, v8.8h, v12.8h // ............*....... + // mul v12.8h, v11.8h, v0.h[0] // .......*............ + // sqrdmulh v11.8h, v11.8h, v0.h[1] // ........*........... + // mls v12.8h, v11.8h, v1.h[0] // ...........*........ + // sub v11.8h, v10.8h, v12.8h // .................*.. + // add v10.8h, v10.8h, v12.8h // ...............*.... + // str q8, [x0], #4*16 // ..............*..... + // str q9, [x0, #-3*16] // ................*... + // str q10, [x0, #-2*16] // ..................*. + // str q11, [x0, #-1*16] // ...................* + + end: diff --git a/examples/opt/aarch64/aarch64_simple0_macros_opt_a72.s b/examples/opt/aarch64/aarch64_simple0_macros_opt_a72.s new file mode 100644 index 00000000..8cd8d874 --- /dev/null +++ b/examples/opt/aarch64/aarch64_simple0_macros_opt_a72.s @@ -0,0 +1,130 @@ +qdata0 .req q8 +qdata1 .req q9 +qdata2 .req q10 +qdata3 .req q11 + +qtwiddle .req q0 +qmodulus .req q1 + +data0 .req v8 +data1 .req v9 +data2 .req v10 +data3 .req v11 + +twiddle .req v0 +modulus .req v1 + +tmp .req v12 + +data_ptr .req x0 +twiddle_ptr .req x1 +modulus_ptr .req x2 + +.macro barmul out, in, twiddle, modulus + mul \out.8h, \in.8h, \twiddle.h[0] + sqrdmulh \in.8h, \in.8h, \twiddle.h[1] + mls \out.8h, \in.8h, \modulus.h[0] +.endm + +.macro butterfly data0, data1, tmp, twiddle, modulus + barmul \tmp, \data1, \twiddle, \modulus + sub \data1.8h, \data0.8h, \tmp.8h + add \data0.8h, \data0.8h, \tmp.8h +.endm + +count .req x2 + +start: + ldr q30, [x0, #48] // .....*.............. + ldr q9, [x1, #0] // *................... +// gap // .................... + ldr q6, [x0, #0] // ..*................. + ldr q18, [x0, #32] // ....*............... +// gap // .................... + ldr q27, [x0, #16] // ...*................ + ldr q7, [x2, #0] // .*.................. +// gap // .................... +// gap // .................... +// gap // .................... +// gap // .................... + mul v16.8H, v30.8H, v9.H[0] // ...........*........ +// gap // .................... +// gap // .................... +// gap // .................... +// gap // .................... +// gap // .................... + sqrdmulh v31.8H, v30.8H, v9.H[1] // ............*....... +// gap // .................... +// gap // .................... +// gap // .................... +// gap // .................... +// gap // .................... + sqrdmulh v15.8H, v27.8H, v9.H[1] // .......*............ +// gap // .................... +// gap // .................... +// gap // .................... +// gap // .................... +// gap // .................... + mul v27.8H, v27.8H, v9.H[0] // ......*............. +// gap // .................... +// gap // .................... +// gap // .................... +// gap // .................... +// gap // .................... + mls v16.8H, v31.8H, v7.H[0] // .............*...... +// gap // .................... +// gap // .................... +// gap // .................... +// gap // .................... +// gap // .................... + mls v27.8H, v15.8H, v7.H[0] // ........*........... +// gap // .................... +// gap // .................... +// gap // .................... +// gap // .................... +// gap // .................... +// gap // .................... +// gap // .................... +// gap // .................... + add v30.8H, v18.8H, v16.8H // ...............*.... + sub v24.8H, v18.8H, v16.8H // ..............*..... +// gap // .................... +// gap // .................... +// gap // .................... +// gap // .................... + sub v7.8H, v6.8H, v27.8H // .........*.......... + add v27.8H, v6.8H, v27.8H // ..........*......... +// gap // .................... + str q24, [x0, #48] // ...................* + str q30, [x0, #32] // ..................*. +// gap // .................... +// gap // .................... +// gap // .................... +// gap // .................... + str q7, [x0, #16] // .................*.. + str q27, [x0], #4*16 // ................*... +// gap // .................... + +// original source code +// ldr q0, [x1, #0] // .*.................. +// ldr q1, [x2, #0] // .....*.............. +// ldr q8, [x0, #0*16] // ..*................. +// ldr q9, [x0, #1*16] // ....*............... +// ldr q10, [x0, #2*16] // ...*................ +// ldr q11, [x0, #3*16] // *................... +// mul v12.8h, v9.8h, v0.h[0] // .........*.......... +// sqrdmulh v9.8h, v9.8h, v0.h[1] // ........*........... +// mls v12.8h, v9.8h, v1.h[0] // ...........*........ +// sub v9.8h, v8.8h, v12.8h // ..............*..... +// add v8.8h, v8.8h, v12.8h // ...............*.... +// mul v12.8h, v11.8h, v0.h[0] // ......*............. +// sqrdmulh v11.8h, v11.8h, v0.h[1] // .......*............ +// mls v12.8h, v11.8h, v1.h[0] // ..........*......... +// sub v11.8h, v10.8h, v12.8h // .............*...... +// add v10.8h, v10.8h, v12.8h // ............*....... +// str q8, [x0], #4*16 // ...................* +// str q9, [x0, #-3*16] // ..................*. +// str q10, [x0, #-2*16] // .................*.. +// str q11, [x0, #-1*16] // ................*... + +end: diff --git a/examples/opt/aarch64/aarch64_simple0_opt_a55.s b/examples/opt/aarch64/aarch64_simple0_opt_a55.s new file mode 100644 index 00000000..66feac88 --- /dev/null +++ b/examples/opt/aarch64/aarch64_simple0_opt_a55.s @@ -0,0 +1,78 @@ + ldr q2, [x1, #0] // *................... + // gap // .................... + // gap // .................... + // gap // .................... + ldr q0, [x0, #48] // .....*.............. + // gap // .................... + // gap // .................... + // gap // .................... + ldr q31, [x0, #16] // ...*................ + // gap // .................... + // gap // .................... + // gap // .................... + mul v7.8H, v0.8H, v2.H[0] // ...........*........ + // gap // .................... + ldr q12, [x2, #0] // .*.................. + // gap // .................... + // gap // .................... + // gap // .................... + sqrdmulh v23.8H, v31.8H, v2.H[1] // .......*............ + // gap // .................... + mul v22.8H, v31.8H, v2.H[0] // ......*............. + // gap // .................... + sqrdmulh v2.8H, v0.8H, v2.H[1] // ............*....... + // gap // .................... + ldr q28, [x0] // ..*................. + // gap // .................... + // gap // .................... + // gap // .................... + mls v22.8H, v23.8H, v12.H[0] // ........*........... + // gap // .................... + ldr q23, [x0, #32] // ....*............... + // gap // .................... + // gap // .................... + // gap // .................... + mls v7.8H, v2.8H, v12.H[0] // .............*...... + // gap // .................... + add v12.8H, v28.8H, v22.8H // ..........*......... + // gap // .................... + sub v2.8H, v28.8H, v22.8H // .........*.......... + // gap // .................... + // gap // .................... + // gap // .................... + str q12, [x0], #4*16 // ................*... + // gap // .................... + add v8.8H, v23.8H, v7.8H // ...............*.... + // gap // .................... + str q2, [x0, #-48] // .................*.. + // gap // .................... + sub v22.8H, v23.8H, v7.8H // ..............*..... + // gap // .................... + str q8, [x0, #-32] // ..................*. + // gap // .................... + // gap // .................... + // gap // .................... + str q22, [x0, #-16] // ...................* + // gap // .................... + + // original source code + // ldr q0, [x1, #0] // *................... + // ldr q1, [x2, #0] // ....*............... + // ldr q8, [x0] // ........*........... + // ldr q9, [x0, #1*16] // ..*................. + // ldr q10, [x0, #2*16] // ..........*......... + // ldr q11, [x0, #3*16] // .*.................. + // mul v24.8h, v9.8h, v0.h[0] // ......*............. + // sqrdmulh v9.8h, v9.8h, v0.h[1] // .....*.............. + // mls v24.8h, v9.8h, v1.h[0] // .........*.......... + // sub v9.8h, v8.8h, v24.8h // .............*...... + // add v8.8h, v8.8h, v24.8h // ............*....... + // mul v24.8h, v11.8h, v0.h[0] // ...*................ + // sqrdmulh v11.8h, v11.8h, v0.h[1] // .......*............ + // mls v24.8h, v11.8h, v1.h[0] // ...........*........ + // sub v11.8h, v10.8h, v24.8h // .................*.. + // add v10.8h, v10.8h, v24.8h // ...............*.... + // str q8, [x0], #4*16 // ..............*..... + // str q9, [x0, #-3*16] // ................*... + // str q10, [x0, #-2*16] // ..................*. + // str q11, [x0, #-1*16] // ...................* diff --git a/examples/opt/aarch64/aarch64_simple0_opt_a72.s b/examples/opt/aarch64/aarch64_simple0_opt_a72.s new file mode 100644 index 00000000..f1a6f5cf --- /dev/null +++ b/examples/opt/aarch64/aarch64_simple0_opt_a72.s @@ -0,0 +1,91 @@ + ldr q0, [x1, #0] // *................... + ldr q7, [x0, #16] // ...*................ +// gap // .................... + ldr q10, [x0, #48] // .....*.............. +// gap // .................... +// gap // .................... + ldr q13, [x2, #0] // .*.................. +// gap // .................... +// gap // .................... + ldr q3, [x0, #32] // ....*............... +// gap // .................... +// gap // .................... + sqrdmulh v2.8H, v7.8H, v0.H[1] // .......*............ + ldr q9, [x0] // ..*................. +// gap // .................... +// gap // .................... +// gap // .................... +// gap // .................... + mul v26.8H, v7.8H, v0.H[0] // ......*............. +// gap // .................... +// gap // .................... +// gap // .................... +// gap // .................... +// gap // .................... + sqrdmulh v14.8H, v10.8H, v0.H[1] // ............*....... +// gap // .................... +// gap // .................... +// gap // .................... +// gap // .................... +// gap // .................... + mls v26.8H, v2.8H, v13.H[0] // ........*........... +// gap // .................... +// gap // .................... +// gap // .................... +// gap // .................... +// gap // .................... + mul v11.8H, v10.8H, v0.H[0] // ...........*........ +// gap // .................... +// gap // .................... +// gap // .................... +// gap // .................... +// gap // .................... + mls v11.8H, v14.8H, v13.H[0] // .............*...... +// gap // .................... +// gap // .................... +// gap // .................... +// gap // .................... +// gap // .................... + add v22.8H, v9.8H, v26.8H // ..........*......... + sub v9.8H, v9.8H, v26.8H // .........*.......... +// gap // .................... +// gap // .................... +// gap // .................... +// gap // .................... +// gap // .................... +// gap // .................... +// gap // .................... + str q9, [x0, #16] // .................*.. + sub v31.8H, v3.8H, v11.8H // ..............*..... + add v9.8H, v3.8H, v11.8H // ...............*.... + str q22, [x0], #4*16 // ................*... +// gap // .................... +// gap // .................... +// gap // .................... +// gap // .................... +// gap // .................... + str q31, [x0, #-16] // ...................* + str q9, [x0, #-32] // ..................*. +// gap // .................... + +// original source code +// ldr q0, [x1, #0] // *................... +// ldr q1, [x2, #0] // ...*................ +// ldr q8, [x0] // ......*............. +// ldr q9, [x0, #1*16] // .*.................. +// ldr q10, [x0, #2*16] // ....*............... +// ldr q11, [x0, #3*16] // ..*................. +// mul v24.8h, v9.8h, v0.h[0] // .......*............ +// sqrdmulh v9.8h, v9.8h, v0.h[1] // .....*.............. +// mls v24.8h, v9.8h, v1.h[0] // .........*.......... +// sub v9.8h, v8.8h, v24.8h // .............*...... +// add v8.8h, v8.8h, v24.8h // ............*....... +// mul v24.8h, v11.8h, v0.h[0] // ..........*......... +// sqrdmulh v11.8h, v11.8h, v0.h[1] // ........*........... +// mls v24.8h, v11.8h, v1.h[0] // ...........*........ +// sub v11.8h, v10.8h, v24.8h // ...............*.... +// add v10.8h, v10.8h, v24.8h // ................*... +// str q8, [x0], #4*16 // .................*.. +// str q9, [x0, #-3*16] // ..............*..... +// str q10, [x0, #-2*16] // ...................* +// str q11, [x0, #-1*16] // ..................*. diff --git a/slothy/core/slothy.py b/slothy/core/slothy.py index d43a39c2..596ec311 100644 --- a/slothy/core/slothy.py +++ b/slothy/core/slothy.py @@ -232,9 +232,6 @@ def optimize(self, start=None, end=None, loop_synthesis_cb=None, logname=None): # Check if the body has a dominant indentation indentation = AsmHelper.find_indentation(body) - if self.config.with_llvm_mca_before is True: - orig_stats = self._make_llvm_mca_stats(pre, body, "ORIGINAL", indentation) - if c.with_preprocessor: self.logger.info("Apply C preprocessor...") body = CPreprocessor.unfold(pre, body, c.compiler_binary) @@ -246,6 +243,10 @@ def optimize(self, start=None, end=None, loop_synthesis_cb=None, logname=None): body = AsmAllocation.unfold_all_aliases(c.register_aliases, body) body = SourceLine.apply_indentation(body, indentation) self.logger.info("Instructions in body: %d", len(list(filter(None, body)))) + + if self.config.with_llvm_mca_before is True: + orig_stats = self._make_llvm_mca_stats(pre, body, "ORIGINAL", indentation) + early, core, late, num_exceptional = Heuristics.periodic(body, logger, c) if self.config.with_llvm_mca_before is True: @@ -265,6 +266,7 @@ def indented(code): if end is not None: core += [SourceLine(f"{end}:")] + core = SourceLine.apply_indentation(core, self.config.indentation) if not self.config.sw_pipelining.enabled: assert early == [] assert late == [] @@ -395,9 +397,6 @@ def optimize_loop(self, loop_lbl, postamble_label=None): c = self.config.copy() c.add_aliases(aliases) - if self.config.with_llvm_mca_before is True: - orig_stats = self._make_llvm_mca_stats(early, body, "ORIGINAL", indentation) - if c.with_preprocessor: self.logger.info("Apply C preprocessor...") body = CPreprocessor.unfold(early, body, c.compiler_binary) @@ -408,10 +407,12 @@ def optimize_loop(self, loop_lbl, postamble_label=None): body = AsmMacro.unfold_all_macros(early, body, inherit_comments=c.inherit_macro_comments) body = AsmAllocation.unfold_all_aliases(c.register_aliases, body) body = SourceLine.apply_indentation(body, indentation) - self.logger.info("Optimizing loop %s (%d instructions) ...", loop_lbl, len(body)) + if self.config.with_llvm_mca_before is True: + orig_stats = self._make_llvm_mca_stats(early, body, "ORIGINAL", indentation) + preamble_code, kernel_code, postamble_code, num_exceptional = \ Heuristics.periodic(body, logger, c) @@ -419,16 +420,19 @@ def optimize_loop(self, loop_lbl, postamble_label=None): kernel_code = kernel_code + orig_stats if self.config.with_llvm_mca_after is True: + print(SourceLine.write_multiline(kernel_code)) new_stats_kernel = self._make_llvm_mca_stats(early, kernel_code, "OPTIMIZED", indentation) kernel_code = kernel_code + new_stats_kernel - if len(preamble_code) > 0: + if self.config.sw_pipelining.optimize_preamble is True \ + and len(preamble_code) > 0: new_stats_preamble = self._make_llvm_mca_stats(early, preamble_code, "PREAMBLE", indentation) preamble_code = preamble_code + new_stats_preamble - if len(postamble_code) > 0: + if self.config.sw_pipelining.optimize_postamble is True \ + and len(postamble_code) > 0: new_stats_postamble = self._make_llvm_mca_stats(early, postamble_code, "POSTAMBLE", indentation) postamble_code = postamble_code + new_stats_postamble diff --git a/tutorial/README.md b/tutorial/README.md new file mode 100644 index 00000000..a49bb728 --- /dev/null +++ b/tutorial/README.md @@ -0,0 +1,1050 @@ +# SLOTHY Tutorial + +This tutorial introduces you to using the SLOTHY superoptimizer for optimizing assembly programs for a specific microarchitecture. +It goes beyond what is written in the [README](../README.md) or the [SLOTHY +paper](https://eprint.iacr.org/2022/1303.pdf) in that it gives more examples on how we, the developers of SLOTHY, +typically use SLOTHY to optimize cryptographic code. At the end of the tutorial, you should be familiar with +the workflow of using SLOTHY as well as a number of common ways to debug or improve your results. + +## Introduction to SLOTHY + +SLOTHY is a fixed instruction superoptimizer: Its input is assembly and its output is semantically-equivalent optimized +assembly using the same instructions and data flow. The fact that SLOTHY does not change instructions is very important +both theoretically (in terms of complexity of optimization) and practically (in terms of developer control) and sets SLOTHY apart from +_synthesizing_ superoptimizers like [souper](https://github.com/google/souper). + +Concretely, SLOTHY performs three main jobs: +1. (Re-)schedule instructions to hide latencies and improve utilization of all execution units. +2. Rename registers in case this enables a better scheduling. +3. Perform software pipelining (aka periodic loop interleaving). We will cover software pipelining in more depth later in this tutorial. + +SLOTHY performs these jobs by first lifting the input assembly into a data-flow graph (DFG) modelling dependencies +between instructions. At this level, the ordering of instructions and the choice of register names is no longer visible. +The goal of SLOTHY, then, is to find a traversal/lowering of the DFG that results in the least +number of pipeline stalls. A traversal/lowering of the graph is assigning to each instruction an index at which +the instruction will be in the output, plus a choice of registers to be used for its outputs. SLOTHY does so by turning +the graph together with information about the (micro)architecture into constraints that are fed into an external constraint +solver; so far, we have been using Google OR-tools, but in principle one can use other solvers as well. +Constraints come in two flavours: Architectural and microarchitectural. Architectural constraints simply ensure that the +resulting code is architecturally valid (e.g. SLOTHY does not use a vector register in a scalar instruction) and +functionally correct (it has the same DFG). Microarchitectural constraints imply (hopefully) that the code will run fast +on the target; SLOTHY models microarchitectures in terms of issue width, instruction latencies, throughput, forwarding +paths, and the number of execution units able to execute certain instructions. We refer to the [SLOTHY +paper](https://eprint.iacr.org/2022/1303.pdf) for details of the constraint model, which are not relevant here. + +Note again that SLOTHY does (largely) not change instructions: Instruction selection is left to the developer. +For cryptographic code -- which is what SLOTHY was developed for -- instruction selection is a core focus of research +and highly-optimized instruction sequences implementing a cryptographic (sub-)routine usually exist. Tight control over +the choice of instructions is also important from a security perspective, as variable-time instructions have to be +avoided. + +**High-assurance cryptography**: While formal verification is not part of SLOTHY itself, there is potential for +combining existing formal verification tools with SLOTHY. From a high level, formal verification should be relatively +simple, owing to the fact that SLOTHY does not change the DFG: In fact, SLOTHY itself includes a selfcheck that +lifts the output assembly back into a DFG and confirms that it is isomorphic to the input DFG via the permutation found by +SLOTHY. However, while this is a strong indicator of correctness of the output assembly, it does _not_ amount to a +formal verification, as pitfalls do remain (notably bad user configurations and subtleties around modelling +memory and load/store offsets which we will not discuss in this tutorial). Research into combining SLOTHY with trusted +verification infrastructure is therefore needed. As a first promising example, AWS-LC has recently +[integrated](https://github.com/aws/aws-lc/pull/1478) an implementation of X25519 that was auto-generated by SLOTHY and +formally verified using the [HOL-Light](https://github.com/jrh13/hol-light) proof assistant. + +## Table of contents + +1) [Installation](#1-installation). This is limited to the fastest way of installing SLOTHY using pip. For more complete instructions, see the [README](../README.md). +2) [Getting started](#2-getting-started) +3) [Using SLOTHY for your own code](#3-writing-your-own-calling-code) +4) [Using SLOTHY's Software Pipelining](#4-software-pipelining) +5) [Checking the quality of SLOTHY optimizations](#5-checking-the-quality-of-slothy-optimizations) +6) [Optimizing a full Neon NTT](#6-optimizing-a-full-neon-ntt) +7) [Optimizing larger pieces of code](#7-optimizing-larger-pieces-of-code) +8) [Adding a new microarchitecture](#8-adding-a-new-microarchitecture) + +The SLOTHY calling code used for the parts 3-7 is located in `tutorial-{3a,3b,4,5,6,7}.py`. + +## 1. Installation + +SLOTHY requires python3 (>= 3.10). +The easiest way to install the dependencies of SLOTHY is using pip. +It's advised to make use of [virtual environment](https://docs.python.org/3/library/venv.html). + +The following steps should get you started: + +```bash +git clone https://github.com/slothy-optimizer/slothy +cd slothy +# setup venv +python3 -m venv venv +source venv/bin/activate +# install dependencies +pip install -r requirements.txt +``` + +You can try to run SLOTHY on one of the examples that come with SLOTHY to make sure it runs without errors: +``` +python3 example.py --examples simple0 +``` + +We will look into more examples shortly and discuss input, output, and available flags. + +## 2. Getting Started + +The simplest way to get started using SLOTHY is by trying out some of the examples that come with SLOTHY. +Once you work on your own code, you will likely be using the `slothy-cli` command or calling the SLOTHY module from your own Python script for invoking SLOTHY allowing you to control all the different options SLOTHY has. +However, for now we will be using the [example.py](../example.py) script and containing a number of examples including the ones we have optimized in the SLOTHY paper. +You can run `python3 example.py --help` to see all examples available. + +Let's look at a very simple example from the previous section called `aarch64_simple0`. +You can find the corresponding code in [examples/naive/aarch64/aarch64_simple0.s](../examples/naive/aarch64/aarch64_simple0.s): +```nasm +ldr q0, [x1, #0] +ldr q1, [x2, #0] + +ldr q8, [x0] +ldr q9, [x0, #1*16] +ldr q10, [x0, #2*16] +ldr q11, [x0, #3*16] + +mul v24.8h, v9.8h, v0.h[0] +sqrdmulh v9.8h, v9.8h, v0.h[1] +mls v24.8h, v9.8h, v1.h[0] +sub v9.8h, v8.8h, v24.8h +add v8.8h, v8.8h, v24.8h + +mul v24.8h, v11.8h, v0.h[0] +sqrdmulh v11.8h, v11.8h, v0.h[1] +mls v24.8h, v11.8h, v1.h[0] +sub v11.8h, v10.8h, v24.8h +add v10.8h, v10.8h, v24.8h + +str q8, [x0], #4*16 +str q9, [x0, #-3*16] +str q10, [x0, #-2*16] +str q11, [x0, #-1*16] +``` + +It contains a straight-line piece of assembly for the Armv8-A architecture. This architecture implements the Neon vector instruction extension and all the instructions in this example are Neon vector instructions. +If you have never written Neon assembly before, you do not have to worry about it at this point. +All you need to know about the code is that it loads some vectors from memory, performs some arithmetic operations, and writes back the result to memory. +Note that there are two independent streams of computation on the four vectors loaded from memory, and, hence, there is quite some possibilities to re-order this code without affecting its semantics. +This code is able to run on a variety of different microarchitectures, ranging from low-end energy efficient in-order cores like the Arm Cortex-A55 to high-end out-of-order CPUs with very complex pipelines like the Apple M1 or Arm Neoverse server CPUs. +For the in-order cores, the instruction scheduling plays the most essential role as poorly scheduled code is very likely to have poor performance, and hence, we will focus on the Cortex-A55 architecture in the following. +Note, however, that SLOTHY has been used to also obtain significant speed-ups for out-of-order cores. + +SLOTHY comes with models for various Arm architectures, including the power-efficient, in-order +[Cortex-A55](https://developer.arm.com/Processors/Cortex-A55), so we can now optimize this piece of code for that +microarchitecture. [example.py](../example.py) contains the needed SLOTHY incarnations for convenience, so we can simply run `python3 +example.py --examples aarch64_simple0_a55` which will optimize for the Cortex-A55 microarchitecture. You can check +[example.py](../example.py) for the details. This will optimize the piece of code above and write the output code to +[examples/opt/aarch64/aarch64_simple0_opt_a55.s](../examples/opt/aarch64/aarch64_simple0_opt_a55.s). +SLOTHY should print something similar to this: +``` +INFO:aarch64_simple0_a55:Instructions in body: 20 +INFO:aarch64_simple0_a55.slothy:Perform internal binary search for minimal number of stalls... +INFO:aarch64_simple0_a55.slothy:Attempt optimization with max 32 stalls... +INFO:aarch64_simple0_a55.slothy:Objective: minimize number of stalls +INFO:aarch64_simple0_a55.slothy:Invoking external constraint solver (OR-Tools CP-SAT v9.7.2996) ... +INFO:aarch64_simple0_a55.slothy:[0.0653s]: Found 1 solutions so far... objective 19.0, bound 12.0 (minimize number of stalls) +INFO:aarch64_simple0_a55.slothy:[0.0801s]: Found 2 solutions so far... objective 18.0, bound 12.0 (minimize number of stalls) +INFO:aarch64_simple0_a55.slothy:OPTIMAL, wall time: 0.180540 s +INFO:aarch64_simple0_a55.slothy:Booleans in result: 449 +INFO:aarch64_simple0_a55.slothy.selfcheck:OK! +INFO:aarch64_simple0_a55.slothy:Minimum number of stalls: 18 +``` + +You can follow the steps SLOTHY performs and see the calls the constraint solver trying to find a re-scheduling of this code containing at most 32 stalls (a default starting point we have set here to speed up the example). +At the same time it is trying to minimize the number of stalls. This is passed as an objective to the constraint solver (OR-tools) which tries to find a solution with the minimum number of stalls. +The best solution it can find has 16 stalls -- which is guaranteed to be the minimum number of stalls given this piece of code and the model of the microarchitecture in SLOTHY. +In the last step, SLOTHY will transform the found traversal of the DFG into actual assembly and write it to the file. +To make sure everything worked out as expected, it will perform a selfcheck which consists of transforming the output assembly into a DFG again and testing that the resulting graph is isomorphic to the input DFG. + +We can now take a look at the output assembly in [examples/opt/aarch64/aarch64_simple0_opt_a55.s](../examples/opt/aarch64/aarch64_simple0_opt_a55.s): +```nasm +ldr q8, [x1, #0] // *................... +// gap // .................... +// gap // .................... +// gap // .................... +ldr q30, [x0, #16] // ...*................ +// gap // .................... +// gap // .................... +// gap // .................... +ldr q25, [x0] // ..*................. +// gap // .................... +// gap // .................... +// gap // .................... +mul v13.8H, v30.8H, v8.H[0] // ......*............. +// gap // .................... +sqrdmulh v21.8H, v30.8H, v8.H[1] // .......*............ +// gap // .................... +ldr q30, [x0, #48] // .....*.............. +// gap // .................... +// gap // .................... +// gap // .................... +ldr q3, [x2, #0] // .*.................. +// gap // .................... +// gap // .................... +// gap // .................... +sqrdmulh v5.8H, v30.8H, v8.H[1] // ............*....... +// gap // .................... +mul v30.8H, v30.8H, v8.H[0] // ...........*........ +// gap // .................... +mls v13.8H, v21.8H, v3.H[0] // ........*........... +// gap // .................... +ldr q15, [x0, #32] // ....*............... +// gap // .................... +// gap // .................... +// gap // .................... +mls v30.8H, v5.8H, v3.H[0] // .............*...... +// gap // .................... +add v8.8H, v25.8H, v13.8H // ..........*......... +// gap // .................... +sub v20.8H, v25.8H, v13.8H // .........*.......... +// gap // .................... +// gap // .................... +// gap // .................... +str q8, [x0], #4*16 // ................*... +// gap // .................... +add v26.8H, v15.8H, v30.8H // ...............*.... +// gap // .................... +str q20, [x0, #-48] // .................*.. +// gap // .................... +sub v5.8H, v15.8H, v30.8H // ..............*..... +// gap // .................... +str q26, [x0, #-32] // ..................*. +// gap // .................... +// gap // .................... +// gap // .................... +str q5, [x0, #-16] // ...................* +// gap // .................... + +// original source code +// ldr q0, [x1, #0] // *................... +// ldr q1, [x2, #0] // ......*............. +// ldr q8, [x0] // ..*................. +// ldr q9, [x0, #1*16] // .*.................. +// ldr q10, [x0, #2*16] // ..........*......... +// ldr q11, [x0, #3*16] // .....*.............. +// mul v24.8h, v9.8h, v0.h[0] // ...*................ +// sqrdmulh v9.8h, v9.8h, v0.h[1] // ....*............... +// mls v24.8h, v9.8h, v1.h[0] // .........*.......... +// sub v9.8h, v8.8h, v24.8h // .............*...... +// add v8.8h, v8.8h, v24.8h // ............*....... +// mul v24.8h, v11.8h, v0.h[0] // ........*........... +// sqrdmulh v11.8h, v11.8h, v0.h[1] // .......*............ +// mls v24.8h, v11.8h, v1.h[0] // ...........*........ +// sub v11.8h, v10.8h, v24.8h // .................*.. +// add v10.8h, v10.8h, v24.8h // ...............*.... +// str q8, [x0], #4*16 // ..............*..... +// str q9, [x0, #-3*16] // ................*... +// str q10, [x0, #-2*16] // ..................*. +// str q11, [x0, #-1*16] // ...................* +``` + +At the top you can see the re-scheduled assembly and at the bottom you find the original source code as a comment. +As comments next to the two sections, you can also see a visual representation on how these instructions have been rescheduled. +You can see that various instructions have been moved around to achieve fewer stalls. + +Note that if you do run SLOTHY again, it may produce a different scheduling with the same minimal number of stalls. +This is expected and due to the constraint solver not producing deterministic outputs. + +In the scheduled code, you can see `// gap` where SLOTHY would expect a "gap" in the current model: +This is not a pipeline stall in the sense of a wasted cycle, but rather an issue slot of +the CPU that was not used. The Cortex-A55 is a dual-issue CPU meaning in ideal circumstances 2 instructions can be issued per cycle. +However, the Neon pipeline can only issue a single (128-bit/q-form) Neon instruction per cycle. +Since our code only consists of (128-bit/q-form) Neon instructions, the best we can hope for is a single `gap` after each instruction. +To make use of these issue slots one would have to mix in scalar instructions (or use 64-bit (d-form) Neon instructions). + +Also note the registers used: In the original code `v24` was as a temporary register in both computation streams preventing to effectively interleave them. +SLOTHY renamed those registers to be able to interleave both computations. Other registers have also been arbitrarily +renamed, but without any specific reason. + +## 3. Writing your own calling code + +When writing your own calls to SLOTHY, there are generally two options: +(1) Using SLOTHY as a Python module, or (2) using `slothy-cli` using command line options. We will continue with (1) to demonstrate some features. +To reproduce the example above, you can place the following code into your own Python script in the root directory of SLOTHY: + +```python +import logging +import sys + +from slothy import Slothy + +import slothy.targets.aarch64.aarch64_neon as AArch64_Neon +import slothy.targets.aarch64.cortex_a55 as Target_CortexA55 + +logging.basicConfig(stream=sys.stdout, level=logging.INFO) + +arch = AArch64_Neon +target = Target_CortexA55 + +slothy = Slothy(arch, target) + +# example +slothy.load_source_from_file("examples/naive/aarch64/aarch64_simple0.s") +slothy.config.variable_size=True +slothy.config.constraints.stalls_first_attempt=32 + +slothy.optimize() +slothy.write_source_to_file("opt/aarch64_simple0_a55.s") +``` + +You will need to pass to SLOTHY both the architecture model (containing the instruction mnemonics and which registers +are input and outputs for each instruction) and the microarchitectual model (containing latencies, throughputs, +execution units, etc.). In this case, we use the AArch64+Neon architecture model and the Arm Cortex-A55 +microarchitecture model that come with SLOTHY. + +The calls to SLOTHY should be self-explanatory: + - `load_source_from_file` loads an assembly file to be optimized. + - `slothy.config` can be used to configure SLOTHY. For the documentation of the configuration options, see the comments in [config.py](../slothy/core/config.py). + - `optimize` performs the actual optimizations by calling the external constraint solver. + - `write_source_to_file` writes back the optimized assembly to a file. + +Setting `slothy.config.variable_size` results in the number of stalls being a parameter of the model that the constraint +solver is trying to minimize within a static 'stall budget'. By default, SLOTHY would start with a stall budget of 0 and +exponentially increase until a solution is found. To speed this process up, we set `stalls_first_attempt=32`, starting +the search with a sufficient stall budget of 32 cycles. + +The `variable_size` may not perform well for large examples. The default strategy (`variable_size=False`) is, hence, to +pass a fixed number of allowed stalls to the constraint solver and to have SLOTHY perform an 'external' binary search to +find the minimum number of stalls for which a solution exists. + +Even with this small Neon example, you can see that understanding the input code is much easier than the output code. In +fact, the input code can be further clarified through the use of macros and register aliases, leading to the following +'clean' version from +[examples/naive/aarch64/aarch64_simple0_macros.s](../examples/naive/aarch64/aarch64_simple0_macros.s) which makes it +apparent that our example is just a pair of NTT butterflies using Barrett multiplication. Note that the `.req` and +`.macro` directives used here are commonly supported [assembly +directives](https://www.sourceware.org/binutils/docs/as/ARM-Directives.html). + +```nasm +qdata0 .req q8 +qdata1 .req q9 +qdata2 .req q10 +qdata3 .req q11 + +qtwiddle .req q0 + +data0 .req v8 +data1 .req v9 +data2 .req v10 +data3 .req v11 + +twiddle .req v0 +modulus .req v1 + +tmp .req v12 + +data_ptr .req x0 +twiddle_ptr .req x1 + +.macro barmul out, in, twiddle, modulus + mul \out.8h, \in.8h, \twiddle.h[0] + sqrdmulh \in.8h, \in.8h, \twiddle.h[1] + mls \out.8h, \in.8h, \modulus.h[0] +.endm + +.macro butterfly data0, data1, tmp, twiddle, modulus + barmul \tmp, \data1, \twiddle, \modulus + sub \data1.8h, \data0.8h, \tmp.8h + add \data0.8h, \data0.8h, \tmp.8h +.endm + +start: + + ldr qtwiddle, [twiddle_ptr, #0] + + ldr qdata0, [data_ptr, #0*16] + ldr qdata1, [data_ptr, #1*16] + ldr qdata2, [data_ptr, #2*16] + ldr qdata3, [data_ptr, #3*16] + + butterfly data0, data1, tmp, twiddle, modulus + butterfly data2, data3, tmp, twiddle, modulus + + str qdata0, [data_ptr], #4*16 + str qdata1, [data_ptr, #-3*16] + str qdata2, [data_ptr, #-2*16] + str qdata3, [data_ptr, #-1*16] + +end: +``` + +SLOTHY will then internally expand all macros and the resulting DFG will be exactly the same as before. +To make this work, we have to slightly change the SLOTHY code: +```python +# example +slothy.load_source_from_file("../examples/naive/aarch64/aarch64_simple0_macros.s") +slothy.config.variable_size=True +slothy.config.constraints.stalls_first_attempt=32 + +slothy.optimize(start="start", end="end") +slothy.write_source_to_file("opt/aarch64_simple0_macros_a55.s") +``` + +The difference is that, we have to explicitly pass `start` and `end` labels to SLOTHY. +This is because SLOTHY does not understand the code before that and the parsing would fail if run on that part of the code. + +We have found it very useful to base assembly optimization on a 'clean' version as above and automate its optimization +using SLOTHY, which is the reason why we believe that SLOTHY can help with the development of auditable and maintainable +high-performance assembly. + + +## 4. Software Pipelining + +One of the most powerful features of SLOTHY is [software +pipelining](https://en.wikipedia.org/wiki/Software_pipelining). The core idea of software pipelining is that loop +scheduling can be improved by moving some instructions to earlier or later iterations of the loop, that is, by +interleaving loop iterations. Note that this does not mean that the loop has to be unrolled: By maintaining +the periodicity of the interleaved code, it is possible to keep it within a loop, thereby retaining code compactness. +Only the first and last iteration(s) may require to be treated separately; those are called the preamble and +postamble, respectively. + +Let's look at an example demonstrating how SLOTHY can perform software pipelining for you. +Consider the simple case of performing the code from the previous example within a loop with a fixed number of iterations (>=2). This is exactly what the +`aarch64_simple0_loop` example in SLOTHY does: +```nasm +... // .req and .macro as above + +count .req x2 +ldr qtwiddle, [twiddle_ptr, #0] +ldr qmodulus, modulus_ptr, #0 + +mov count, #16 +start: + + ldr qtwiddle, [twiddle_ptr, #0] + + ldr qdata0, [data_ptr, #0*16] + ldr qdata1, [data_ptr, #1*16] + ldr qdata2, [data_ptr, #2*16] + ldr qdata3, [data_ptr, #3*16] + + butterfly data0, data1, tmp, twiddle, modulus + butterfly data2, data3, tmp, twiddle, modulus + + str qdata0, [data_ptr], #4*16 + str qdata1, [data_ptr, #-3*16] + str qdata2, [data_ptr, #-2*16] + str qdata3, [data_ptr, #-1*16] + + subs count, count, #1 + cbnz count, start +``` + +Let's use SLOTHY to superoptimize this loop: +```python +slothy.load_source_from_file("examples/naive/aarch64/aarch64_simple0_loop.s") +slothy.config.variable_size=True +slothy.config.constraints.stalls_first_attempt=32 + +slothy.config.sw_pipelining.enabled = True +slothy.config.sw_pipelining.optimize_preamble = False +slothy.config.sw_pipelining.optimize_postamble = False +slothy.optimize_loop("start") +slothy.write_source_to_file("opt/aarch64_simple0_loop_a55.s") +``` + +Software pipelining needs to be enabled by setting `slothy.config.sw_pipelining.enabled = True`. +We also need to specifically tell SLOTHY that we would like to optimize the loop starting at `start` -- SLOTHY will +automatically detect that the loop ends at `cbnz count, start`. Finally, `optimize_preamble = False` and +`optimize_preamble = False` prevent SLOTHY from optimizing the loop preamble and postamble (first/last iteration), which +it would by default -- you normally want this set, but we unset it here to simplify the output. This is what it will +look like: + +```nasm +// ... +count .req x2 +ldr qtwiddle, [twiddle_ptr, #0] +ldr qmodulus, [modulus_ptr, #0] +mov count, #16 + ldr q3, [x0, #16] + sqrdmulh v7.8H, v3.8H, v0.H[1] + sub count, count, #1 +start: + mul v3.8H, v3.8H, v0.H[0] // ....*............. + // gap // .................. + ldr q19, [x0, #48] // ...*.............. + // gap // .................. + // gap // .................. + // gap // .................. + ldr q15, [x0, #0] // *................. + // gap // .................. + // gap // .................. + // gap // .................. + mls v3.8H, v7.8H, v1.H[0] // ......*........... + // gap // .................. + mul v13.8H, v19.8H, v0.H[0] // .........*........ + // gap // .................. + sqrdmulh v19.8H, v19.8H, v0.H[1] // ..........*....... + // gap // .................. + ldr q7, [x0, #32] // ..*............... + // gap // .................. + // gap // .................. + // gap // .................. + sub v17.8H, v15.8H, v3.8H // .......*.......... + // gap // .................. + add v10.8H, v15.8H, v3.8H // ........*......... + // gap // .................. + mls v13.8H, v19.8H, v1.H[0] // ...........*...... + // gap // .................. + str q17, [x0, #16] // ...............*.. + // gap // .................. + ldr q3, [x0, #80] // .e................ + // gap // .................. + // gap // .................. + // gap // .................. + add v15.8H, v7.8H, v13.8H // .............*.... + // gap // .................. + str q10, [x0], #4*16 // ..............*... + // gap // .................. + sub v13.8H, v7.8H, v13.8H // ............*..... + // gap // .................. + str q15, [x0, #-32] // ................*. + // gap // .................. + sqrdmulh v7.8H, v3.8H, v0.H[1] // .....e............ + // gap // .................. + str q13, [x0, #-16] // .................* + // gap // .................. + + // original source code + // ldr q8, [x0, #0*16] // .......|.*............... + // ldr q9, [x0, #1*16] // e......|..........e...... + // ldr q10, [x0, #2*16] // .......|.....*........... + // ldr q11, [x0, #3*16] // .......|*................ + // mul v12.8h, v9.8h, v0.h[0] // .......*................. + // sqrdmulh v9.8h, v9.8h, v0.h[1] // .....e.|...............e. + // mls v12.8h, v9.8h, v1.h[0] // .......|..*.............. + // sub v9.8h, v8.8h, v12.8h // .......|......*.......... + // add v8.8h, v8.8h, v12.8h // .......|.......*......... + // mul v12.8h, v11.8h, v0.h[0] // .......|...*............. + // sqrdmulh v11.8h, v11.8h, v0.h[1] // .......|....*............ + // mls v12.8h, v11.8h, v1.h[0] // .......|........*........ + // sub v11.8h, v10.8h, v12.8h // ...*...|.............*... + // add v10.8h, v10.8h, v12.8h // .*.....|...........*..... + // str q8, [x0], #4*16 // ..*....|............*.... + // str q9, [x0, #-3*16] // .......|.........*....... + // str q10, [x0, #-2*16] // ....*..|..............*.. + // str q11, [x0, #-1*16] // ......*|................* + + sub count, count, #1 + cbnz count, start + mul v3.8H, v3.8H, v0.H[0] + ldr q19, [x0, #48] + ldr q15, [x0, #0] + mls v3.8H, v7.8H, v1.H[0] + mul v13.8H, v19.8H, v0.H[0] + sqrdmulh v19.8H, v19.8H, v0.H[1] + ldr q7, [x0, #32] + sub v17.8H, v15.8H, v3.8H + add v10.8H, v15.8H, v3.8H + mls v13.8H, v19.8H, v1.H[0] + str q17, [x0, #16] + add v15.8H, v7.8H, v13.8H + str q10, [x0], #4*16 + sub v13.8H, v7.8H, v13.8H + str q15, [x0, #-32] + str q13, [x0, #-16] +``` + +Let's start by looking at the optimized loop body going from `start:` to `cbnz count, start`: +We see that the loop now has 4 blocks of 3 `gap`s meaning that SLOTHY predicts 4 stalls of 1 cycle each. This compares +to 7 stalls in the version without software pipelining. We see that 2 load instructions are marked as early instructions +(annotated `(e)`), meaning they have been moved into the previous iteration: Intuitively, this makes sense: We know +statically what data we need to load for the next iteration, and loads have a fairly long latency, so we can improve +performance by issuing loads early. For the code to still be correct, SLOTHY decreases the number of iterations by one +(`sub count, count, #1`), adds the missing early-instructions for the first iteration before the loop, and finally adds +the non-early instructions of the last iteration after the loop. + +Another experimental feature that can be witnessed in this example is _address offset fixup_. The two `ldr`s that +were moved into the previous iteration have been reordered with the `str _, [x0], #64` which modifies the address +register. SLOTHY is aware of this and has adjusted the immediate offsets in `ldr` accordingly. Without this, software +pipelining would not be possible here. Address offset fixup is an important yet somewhat subtle feature, and mistakes +in its handling are currently not caught by SLOTHY's selfcheck. Going into the details of why that is goes too far for +this tutorial, but it is one of the reasons why the selfcheck does, as it stands, not replace a formal verification. + +## 5. Checking the quality of SLOTHY optimizations + +You may ask how we know that SLOTHY has actually done something useful here? Sure enough, the interleaving in the above +example looks somewhat sensible, and SLOTHY's model predicts only few full-cycle stalls. However, at this point we don't +have any indicator of the impact of SLOTHY's optimizations on real hardware. + +Indeed, developing accurate microarchitectural models for SLOTHY is a time-consuming and iterative process: +It usually takes a while until you have refined things to the point where SLOTHY's prediction closely relates to +performance on real hardware. The most common refinement steps are: +1) There is a mistake in the microarchitectural model mismatching what is written in the Software Optimization Guide (SWOG); +2) Some aspect of the microarchtecture (e.g., certain forwarding paths or other latency exceptions) is not documented in the SWOG. + +We briefly discuss two ways that we found useful to evaluate the quality SLOTHY's optimizations and drive the +refinement of microarchitectural models. + +First, one useful tool for approximate but independent (of SLOTHY) performance evaluation is LLVM's [Machine Code +Analyzer](https://llvm.org/docs/CommandGuide/llvm-mca.html). If you have `llvm-mca` available in your PATH (you may have +to compile LLVM >= 18 yourself), you can make use of it in SLOTHY by setting the `with_llvm_mca` flag. +Let's look at the last example and enable LLVM MCA: + +```python +slothy.load_source_from_file("../examples/naive/aarch64/aarch64_simple0_loop.s") +slothy.config.variable_size=True +slothy.config.constraints.stalls_first_attempt=32 + +slothy.config.sw_pipelining.enabled = True +slothy.config.sw_pipelining.optimize_preamble = False +slothy.config.sw_pipelining.optimize_postamble = False +slothy.config.with_llvm_mca = True +slothy.optimize_loop("start") +slothy.write_source_to_file("./aarch64_simple0_loop_mca_a55.s") +``` + +This will call LLVM MCA on both the original code and the optimized code and append the LLVM MCA statistics as a comment to the output. +Somewhere in the code you will see: +```nasm +// LLVM MCA STATISTICS (ORIGINAL) BEGIN +// +// Iterations: 100 +// Instructions: 2000 +// Total Cycles: 3102 +// Total uOps: 2100 +// +// Dispatch Width: 2 +// uOps Per Cycle: 0.68 +// IPC: 0.64 +// Block RThroughput: 10.5 + +... + +// LLVM MCA STATISTICS (OPTIMIZED) BEGIN +// +// Iterations: 100 +// Instructions: 2000 +// Total Cycles: 2102 +// Total uOps: 2100 +// +// Dispatch Width: 2 +// uOps Per Cycle: 1.00 +// IPC: 0.95 +// Block RThroughput: 10.5 +``` + +This suggests that our optimizations were actually useful: With respect to LLVM-MCA's scheduling model of Cortex-A55, the + cycle count per iteration was reduced from 31 cycles to 21 cycles. + +But LLVM MCA gives you more: It outputs a timeline view showing how each instruction travels through the pipeline: +```nasm +// Timeline view (ORIGINAL): +// 0123456789 0123456789 0123456789 0123456789 01234 +// Index 0123456789 0123456789 0123456789 0123456789 0123456789 +// +// [0,0] DeeE . . . . . . . . . . . . . . . . . . . ldr q0, [x1] +// [0,1] .DeeE. . . . . . . . . . . . . . . . . . . ldr q1, [x2] +// [0,2] . DeeE . . . . . . . . . . . . . . . . . . ldr q8, [x0] +// [0,3] . DeeE . . . . . . . . . . . . . . . . . . ldr q9, [x0, #16] +// [0,4] . DeeE . . . . . . . . . . . . . . . . . . ldr q10, [x0, #32] +// [0,5] . DeeE . . . . . . . . . . . . . . . . . . ldr q11, [x0, #48] +// [0,6] . .DeeeE . . . . . . . . . . . . . . . . . mul v24.8h, v9.8h, v0.h[0] +// [0,7] . . DeeeE . . . . . . . . . . . . . . . . . sqrdmulh v9.8h, v9.8h, v0.h[1] +// [0,8] . . .DeeeE . . . . . . . . . . . . . . . . mls v24.8h, v9.8h, v1.h[0] +// [0,9] . . . DeE . . . . . . . . . . . . . . . . sub v9.8h, v8.8h, v24.8h +// [0,10] . . . .DeE . . . . . . . . . . . . . . . . add v8.8h, v8.8h, v24.8h +// [0,11] . . . . DeeeE . . . . . . . . . . . . . . . mul v24.8h, v11.8h, v0.h[0] +// [0,12] . . . . DeeeE . . . . . . . . . . . . . . . sqrdmulh v11.8h, v11.8h, v0.h[1] +// [0,13] . . . . . DeeeE . . . . . . . . . . . . . . mls v24.8h, v11.8h, v1.h[0] +// [0,14] . . . . . .DeE . . . . . . . . . . . . . . sub v11.8h, v10.8h, v24.8h +// [0,15] . . . . . . DeE. . . . . . . . . . . . . . add v10.8h, v10.8h, v24.8h +// [0,16] . . . . . . DE. . . . . . . . . . . . . . str q8, [x0], #64 +// [0,17] . . . . . . DE . . . . . . . . . . . . . stur q9, [x0, #-48] +// [0,18] . . . . . . DE . . . . . . . . . . . . . stur q10, [x0, #-32] +// [0,19] . . . . . . .DE . . . . . . . . . . . . . stur q11, [x0, #-16] +// [1,0] . . . . . . .DeeE. . . . . . . . . . . . . ldr q0, [x1] +// [1,1] . . . . . . . DeeE . . . . . . . . . . . . ldr q1, [x2] +// [1,2] . . . . . . . DeeE . . . . . . . . . . . . ldr q8, [x0] +// [1,3] . . . . . . . DeeE . . . . . . . . . . . . ldr q9, [x0, #16] +// [1,4] . . . . . . . DeeE . . . . . . . . . . . . ldr q10, [x0, #32] +// [1,5] . . . . . . . .DeeE. . . . . . . . . . . . ldr q11, [x0, #48] +// [1,6] . . . . . . . . DeeeE . . . . . . . . . . . mul v24.8h, v9.8h, v0.h[0] +// [1,7] . . . . . . . . DeeeE . . . . . . . . . . . sqrdmulh v9.8h, v9.8h, v0.h[1] +// [1,8] . . . . . . . . . DeeeE . . . . . . . . . . mls v24.8h, v9.8h, v1.h[0] +// [1,9] . . . . . . . . . .DeE . . . . . . . . . . sub v9.8h, v8.8h, v24.8h +// [1,10] . . . . . . . . . . DeE. . . . . . . . . . add v8.8h, v8.8h, v24.8h +// [1,11] . . . . . . . . . . DeeeE . . . . . . . . . mul v24.8h, v11.8h, v0.h[0] +// [1,12] . . . . . . . . . . DeeeE . . . . . . . . . sqrdmulh v11.8h, v11.8h, v0.h[1] +// [1,13] . . . . . . . . . . . DeeeE . . . . . . . . mls v24.8h, v11.8h, v1.h[0] +// [1,14] . . . . . . . . . . . . DeE. . . . . . . . sub v11.8h, v10.8h, v24.8h +// [1,15] . . . . . . . . . . . . DeE . . . . . . . add v10.8h, v10.8h, v24.8h +// [1,16] . . . . . . . . . . . . DE . . . . . . . str q8, [x0], #64 +// [1,17] . . . . . . . . . . . . DE . . . . . . . stur q9, [x0, #-48] +// [1,18] . . . . . . . . . . . . .DE . . . . . . . stur q10, [x0, #-32] +// [1,19] . . . . . . . . . . . . . DE . . . . . . . stur q11, [x0, #-16] +// [2,0] . . . . . . . . . . . . . DeeE . . . . . . ldr q0, [x1] +// [2,1] . . . . . . . . . . . . . DeeE . . . . . . ldr q1, [x2] +// [2,2] . . . . . . . . . . . . . DeeE . . . . . . ldr q8, [x0] +// [2,3] . . . . . . . . . . . . . DeeE . . . . . . ldr q9, [x0, #16] +// [2,4] . . . . . . . . . . . . . .DeeE. . . . . . ldr q10, [x0, #32] +// [2,5] . . . . . . . . . . . . . . DeeE . . . . . ldr q11, [x0, #48] +// [2,6] . . . . . . . . . . . . . . DeeeE . . . . . mul v24.8h, v9.8h, v0.h[0] +// [2,7] . . . . . . . . . . . . . . DeeeE . . . . . sqrdmulh v9.8h, v9.8h, v0.h[1] +// [2,8] . . . . . . . . . . . . . . . DeeeE . . . . mls v24.8h, v9.8h, v1.h[0] +// [2,9] . . . . . . . . . . . . . . . . DeE. . . . sub v9.8h, v8.8h, v24.8h +// [2,10] . . . . . . . . . . . . . . . . DeE . . . add v8.8h, v8.8h, v24.8h +// [2,11] . . . . . . . . . . . . . . . . DeeeE . . . mul v24.8h, v11.8h, v0.h[0] +// [2,12] . . . . . . . . . . . . . . . . DeeeE. . . sqrdmulh v11.8h, v11.8h, v0.h[1] +// [2,13] . . . . . . . . . . . . . . . . . DeeeE . . mls v24.8h, v11.8h, v1.h[0] +// [2,14] . . . . . . . . . . . . . . . . . . DeE . sub v11.8h, v10.8h, v24.8h +// [2,15] . . . . . . . . . . . . . . . . . . DeE . add v10.8h, v10.8h, v24.8h +// [2,16] . . . . . . . . . . . . . . . . . . DE . str q8, [x0], #64 +// [2,17] . . . . . . . . . . . . . . . . . . .DE . stur q9, [x0, #-48] +// [2,18] . . . . . . . . . . . . . . . . . . . DE. stur q10, [x0, #-32] +// [2,19] . . . . . . . . . . . . . . . . . . . DE stur q11, [x0, #-16] +... + +// Timeline view (OPTIMIZED): +// 0123456789 0123456789 0123456789 +// Index 0123456789 0123456789 0123456789 01234 +// +// [0,0] DeeE . . . . . . . . . . . . . ldr q7, [x1] +// [0,1] .DeeE. . . . . . . . . . . . . ldr q31, [x0, #16] +// [0,2] . DeeE . . . . . . . . . . . . ldr q11, [x0, #48] +// [0,3] . DeeeE . . . . . . . . . . . . mul v20.8h, v31.8h, v7.h[0] +// [0,4] . DeeeE. . . . . . . . . . . . sqrdmulh v31.8h, v31.8h, v7.h[1] +// [0,5] . .DeeeE . . . . . . . . . . . mul v18.8h, v11.8h, v7.h[0] +// [0,6] . . DeeeE . . . . . . . . . . . sqrdmulh v7.8h, v11.8h, v7.h[1] +// [0,7] . . DeeE . . . . . . . . . . . ldr q11, [x2] +// [0,8] . . DeeE . . . . . . . . . . . ldr q8, [x0] +// [0,9] . . .DeeeE . . . . . . . . . . mls v20.8h, v31.8h, v11.h[0] +// [0,10] . . . DeeeE . . . . . . . . . . mls v18.8h, v7.8h, v11.h[0] +// [0,11] . . . DeeE . . . . . . . . . . ldr q7, [x0, #32] +// [0,12] . . . DeE . . . . . . . . . . sub v31.8h, v8.8h, v20.8h +// [0,13] . . . .DeE . . . . . . . . . . add v11.8h, v8.8h, v20.8h +// [0,14] . . . . DeE. . . . . . . . . . sub v20.8h, v7.8h, v18.8h +// [0,15] . . . . DE . . . . . . . . . . str q31, [x0, #16] +// [0,16] . . . . DeE . . . . . . . . . add v7.8h, v7.8h, v18.8h +// [0,17] . . . . DE . . . . . . . . . str q11, [x0], #64 +// [0,18] . . . . DE . . . . . . . . . stur q7, [x0, #-32] +// [0,19] . . . . .DE . . . . . . . . . stur q20, [x0, #-16] +// [1,0] . . . . .DeeE. . . . . . . . . ldr q7, [x1] +// [1,1] . . . . . DeeE . . . . . . . . ldr q31, [x0, #16] +// [1,2] . . . . . DeeE . . . . . . . . ldr q11, [x0, #48] +// [1,3] . . . . . DeeeE. . . . . . . . mul v20.8h, v31.8h, v7.h[0] +// [1,4] . . . . . .DeeeE . . . . . . . sqrdmulh v31.8h, v31.8h, v7.h[1] +// [1,5] . . . . . . DeeeE . . . . . . . mul v18.8h, v11.8h, v7.h[0] +// [1,6] . . . . . . DeeeE . . . . . . . sqrdmulh v7.8h, v11.8h, v7.h[1] +// [1,7] . . . . . . DeeE . . . . . . . ldr q11, [x2] +// [1,8] . . . . . . DeeE . . . . . . . ldr q8, [x0] +// [1,9] . . . . . . . DeeeE . . . . . . mls v20.8h, v31.8h, v11.h[0] +// [1,10] . . . . . . . DeeeE . . . . . . mls v18.8h, v7.8h, v11.h[0] +// [1,11] . . . . . . . DeeE . . . . . . ldr q7, [x0, #32] +// [1,12] . . . . . . . .DeE . . . . . . sub v31.8h, v8.8h, v20.8h +// [1,13] . . . . . . . . DeE. . . . . . add v11.8h, v8.8h, v20.8h +// [1,14] . . . . . . . . DeE . . . . . sub v20.8h, v7.8h, v18.8h +// [1,15] . . . . . . . . DE. . . . . . str q31, [x0, #16] +// [1,16] . . . . . . . . DeE . . . . . add v7.8h, v7.8h, v18.8h +// [1,17] . . . . . . . . DE . . . . . str q11, [x0], #64 +// [1,18] . . . . . . . . .DE . . . . . stur q7, [x0, #-32] +// [1,19] . . . . . . . . . DE . . . . . stur q20, [x0, #-16] +// [2,0] . . . . . . . . . DeeE . . . . ldr q7, [x1] +// [2,1] . . . . . . . . . DeeE . . . . ldr q31, [x0, #16] +// [2,2] . . . . . . . . . DeeE . . . . ldr q11, [x0, #48] +// [2,3] . . . . . . . . . .DeeeE . . . mul v20.8h, v31.8h, v7.h[0] +// [2,4] . . . . . . . . . . DeeeE . . . sqrdmulh v31.8h, v31.8h, v7.h[1] +// [2,5] . . . . . . . . . . DeeeE . . . mul v18.8h, v11.8h, v7.h[0] +// [2,6] . . . . . . . . . . DeeeE . . . sqrdmulh v7.8h, v11.8h, v7.h[1] +// [2,7] . . . . . . . . . . DeeE . . . ldr q11, [x2] +// [2,8] . . . . . . . . . . .DeeE. . . ldr q8, [x0] +// [2,9] . . . . . . . . . . . DeeeE . . mls v20.8h, v31.8h, v11.h[0] +// [2,10] . . . . . . . . . . . DeeeE . . mls v18.8h, v7.8h, v11.h[0] +// [2,11] . . . . . . . . . . . DeeE . . ldr q7, [x0, #32] +// [2,12] . . . . . . . . . . . . DeE. . sub v31.8h, v8.8h, v20.8h +// [2,13] . . . . . . . . . . . . DeE . add v11.8h, v8.8h, v20.8h +// [2,14] . . . . . . . . . . . . DeE . sub v20.8h, v7.8h, v18.8h +// [2,15] . . . . . . . . . . . . DE . str q31, [x0, #16] +// [2,16] . . . . . . . . . . . . DeE . add v7.8h, v7.8h, v18.8h +// [2,17] . . . . . . . . . . . . .DE . str q11, [x0], #64 +// [2,18] . . . . . . . . . . . . . DE. stur q7, [x0, #-32] +// [2,19] . . . . . . . . . . . . . DE stur q20, [x0, #-16] +``` + +However, LLVM MCA's model might not be accurate either and cannot replacement measurements on real hardware -- so let's +do that. Here, we use a [profiling tool](https://github.com/slothy-optimizer/pqax/tree/main/tests/profiling) we wrote +as part of the [pqax](https://github.com/slothy-optimizer/pqax) benchmarking framework. It takes an assembly snippet as +input and automatically generates a program running and benchmarking prefixes of the input, and combining them into a +performance diagram similar to the one generated by LLVM-MCA. Here's the output in our case: + +```nasm +===== Stepwise profiling ======= +[ 0]: ldr q0, [x1, #0] ......*..................................... +[ 1]: ldr q8, [x0, #0*16] .......*.................................... +[ 2]: ldr q9, [x0, #1*16] .........*.................................. +[ 3]: ldr q10, [x0, #2*16] ...........*................................ +[ 4]: ldr q11, [x0, #3*16] .............*.............................. +[ 5]: mul v12.8h, v9.8h, v0.h[0] ...............*............................ +[ 6]: sqrdmulh v9.8h, v9.8h, v0.h[1] ................*........................... +[ 7]: mls v12.8h, v9.8h, v1.h[0] .................*.......................... +[ 8]: sub v9.8h, v8.8h, v12.8h .....................*...................... +[ 9]: add v8.8h, v8.8h, v12.8h ........................*................... +[ 10]: mul v12.8h, v11.8h, v0.h[0] .........................*.................. +[ 11]: sqrdmulh v11.8h, v11.8h, v0.h[1] ..........................*................. +[ 12]: mls v12.8h, v11.8h, v1.h[0] ...........................*................ +[ 13]: sub v11.8h, v10.8h, v12.8h ...............................*............ +[ 14]: add v10.8h, v10.8h, v12.8h ..................................*......... +[ 15]: str q8, [x0], #4*16 ...................................*........ +[ 16]: str q9, [x0, #-3*16] .....................................*...... +[ 17]: str q10, [x0, #-2*16] .....................................*...... +[ 18]: str q11, [x0, #-1*16] ........................................*... + +===== Stepwise profiling (OPTIMIZED) ======= +[ 0]: ldr q18, [x0, #16] // .*........................ +[ 1]: sqrdmulh v8.8H, v6.8H, v2.H[1] // ..*....................... +[ 2]: mul v23.8H, v6.8H, v2.H[0] // ...*...................... +[ 3]: ldr q31, [x0, #32] // ....*..................... +[ 4]: mul v3.8H, v18.8H, v2.H[0] // ......*................... +[ 5]: mls v23.8H, v8.8H, v1.H[0] // .......*.................. +[ 6]: sqrdmulh v9.8H, v18.8H, v2.H[1] // ........*................. +[ 7]: ldr q15, [x0, #0] // .........*................ +[ 8]: sub v11.8H, v31.8H, v23.8H // ...........*.............. +[ 9]: mls v3.8H, v9.8H, v1.H[0] // ............*............. +[ 10]: add v16.8H, v31.8H, v23.8H // .............*............ +[ 11]: str q11, [x0, #48] // ..............*........... +[ 12]: ldr q2, [x1, #0] // ...............*.......... +[ 13]: add v13.8H, v15.8H, v3.8H // .................*........ +[ 14]: str q16, [x0, #32] // ..................*....... +[ 15]: sub v7.8H, v15.8H, v3.8H // ...................*...... +[ 16]: str q13, [x0], #4*16 // ....................*..... +[ 17]: ldr q6, [x0, #48] // .....................*.... +[ 18]: str q7, [x0, #-48] // .......................*.. +``` + +We can see that SLOTHY's predictions were exactly right, and that LLVM-MCA's model is off in a few places. +So, in a nutshell, we'd say that LLVM-MCA is great for quick evaluation of performance, but when you get down +to the last cycle and fine-tuning your model, there is no way around measurements on real hardware. + +## 6. Optimizing a full Neon NTT + +The examples previously considered were all toy examples, so you may wonder how to apply SLOTHY to actual cryptographic code. +Let's look at a real-world example: The Kyber number-theoretic transform -- a core arithmetic function of the Kyber key-encapsulation mechanism making up a large chunk of the total run-time. +The target platform is again the Arm Cortex-A55 and the code primarily consists of +Neon vector instructions. +We'll consider a straightforward implementation available here: [ntt_kyber_123_4567.s](../examples/naive/aarch64/ntt_kyber_123_4567.s). +If you have ever written an NTT, it should be fairly easy to understand what the code is doing. +The code consists of 2 main loops implementing layers 1+2+3 and 4+5+6+7 of the NTT. +The actual operations are wrapped in macros implementing butterflies on single vector registers. +Note that this code performs very poorly: No consideration was given to the intricacies of the microarchitecture. + +Let's run SLOTHY on this code: +```python +slothy.load_source_from_file("examples/naive/aarch64/ntt_kyber_123_4567.s") +slothy.config.sw_pipelining.enabled = True +slothy.config.inputs_are_outputs = True +slothy.config.sw_pipelining.minimize_overlapping = False +slothy.config.variable_size = True +slothy.config.reserved_regs = [f"x{i}" for i in range(0, 7)] + ["x30", "sp"] +slothy.config.constraints.stalls_first_attempt = 64 +slothy.optimize_loop("layer123_start") +slothy.optimize_loop("layer4567_start") +slothy.write_source_to_file("opt/ntt_kyber_123_4567_opt_a55.s") +``` + +We simply optimize both loops separately. +You will notice some additional flags we have set. To read the documentation of those, please have a look at [config.py](../slothy/core/config.py). +We have set an additional flag: `inputs_are_outputs = True`. This tells SLOTHY that the registers that are used as +inputs to the loop (e.g., the pointer to the polynomial input) are also outputs of the entire loop; otherwise, SLOTHY +could overwrite them in the postamble once they are no longer needed. You most likely want `inputs_are_outputs=True` +whenever you are optimizing a loop. We also use the `reserved_regs` option to tell SLOTHY that registers `x0, ..., x7, +x30, sp` are used for other purposes and should not be used by SLOTHY. When optimizing only parts of a function, it is +essential to tell SLOTHY which registers should not be used: By default SLOTHY will use any of the architectural +registers. If you are familiar with inline assembly, SLOTHY's `reserved_regs` are essentially the complement of the +'clobber list'. + +When running this example, you will notice that it has a significantly longer runtime. +On my Intel i7-1360P it takes approximately 15 minutes to optimize both loops. +You may instead look at an optimized version of the same code [examples/opt/aarch64/ntt_kyber_123_4567_opt_a55.s](../examples/opt/aarch64/ntt_kyber_123_4567_opt_a55.s). +You notice that both loops have many early instructions, and coming up with this code by hand would be tedious, time-consuming and error-prone. + + +## 7. Optimizing larger pieces of code + +We've seen that the code above can be optimized relatively fast (within seconds to minutes on a laptop). +When using a more powerful machine and allowing optimization times of hours, one can scale this up to larger examples. +We've successfully used (vanilla) SLOTHY for optimized code snippets of up to 180 instructions. +However, for larger code at a certain point the constraint solving becomes prohibitively expensive and we need to use a different strategy. + +One such example is the X25519 implementation we looked at in the [SLOTHY paper](https://eprint.iacr.org/2022/1303) available in [X25519-AArch64-simple.s](../examples/naive/aarch64/X25519-AArch64-simple.s) +It is a hybrid vector-scalar implementation based on an [implementation](https://github.com/Emill/X25519-AArch64) by Lenngren. +Its core loop consists of 958 instructions which well exceeds what SLOTHY can currently optimize in a single pass. + +However, we can still make use of SLOTHY to optimize this code by employing heuristics. +One particularly useful heuristics supported by SLOTHY is the `splitting` heuristic. +When a piece of code is too large to be optimized at once, it splits it into multiple overlapping pieces that are optimized separately. +With this approach one loses the optimality guarantees as it may be that there is a solution that SLOTHY cannot find due to the splitting. +However, by repeatedly running SLOTHY using the `splitting` heuristic, we managed to outperform the state-of-the-art and get very close to optimal results (in terms of IPC). + +To demonstrate the splitting heuristic we can use the following SLOTHY call: +```python +# example +slothy.load_source_from_file("../examples/naive/aarch64/X25519-AArch64-simple.s") + +# first pass: replace symbolic register names by architectural registers +slothy.config.inputs_are_outputs=True +slothy.config.outputs=["x0"] +slothy.config.constraints.functional_only = True +slothy.config.constraints.allow_reordering = False +slothy.optimize(start="mainloop", end="end_label") +slothy.config.constraints.functional_only = False +slothy.config.constraints.allow_reordering = True + +# second pass: splitting heuristic +slothy.config.variable_size=True +slothy.config.constraints.stalls_first_attempt=32 +slothy.config.split_heuristic = True +slothy.config.split_heuristic_stepsize = 0.05 +slothy.config.split_heuristic_factor = 10 +slothy.config.split_heuristic_repeat = 2 +slothy.optimize(start="mainloop", end="end_label") +slothy.write_source_to_file("opt/X25519-AArch64-simple_opt.s") +``` + + +The `splitting` heuristic can be turned on by setting `slothy.config.split_heuristic = True`. +It has three main parameters: +- `split_heuristic_factor` : Determines the size of each split. In this case, 10 means that we will be optimizing 10% of the original code at a time. +- `split_heuristic_stepsize` : Controls the degree of overlapping of the sliding window. Setting it to 0.05 means the sliding window moves by 5% every time. We will start with optimizing the first 10% ([0,0.1]) of the code, then [0.05,0.15], [0.1,0.20], ... +- `split_heuristic_repeat`: The number of times the optimization should be repeated. + +You will notice in the example above, that there is another call to `slothy.optimize()` prior to that. +This is needed as the input implementation is using symbolic register names which is a feature unrelated to the splitting heuristic that we want to demonstrate here. +It allows a developer of the code to leave the register allocation up to SLOTHY. +Unfortunately, it is not compatible with the splitting heuristic (as register allocation can't be performed locally), and hence we first need to do the register allocation on the full code before we continue. +We can configure SLOTHY to only consider register allocation by setting the `allow_reordering=False` (disabling the ordering constraints) and `functional_only=True` (disabling the microarchitectural constraints). +In this way, the constraints remain manageable, and SLOTHY finds a register allocation within a few minutes. + +Running this example takes around 15 minutes. +You can instead look at the output available in [opt/X25519-AArch64-simple_opt.s](opt/X25519-AArch64-simple_opt.s) +The output will look similar to the previous examples and contains significantly less pipeline stalls than the input. +For achieving the best performance, we require a few more calls to SLOTHY. You can find the script we used [here](../paper/scripts/slothy_x25519.sh) - it runs around 1.5 hours. + +## 8. Adding a new microarchitecture + +You may wonder how to extend SLOTHY to include a new microarchitecture. +For example, you may want to optimize code for a newer iteration of the Arm Cortex-A55, e.g., the Arm Cortex-A510. +To understand what is needed for that, let's look at the microarchitectural model for the Cortex-A55 available in [slothy/targets/aarch64/cortex_a55.py](../slothy/targets/aarch64/cortex_a55.py). + +Skipping some boilerplate code, you will see the following structure: +```python +from slothy.targets.aarch64.aarch64_neon import * + +issue_rate = 2 +class ExecutionUnit(Enum): + """Enumeration of execution units in Cortex-A55 model""" + SCALAR_ALU0=1 + SCALAR_ALU1=2 + SCALAR_MAC=3 + SCALAR_LOAD=4 + SCALAR_STORE=5 + VEC0=6 + VEC1=7 + # ... + +execution_units = { + // ... +} + +inverse_throughput = { + // ... +} + +default_latencies = { + // ... +} + + +def get_latency(src, out_idx, dst): + // ... + latency = lookup_multidict( + default_latencies, src) + // ... + return latency + +def get_units(src): + units = lookup_multidict(execution_units, src) + if isinstance(units,list): + return units + return [units] + +def get_inverse_throughput(src): + return lookup_multidict( + inverse_throughput, src) +``` + +Going through the snippet, we can see the core components: + - Definition of the `issue_rate` corresponding to the number of issue slots available per cycle. Since the Cortex-A55 is a dual-issue CPU, this is two. + - Definition of an `Enum` modelling the different execution units available. In this case, we model 2 scalar units, one + MAC unit, 2 64-bit vector units, one load unit, and one store unit. + - Finally, we need to implement the functions `get_latency`, `get_units`, `get_inverse_throughput` returning the + latency, occupied execution units, and throughputs. The input to these functions is a class from the architectural + model representing the instruction in question. For example, the class `vmull` in + [aarch64_neon.py](../slothy/targets/aarch64/aarch64_neon.py) corresponds to the `umull` instruction. We commonly + implement this using dictionaries above. + +For example, for the (128-bit/qform) `vmull` instruction, we can find in the [Arm Cortex-A55 Software Optimization +Guide](https://developer.arm.com/documentation/EPM128372/latest/) that it occupies both vector execution units, has an +inverse throughput of 1, and a latency of 4 cycles. We can model this in the following way: + +```python +execution_units = { + ( vmull ): [[ExecutionUnit.VEC0, ExecutionUnit.VEC1]], +} + +inverse_throughput = { + ( vmull ) : 1, +} + +default_latencies = { + ( vmull ) : 4, +} +``` + +We mostly use the tuple-syntax, so we can group together instructions that belong together. +For example, later we may want to add the Neon `add`. From the SWOG we can see that (128-bit/qform) `add` occupies both +64-bit vector execution units, has a latency of 3 cycles, and throughput of 1 cycle. +We can extend the above model as follows: + +```python +execution_units = { + ( vmull, vadd ): [[ExecutionUnit.VEC0, ExecutionUnit.VEC1]], +} + +inverse_throughput = { + ( vmull, vadd ) : 1, +} + +default_latencies = { + ( vmull ) : 4, + ( vadd ) : 3, +} +``` + + +(When looking at the actual model, you will notice that this is not quite how it is modelled. You will see that for some +instructions, we have to distinguish between the q-form (128-bit) and the d-form (64-bit) of the instruction. Q-form +instructions occupy both vector execution units, while most D-form instructions occupy only 1. Latencies also vary +depending on the actual form.) + +Note that both the architectural model and the micro-architectural model can be built lazily: As long as the +corresponding instruction do not appear in your input, you may leave out their description. +As soon as you hit an instruction that is not part of the architectural or micro-architectural model, you will see an +error. + +## Troubleshooting + +- ModuleNotFoundError: No module named 'ortools' + +This suggests that you have not installed the required dependencies needed by SLOTHY. +Either you need to follow the installation instructions, or if you have done that already, you likely forgot to enter the virtual environment you have installed them in using `source venv/bin/activate`. You will have to run this every time you open a new terminal. + +- The selfcheck passes but the code is functionally incorrect! + +The most common reason for this is a bad configuration: Check that you all registers that must be kept for the sake of +the surrounding code are marked as `reserved_regs`. + +Another possibility, albeit hopefully rare by now, is a failure during address offset fixup: This feature is not yet +stable, and the selfcheck is currently blind to erroneous calculations here. If you are sure your configuration is correct, you might +want to check the adjusted address offsets manually. If you find a bug, let us know! diff --git a/tutorial/opt/X25519-AArch64-simple_opt.s b/tutorial/opt/X25519-AArch64-simple_opt.s new file mode 100644 index 00000000..3bedd5ad --- /dev/null +++ b/tutorial/opt/X25519-AArch64-simple_opt.s @@ -0,0 +1,3652 @@ + /* X25519-AArch64 by Emil Lenngren (2018) + * + * To the extent possible under law, the person who associated CvC0 with + * X25519-AArch64 has waived all copyright and related or neighboring rights + * to X25519-AArch64. + * + * You should have received a copy of the CvC0 legalcode along with this + * work. If not, see . + */ + +/* + * This is an AArch64 implementation of X25519. + * It follows the reference implementation where the representation of + * a field element [0..2^255-19) is represented by a 256-bit little endian integer, + * reduced modulo 2^256-38, and may possibly be in the range [2^256-38..2^256). + * The scalar is a 256-bit integer where certain bits are hardcoded per specification. + * + * The implementation runs in constant time (~145k cycles on Cortex-vA53), + * and no conditional branches or memory access pattern depend on secret data. + */ + +/* + * Implementation manually de-interleaved and modularized for use with SLOTHY. See + * + * Fast and Clean: Auditable High Performance Assembly via Constraint Solving + * (Abdulrahman, Becker, Kannwischer, Klein) + */ + +#include +#include "instruction_wrappers.i" + +.macro fcsel_dform out, in0, in1, cond // @slothy:no-unfold + fcsel dform_\out, dform_\in0, dform_\in1, \cond +.endm + +#define STACK_MASK1 0 +#define STACK_MASK2 8 +#define STACK_A_0 16 +#define STACK_A_8 (STACK_A_0+ 8) +#define STACK_A_16 (STACK_A_0+16) +#define STACK_A_24 (STACK_A_0+24) +#define STACK_A_32 (STACK_A_0+32) +#define STACK_B_0 64 +#define STACK_B_8 (STACK_B_0+ 8) +#define STACK_B_16 (STACK_B_0+16) +#define STACK_B_24 (STACK_B_0+24) +#define STACK_B_32 (STACK_B_0+32) +#define STACK_CTR 104 +#define STACK_LASTBIT 108 +#define STACK_SCALAR 112 +#define STACK_X_0 168 +#define STACK_X_8 (STACK_X_0+ 8) +#define STACK_X_16 (STACK_X_0+16) +#define STACK_X_24 (STACK_X_0+24) +#define STACK_X_32 (STACK_X_0+32) +#define STACK_OUT_PTR (STACK_X_0+48) + + .cpu generic+fp+simd + .text + .align 2 + + // in: x0: pointer + // out: x0: loaded value + // .type load64unaligned, %function +load64unaligned: + ldrb w1, [x0] + ldrb w2, [x0, #1] + ldrb w3, [x0, #2] + ldrb w4, [x0, #3] + ldrb w5, [x0, #4] + ldrb w6, [x0, #5] + ldrb w7, [x0, #6] + ldrb w8, [x0, #7] + + orr w1, w1, w2, lsl #8 + orr w3, w3, w4, lsl #8 + orr w5, w5, w6, lsl #8 + orr w7, w7, w8, lsl #8 + + orr w1, w1, w3, lsl #16 + orr w5, w5, w7, lsl #16 + + orr x0, x1, x5, lsl #32 + + ret + // .size load64unaligned, .-load64unaligned + + // in: x0: pointer + // out: x0-x3: loaded value + // .type load256unaligned, %function +load256unaligned: + stp x29, x30, [sp, #-64]! + mov x29, sp + stp x19, x20, [sp, #16] + stp x21, x22, [sp, #32] + + mov x19, x0 + bl load64unaligned + mov x20, x0 + add x0, x19, #8 + bl load64unaligned + mov x21, x0 + add x0, x19, #16 + bl load64unaligned + mov x22, x0 + add x0, x19, #24 + bl load64unaligned + mov x3, x0 + + mov x0, x20 + mov x1, x21 + mov x2, x22 + + ldp x19, x20, [sp, #16] + ldp x21, x22, [sp, #32] + ldp x29, x30, [sp], #64 + ret + // .size load256unaligned, .-load256unaligned + +vAB0 .req v0 +vAB1 .req v1 +vAB2 .req v2 +vAB3 .req v3 +vAB4 .req v4 +vAB5 .req v5 +vAB6 .req v6 +vAB7 .req v7 +vAB8 .req v8 +vAB9 .req v9 + +vT0 .req vAB0 +vT1 .req vAB1 +vT2 .req vAB2 +vT3 .req vAB3 +vT4 .req vAB4 +vT5 .req vAB5 +vT6 .req vAB6 +vT7 .req vAB7 +vT8 .req vAB8 +vT9 .req vAB9 + +vTA0 .req vAB0 +vTA1 .req vAB1 +vTA2 .req vAB2 +vTA3 .req vAB3 +vTA4 .req vAB4 +vTA5 .req vAB5 +vTA6 .req vAB6 +vTA7 .req vAB7 +vTA8 .req vAB8 +vTA9 .req vAB9 + +vBX0 .req v10 +vBX1 .req v11 +vBX2 .req v12 +vBX3 .req v13 +vBX4 .req v14 +vBX5 .req v15 +vBX6 .req v16 +vBX7 .req v17 +vBX8 .req v18 +vBX9 .req v19 + +vDC0 .req vBX0 +vDC1 .req vBX1 +vDC2 .req vBX2 +vDC3 .req vBX3 +vDC4 .req vBX4 +vDC5 .req vBX5 +vDC6 .req vBX6 +vDC7 .req vBX7 +vDC8 .req vBX8 +vDC9 .req vBX9 + +vADBC0 .req v20 +vADBC1 .req v21 +vADBC2 .req v22 +vADBC3 .req v23 +vADBC4 .req v24 +vADBC5 .req v25 +vADBC6 .req v26 +vADBC7 .req v27 +vADBC8 .req v28 +vADBC9 .req v29 + +vX4Z50 .req vADBC0 +vX4Z51 .req vADBC1 +vX4Z52 .req vADBC2 +vX4Z53 .req vADBC3 +vX4Z54 .req vADBC4 +vX4Z55 .req vADBC5 +vX4Z56 .req vADBC6 +vX4Z57 .req vADBC7 +vX4Z58 .req vADBC8 +vX4Z59 .req vADBC9 + +vMaskA .req v30 +vMaskB .req v15 + +vZ20 .req v1 +vZ22 .req v3 +vZ24 .req v5 +vZ26 .req v7 +vZ28 .req v9 + +vZ30 .req v11 +vZ32 .req v13 +vZ34 .req v15 +vZ36 .req v17 +vZ38 .req v19 + +vX20 .req v0 +vX22 .req v2 +vX24 .req v4 +vX26 .req v6 +vX28 .req v8 + +vX30 .req v10 +vX32 .req v12 +vX34 .req v14 +vX36 .req v16 +vX38 .req v18 + +vB0 .req v20 +vB2 .req v21 +vB4 .req v22 +vB6 .req v23 +vB8 .req v24 + +vA0 .req v0 +vA2 .req v2 +vA4 .req v4 +vA6 .req v6 +vA8 .req v8 + +vC0 .req v10 +vC2 .req v12 +vC4 .req v14 +vC6 .req v16 +vC8 .req v18 + +vD0 .req v25 +vD2 .req v26 +vD4 .req v27 +vD6 .req v28 +vD8 .req v29 + +vF0 .req v1 +vF2 .req v3 +vF4 .req v5 +vF6 .req v7 +vF8 .req v9 + +vG0 .req v20 +vG2 .req v21 +vG4 .req v22 +vG6 .req v23 +vG8 .req v24 + +// F +sF0 .req x0 +sF1 .req x1 +sF2 .req x2 +sF3 .req x3 +sF4 .req x4 +sF5 .req x5 +sF6 .req x6 +sF7 .req x7 +sF8 .req x8 +sF9 .req x9 + +sAA0 .req x20 +sAA1 .req x21 +sAA2 .req x22 +sAA3 .req x23 +sAA4 .req x24 +sAA5 .req x25 +sAA6 .req x26 +sAA7 .req x27 +sAA8 .req x28 +sAA9 .req x19 + +stmp .req x2 + +// G +sG0 .req x0 +sG1 .req x1 +sG2 .req x2 +sG3 .req x3 +sG4 .req x4 +sG5 .req x5 +sG6 .req x6 +sG7 .req x7 +sG8 .req x8 +sG9 .req x9 + +sBB0 .req x0 +sBB1 .req x1 +sBB2 .req x2 +sBB3 .req x3 +sBB4 .req x4 +sBB5 .req x5 +sBB6 .req x6 +sBB7 .req x7 +sBB8 .req x8 +sBB9 .req x9 + +// E +sE0 .req x10 +sE1 .req x11 +sE2 .req x12 +sE3 .req x13 +sE4 .req x14 +sE5 .req x15 +sE6 .req x16 +sE7 .req x17 +sE8 .req x19 +sE9 .req x20 + +sZ40 .req x23 +sZ41 .req x3 +sZ42 .req x21 +sZ44 .req x7 +sZ45 .req x6 +sZ46 .req x24 +sZ48 .req x22 + +START: + + +.macro scalar_stack_ldr sA, offset, name + ldr \sA\()0, [sp, #\offset\()_0] // @slothy:reads=[\name\()0] + ldr \sA\()2, [sp, #\offset\()_8] // @slothy:reads=[\name\()8] + ldr \sA\()4, [sp, #\offset\()_16] // @slothy:reads=[\name\()16] + ldr \sA\()6, [sp, #\offset\()_24] // @slothy:reads=[\name\()24] + ldr \sA\()8, [sp, #\offset\()_32] // @slothy:reads=[\name\()32] +.endm + +.macro scalar_stack_str offset, sA, name + stp \sA\()0, \sA\()2, [sp, #\offset\()_0] // @slothy:writes=[\name\()0,\name\()8] + stp \sA\()4, \sA\()6, [sp, #\offset\()_16] // @slothy:writes=[\name\()16,\name\()24] + str \sA\()8, [sp, #\offset\()_32] // @slothy:writes=[\name\()32] +.endm + +.macro vector_stack_str offset, vA, name + stp D<\vA\()0>, D<\vA\()2>, [sp, #\offset\()_0] // @slothy:writes=[\name\()0,\name\()8] + stp D<\vA\()4>, D<\vA\()6>, [sp, #\offset\()_16] // @slothy:writes=[\name\()16,\name\()24] + str D<\vA\()8>, [sp, #\offset\()_32] // @slothy:writes=[\name\()32] +.endm + + // TODO: eliminate this explicit register assignment by converting stack_vld2_lane to AArch64Instruction + xvector_load_lane_tmp .req x26 + +.macro vector_load_lane vA, offset, lane, name + add xvector_load_lane_tmp, sp, #\offset\()_0 + ld2 { \vA\()0.s, \vA\()1.s }[\lane\()], [xvector_load_lane_tmp], #8 // @slothy:reads=[\name\()0] + ld2 { \vA\()2.s, \vA\()3.s }[\lane\()], [xvector_load_lane_tmp], #8 // @slothy:reads=[\name\()8] + ld2 { \vA\()4.s, \vA\()5.s }[\lane\()], [xvector_load_lane_tmp], #8 // @slothy:reads=[\name\()16] + ld2 { \vA\()6.s, \vA\()7.s }[\lane\()], [xvector_load_lane_tmp], #8 // @slothy:reads=[\name\()24] + ld2 { \vA\()8.s, \vA\()9.s }[\lane\()], [xvector_load_lane_tmp], #8 // @slothy:reads=[\name\()32] +.endm + +.macro vector_sub_inner vC0, vC2, vC4, vC6, vC8, vA0, vA2, vA4, vA6, vA8, vB0, vB2, vB4, vB6, vB8 + // (2^255-19)*4 - vB + sub \vC0\().2s, v28.2s, \vB0\().2s + sub \vC2\().2s, v29.2s, \vB2\().2s + sub \vC4\().2s, v29.2s, \vB4\().2s + sub \vC6\().2s, v29.2s, \vB6\().2s + sub \vC8\().2s, v29.2s, \vB8\().2s + + // ... + vA + add \vC0\().2s, \vA0\().2s, \vC0\().2s + add \vC2\().2s, \vA2\().2s, \vC2\().2s + add \vC4\().2s, \vA4\().2s, \vC4\().2s + add \vC6\().2s, \vA6\().2s, \vC6\().2s + add \vC8\().2s, \vA8\().2s, \vC8\().2s +.endm + +.macro vector_sub vC, vA, vB + vector_sub_inner \vC\()0, \vC\()2, \vC\()4, \vC\()6, \vC\()8, \vA\()0, \vA\()2, \vA\()4, \vA\()6, \vA\()8, \vB\()0, \vB\()2, \vB\()4, \vB\()6, \vB\()8 +.endm + + +.macro vector_add_inner vC0, vC2, vC4, vC6, vC8, vA0, vA2, vA4, vA6, vA8, vB0, vB2, vB4, vB6, vB8 + add \vC0\().2s, \vA0\().2s, \vB0\().2s + add \vC2\().2s, \vA2\().2s, \vB2\().2s + add \vC4\().2s, \vA4\().2s, \vB4\().2s + add \vC6\().2s, \vA6\().2s, \vB6\().2s + add \vC8\().2s, \vA8\().2s, \vB8\().2s +.endm + +.macro vector_add vC, vA, vB + vector_add_inner \vC\()0, \vC\()2, \vC\()4, \vC\()6, \vC\()8, \vA\()0, \vA\()2, \vA\()4, \vA\()6, \vA\()8, \vB\()0, \vB\()2, \vB\()4, \vB\()6, \vB\()8 +.endm + +.macro vector_cmov_inner vA0, vA2, vA4, vA6, vA8, vB0, vB2, vB4, vB6, vB8, vC0, vC2, vC4, vC6, vC8 + fcsel_dform \vA0, \vB0, \vC0, eq + fcsel_dform \vA2, \vB2, \vC2, eq + fcsel_dform \vA4, \vB4, \vC4, eq + fcsel_dform \vA6, \vB6, \vC6, eq + fcsel_dform \vA8, \vB8, \vC8, eq +.endm + +.macro vector_cmov vA, vB, vC + vector_cmov_inner \vA\()0, \vA\()2, \vA\()4, \vA\()6, \vA\()8, \vB\()0, \vB\()2, \vB\()4, \vB\()6, \vB\()8, \vC\()0, \vC\()2, \vC\()4, \vC\()6, \vC\()8, +.endm + +.macro vector_transpose_inner vA0, vA1, vA2, vA3, vA4, vA5, vA6, vA7, vA8, vA9, vB0, vB2, vB4, vB6, vB8, vC0, vC2, vC4, vC6, vC8 + trn2 \vA1\().2s, \vB0\().2s, \vC0\().2s + trn1 \vA0\().2s, \vB0\().2s, \vC0\().2s + trn2 \vA3\().2s, \vB2\().2s, \vC2\().2s + trn1 \vA2\().2s, \vB2\().2s, \vC2\().2s + trn2 \vA5\().2s, \vB4\().2s, \vC4\().2s + trn1 \vA4\().2s, \vB4\().2s, \vC4\().2s + trn2 \vA7\().2s, \vB6\().2s, \vC6\().2s + trn1 \vA6\().2s, \vB6\().2s, \vC6\().2s + trn2 \vA9\().2s, \vB8\().2s, \vC8\().2s + trn1 \vA8\().2s, \vB8\().2s, \vC8\().2s +.endm + +.macro vector_transpose vA, vB, vC + vector_transpose_inner \vA\()0, \vA\()1, \vA\()2, \vA\()3, \vA\()4, \vA\()5, \vA\()6, \vA\()7, \vA\()8, \vA\()9, \vB\()0, \vB\()2, \vB\()4, \vB\()6, \vB\()8, \vC\()0, \vC\()2, \vC\()4, \vC\()6, \vC\()8, +.endm + +.macro vector_to_scalar_inner sA0, sA2, sA4, sA6, sA8, vB0, vB2, vB4, vB6, vB8 + mov \sA0, \vB0\().d[0] + mov \sA2, \vB2\().d[0] + mov \sA4, \vB4\().d[0] + mov \sA6, \vB6\().d[0] + mov \sA8, \vB8\().d[0] +.endm + +.macro vector_to_scalar sA, vB + vector_to_scalar_inner \sA\()0, \sA\()2, \sA\()4, \sA\()6, \sA\()8, \vB\()0, \vB\()2, \vB\()4, \vB\()6, \vB\()8 +.endm + +.macro scalar_to_vector_inner vA0, vA2, vA4, vA6, vA8, sB0, sB2, sB4, sB6, sB8 + mov \vA0\().d[0], \sB0 + mov \vA2\().d[0], \sB2 + mov \vA4\().d[0], \sB4 + mov \vA6\().d[0], \sB6 + mov \vA8\().d[0], \sB8 +.endm + +.macro scalar_to_vector vA, sB + scalar_to_vector_inner \vA\()0, \vA\()2, \vA\()4, \vA\()6, \vA\()8, \sB\()0, \sB\()2, \sB\()4, \sB\()6, \sB\()8 +.endm + + +.macro vector_extract_upper_inner vA0, vA2, vA4, vA6, vA8, vB0, vB2, vB4, vB6, vB8 + mov \vA0\().d[0], \vB0\().d[1] + mov \vA2\().d[0], \vB2\().d[1] + mov \vA4\().d[0], \vB4\().d[1] + mov \vA6\().d[0], \vB6\().d[1] + mov \vA8\().d[0], \vB8\().d[1] +.endm + +.macro vector_extract_upper vA, vB + vector_extract_upper_inner \vA\()0, \vA\()2, \vA\()4, \vA\()6, \vA\()8, \vB\()0, \vB\()2, \vB\()4, \vB\()6, \vB\()8 +.endm + +.macro vector_compress_inner vA0, vA2, vA4, vA6, vA8, vB0, vB1, vB2, vB3, vB4, vB5, vB6, vB7, vB8, vB9 + trn1 \vA0\().4s, \vB0\().4s, \vB1\().4s + trn1 \vA2\().4s, \vB2\().4s, \vB3\().4s + trn1 \vA4\().4s, \vB4\().4s, \vB5\().4s + trn1 \vA6\().4s, \vB6\().4s, \vB7\().4s + trn1 \vA8\().4s, \vB8\().4s, \vB9\().4s +.endm + +.macro vector_compress vA, vB + vector_compress_inner \vA\()0, \vA\()2, \vA\()4, \vA\()6, \vA\()8, \vB\()0, \vB\()1, \vB\()2, \vB\()3, \vB\()4, \vB\()5, \vB\()6, \vB\()7, \vB\()8, \vB\()9, +.endm + +.macro scalar_clear_carries_inner sA0, sA1, sA2, sA3, sA4, sA5, sA6, sA7, sA8, sA9 + and \sA1, \sA1, #0x1ffffff + and \sA3, \sA3, #0x1ffffff + and \sA5, \sA5, #0x1ffffff + and \sA7, \sA7, #0x1ffffff + mov W<\sA0>, W<\sA0> + mov W<\sA2>, W<\sA2> + mov W<\sA4>, W<\sA4> + mov W<\sA6>, W<\sA6> + mov W<\sA8>, W<\sA8> +.endm + +.macro scalar_clear_carries sA + scalar_clear_carries_inner \sA\()0, \sA\()1, \sA\()2, \sA\()3, \sA\()4, \sA\()5, \sA\()6, \sA\()7, \sA\()8, \sA\()9 +.endm + +.macro scalar_decompress_inner sA0, sA1, sA2, sA3, sA4, sA5, sA6, sA7, sA8, sA9 + lsr \sA1, \sA0, #32 + lsr \sA3, \sA2, #32 + lsr \sA5, \sA4, #32 + lsr \sA7, \sA6, #32 + lsr \sA9, \sA8, #32 +.endm + +.macro scalar_decompress sA + scalar_decompress_inner \sA\()0, \sA\()1, \sA\()2, \sA\()3, \sA\()4, \sA\()5, \sA\()6, \sA\()7, \sA\()8, \sA\()9 +.endm + + // TODO: eliminate those. should be easy + vR_l4h4l5h5 .req vADBC4 + vR_l6h6l7h7 .req vADBC5 + + vR_l0h0l1h1 .req vADBC0 + vR_l2h2l3h3 .req vADBC1 + + vR_l0123 .req vADBC4 + vR_l4567 .req vADBC6 + vR_h0123 .req vADBC5 + vR_h4567 .req vADBC7 + vR_l89h89 .req vADBC8 + + vR_h89xx .req vADBC9 + + vSum0123 .req vADBC0 + vSum4567 .req vADBC1 + vSum89xx .req vADBC2 + + vDiff0123 .req v10 + vDiff4567 .req v11 + vDiff89xx .req v12 + + // TODO: eliminate those explicit register assignments by converting stack_vld1r and stack_vldr_bform to AArch64Instruction + vrepack_inner_tmp .req v19 + vrepack_inner_tmp2 .req v0 + +.macro vector_addsub_repack_inner vA0, vA1, vA2, vA3, vA4, vA5, vA6, vA7, vA8, vA9, vC0, vC1, vC2, vC3, vC4, vC5, vC6, vC7, vC8, vC9 + uzp1 vR_l4h4l5h5.4s, \vC4\().4s, \vC5\().4s + uzp1 vR_l6h6l7h7.4s, \vC6\().4s, \vC7\().4s + ld1r {vrepack_inner_tmp.2d}, [sp] // @slothy:reads=mask1 + uzp1 vR_l4567.4s, vR_l4h4l5h5.4s, vR_l6h6l7h7.4s + uzp2 vR_h4567.4s, vR_l4h4l5h5.4s, vR_l6h6l7h7.4s + trn1 vR_l89h89.4s, \vC8\().4s, \vC9\().4s + ldr B, [sp, #STACK_MASK2] // @slothy:reads=mask2 + uzp1 vR_l0h0l1h1.4s, \vC0\().4s, \vC1\().4s + uzp1 vR_l2h2l3h3.4s, \vC2\().4s, \vC3\().4s + mov vR_h89xx.d[0], vR_l89h89.d[1] + uzp1 vR_l0123.4s, vR_l0h0l1h1.4s, vR_l2h2l3h3.4s + uzp2 vR_h0123.4s, vR_l0h0l1h1.4s, vR_l2h2l3h3.4s + add vDiff4567.4s, vR_l4567.4s, vrepack_inner_tmp.4s + add vDiff89xx.2s, vR_l89h89.2s, vrepack_inner_tmp.2s + mov vrepack_inner_tmp.b[0], vrepack_inner_tmp2.b[0] + add vSum0123.4s, vR_l0123.4s, vR_h0123.4s + add vSum4567.4s, vR_l4567.4s, vR_h4567.4s + add vSum89xx.2s, vR_l89h89.2s, vR_h89xx.2s + add vDiff0123.4s, vR_l0123.4s, vrepack_inner_tmp.4s + sub vDiff4567.4s, vDiff4567.4s, vR_h4567.4s + sub vDiff0123.4s, vDiff0123.4s, vR_h0123.4s + sub vDiff89xx.2s, vDiff89xx.2s, vR_h89xx.2s + zip1 \vA0\().4s, vDiff0123.4s, vSum0123.4s + zip2 \vA2\().4s, vDiff0123.4s, vSum0123.4s + zip1 \vA4\().4s, vDiff4567.4s, vSum4567.4s + zip2 \vA6\().4s, vDiff4567.4s, vSum4567.4s + zip1 \vA8\().2s, vDiff89xx.2s, vSum89xx.2s + zip2 \vA9\().2s, vDiff89xx.2s, vSum89xx.2s + mov \vA1\().d[0], \vA0\().d[1] + mov \vA3\().d[0], \vA2\().d[1] + mov \vA5\().d[0], \vA4\().d[1] + mov \vA7\().d[0], \vA6\().d[1] +.endm + +.macro vector_addsub_repack vA, vC +vector_addsub_repack_inner \vA\()0, \vA\()1, \vA\()2, \vA\()3, \vA\()4, \vA\()5, \vA\()6, \vA\()7, \vA\()8, \vA\()9, \vC\()0, \vC\()1, \vC\()2, \vC\()3, \vC\()4, \vC\()5, \vC\()6, \vC\()7, \vC\()8, \vC\()9 +.endm + +// sAA0 .. sAA9 output AA = A^2 +// sA0 .. sA9 input A +// TODO: simplify (this is still the same instruction order as before; we can make it simpler and leave the re-ordering to Sloty) +.macro scalar_sqr_inner sAA0, sAA1, sAA2, sAA3, sAA4, sAA5, sAA6, sAA7, sAA8, sAA9, sA0, sA1, sA2, sA3, sA4, sA5, sA6, sA7, sA8, sA9 + lsr \sA1, \sA0, #32 + lsr \sA3, \sA2, #32 + lsr \sA5, \sA4, #32 + lsr \sA7, \sA6, #32 + lsr \sA9, \sA8, #32 + add X, \sA9, \sA9 + add X, \sA8, \sA8 + add X, \sA7, \sA7 + add X, \sA6, \sA6 + add X, \sA5, \sA5 + add X, \sA4, \sA4 + add X, \sA3, \sA3 + add X, \sA2, \sA2 + add X, \sA1, \sA1 + umull X, W<\sA4>, W<\sA4> + umull X, W<\sA4>, W + mul W<\sA9>, W<\sA9>, W + mul W<\sA7>, W<\sA7>, W + mul W<\sA5>, W<\sA5>, W + umaddl X, W<\sA9>, W, X + umaddl X, W<\sA0>, W, X + umull X, W<\sA0>, W<\sA0> + umull X, W<\sA0>, W + umull X, W<\sA0>, W + umull X, W<\sA0>, W + umull X, W<\sA0>, W + umull X, W<\sA0>, W + umull X, W<\sA0>, W + umull X, W<\sA0>, W + umaddl X, W<\sA0>, W, X + mul W, W<\sA6>, W + umaddl X, W<\sA1>, W, X + umaddl X, W<\sA1>, W, X + umaddl X, W, W, X + umaddl X, W<\sA1>, W, X + umaddl X, W, W, X + umaddl X, W<\sA1>, W, X + umaddl X, W, W, X + umaddl X, W<\sA1>, W, X + mul W, W<\sA8>, W + umaddl X, W<\sA2>, W<\sA2>, X + umaddl X, W<\sA2>, W, X + umaddl X, W<\sA2>, W, X + umaddl X, W<\sA2>, W, X + umaddl X, W<\sA2>, W, X + umaddl X, W<\sA2>, W, X + umaddl X, W<\sA3>, W, X + umaddl X, W<\sA3>, W, X + umaddl X, W, W, X + umaddl X, W<\sA3>, W, X + umaddl X, W<\sA8>, W, X + umaddl X, W<\sA6>, W, X + add X, X, X, lsr #26 + umaddl X, W<\sA5>, W, X + add X, X, X, lsr #25 + bic X, X, #0x1ffffff + add X, X, X, lsr #24 + and X, X, #0x1ffffff + add X, X, X, lsr #21 + umaddl X, W<\sA7>, W, X + add X, X, X + add X, X, X + add X, X, X + add X, X, X + umaddl X, W, W, X + umaddl X, W, W, X + and X, X, #0x3ffffff + umaddl X, W<\sA7>, W, X + umaddl X, W<\sA7>, W, X + umaddl X, W<\sA7>, W, X + umaddl X, W<\sA7>, W, X + umaddl X, W, W, X + umaddl X, W, W, X + umaddl X, W, W, X + umaddl X, W, W, X + umaddl X, W, W, X + umaddl X, W, W, X + umaddl X, W<\sA9>, W, X + umaddl X, W<\sA9>, W, X + umaddl X, W<\sA9>, W, X + umaddl X, W<\sA9>, W, X + umaddl X, W<\sA9>, W, X + umaddl X, W<\sA9>, W, X + umaddl X, W<\sA9>, W, X + umaddl X, W<\sA9>, W, X + add \sAA1, X, X, lsr #26 + and \sAA0, X, #0x3ffffff + add \sAA2, X, \sAA1, lsr #25 + bfi \sAA0, \sAA1, #32, #25 + add \sAA3, X, \sAA2, lsr #26 + and \sAA2, \sAA2, #0x3ffffff + add \sAA4, X, \sAA3, lsr #25 + bfi \sAA2, \sAA3, #32, #25 + add \sAA5, X, \sAA4, lsr #26 + and \sAA4, \sAA4, #0x3ffffff + add \sAA6, X, \sAA5, lsr #25 + bfi \sAA4, \sAA5, #32, #25 + add \sAA7, X, \sAA6, lsr #26 + and \sAA6, \sAA6, #0x3ffffff + add \sAA8, X, \sAA7, lsr #25 + bfi \sAA6, \sAA7, #32, #25 + add \sAA9, X, \sAA8, lsr #26 + and \sAA8, \sAA8, #0x3ffffff + bfi \sAA8, \sAA9, #32, #26 +.endm + +.macro scalar_sqr sAA, sA +scalar_sqr_inner \sAA\()0, \sAA\()1, \sAA\()2, \sAA\()3, \sAA\()4, \sAA\()5, \sAA\()6, \sAA\()7, \sAA\()8, \sAA\()9, \sA\()0, \sA\()1, \sA\()2, \sA\()3, \sA\()4, \sA\()5, \sA\()6, \sA\()7, \sA\()8, \sA\()9 +.endm + +// sC0 .. sC9 output C = A*B +// sA0 .. sA9 input A +// sB0 .. sB9 input B +.macro scalar_mul_inner sC0, sC1, sC2, sC3, sC4, sC5, sC6, sC7, sC8, sC9, sA0, sA1, sA2, sA3, sA4, sA5, sA6, sA7, sA8, sA9, sB0, sB1, sB2, sB3, sB4, sB5, sB6, sB7, sB8, sB9 + + + mul W, W<\sA1>, W + mul W, W<\sA2>, W + mul W, W<\sA3>, W + mul W, W<\sA5>, W + mul W, W<\sA6>, W + mul W, W<\sA7>, W + mul W, W<\sA8>, W + mul W, W<\sA9>, W + + umull X, W<\sA1>, W<\sB8> + umaddl X, W<\sA3>, W<\sB6>, X + umaddl X, W<\sA5>, W<\sB4>, X + umaddl X, W<\sA7>, W<\sB2>, X + umaddl X, W<\sA9>, W<\sB0>, X + umaddl X, W<\sA0>, W<\sB9>, X + umaddl X, W<\sA2>, W<\sB7>, X + umaddl X, W<\sA4>, W<\sB5>, X + umaddl X, W<\sA6>, W<\sB3>, X + umaddl X, W<\sA8>, W<\sB1>, X + + umull X, W<\sA1>, W<\sB7> + umaddl X, W<\sA3>, W<\sB5>, X + umaddl X, W<\sA5>, W<\sB3>, X + umaddl X, W<\sA7>, W<\sB1>, X + umaddl X, W, W<\sB9>, X + add X, X, X + umaddl X, W<\sA0>, W<\sB8>, X + umaddl X, W<\sA2>, W<\sB6>, X + umaddl X, W<\sA4>, W<\sB4>, X + umaddl X, W<\sA6>, W<\sB2>, X + umaddl X, W<\sA8>, W<\sB0>, X + + + umull X, W<\sA1>, W<\sB6> + umaddl X, W<\sA3>, W<\sB4>, X + umaddl X, W<\sA5>, W<\sB2>, X + umaddl X, W<\sA7>, W<\sB0>, X + umaddl X, W, W<\sB8>, X + umaddl X, W<\sA0>, W<\sB7>, X + umaddl X, W<\sA2>, W<\sB5>, X + umaddl X, W<\sA4>, W<\sB3>, X + umaddl X, W<\sA6>, W<\sB1>, X + umaddl X, W, W<\sB9>, X + + umull X, W<\sA1>, W<\sB5> + umaddl X, W<\sA3>, W<\sB3>, X + umaddl X, W<\sA5>, W<\sB1>, X + umaddl X, W, W<\sB9>, X + umaddl X, W, W<\sB7>, X + add X, X, X + umaddl X, W<\sA0>, W<\sB6>, X + umaddl X, W<\sA2>, W<\sB4>, X + umaddl X, W<\sA4>, W<\sB2>, X + umaddl X, W<\sA6>, W<\sB0>, X + umaddl X, W, W<\sB8>, X + + umull X, W, W<\sB6> + umaddl X, W<\sA5>, W<\sB0>, X + umaddl X, W, W<\sB8>, X + umaddl X, W<\sA3>, W<\sB2>, X + umaddl X, W<\sA1>, W<\sB4>, X + umaddl X, W, W<\sB7>, X + umaddl X, W, W<\sB9>, X + umaddl X, W<\sA4>, W<\sB1>, X + umaddl X, W<\sA2>, W<\sB3>, X + umaddl X, W<\sA0>, W<\sB5>, X + + umull X, W, W<\sB5> + umaddl X, W, W<\sB7>, X + umaddl X, W, W<\sB9>, X + umaddl X, W<\sA3>, W<\sB1>, X + umaddl X, W<\sA1>, W<\sB3>, X + add X, X, X + umaddl X, W, W<\sB6>, X + umaddl X, W, W<\sB8>, X + umaddl X, W<\sA4>, W<\sB0>, X + umaddl X, W<\sA2>, W<\sB2>, X + umaddl X, W<\sA0>, W<\sB4>, X + + umull X, W, W<\sB4> + umaddl X, W, W<\sB6>, X + umaddl X, W, W<\sB8>, X + umaddl X, W<\sA3>, W<\sB0>, X + umaddl X, W<\sA1>, W<\sB2>, X + mul W, W<\sA4>, W + umaddl X, W, W<\sB5>, X + umaddl X, W, W<\sB7>, X + umaddl X, W, W<\sB9>, X + umaddl X, W<\sA2>, W<\sB1>, X + umaddl X, W<\sA0>, W<\sB3>, X + + add X, X, X, lsr #26 + and \sC4, X, #0x3ffffff + add X, X, X, lsr #25 + and \sC5, X, #0x1ffffff + add X, X, X, lsr #26 + and \sC6, X, #0x3ffffff + add X, X, X, lsr #25 + bfi \sC6, X, #32, #25 + add X, X, X, lsr #26 + and \sC8, X, #0x3ffffff + bic X, X, #0x3ffffff + lsr X, X, #26 + bfi \sC8, X, #32, #26 + add X, X, X, lsr #25 + add X, X, X, lsr #22 + + umaddl X, W, W<\sB1>, X + umaddl X, W, W<\sB3>, X + umaddl X, W, W<\sB5>, X + umaddl X, W, W<\sB7>, X + umaddl X, W, W<\sB9>, X + add X, X, X + umaddl X, W, W<\sB2>, X + umaddl X, W, W<\sB4>, X + umaddl X, W, W<\sB6>, X + umaddl X, W, W<\sB8>, X + umaddl X, W<\sA0>, W<\sB0>, X + + umull X, W, W<\sB2> + umaddl X, W, W<\sB4>, X + umaddl X, W, W<\sB6>, X + umaddl X, W, W<\sB8>, X + umaddl X, W<\sA1>, W<\sB0>, X + umaddl X, W, W<\sB3>, X + umaddl X, W, W<\sB5>, X + umaddl X, W, W<\sB7>, X + umaddl X, W, W<\sB9>, X + umaddl X, W<\sA0>, W<\sB1>, X + + umull X, W, W<\sB3> + umaddl X, W, W<\sB5>, X + umaddl X, W, W<\sB7>, X + umaddl X, W, W<\sB9>, X + umaddl X, W<\sA1>, W<\sB1>, X + add X, X, X + umaddl X, W, W<\sB4>, X + umaddl X, W, W<\sB6>, X + umaddl X, W, W<\sB8>, X + umaddl X, W<\sA2>, W<\sB0>, X + umaddl X, W<\sA0>, W<\sB2>, X + + add \sC1, X, X, lsr #26 + and \sC0, X, #0x3ffffff + add \sC2, X, \sC1, lsr #25 + bfi \sC0, \sC1, #32, #25 + add X, X, \sC2, lsr #26 + and \sC2, \sC2, #0x3ffffff + add \sC4, \sC4, X, lsr #25 + bfi \sC2, X, #32, #25 + add \sC5, \sC5, \sC4, lsr #26 + and \sC4, \sC4, #0x3ffffff + bfi \sC4, \sC5, #32, #26 +.endm + +.macro scalar_mul sC, sA, sB +scalar_mul_inner \sC\()0, \sC\()1, \sC\()2, \sC\()3, \sC\()4, \sC\()5, \sC\()6, \sC\()7, \sC\()8, \sC\()9, \sA\()0, \sA\()1, \sA\()2, \sA\()3, \sA\()4, \sA\()5, \sA\()6, \sA\()7, \sA\()8, \sA\()9, \sB\()0, \sB\()1, \sB\()2, \sB\()3, \sB\()4, \sB\()5, \sB\()6, \sB\()7, \sB\()8, \sB\()9 +.endm + +xtmp_scalar_sub_0 .req x21 + +// sC0 .. sC4 output C = A + 4p - B (registers may be the same as A) +// sA0 .. sA4 first operand A +// sB0 .. sB4 second operand B +.macro scalar_sub_inner sC0, sC1, sC2, sC3, sC4, sA0, sA1, sA2, sA3, sA4, sB0, sB1, sB2, sB3, sB4 + + ldr xtmp_scalar_sub_0, #=0x07fffffe07fffffc + add \sC1, \sA1, xtmp_scalar_sub_0 + add \sC2, \sA2, xtmp_scalar_sub_0 + add \sC3, \sA3, xtmp_scalar_sub_0 + add \sC4, \sA4, xtmp_scalar_sub_0 + movk xtmp_scalar_sub_0, #0xffb4 + add \sC0, \sA0, xtmp_scalar_sub_0 + sub \sC0, \sC0, \sB0 + sub \sC1, \sC1, \sB1 + sub \sC2, \sC2, \sB2 + sub \sC3, \sC3, \sB3 + sub \sC4, \sC4, \sB4 +.endm + +.macro scalar_sub sC, sA, sB +scalar_sub_inner \sC\()0, \sC\()2, \sC\()4, \sC\()6, \sC\()8, \sA\()0, \sA\()2, \sA\()4, \sA\()6, \sA\()8, \sB\()0, \sB\()2, \sB\()4, \sB\()6, \sB\()8 +.endm + + +.macro scalar_addm_inner sC0, sC1, sC2, sC3, sC4, sC5, sC6, sC7, sC8, sC9, sA0, sA1, sA2, sA3, sA4, sA5, sA6, sA7, sA8, sA9, sB0, sB1, sB2, sB3, sB4, sB5, sB6, sB7, sB8, sB9, multconst + + ldr X, #=\multconst + umaddl \sC9, W<\sB9>, W, \sA9 + umaddl \sC0, W<\sB0>, W, \sA0 + umaddl \sC1, W<\sB1>, W, \sA1 + umaddl \sC2, W<\sB2>, W, \sA2 + lsr X, \sC9, #25 + umaddl \sC3, W<\sB3>, W, \sA3 + and \sC9, \sC9, #0x1ffffff + umaddl \sC4, W<\sB4>, W, \sA4 + add \sC0, \sC0, X + umaddl \sC5, W<\sB5>, W, \sA5 + add \sC0, \sC0, X, lsl #1 + umaddl \sC6, W<\sB6>, W, \sA6 + add \sC0, \sC0, X, lsl #4 + umaddl \sC7, W<\sB7>, W, \sA7 + umaddl \sC8, W<\sB8>, W, \sA8 + + add \sC1, \sC1, \sC0, lsr #26 + and \sC0, \sC0, #0x3ffffff + add \sC2, \sC2, \sC1, lsr #25 + and \sC1, \sC1, #0x1ffffff + add \sC3, \sC3, \sC2, lsr #26 + and \sC2, \sC2, #0x3ffffff + add \sC4, \sC4, \sC3, lsr #25 + and \sC3, \sC3, #0x1ffffff + add \sC5, \sC5, \sC4, lsr #26 + and \sC4, \sC4, #0x3ffffff + add \sC6, \sC6, \sC5, lsr #25 + and \sC5, \sC5, #0x1ffffff + add \sC7, \sC7, \sC6, lsr #26 + and \sC6, \sC6, #0x3ffffff + add \sC8, \sC8, \sC7, lsr #25 + and \sC7, \sC7, #0x1ffffff + add \sC9, \sC9, \sC8, lsr #26 + and \sC8, \sC8, #0x3ffffff +.endm + +.macro scalar_addm sC, sA, sB, multconst +scalar_addm_inner \sC\()0, \sC\()1, \sC\()2, \sC\()3, \sC\()4, \sC\()5, \sC\()6, \sC\()7, \sC\()8, \sC\()9, \sA\()0, \sA\()1, \sA\()2, \sA\()3, \sA\()4, \sA\()5, \sA\()6, \sA\()7, \sA\()8, \sA\()9, \sB\()0, \sB\()1, \sB\()2, \sB\()3, \sB\()4, \sB\()5, \sB\()6, \sB\()7, \sB\()8, \sB\()9, \multconst +.endm + +// vAA0 .. vAA9 output AA = A^2 +// vA0 .. vA9 input A +.macro vector_sqr_inner vAA0, vAA1, vAA2, vAA3, vAA4, vAA5, vAA6, vAA7, vAA8, vAA9, vA0, vA1, vA2, vA3, vA4, vA5, vA6, vA7, vA8, vA9 + shl V.2s, \vA9\().2s, #1 + shl V.2s, \vA8\().2s, #1 + shl V.2s, \vA7\().2s, #1 + shl V.2s, \vA6\().2s, #1 + shl V.2s, \vA5\().2s, #1 + shl V.2s, \vA4\().2s, #1 + shl V.2s, \vA3\().2s, #1 + shl V.2s, \vA2\().2s, #1 + shl V.2s, \vA1\().2s, #1 + umull V.2d, \vA0\().2s, V.2s + umlal V.2d, \vA1\().2s, V.2s + umlal V.2d, \vA2\().2s, V.2s + umlal V.2d, \vA3\().2s, V.2s + umlal V.2d, \vA4\().2s, V.2s + umull V.2d, \vA0\().2s, V.2s + umlal V.2d, V.2s, V.2s + umlal V.2d, \vA2\().2s, V.2s + umlal V.2d, V.2s, V.2s + umlal V.2d, \vA4\().2s, \vA4\().2s + mul V.2s, \vA9\().2s, vconst19.2s + umull V.2d, \vA0\().2s, V.2s + umlal V.2d, \vA1\().2s, V.2s + umlal V.2d, \vA2\().2s, V.2s + umlal V.2d, \vA3\().2s, V.2s + umlal V.2d, V.2s, V.2s + umull V.2d, \vA0\().2s, V.2s + umlal V.2d, V.2s, V.2s + umlal V.2d, \vA2\().2s, V.2s + umlal V.2d, V.2s, \vA3\().2s + umull V.2d, \vA0\().2s, V.2s + umlal V.2d, \vA1\().2s, V.2s + umlal V.2d, \vA2\().2s, V.2s + umull V.2d, \vA0\().2s, V.2s + umlal V.2d, V.2s, V.2s + umlal V.2d, \vA2\().2s, \vA2\().2s + umull V.2d, \vA0\().2s, V.2s + umlal V.2d, \vA1\().2s, V.2s + umull V.2d, \vA0\().2s, V.2s + umlal V.2d, V.2s, \vA1\().2s + umull V.2d, \vA0\().2s, V.2s + umull V.2d, \vA0\().2s, \vA0\().2s + usra V.2d, V.2d, #26 + and V.16b, V.16b, vMaskA.16b + mul V.2s, \vA8\().2s, vconst19.2s + bic V.16b, V.16b, vMaskB.16b + and \vA9\().16b, V.16b, vMaskB.16b + usra V.2d, V.2d, #25 + mul V.2s, \vA7\().2s, vconst19.2s + usra V.2d, V.2d, #24 + mul V.2s, \vA6\().2s, vconst19.2s + usra V.2d, V.2d, #21 + mul V.2s, \vA5\().2s, vconst19.2s + shl V.2s, V.2s, #1 + shl V.2s, V.2s, #1 + shl V.2s, V.2s, #1 + shl V.2s, V.2s, #1 + umlal V.2d, V.2s, V.2s + umlal V.2d, V.2s, V.2s + umlal V.2d, V.2s, V.2s + umlal V.2d, V.2s, V.2s + umlal V.2d, V.2s, V.2s + umlal V.2d, V.2s, V.2s + umlal V.2d, V.2s, V.2s + umlal V.2d, V.2s, V.2s + umlal V.2d, V.2s, V.2s + umlal V.2d, V.2s, \vA6\().2s + umlal V.2d, V.2s, V.2s + umlal V.2d, V.2s, V.2s + umlal V.2d, V.2s, V.2s + usra V.2d, V.2d, #26 + umlal V.2d, V.2s, V.2s + umlal V.2d, V.2s, V.2s + umlal V.2d, V.2s, V.2s + usra V.2d, V.2d, #25 + umlal V.2d, V.2s, V.2s + umlal V.2d, V.2s, V.2s + umlal V.2d, V.2s, V.2s + usra V.2d, V.2d, #26 + umlal V.2d, V.2s, V.2s + umlal V.2d, V.2s, V.2s + usra V.2d, V.2d, #25 + umlal V.2d, V.2s, \vA8\().2s + umlal V.2d, V.2s, V.2s + usra V.2d, V.2d, #26 + umlal V.2d, V.2s, V.2s + usra V.2d, V.2d, #25 + usra V.2d, V.2d, #26 + usra V.2d, V.2d, #25 + usra \vAA9\().2d, V.2d, #26 + and \vAA4\().16b, V.16b, vMaskA.16b + and \vAA5\().16b, V.16b, vMaskB.16b + and \vAA0\().16b, V.16b, vMaskA.16b + and \vAA6\().16b, V.16b, vMaskA.16b + and \vAA1\().16b, V.16b, vMaskB.16b + and \vAA7\().16b, V.16b, vMaskB.16b + and \vAA2\().16b, V.16b, vMaskA.16b + and \vAA8\().16b, V.16b, vMaskA.16b + and \vAA3\().16b, V.16b, vMaskB.16b +.endm + +.macro vector_sqr vAA, vA +vector_sqr_inner \vAA\()0, \vAA\()1, \vAA\()2, \vAA\()3, \vAA\()4, \vAA\()5, \vAA\()6, \vAA\()7, \vAA\()8, \vAA\()9, \vA\()0, \vA\()1, \vA\()2, \vA\()3, \vA\()4, \vA\()5, \vA\()6, \vA\()7, \vA\()8, \vA\()9 +.endm + +// vC0 .. vC9 output C = A*B +// vA0 .. vA9 first operand A +// vB0 .. vB9 second operand B +.macro vector_mul_inner vC0, vC1, vC2, vC3, vC4, vC5, vC6, vC7, vC8, vC9, vA0, vA1, vA2, vA3, vA4, vA5, vA6, vA7, vA8, vA9, vB0, vB1, vB2, vB3, vB4, vB5, vB6, vB7, vB8, vB9 + umull \vC9\().2d, \vA0\().2s, \vB9\().2s + umlal \vC9\().2d, \vA2\().2s, \vB7\().2s + umlal \vC9\().2d, \vA4\().2s, \vB5\().2s + umlal \vC9\().2d, \vA6\().2s, \vB3\().2s + umlal \vC9\().2d, \vA8\().2s, \vB1\().2s + mul \vB9\().2s, \vB9\().2s, vconst19.2s + umull \vC8\().2d, \vA1\().2s, \vB7\().2s + umlal \vC8\().2d, \vA3\().2s, \vB5\().2s + umlal \vC8\().2d, \vA5\().2s, \vB3\().2s + umlal \vC8\().2d, \vA7\().2s, \vB1\().2s + umlal \vC8\().2d, \vA9\().2s, \vB9\().2s + umlal \vC9\().2d, \vA1\().2s, \vB8\().2s + umlal \vC9\().2d, \vA3\().2s, \vB6\().2s + umlal \vC9\().2d, \vA5\().2s, \vB4\().2s + umlal \vC9\().2d, \vA7\().2s, \vB2\().2s + umlal \vC9\().2d, \vA9\().2s, \vB0\().2s + shl \vC8\().2d, \vC8\().2d, #1 + umull \vC7\().2d, \vA0\().2s, \vB7\().2s + umlal \vC7\().2d, \vA2\().2s, \vB5\().2s + umlal \vC7\().2d, \vA4\().2s, \vB3\().2s + umlal \vC7\().2d, \vA6\().2s, \vB1\().2s + umlal \vC7\().2d, \vA8\().2s, \vB9\().2s + mul \vB7\().2s, \vB7\().2s, vconst19.2s + umlal \vC8\().2d, \vA0\().2s, \vB8\().2s + umlal \vC8\().2d, \vA2\().2s, \vB6\().2s + umlal \vC8\().2d, \vA4\().2s, \vB4\().2s + umlal \vC8\().2d, \vA6\().2s, \vB2\().2s + umlal \vC8\().2d, \vA8\().2s, \vB0\().2s + mul \vB8\().2s, \vB8\().2s, vconst19.2s + umull \vC6\().2d, \vA1\().2s, \vB5\().2s + umlal \vC6\().2d, \vA3\().2s, \vB3\().2s + umlal \vC6\().2d, \vA5\().2s, \vB1\().2s + umlal \vC6\().2d, \vA7\().2s, \vB9\().2s + umlal \vC6\().2d, \vA9\().2s, \vB7\().2s + umlal \vC7\().2d, \vA1\().2s, \vB6\().2s + umlal \vC7\().2d, \vA3\().2s, \vB4\().2s + umlal \vC7\().2d, \vA5\().2s, \vB2\().2s + umlal \vC7\().2d, \vA7\().2s, \vB0\().2s + umlal \vC7\().2d, \vA9\().2s, \vB8\().2s + shl \vC6\().2d, \vC6\().2d, #1 + umull \vC5\().2d, \vA0\().2s, \vB5\().2s + umlal \vC5\().2d, \vA2\().2s, \vB3\().2s + umlal \vC5\().2d, \vA4\().2s, \vB1\().2s + umlal \vC5\().2d, \vA6\().2s, \vB9\().2s + umlal \vC5\().2d, \vA8\().2s, \vB7\().2s + mul \vB5\().2s, \vB5\().2s, vconst19.2s + umlal \vC6\().2d, \vA0\().2s, \vB6\().2s + umlal \vC6\().2d, \vA2\().2s, \vB4\().2s + umlal \vC6\().2d, \vA4\().2s, \vB2\().2s + umlal \vC6\().2d, \vA6\().2s, \vB0\().2s + umlal \vC6\().2d, \vA8\().2s, \vB8\().2s + mul \vB6\().2s, \vB6\().2s, vconst19.2s + umull \vC4\().2d, \vA1\().2s, \vB3\().2s + umlal \vC4\().2d, \vA3\().2s, \vB1\().2s + umlal \vC4\().2d, \vA5\().2s, \vB9\().2s + umlal \vC4\().2d, \vA7\().2s, \vB7\().2s + umlal \vC4\().2d, \vA9\().2s, \vB5\().2s + umlal \vC5\().2d, \vA1\().2s, \vB4\().2s + umlal \vC5\().2d, \vA3\().2s, \vB2\().2s + umlal \vC5\().2d, \vA5\().2s, \vB0\().2s + umlal \vC5\().2d, \vA7\().2s, \vB8\().2s + umlal \vC5\().2d, \vA9\().2s, \vB6\().2s + shl \vC4\().2d, \vC4\().2d, #1 + umull \vC3\().2d, \vA0\().2s, \vB3\().2s + umlal \vC3\().2d, \vA2\().2s, \vB1\().2s + umlal \vC3\().2d, \vA4\().2s, \vB9\().2s + umlal \vC3\().2d, \vA6\().2s, \vB7\().2s + umlal \vC3\().2d, \vA8\().2s, \vB5\().2s + mul \vB3\().2s, \vB3\().2s, vconst19.2s + umlal \vC4\().2d, \vA0\().2s, \vB4\().2s + umlal \vC4\().2d, \vA2\().2s, \vB2\().2s + umlal \vC4\().2d, \vA4\().2s, \vB0\().2s + umlal \vC4\().2d, \vA6\().2s, \vB8\().2s + umlal \vC4\().2d, \vA8\().2s, \vB6\().2s + mul \vB4\().2s, \vB4\().2s, vconst19.2s + umull \vC2\().2d, \vA1\().2s, \vB1\().2s + umlal \vC2\().2d, \vA3\().2s, \vB9\().2s + umlal \vC2\().2d, \vA5\().2s, \vB7\().2s + umlal \vC2\().2d, \vA7\().2s, \vB5\().2s + umlal \vC2\().2d, \vA9\().2s, \vB3\().2s + umlal \vC3\().2d, \vA1\().2s, \vB2\().2s + umlal \vC3\().2d, \vA3\().2s, \vB0\().2s + umlal \vC3\().2d, \vA5\().2s, \vB8\().2s + umlal \vC3\().2d, \vA7\().2s, \vB6\().2s + umlal \vC3\().2d, \vA9\().2s, \vB4\().2s + shl \vC2\().2d, \vC2\().2d, #1 + umull \vC1\().2d, \vA0\().2s, \vB1\().2s + umlal \vC1\().2d, \vA2\().2s, \vB9\().2s + umlal \vC1\().2d, \vA4\().2s, \vB7\().2s + umlal \vC1\().2d, \vA6\().2s, \vB5\().2s + umlal \vC1\().2d, \vA8\().2s, \vB3\().2s + mul \vB1\().2s, \vB1\().2s, vconst19.2s + umlal \vC2\().2d, \vA0\().2s, \vB2\().2s + umlal \vC2\().2d, \vA2\().2s, \vB0\().2s + umlal \vC2\().2d, \vA4\().2s, \vB8\().2s + umlal \vC2\().2d, \vA6\().2s, \vB6\().2s + umlal \vC2\().2d, \vA8\().2s, \vB4\().2s + mul \vB2\().2s, \vB2\().2s, vconst19.2s + umull \vC0\().2d, \vA1\().2s, \vB9\().2s + umlal \vC0\().2d, \vA3\().2s, \vB7\().2s + umlal \vC0\().2d, \vA5\().2s, \vB5\().2s + ushr vMaskB.2d, vMaskA.2d, #1 + usra \vC3\().2d, \vC2\().2d, #26 + and \vC2\().16b, \vC2\().16b, vMaskA.16b + umlal \vC1\().2d, \vA1\().2s, \vB0\().2s + usra \vC4\().2d, \vC3\().2d, #25 + and \vC3\().16b, \vC3\().16b, vMaskB.16b + umlal \vC0\().2d, \vA7\().2s, \vB3\().2s + usra \vC5\().2d, \vC4\().2d, #26 + and \vC4\().16b, \vC4\().16b, vMaskA.16b + umlal \vC1\().2d, \vA3\().2s, \vB8\().2s + usra \vC6\().2d, \vC5\().2d, #25 + and \vC5\().16b, \vC5\().16b, vMaskB.16b + umlal \vC0\().2d, \vA9\().2s, \vB1\().2s + usra \vC7\().2d, \vC6\().2d, #26 + and \vC6\().16b, \vC6\().16b, vMaskA.16b + umlal \vC1\().2d, \vA5\().2s, \vB6\().2s + umlal \vC1\().2d, \vA7\().2s, \vB4\().2s + umlal \vC1\().2d, \vA9\().2s, \vB2\().2s + usra \vC8\().2d, \vC7\().2d, #25 + and \vC7\().16b, \vC7\().16b, vMaskB.16b + shl \vC0\().2d, \vC0\().2d, #1 + usra \vC9\().2d, \vC8\().2d, #26 + and \vC8\().16b, \vC8\().16b, vMaskA.16b + umlal \vC0\().2d, \vA0\().2s, \vB0\().2s + umlal \vC0\().2d, \vA2\().2s, \vB8\().2s + umlal \vC0\().2d, \vA4\().2s, \vB6\().2s + umlal \vC0\().2d, \vA6\().2s, \vB4\().2s + umlal \vC0\().2d, \vA8\().2s, \vB2\().2s + bic \vB9\().16b, \vC9\().16b, vMaskB.16b + and \vC9\().16b, \vC9\().16b, vMaskB.16b + usra \vC0\().2d, \vB9\().2d, #25 + usra \vC0\().2d, \vB9\().2d, #24 + usra \vC0\().2d, \vB9\().2d, #21 + usra \vC1\().2d, \vC0\().2d, #26 + and \vC0\().16b, \vC0\().16b, vMaskA.16b + usra \vC2\().2d, \vC1\().2d, #25 + and \vC1\().16b, \vC1\().16b, vMaskB.16b + usra \vC3\().2d, \vC2\().2d, #26 + and \vC2\().16b, \vC2\().16b, vMaskA.16b +.endm + +.macro vector_mul vC, vA, vB +vector_mul_inner \vC\()0, \vC\()1, \vC\()2, \vC\()3, \vC\()4, \vC\()5, \vC\()6, \vC\()7, \vC\()8, \vC\()9, \vA\()0, \vA\()1, \vA\()2, \vA\()3, \vA\()4, \vA\()5, \vA\()6, \vA\()7, \vA\()8, \vA\()9, \vB\()0, \vB\()1, \vB\()2, \vB\()3, \vB\()4, \vB\()5, \vB\()6, \vB\()7, \vB\()8, \vB\()9 +.endm + + // in: x1: scalar pointer, x2: base point pointer + // out: x0: result pointer + .global x25519_scalarmult_alt_orig + .global _x25519_scalarmult_alt_orig + // .type x25519_scalarmult, %function +x25519_scalarmult_alt_orig: +_x25519_scalarmult_alt_orig: + stp x29, x30, [sp, #-160]! + mov x29, sp + stp x19, x20, [sp, #16] + stp x21, x22, [sp, #32] + stp x23, x24, [sp, #48] + stp x25, x26, [sp, #64] + stp x27, x28, [sp, #80] + stp d8, d9, [sp, #96] + stp d10, d11, [sp, #112] + stp d12, d13, [sp, #128] + stp d14, d15, [sp, #144] + sub sp, sp, STACK_OUT_PTR+8 + + // 0: mask1, 8: mask2, 16: AA, 56: B/BB, 96: counter, 100: lastbit, 104: scalar, 136: X1, 176: outptr, 184: padding, 192: fp, 200: lr + + str x0, [sp, STACK_OUT_PTR] // outptr + mov x19, x2 // point + + mov x0, x1 // scalar + bl load256unaligned + + and x3, x3, #0x7fffffffffffffff + and x0, x0, #0xfffffffffffffff8 + orr x3, x3, #0x4000000000000000 + + stp x0, x1, [sp, STACK_SCALAR] + stp x2, x3, [sp, STACK_SCALAR+16] + + mov x0, x19 // point + bl load256unaligned + + // Unpack point (discard most significant bit) + lsr x12, x0, #51 + lsr x17, x2, #51 + orr w12, w12, w1, lsl #13 + orr w17, w17, w3, lsl #13 + ubfx x8, x3, #12, #26 + ubfx x9, x3, #38, #25 + ubfx x11, x0, #26, #25 + ubfx x13, x1, #13, #25 + lsr x14, x1, #38 + ubfx x16, x2, #25, #26 + and w10, w0, #0x3ffffff + and w12, w12, #0x3ffffff + and w15, w2, #0x1ffffff + and w17, w17, #0x1ffffff + stp w10, w11, [sp, STACK_X_0] + stp w12, w13, [sp, STACK_X_8] + stp w14, w15, [sp, STACK_X_16] + stp w16, w17, [sp, STACK_X_24] + stp w8, w9, [sp, STACK_X_32] + + // X2 (initially set to 1) + mov x1, #1 + mov v0.d[0], x1 + mov v2.d[0], xzr + mov v4.d[0], xzr + mov v6.d[0], xzr + mov v8.d[0], xzr + + // Z2 (initially set to 0) + mov v1.d[0], xzr + mov v3.d[0], xzr + mov v5.d[0], xzr + mov v7.d[0], xzr + mov v9.d[0], xzr + + // X3 (initially set to X1) + mov v10.s[0], w10 + mov v10.s[1], w11 + mov v12.s[0], w12 + mov v12.s[1], w13 + mov v14.s[0], w14 + mov v14.s[1], w15 + mov v16.s[0], w16 + mov v16.s[1], w17 + mov v18.s[0], w8 + mov v18.s[1], w9 + + // Z3 (initially set to 1) + mov v11.d[0], x1 + mov v13.d[0], xzr + mov v15.d[0], xzr + mov v17.d[0], xzr + mov v19.d[0], xzr + + mov x0, #255-1 // 255 iterations + str W0, [sp, #STACK_CTR] // @slothy:writes=ctr + + const19 .req x30 + vconst19 .req v31 + + mov w30, #19 + dup vconst19.2s, w30 + mov x0, #(1<<26)-1 + dup v30.2d, x0 + ldr x0, #=0x07fffffe07fffffc + // TODO: I do not quite understand what the two stps are doing + // First seems to write bytes 0-15 (mask1+mask2); second seems to write bytes 16-31 (mask2+A) + // stp x0, x0, [sp, #STACK_MASK1] // @slothy:writes=mask1 + + sub x1, x0, #0xfc-0xb4 + str x0, [sp, #STACK_MASK1] // @slothy:writes=mask1 + str x1, [sp, #STACK_MASK2] // @slothy:writes=mask2 + + ldr d28, [sp, #STACK_MASK2] // @slothy:reads=mask2 + ldr d29, [sp, #STACK_MASK1] // @slothy:reads=mask1 + + ldrb w1, [sp, #STACK_SCALAR+31] + lsr w1, w1, #6 + str w1, [sp, #STACK_LASTBIT] // @slothy:writes=lastbit + mainloop: + sub v22.2S, v29.2S, v13.2S // ............*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + add v25.2S, v12.2S, v13.2S // ...........................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + add v20.2S, v10.2S, v11.2S // ..........................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + sub v23.2S, v29.2S, v5.2S // ...*.......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + add v21.2S, v4.2S, v23.2S // ........*..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + tst w1, #1 // *............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + add v27.2S, v2.2S, v3.2S // ......................*....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + fcsel_dform v23, v27, v25, eq // ................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + mov x6, v23.d[0] // .....................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + add v22.2S, v12.2S, v22.2S // .................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + sub v26.2S, v29.2S, v19.2S // ...............*.............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + sub v13.2S, v29.2S, v17.2S // ..............*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + sub v24.2S, v29.2S, v9.2S // .....*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + sub v12.2S, v29.2S, v7.2S // ....*......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + sub v23.2S, v29.2S, v15.2S // .............*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + sub v3.2S, v29.2S, v3.2S // ..*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + add v29.2S, v18.2S, v26.2S // ....................*......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + sub v26.2S, v28.2S, v11.2S // ...........*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + add v11.2S, v18.2S, v19.2S // ..............................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + add v19.2S, v14.2S, v23.2S // ..................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + add v23.2S, v0.2S, v1.2S // .....................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + add v18.2S, v8.2S, v9.2S // .........................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + add v9.2S, v14.2S, v15.2S // ............................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + add v14.2S, v8.2S, v24.2S // ..........*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + add v8.2S, v6.2S, v12.2S // .........*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + add v3.2S, v2.2S, v3.2S // .......*...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + sub v12.2S, v28.2S, v1.2S // .*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + add v24.2S, v0.2S, v12.2S // ......*....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + fcsel_dform v0, v23, v20, eq // ...............................*.............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + add v28.2S, v10.2S, v26.2S // ................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + fcsel_dform v12, v24, v28, eq // ...................................................*.......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + fcsel_dform v1, v3, v22, eq // ....................................................*......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + add v26.2S, v16.2S, v17.2S // .............................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + add v15.2S, v6.2S, v7.2S // ........................*..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + fcsel_dform v6, v15, v26, eq // ..................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + fcsel_dform v17, v21, v19, eq // .....................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + trn1 v7.2S, v15.2S, v8.2S // ................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + mov x21, v6.d[0] // .......................................*...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + lsr x22, x21, #32 // ........................................................................*..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + fcsel_dform v10, v18, v11, eq // ...................................*.......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + mov x24, v10.d[0] // ........................................*..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + lsr x13, x24, #32 // .........................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + add v5.2S, v4.2S, v5.2S // .......................*...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + fcsel_dform v10, v5, v9, eq // .................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + mov x0, v10.d[0] // ......................................*....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + lsr x14, x0, #32 // .......................................................................*...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + add x28, x14, x14 // ..............................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + trn2 v15.2S, v15.2S, v8.2S // ...............................................*.............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + stp d12, d1, [sp, #STACK_B_0] // ..................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... // @slothy:writes=[B0,B8] + trn2 v1.2S, v5.2S, v21.2S // .............................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + trn1 v5.2S, v5.2S, v21.2S // ..............................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + trn1 v2.2S, v27.2S, v3.2S // ............................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + mov x8, v0.d[0] // ....................................*......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umull x26, w0, w0 // ...................................................................................*.......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + lsr x23, x8, #32 // .....................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + add x17, x23, x23 // ..................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + trn2 v3.2S, v27.2S, v3.2S // ...........................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + add x20, x13, x13 // ..........................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + add x7, x0, x0 // ...............................................................................*.............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + umull x12, w0, w28 // ....................................................................................*......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + add x0, x24, x24 // ...........................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + umaddl x11, w8, w20, x12 // .........................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x12, w23, w0, x11 // ...........................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + add v27.2S, v16.2S, v13.2S // ...................*.......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + trn2 v13.2S, v18.2S, v14.2S // .................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + trn1 v12.2S, v22.2S, v25.2S // ...........................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + trn2 v22.2S, v22.2S, v25.2S // ..........................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + mul w19, w13, w30 // .....................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + umaddl x2, w19, w20, x26 // ........................................................................................*..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x26, w8, w0, x2 // ..................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + add x10, x22, x22 // ............................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + umaddl x4, w6, w10, x12 // ..................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + add x20, x21, x21 // .............................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + umaddl x29, w17, w10, x26 // ..........................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x2, w6, w20, x29 // .................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + lsr x27, x6, #32 // ......................................................................*....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + add x29, x27, x27 // ................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + mul w16, w14, w30 // .......................................................................................*...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x13, w29, w28, x2 // .....................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + umull x9, w8, w8 // ..........................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + add x26, x6, x6 // .................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + umaddl x5, w16, w28, x9 // ..........................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umull x9, w8, w20 // ................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + umaddl x3, w17, w28, x9 // ........................................................................................................*..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x12, w6, w7, x3 // ...............................................................................................................*.............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + add x3, x28, x28 // ...................................................................................................................................*.......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + mul w25, w22, w30 // ......................................................................................*....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + mul w18, w21, w30 // ...................................................................................................*.......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umull x14, w8, w26 // ............................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + umaddl x9, w23, w17, x14 // ....................................................................................................*......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x9, w21, w18, x9 // ........................................................................................................................*..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x2, w25, w3, x9 // ..........................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + mul w22, w24, w30 // ............................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + umaddl x9, w27, w29, x12 // ...................................................................................................................*.......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x15, w24, w22, x9 // .......................................................................................................................*...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + add x9, x10, x10 // ....................................................................................................................................*......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + fcsel_dform v4, v8, v27, eq // ......................................................*....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + stp d17, d4, [sp, #STACK_B_16] // ...................................................................*.......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... // @slothy:writes=[B16,B24] + umaddl x1, w27, w20, x4 // ......................................................................................................................*....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + add x11, x1, x13, lsr #26 // .........................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + add x12, x5, x11, lsr #25 // ...........................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + bic x1, x11, #0x1ffffff // ............................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + add x21, x12, x1, lsr #24 // .............................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + trn1 v4.2S, v18.2S, v14.2S // ..................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + add x24, x29, x29 // ..................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + add x5, x21, x1, lsr #21 // ...............................................................................................................................*.............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + umaddl x14, w18, w7, x5 // .....................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + umaddl x14, w25, w24, x14 // ........................................................................................................................................*..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x21, w22, w26, x14 // ............................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + trn1 v0.2S, v23.2S, v24.2S // ..........................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + trn2 v16.2S, v19.2S, v9.2S // ............................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + umull x12, w8, w7 // ..............................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x1, w17, w29, x12 // ......................................................................................................*....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x5, w6, w6, x1 // .............................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + umaddl x12, w25, w10, x5 // ................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + umaddl x4, w22, w20, x12 // ................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + umaddl x3, w19, w3, x4 // ......................................................................................................................................................*....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + fcsel_dform v21, v14, v29, eq // .......................................................*...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + trn1 v14.2S, v19.2S, v9.2S // .............................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + trn1 v18.2S, v29.2S, v11.2S // .................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + trn2 v9.2S, v23.2S, v24.2S // .........................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + trn2 v23.2S, v29.2S, v11.2S // ................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + umull x5, w8, w28 // ...............................................................................................*.............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + umull x16, w8, w29 // .............................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + umaddl x4, w23, w26, x16 // .....................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + umaddl x4, w25, w20, x4 // ...........................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + umaddl x12, w22, w28, x4 // ...............................................................................................................................................*.............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + umaddl x14, w19, w7, x12 // .....................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + add x16, x17, x17 // .................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + umaddl x16, w19, w16, x21 // ..................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x1, w23, w7, x5 // .......................................................................................................*...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x21, w6, w29, x1 // ..............................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x12, w22, w10, x21 // .................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + umull x4, w8, w10 // .................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + umaddl x21, w19, w20, x12 // .......................................................................................................................................................*...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x2, w22, w7, x2 // ..............................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x12, w19, w24, x2 // ....................................................................................................................................................*......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x2, w23, w20, x4 // .........................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x1, w6, w28, x2 // ................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + umaddl x2, w27, w7, x1 // ....................................................................................................................*......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + and x10, x16, #0x3ffffff // ...........................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + umaddl x6, w19, w0, x2 // .........................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umull x2, w8, w17 // ...........................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + umaddl x8, w18, w28, x2 // ......................................................................................................................................*....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + and x1, x13, #0x3ffffff // .......................................................................................................................................*...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + ldr x4, [sp, #STACK_B_0] // ................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. // @slothy:reads=[B0] + umaddl x24, w25, w7, x8 // .........................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x18, w22, w29, x24 // .............................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + umaddl x27, w19, w9, x15 // ........................................................................................................................................................*..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x17, w19, w26, x18 // ...................................................................................................................................................*.......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + add x25, x17, x16, lsr #26 // ..........................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + add x5, x12, x25, lsr #25 // ............................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + lsr x8, x4, #32 // .....................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + add x23, x8, x8 // ..................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + add x29, x23, x23 // .................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + add x0, x14, x5, lsr #26 // ..............................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + add x17, x3, x0, lsr #25 // ................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + add x20, x21, x17, lsr #26 // ..................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + add x13, x27, x20, lsr #25 // ....................................................................................................................................................................*......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + add x22, x6, x13, lsr #26 // ......................................................................................................................................................................*....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + str d21, [sp, #STACK_B_32] // ....................................................................*......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... // @slothy:writes=[B32] + ldr x12, [sp, #STACK_B_32] // ....................................................................................................................................................................................*......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... // @slothy:reads=[B32] + lsr x21, x12, #32 // .........................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + add x3, x21, x21 // ..........................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + and x27, x5, #0x3ffffff // ...............................................................................................................................................................*.............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + and x26, x13, #0x3ffffff // .......................................................................................................................................................................*...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + bfi x26, x22, #32, #25 // .........................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + bfi x27, x0, #32, #25 // .................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + add x22, x1, x22, lsr #25 // ........................................................................................................................................................................*..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + add x14, x12, x12 // ...........................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + bfi x10, x25, #32, #25 // .............................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + and x17, x17, #0x3ffffff // ...................................................................................................................................................................*.......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + bfi x17, x20, #32, #25 // .....................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + ldr x28, [sp, #STACK_B_8] // .................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ // @slothy:reads=[B8] + lsr x2, x28, #32 // ......................................................................................................................................................................................*....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + ldr x13, [sp, #STACK_B_16] // ..................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... // @slothy:reads=[B16] + lsr x19, x13, #32 // .......................................................................................................................................................................................*...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + mul w15, w19, w30 // .......................................................................................................................................................................................................*...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umull x1, w4, w4 // ..........................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + stp x17, x26, [sp, #STACK_A_16] // ..............................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... // @slothy:writes=[A16,A24] + stp x10, x27, [sp, #STACK_A_0] // .............................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ // @slothy:writes=[A0,A8] + and x5, x22, #0x3ffffff // ...........................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + and x20, x11, #0x1ffffff // ..............................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + ldr x18, [sp, #STACK_B_24] // ...................................................................................................................................................................................*.......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... // @slothy:reads=[B24] + lsr x27, x18, #32 // ........................................................................................................................................................................................*..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + add x24, x27, x27 // ............................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + umull x7, w4, w24 // .................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + add x16, x18, x18 // .............................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + add x6, x20, x22, lsr #26 // ..........................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + add x11, x24, x24 // ....................................................................................................................................................................................................................................................*......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + bfi x5, x6, #32, #26 // ............................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + str x5, [sp, #STACK_A_32] // ...............................................................................................................................................................................*.............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. // @slothy:writes=[A32] + mul w22, w12, w30 // ............................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + umull x5, w4, w16 // ................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + add x26, x2, x2 // ................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + add x25, x13, x13 // ...............................................................................................................................................................................................*.............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + add x17, x19, x19 // ..............................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x0, w23, w17, x5 // ........................................................................................................................................................................................................................*..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x19, w28, w25, x0 // ...............................................................................................................................................................................................................................*.............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + umaddl x6, w2, w26, x19 // ...................................................................................................................................................................................................................................*.......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x12, w12, w22, x6 // .......................................................................................................................................................................................................................................*...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + add x9, x28, x28 // .................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + umull x10, w4, w9 // ............................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + umaddl x6, w8, w23, x10 // ....................................................................................................................................................................................................................*......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + mul w10, w18, w30 // ...................................................................................................................................................................................................................*.......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x18, w18, w10, x6 // ........................................................................................................................................................................................................................................*..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + mul w5, w27, w30 // ......................................................................................................................................................................................................*....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + add x20, x17, x17 // ...................................................................................................................................................................................................................................................*.......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x0, w5, w20, x18 // ..........................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x19, w22, w25, x0 // ..............................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + mul w6, w21, w30 // .....................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + umull x0, w4, w17 // ...............................................................................................................................................................................................................*.............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + umaddl x21, w8, w25, x0 // .......................................................................................................................................................................................................................*...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x27, w28, w26, x21 // ..............................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x0, w22, w24, x27 // .................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + umaddl x27, w6, w16, x0 // .......................................................................................................................................................................................................................................................................*...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umull x21, w4, w23 // ...........................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + umaddl x18, w10, w17, x21 // ......................................................................................................................................................................................................................................................*....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x18, w5, w25, x18 // .........................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x18, w22, w26, x18 // .............................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + umaddl x0, w6, w9, x18 // ...................................................................................................................................................................................................................................................................*.......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umull x21, w4, w25 // ..............................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x18, w23, w26, x21 // ......................................................................................................................................................................................................................*....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x18, w28, w28, x18 // .............................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + umull x21, w13, w17 // ....................................................................................................................................................................................................*......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x21, w4, w3, x21 // .........................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x21, w8, w14, x21 // ...........................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + umaddl x21, w28, w24, x21 // ..................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x21, w2, w16, x21 // ......................................................................................................................................................................................................................................*....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x1, w15, w17, x1 // ..........................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umull x15, w13, w13 // ...................................................................................................................................................................................................*.......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x15, w6, w3, x15 // ........................................................................................................................................................................................................*..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x13, w4, w14, x15 // ..................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x13, w23, w24, x13 // ..........................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x23, w28, w16, x13 // .................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + umaddl x13, w26, w17, x23 // .....................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + add x15, x21, x13, lsr #26 // .........................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + add x21, x1, x15, lsr #25 // ...........................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + add x1, x26, x26 // ..................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x23, w6, w1, x19 // ....................................................................................................................................................................................................................................................................*......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umull x3, w4, w26 // .............................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + umaddl x4, w8, w9, x3 // .....................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + umaddl x26, w5, w16, x4 // ...........................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + umaddl x3, w22, w17, x26 // ...............................................................................................................................................................................................................................................................*.............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + umaddl x26, w6, w25, x3 // .....................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + bic x3, x15, #0x1ffffff // ............................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + add x21, x21, x3, lsr #24 // .............................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + add x19, x21, x3, lsr #21 // ...............................................................................................................................................................................................................................................*.............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + umaddl x10, w10, w25, x19 // .....................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + umaddl x10, w5, w1, x10 // ........................................................................................................................................................................................................................................................*..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x10, w22, w9, x10 // ............................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + umaddl x19, w6, w29, x10 // ..................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + add x3, x0, x19, lsr #26 // ..........................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + add x1, x23, x3, lsr #25 // ............................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + add x21, x26, x1, lsr #26 // ..............................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + ldr x9, #=0x07fffffe07fffffc // .....................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + ldr x10, [sp, #STACK_A_32] // ....................................................................................................................................................................................................................................................................................................*......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... // @slothy:reads=[A32] + add x4, x10, x9 // .........................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + ldr x23, [sp, #STACK_A_8] // .................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ // @slothy:reads=[A8] + add x29, x23, x9 // ......................................................................................................................................................................................................................................................................................................*....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x26, w5, w24, x18 // ................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + and x1, x1, #0x3ffffff // ...............................................................................................................................................................................................................................................................................*.............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + bfi x1, x21, #32, #25 // .................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + sub x5, x29, x1 // .............................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + and x0, x3, #0x1ffffff // .................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + umaddl x23, w22, w16, x26 // ................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + umaddl x22, w6, w20, x23 // ......................................................................................................................................................................................................................................................................*....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + add x10, x22, x21, lsr #25 // ................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + add x20, x27, x10, lsr #26 // ..................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + and x29, x20, #0x1ffffff // ...................................................................................................................................................................................................................................................................................................................*.......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x23, w6, w11, x12 // ........................................................................................................................................................................................................................................................................*..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + add x27, x23, x20, lsr #25 // ....................................................................................................................................................................................................................................................................................*......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x23, w8, w16, x7 // .........................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x23, w28, w17, x23 // ................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + umaddl x28, w2, w25, x23 // ....................................................................................................................................................................................................................................*......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x25, w6, w14, x28 // .........................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + add x11, x25, x27, lsr #26 // ......................................................................................................................................................................................................................................................................................*....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + and x23, x13, #0x3ffffff // .......................................................................................................................................................................................................................................................*...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + add x2, x23, x11, lsr #25 // ........................................................................................................................................................................................................................................................................................*..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + and x25, x10, #0x3ffffff // ...................................................................................................................................................................................................................................................................................*.......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + bfi x25, x20, #32, #25 // .....................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + and x18, x21, #0x1ffffff // ..................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + ldr x16, [sp, #STACK_A_16] // ..................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... // @slothy:reads=[A16] + add x26, x16, x9 // .......................................................................................................................................................................................................................................................................................................*...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + and x13, x15, #0x1ffffff // ..............................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + add x13, x13, x2, lsr #26 // ..........................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + ldr x6, [sp, #STACK_A_24] // ...................................................................................................................................................................................................................................................................................................*.......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... // @slothy:reads=[A24] + trn2 v6.2S, v28.2S, v20.2S // ........................................................*..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + trn1 v10.2S, v28.2S, v20.2S // .........................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + trn2 v11.2S, v27.2S, v26.2S // ..............................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + trn1 v17.2S, v27.2S, v26.2S // ...............................................................*.............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + add x22, x6, x9 // ........................................................................................................................................................................................................................................................................................................*..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + sub x26, x26, x25 // ..............................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + ldr x14, #=121666 // ...............................................................................................................................................................................................................................................................................................................................*.............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + mov w10, w1 // ......................................................................................................................................................................................................................................................................................................................*....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + and x16, x27, #0x3ffffff // .......................................................................................................................................................................................................................................................................................*...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + bfi x16, x11, #32, #25 // .........................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + mov w7, w16 // ........................................................................................................................................................................................................................................................................................................................*..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + and x6, x2, #0x3ffffff // ...........................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + and x15, x19, #0x3ffffff // ...........................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + bfi x6, x13, #32, #26 // ............................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + and x2, x11, #0x1ffffff // ....................................................................................................................................................................................................................................................................................................................*......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + movk x9, #0xffb4 // ..........................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + mov w21, w25 // .......................................................................................................................................................................................................................................................................................................................*...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + mov w24, w6 // .........................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + str x6, [sp, #STACK_B_32] // ...............................................................................................................................................................................................................................................................................................*.............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. // @slothy:writes=[B32] + stp x25, x16, [sp, #STACK_B_16] // ..............................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... // @slothy:writes=[B16,B24] + lsr x25, x26, #32 // ............................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + ldr x12, [sp, #STACK_A_0] // ................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. // @slothy:reads=[A0] + add x12, x12, x9 // ...........................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + bfi x15, x3, #32, #25 // .............................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + stp x15, x1, [sp, #STACK_B_0] // .............................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ // @slothy:writes=[B0,B8] + mov w9, w15 // .....................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + sub x16, x22, x16 // ...............................................................................................................................................................................................................................................................................................................*.............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + umaddl x3, w16, w14, x7 // ...........................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + umaddl x23, w26, w14, x21 // .......................................................................................................................................................................................................................................................................................................................................*...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x1, w5, w14, x10 // ...................................................................................................................................................................................................................................................................................................................................*.......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + lsr x20, x16, #32 // .............................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + sub x19, x12, x15 // ............................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + umaddl x15, w19, w14, x9 // .................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + lsr x7, x19, #32 // ..........................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x11, w7, w14, x0 // ..................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + lsr x10, x5, #32 // ...........................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + umaddl x21, w10, w14, x18 // .....................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + umaddl x2, w20, w14, x2 // .............................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + umaddl x17, w25, w14, x29 // .........................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + sub x6, x4, x6 // ................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + lsr x27, x6, #32 // ..............................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x12, w27, w14, x13 // ................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + lsr x9, x12, #25 // ....................................................................................................................................................................................................................................................................................................................................*......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + add x4, x15, x9 // ........................................................................................................................................................................................................................................................................................................................................*..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + add x4, x4, x9, lsl #1 // ..........................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + add x0, x4, x9, lsl #4 // ............................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + umaddl x22, w6, w14, x24 // ..............................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + and x14, x12, #0x1ffffff // ......................................................................................................................................................................................................................................................................................................................................*....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + add x4, x11, x0, lsr #26 // ...............................................................................................................................................................................................................................................................................................................................................*.............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + add x15, x1, x4, lsr #25 // .................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + add x28, x21, x15, lsr #26 // ...................................................................................................................................................................................................................................................................................................................................................*.......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + add x9, x23, x28, lsr #25 // .....................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + add x12, x17, x9, lsr #26 // .......................................................................................................................................................................................................................................................................................................................................................*...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + add x11, x3, x12, lsr #25 // .........................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + add x29, x2, x11, lsr #26 // ...........................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + and x24, x29, #0x1ffffff // ..............................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + and x13, x4, #0x1ffffff // ..................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + and x0, x0, #0x3ffffff // ................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + umull x8, w13, w20 // ...................................................................................................................................................................................................................................................................................................................................................................................*.......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umull x23, w13, w6 // .........................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + and x2, x28, #0x1ffffff // ......................................................................................................................................................................................................................................................................................................................................................*....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + and x21, x15, #0x3ffffff // ....................................................................................................................................................................................................................................................................................................................................................*......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + add x18, x22, x29, lsr #25 // .............................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + and x28, x18, #0x3ffffff // ................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + and x4, x12, #0x1ffffff // ..........................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + add x3, x14, x18, lsr #26 // ...............................................................................................................................................................................................................................................................................................................................................................*.............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + mul w14, w21, w30 // ..................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x1, w2, w25, x8 // ....................................................................................................................................................................................................................................................................................................................................................................................*......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x8, w4, w10, x1 // .....................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + umaddl x17, w24, w7, x8 // ......................................................................................................................................................................................................................................................................................................................................................................................*....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + mul w29, w3, w30 // ........................................................................................................................................................................................................................................................................................................................................................................*..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x22, w2, w16, x23 // ..........................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + mul w18, w24, w30 // ......................................................................................................................................................................................................................................................................................................................................................................*....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x17, w29, w27, x17 // .......................................................................................................................................................................................................................................................................................................................................................................................*...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + add x23, x17, x17 // ........................................................................................................................................................................................................................................................................................................................................................................................*..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + and x12, x9, #0x3ffffff // ........................................................................................................................................................................................................................................................................................................................................................*..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x8, w4, w26, x22 // ...........................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + umaddl x8, w24, w5, x8 // ............................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + umaddl x9, w0, w6, x23 // .........................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + and x15, x11, #0x3ffffff // ............................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + umaddl x8, w3, w19, x8 // .............................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + umaddl x17, w0, w27, x8 // ..............................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x1, w21, w20, x17 // ...............................................................................................................................................................................................................................................................................................................................................................................*.............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + umull x11, w13, w16 // ..............................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x23, w2, w26, x11 // ...............................................................................................................................................................................................................................................................................................................................................................................................*.............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + umull x8, w13, w25 // ........................................................................................................................................................................................................................................................................................................................................................................................................*..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x22, w4, w5, x23 // ................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + umaddl x17, w2, w10, x8 // .........................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x23, w4, w7, x17 // ..........................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x3, w18, w27, x23 // ...........................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + umaddl x9, w21, w16, x9 // ..........................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x17, w24, w19, x22 // .................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + umaddl x8, w29, w20, x3 // ............................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + add x8, x8, x8 // .............................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + umaddl x3, w29, w6, x17 // ..................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x23, w0, w20, x3 // ...................................................................................................................................................................................................................................................................................................................................................................................................*.......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x11, w0, w16, x8 // ..............................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + mul w3, w28, w30 // .......................................................................................................................................................................................................................................................................................................................................................................*...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x11, w21, w26, x11 // ...............................................................................................................................................................................................................................................................................................................................................................................................................*.............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + umaddl x11, w12, w5, x11 // ................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + umaddl x8, w15, w19, x11 // .................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + umaddl x11, w3, w6, x8 // ..................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + mul w17, w4, w30 // ....................................................................................................................................................................................................................................................................................................................................................................*......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + mul w8, w15, w30 // .....................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + umull x24, w29, w25 // .............................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + umaddl x24, w18, w20, x24 // ..............................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x22, w17, w27, x24 // ...............................................................................................................................................................................................................................................................................................................................................................................................................................*.............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + umaddl x24, w2, w7, x22 // ................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + umaddl x24, w13, w10, x24 // .................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + add x22, x24, x24 // ..................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x24, w3, w16, x22 // ...................................................................................................................................................................................................................................................................................................................................................................................................................................*.......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x24, w8, w6, x24 // ....................................................................................................................................................................................................................................................................................................................................................................................................................................*......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x24, w12, w19, x24 // .....................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + umaddl x24, w21, w5, x24 // ......................................................................................................................................................................................................................................................................................................................................................................................................................................*....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x23, w21, w25, x23 // ....................................................................................................................................................................................................................................................................................................................................................................................................*......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x23, w12, w10, x23 // .....................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + umaddl x22, w15, w7, x23 // ......................................................................................................................................................................................................................................................................................................................................................................................................*....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umull x23, w29, w16 // ...................................................................................................................................................................................................................................................................................................................................................................................................................*.......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x4, w4, w19, x23 // ....................................................................................................................................................................................................................................................................................................................................................................................................................*......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x4, w18, w6, x4 // .....................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + umaddl x4, w2, w5, x4 // ......................................................................................................................................................................................................................................................................................................................................................................................................................*....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x23, w13, w26, x4 // .......................................................................................................................................................................................................................................................................................................................................................................................................................*...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x4, w3, w20, x23 // ........................................................................................................................................................................................................................................................................................................................................................................................................................*..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x4, w8, w27, x4 // .........................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x23, w12, w7, x4 // ..........................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x4, w21, w10, x23 // ...........................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + umaddl x23, w0, w25, x4 // ............................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + umaddl x9, w12, w26, x9 // ...........................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + umaddl x9, w15, w5, x9 // ............................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + umaddl x4, w28, w19, x9 // .............................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + umaddl x9, w12, w25, x1 // ................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + umaddl x9, w15, w10, x9 // .................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + umaddl x9, w28, w7, x9 // ..................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x15, w3, w27, x22 // .......................................................................................................................................................................................................................................................................................................................................................................................................*...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x28, w0, w26, x24 // .......................................................................................................................................................................................................................................................................................................................................................................................................................................*...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + add x24, x23, x28, lsr #26 // ...................................................................................................................................................................................................................................................................................................................................................................................................................................................*.......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + add x1, x11, x24, lsr #25 // .....................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + add x11, x15, x1, lsr #26 // .......................................................................................................................................................................................................................................................................................................................................................................................................................................................*...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + add x23, x4, x11, lsr #25 // .........................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + add x9, x9, x23, lsr #26 // ...........................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + and x15, x24, #0x1ffffff // ......................................................................................................................................................................................................................................................................................................................................................................................................................................................*....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + and x4, x23, #0x3ffffff // ............................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + bfi x4, x9, #32, #26 // ...............................................................................................................................................................................................................................................................................................................................................................................................................................................................*.............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + umull x22, w29, w26 // ........................................................................................................................................................................................................................................................................................................................................................................................................................................*..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x22, w18, w16, x22 // .........................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x23, w17, w6, x22 // ..........................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x23, w2, w19, x23 // ...........................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + umaddl x24, w13, w5, x23 // ............................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + mul v27.2S, v23.2S, v31.2S // ...........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + umull v29.2D, v0.2S, v22.2S // .....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................ + umlal v29.2D, v2.2S, v6.2S // ......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*....................................................................................................................................................................................................................................................................................................................................................................................................... + umlal v29.2D, v5.2S, v27.2S // .......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*...................................................................................................................................................................................................................................................................................................................................................................................................... + umull v21.2D, v0.2S, v11.2S // .......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*...................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umlal v21.2D, v2.2S, v16.2S // ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*..................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umlal v21.2D, v5.2S, v22.2S // .........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umlal v21.2D, v7.2S, v6.2S // ..........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umull v25.2D, v0.2S, v23.2S // ......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*....................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umlal v25.2D, v2.2S, v11.2S // .......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*...................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umlal v25.2D, v5.2S, v16.2S // ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*..................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umlal v25.2D, v7.2S, v22.2S // .........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umull v20.2D, v9.2S, v11.2S // ............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + umlal v20.2D, v3.2S, v16.2S // .............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + umlal v20.2D, v1.2S, v22.2S // ..............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umull v23.2D, v0.2S, v16.2S // ..............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................... + umlal v23.2D, v2.2S, v22.2S // ...............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.............................................................................................................................................................................................................................................................................................................................................................................................................................. + umull v8.2D, v9.2S, v16.2S // ...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.......................................................................................................................................................................................................................................................................................................................................................................................................................................... + umlal v8.2D, v3.2S, v22.2S // ....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*......................................................................................................................................................................................................................................................................................................................................................................................................................................... + umull v28.2D, v9.2S, v22.2S // ..........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................... + mul v22.2S, v22.2S, v31.2S // ..........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................... + mul v26.2S, v11.2S, v31.2S // ............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................. + mul v16.2S, v16.2S, v31.2S // ...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.......................................................................................................................................................................................................................................................................................................................................................................................................................... + umlal v28.2D, v3.2S, v6.2S // ...........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................. + umlal v28.2D, v1.2S, v27.2S // ............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................. + umlal v28.2D, v15.2S, v26.2S // .............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................ + umlal v28.2D, v13.2S, v16.2S // ..............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................... + shl v28.2D, v28.2D, #1 // ....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*......................................................................................................................................................................................................................................................................................................................................................................................................... + umlal v28.2D, v0.2S, v14.2S // ...........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................. + umlal v28.2D, v2.2S, v12.2S // ............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................. + umlal v20.2D, v15.2S, v6.2S // ...............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.............................................................................................................................................................................................................................................................................................................................................................................................................................................................. + umlal v20.2D, v13.2S, v27.2S // ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................. + shl v20.2D, v20.2D, #1 // ......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*....................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umlal v20.2D, v0.2S, v18.2S // .............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................ + umlal v25.2D, v4.2S, v6.2S // ..........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umlal v25.2D, v9.2S, v18.2S // .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................ + mul v18.2S, v18.2S, v31.2S // ..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................... + umlal v8.2D, v1.2S, v6.2S // .....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................ + umlal v8.2D, v15.2S, v27.2S // ......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*....................................................................................................................................................................................................................................................................................................................................................................................................................................... + umlal v8.2D, v13.2S, v26.2S // .......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*...................................................................................................................................................................................................................................................................................................................................................................................................................................... + shl v11.2D, v8.2D, #1 // .............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................ + umlal v11.2D, v0.2S, v17.2S // ....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*......................................................................................................................................................................................................................................................................................................................................................................................................................... + umlal v25.2D, v3.2S, v17.2S // ..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umlal v21.2D, v4.2S, v27.2S // ...........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................. + umlal v23.2D, v5.2S, v6.2S // ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................. + umlal v23.2D, v7.2S, v27.2S // .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................ + umlal v23.2D, v4.2S, v26.2S // ..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................... + umlal v23.2D, v9.2S, v14.2S // ...............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.............................................................................................................................................................................................................................................................................................................................................................................................................. + umlal v23.2D, v3.2S, v12.2S // ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................. + umlal v23.2D, v1.2S, v10.2S // .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................ + umlal v23.2D, v15.2S, v18.2S // ..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................... + umull v19.2D, v0.2S, v6.2S // ............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................. + umlal v19.2D, v2.2S, v27.2S // .............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................ + umull v8.2D, v9.2S, v6.2S // .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................ + umlal v8.2D, v3.2S, v27.2S // ..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................... + umull v27.2D, v9.2S, v27.2S // ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*..................................................................................................................................................................................................................................................................................................................................................................... + umlal v27.2D, v3.2S, v26.2S // .........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................... + umlal v27.2D, v1.2S, v16.2S // ..........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................... + umlal v27.2D, v15.2S, v22.2S // .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................ + umlal v8.2D, v1.2S, v26.2S // ...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.......................................................................................................................................................................................................................................................................................................................................................................................... + umlal v8.2D, v15.2S, v16.2S // ....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*......................................................................................................................................................................................................................................................................................................................................................................................... + umlal v19.2D, v5.2S, v26.2S // ..............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................... + umlal v19.2D, v7.2S, v16.2S // ...............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.............................................................................................................................................................................................................................................................................................................................................................................. + umlal v19.2D, v4.2S, v22.2S // ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................. + umlal v8.2D, v13.2S, v22.2S // .....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................ + shl v22.2D, v8.2D, #1 // ...........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................. + mul w12, w12, w30 // .............................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + umaddl x24, w3, w25, x24 // ..............................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x22, w8, w20, x24 // ...............................................................................................................................................................................................................................................................................................................................................................................................................................................*.............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + umaddl x24, w12, w27, x22 // ................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + umaddl x24, w21, w7, x24 // .................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + mul w2, w2, w30 // ...................................................................................................................................................................................................................................................................................................................................................................*.......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + mul w23, w13, w30 // .................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + umull x22, w29, w10 // .......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x22, w18, w25, x22 // ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x22, w17, w20, x22 // .........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x22, w2, w27, x22 // ..........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x22, w13, w7, x22 // ...........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + add x22, x22, x22 // ............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + umaddl x22, w3, w26, x22 // .............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + umaddl x22, w8, w16, x22 // ..............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x22, w12, w6, x22 // ...............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + umaddl x21, w21, w19, x22 // ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + bic x22, x9, #0x3ffffff // .............................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + umull x9, w29, w5 // .............................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + umaddl x9, w18, w26, x9 // ..............................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x9, w17, w16, x9 // ...............................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + umaddl x9, w2, w6, x9 // ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + umaddl x13, w13, w19, x9 // .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + lsr x9, x22, #26 // ..............................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umlal v29.2D, v7.2S, v26.2S // ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*..................................................................................................................................................................................................................................................................................................................................................................................................... + umlal v19.2D, v9.2S, v10.2S // ..............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................... + umlal v25.2D, v1.2S, v14.2S // ...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.......................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umlal v25.2D, v15.2S, v12.2S // ....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*......................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umlal v29.2D, v4.2S, v16.2S // .........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................... + umlal v29.2D, v9.2S, v12.2S // ......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*....................................................................................................................................................................................................................................................................................................................................................................................... + umlal v29.2D, v3.2S, v10.2S // .......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*...................................................................................................................................................................................................................................................................................................................................................................................... + umlal v11.2D, v2.2S, v14.2S // .....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................ + umlal v22.2D, v0.2S, v12.2S // ..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................... + umlal v22.2D, v2.2S, v10.2S // ...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.......................................................................................................................................................................................................................................................................................................................................................................... + umlal v22.2D, v5.2S, v18.2S // ....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*......................................................................................................................................................................................................................................................................................................................................................................... + add x9, x9, x22, lsr #25 // ................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + add x22, x9, x22, lsr #22 // .................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + umaddl x22, w29, w7, x22 // ..................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x22, w18, w10, x22 // ...................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x22, w17, w25, x22 // ....................................................................................................................................................................................................................................................................................................................................................................................................................................................................*......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x9, w2, w20, x22 // .....................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + umaddl x2, w23, w27, x9 // ......................................................................................................................................................................................................................................................................................................................................................................................................................................................................*....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + add x9, x2, x2 // .......................................................................................................................................................................................................................................................................................................................................................................................................................................................................*...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x9, w3, w5, x9 // ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................*..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x26, w8, w26, x9 // .........................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x23, w12, w16, x26 // ..........................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x16, w14, w6, x23 // ...........................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + umaddl x18, w3, w10, x13 // ..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x13, w8, w25, x18 // ...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x26, w12, w20, x13 // ....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x13, w14, w27, x26 // .....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + umaddl x12, w0, w7, x13 // ......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x20, w0, w19, x16 // ............................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + umaddl x13, w0, w5, x21 // .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + umlal v21.2D, v9.2S, v17.2S // ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*..................................................................................................................................................................................................................................................................................................................................................................................................................................... + umlal v21.2D, v3.2S, v14.2S // .........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................... + umlal v20.2D, v2.2S, v17.2S // ..............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................... + umlal v20.2D, v5.2S, v14.2S // ...............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.............................................................................................................................................................................................................................................................................................................................................................................................................................................. + mul v14.2S, v14.2S, v31.2S // ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................. + mul v17.2S, v17.2S, v31.2S // .........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................... + umlal v22.2D, v7.2S, v17.2S // .....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................ + umlal v22.2D, v4.2S, v14.2S // ......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*....................................................................................................................................................................................................................................................................................................................................................................... + umlal v29.2D, v1.2S, v18.2S // ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*..................................................................................................................................................................................................................................................................................................................................................................................... + umlal v29.2D, v15.2S, v17.2S // .........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................... + umlal v29.2D, v13.2S, v14.2S // ..........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................... + usra v29.2D, v22.2D, #26 // ............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................. + and v8.16B, v22.16B, v30.16B // .............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................ + and x19, x28, #0x3ffffff // ....................................................................................................................................................................................................................................................................................................................................................................................................................................................*......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + add x27, x12, x20, lsr #26 // ..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + umaddl x12, w0, w10, x24 // ..................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + add x21, x13, x27, lsr #25 // ....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + add x2, x12, x21, lsr #26 // ......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + add x12, x19, x2, lsr #25 // ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + add x9, x15, x12, lsr #26 // ..........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + ushr v26.2D, v30.2D, #1 // ...........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................. + and v22.16B, v29.16B, v26.16B // ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................. + umlal v28.2D, v5.2S, v10.2S // .............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................ + umlal v28.2D, v7.2S, v18.2S // ..............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................... + umlal v28.2D, v4.2S, v17.2S // ...............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.............................................................................................................................................................................................................................................................................................................................................................................................. + umlal v21.2D, v1.2S, v12.2S // ..........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................... + usra v28.2D, v29.2D, #25 // ...............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.............................................................................................................................................................................................................................................................................................................................................................. + umlal v11.2D, v5.2S, v12.2S // ......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*....................................................................................................................................................................................................................................................................................................................................................................................................................... + umlal v11.2D, v7.2S, v10.2S // .......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*...................................................................................................................................................................................................................................................................................................................................................................................................................... + umlal v11.2D, v4.2S, v18.2S // ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*..................................................................................................................................................................................................................................................................................................................................................................................................................... + umlal v23.2D, v13.2S, v17.2S // ...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.......................................................................................................................................................................................................................................................................................................................................................................................................... + usra v23.2D, v28.2D, #26 // ..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................... + usra v11.2D, v23.2D, #25 // .....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................ + umlal v21.2D, v15.2S, v10.2S // ...........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................. + umlal v21.2D, v13.2S, v18.2S // ............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................. + usra v21.2D, v11.2D, #26 // ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*..................................................................................................................................................................................................................................................................................................................................................... + umlal v20.2D, v7.2S, v12.2S // ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................. + umlal v20.2D, v4.2S, v10.2S // .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................ + usra v20.2D, v21.2D, #25 // .............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................ + umlal v25.2D, v13.2S, v10.2S // .....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................ + usra v25.2D, v20.2D, #26 // ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................. + and v24.16B, v25.16B, v26.16B // ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*..................................................................................................................................................................................................................................................................................................................................... + and v20.16B, v20.16B, v30.16B // .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................ + trn1 v16.4S, v20.4S, v24.4S // .......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*...................................................................................................................................................................................................................................................................................................................... + ld1r {v20.2D}, [sp] // ....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*......................................................................................................................................................................................................................................................................................................................... // @slothy:reads=mask1 + add v29.2S, v16.2S, v20.2S // ...............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.............................................................................................................................................................................................................................................................................................................. + mov v24.d[0], v16.d[1] // ...........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................. + sub v29.2S, v29.2S, v24.2S // .......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*...................................................................................................................................................................................................................................................................................................... + add v24.2S, v16.2S, v24.2S // ...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.......................................................................................................................................................................................................................................................................................................... + and v16.16B, v28.16B, v30.16B // ...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.......................................................................................................................................................................................................................................................................................................................................................... + and v28.16B, v23.16B, v26.16B // ......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*....................................................................................................................................................................................................................................................................................................................................................... + uzp1 v23.4S, v16.4S, v28.4S // ..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................... + ldr b16, [sp, #STACK_MASK2] // ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*..................................................................................................................................................................................................................................................................................................................... // @slothy:reads=mask2 + and v11.16B, v11.16B, v30.16B // .........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................... + and v21.16B, v21.16B, v26.16B // ..............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................... + uzp1 v21.4S, v11.4S, v21.4S // ...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.......................................................................................................................................................................................................................................................................................................................... + uzp1 v11.4S, v23.4S, v21.4S // .....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................ + add v28.4S, v11.4S, v20.4S // ..............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................... + uzp2 v21.4S, v23.4S, v21.4S // ......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*....................................................................................................................................................................................................................................................................................................................... + sub v28.4S, v28.4S, v21.4S // .....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................ + mov v20.b[0], v16.b[0] // ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................. + add v21.4S, v11.4S, v21.4S // ..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................... + mul v12.2S, v12.2S, v31.2S // .......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*...................................................................................................................................................................................................................................................................................................................................................................... + mul v11.2S, v6.2S, v31.2S // .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................ + umlal v27.2D, v13.2S, v11.2S // .......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*...................................................................................................................................................................................................................................................................................................................................................... + shl v6.2D, v27.2D, #1 // ...............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.............................................................................................................................................................................................................................................................................................................................................. + umlal v6.2D, v0.2S, v10.2S // ..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................... + umlal v6.2D, v2.2S, v18.2S // ...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.......................................................................................................................................................................................................................................................................................................................................... + bic v0.16B, v25.16B, v26.16B // .......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*...................................................................................................................................................................................................................................................................................................................................... + umlal v6.2D, v5.2S, v17.2S // ....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*......................................................................................................................................................................................................................................................................................................................................... + umlal v6.2D, v7.2S, v14.2S // .....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................ + umlal v6.2D, v4.2S, v12.2S // ......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*....................................................................................................................................................................................................................................................................................................................................... + usra v6.2D, v0.2D, #25 // .........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................... + usra v6.2D, v0.2D, #24 // ..........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................... + usra v6.2D, v0.2D, #21 // ...........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................. + and v7.16B, v6.16B, v30.16B // .............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................ + umlal v19.2D, v3.2S, v18.2S // ....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*......................................................................................................................................................................................................................................................................................................................................................... + umlal v19.2D, v1.2S, v17.2S // ..........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................... + umlal v19.2D, v15.2S, v14.2S // ...........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................. + umlal v19.2D, v13.2S, v12.2S // ............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................. + usra v19.2D, v6.2D, #26 // ............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................. + and v12.16B, v19.16B, v26.16B // ...............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.............................................................................................................................................................................................................................................................................................................................. + usra v8.2D, v19.2D, #25 // ..............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................... + ldr x8, [sp, #STACK_CTR] // .............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ // @slothy:reads=[ctr,lastbit] + and v25.16B, v8.16B, v30.16B // .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................ + usra v22.2D, v8.2D, #26 // ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................. + uzp1 v18.4S, v25.4S, v22.4S // ..........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................... + add x13, sp, #STACK_SCALAR // .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + subs w0, w8, #1 // ...............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.............................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + asr w19, w0, #5 // ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + and w5, w0, #0x1f // ...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.......................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + and x15, x12, #0x3ffffff // ...........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + ldr w13, [x13, w19, SXTW #2] // ..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + and x12, x1, #0x3ffffff // ........................................................................................................................................................................................................................................................................................................................................................................................................................................................*..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + bfi x12, x11, #32, #25 // ..........................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + and x21, x21, #0x3ffffff // .......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + bfi x21, x2, #32, #25 // .........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + bfi x15, x9, #32, #26 // ............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + uzp1 v25.4S, v7.4S, v12.4S // .........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................... + lsr w11, w13, w5 // ....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*......................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + stp w0, w11, [sp, #STACK_CTR] // .....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................ // @slothy:writes=[ctr,lastbit] + zip1 v10.4S, v28.4S, v21.4S // ..........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................... + zip2 v12.4S, v28.4S, v21.4S // ...........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................. + and x9, x20, #0x3ffffff // ...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + bfi x9, x27, #32, #25 // .....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + lsr x2, x8, #32 // ..............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + uzp2 v0.4S, v25.4S, v18.4S // .............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................ + uzp1 v25.4S, v25.4S, v18.4S // ............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................. + add v18.4S, v25.4S, v0.4S // .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................ + add v25.4S, v25.4S, v20.4S // ....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*......................................................................................................................................................................................................................................................................................................... + sub v25.4S, v25.4S, v0.4S // ......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*....................................................................................................................................................................................................................................................................................................... + zip1 v6.4S, v25.4S, v18.4S // ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*..................................................................................................................................................................................................................................................................................................... + zip2 v0.4S, v25.4S, v18.4S // .........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................... + mov v9.d[0], v6.d[1] // ..............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................... + zip1 v4.2S, v29.2S, v24.2S // ............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................. + zip2 v5.2S, v29.2S, v24.2S // .............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................ + add x8, sp, #STACK_B_0 // ..........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................... + mov v15.d[0], v12.d[1] // .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................ + shl v24.2S, v15.2S, #1 // ....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*......................................................................................................................................................................................................................................................................................... + shl v8.2S, v9.2S, #1 // ..........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................... + shl v16.2S, v4.2S, #1 // ...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.......................................................................................................................................................................................................................................................................................... + umull v23.2D, v6.2S, v16.2S // ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................. + umlal v23.2D, v8.2S, v24.2S // .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................ + mov v3.d[0], v0.d[1] // ...............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.............................................................................................................................................................................................................................................................................................. + shl v22.2S, v3.2S, #1 // ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*..................................................................................................................................................................................................................................................................................... + shl v13.2S, v12.2S, #1 // .....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................ + umlal v23.2D, v0.2S, v13.2S // ..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................... + mov v1.d[0], v10.d[1] // ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................. + shl v11.2S, v1.2S, #1 // ......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*....................................................................................................................................................................................................................................................................................... + shl v7.2S, v5.2S, #1 // ..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................... + mul v2.2S, v5.2S, v31.2S // .....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................ + umlal v23.2D, v22.2S, v11.2S // ...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.......................................................................................................................................................................................................................................................................... + umlal v23.2D, v10.2S, v10.2S // ....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*......................................................................................................................................................................................................................................................................... + umlal v23.2D, v2.2S, v7.2S // ..........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................... + umull v28.2D, v6.2S, v6.2S // ..........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................... + mul v18.2S, v1.2S, v31.2S // .....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................ + shl v25.2S, v8.2S, #1 // ......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*....................................................................................................................................................................................................................................... + umull v1.2D, v6.2S, v7.2S // ...........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................. + umlal v1.2D, v9.2S, v16.2S // ............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................. + umlal v1.2D, v0.2S, v24.2S // .............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................ + umlal v1.2D, v3.2S, v13.2S // ..............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................... + umlal v1.2D, v10.2S, v11.2S // ...............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.............................................................................................................................................................................................................................................................................. + usra v1.2D, v23.2D, #26 // ...........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................. + bic v29.16B, v1.16B, v26.16B // ..............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................... + usra v28.2D, v29.2D, #25 // ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................. + usra v28.2D, v29.2D, #24 // ..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................... + usra v28.2D, v29.2D, #21 // ....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*......................................................................................................................................................................................................................................... + umlal v28.2D, v18.2S, v11.2S // ..........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................... + umlal v28.2D, v2.2S, v25.2S // ...........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................. + mul v21.2S, v4.2S, v31.2S // .............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................ + mul v20.2S, v15.2S, v31.2S // .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................ + shl v17.2S, v22.2S, #1 // .......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*...................................................................................................................................................................................................................................... + shl v14.2S, v0.2S, #1 // .........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................... + umlal v28.2D, v21.2S, v14.2S // ............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................. + umlal v28.2D, v20.2S, v17.2S // .............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................ + umull v7.2D, v6.2S, v13.2S // ...........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................. + umlal v7.2D, v8.2S, v11.2S // ............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................. + shl v27.2S, v10.2S, #1 // .......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*...................................................................................................................................................................................................................................................................................... + umull v5.2D, v6.2S, v11.2S // ...............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.............................................................................................................................................................................................................................................................. + umlal v5.2D, v9.2S, v27.2S // ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................. + umlal v5.2D, v0.2S, v22.2S // .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................ + umlal v7.2D, v0.2S, v27.2S // .............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................ + umlal v7.2D, v22.2S, v3.2S // ..............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................... + umlal v5.2D, v2.2S, v13.2S // ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................. + umlal v7.2D, v21.2S, v4.2S // ...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.......................................................................................................................................................................................................... + umull v10.2D, v6.2S, v24.2S // ......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*....................................................................................................................................................................................................................................................................... + umlal v10.2D, v9.2S, v13.2S // .......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*...................................................................................................................................................................................................................................................................... + umlal v10.2D, v0.2S, v11.2S // ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*..................................................................................................................................................................................................................................................................... + umlal v10.2D, v3.2S, v27.2S // .........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................... + umlal v10.2D, v2.2S, v16.2S // ......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*....................................................................................................................................................................................................... + shl v25.2S, v24.2S, #1 // .........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................... + umlal v7.2D, v2.2S, v25.2S // ....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*......................................................................................................................................................................................................... + umull v16.2D, v6.2S, v14.2S // .......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*...................................................................................................................................................................................................................................................... + umlal v16.2D, v8.2S, v9.2S // ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*..................................................................................................................................................................................................................................................... + mul v25.2S, v12.2S, v31.2S // ...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.......................................................................................................................................................................................................................................... + umlal v16.2D, v25.2S, v12.2S // ...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.......................................................................................................................................................................................................................... + umlal v16.2D, v2.2S, v17.2S // ....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*......................................................................................................................................................................................................................... + umlal v16.2D, v21.2S, v27.2S // .....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................ + umull v12.2D, v6.2S, v8.2S // .........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................... + umlal v12.2D, v2.2S, v14.2S // ...............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.............................................................................................................................................................................................................................. + umlal v12.2D, v21.2S, v22.2S // ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................. + shl v18.2S, v11.2S, #1 // ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*..................................................................................................................................................................................................................................... + umlal v16.2D, v20.2S, v18.2S // ......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*....................................................................................................................................................................................................................... + umlal v28.2D, v25.2S, v27.2S // ..............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................... + umlal v12.2D, v20.2S, v27.2S // .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................ + umlal v12.2D, v25.2S, v11.2S // ..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................... + usra v12.2D, v28.2D, #26 // .......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*...................................................................................................................................................................................................................... + usra v16.2D, v12.2D, #25 // ...........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................. + umlal v5.2D, v21.2S, v24.2S // .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................ + and v15.16B, v23.16B, v30.16B // ............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................. + umull v25.2D, v6.2S, v22.2S // .....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................ + umlal v25.2D, v9.2S, v14.2S // ......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*....................................................................................................................................................................................................................................................... + umlal v25.2D, v2.2S, v27.2S // ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*..................................................................................................................................................................................................................... + umlal v25.2D, v21.2S, v11.2S // .........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................... + umlal v25.2D, v20.2S, v13.2S // ..........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................... + usra v25.2D, v16.2D, #26 // ...............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.............................................................................................................................................................................................................. + and v9.16B, v1.16B, v26.16B // ...............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.............................................................................................................................................................................................................................................. + umull v4.2D, v6.2S, v27.2S // ..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................... + umlal v4.2D, v8.2S, v22.2S // ...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.......................................................................................................................................................................................................................................................... + umlal v4.2D, v0.2S, v0.2S // ....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*......................................................................................................................................................................................................................................................... + umlal v4.2D, v20.2S, v24.2S // ............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................. + umlal v4.2D, v2.2S, v18.2S // .............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................ + umlal v4.2D, v21.2S, v13.2S // ..............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................... + usra v4.2D, v25.2D, #25 // ..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................... + usra v5.2D, v4.2D, #26 // .....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................ + usra v7.2D, v5.2D, #25 // .......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*...................................................................................................................................................................................................... + usra v10.2D, v7.2D, #26 // ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*..................................................................................................................................................................................................... + usra v15.2D, v10.2D, #25 // .........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................... + and v8.16B, v15.16B, v30.16B // ..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................... + and v5.16B, v5.16B, v26.16B // ............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................. + and v3.16B, v25.16B, v26.16B // ...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.......................................................................................................................................................................................... + and v6.16B, v7.16B, v30.16B // ..............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................... + and v7.16B, v10.16B, v26.16B // ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................. + ld2 { v10.S, v11.S }[1], [x8], #8 // ...........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................. // @slothy:reads=[B0] + and v1.16B, v12.16B, v26.16B // ...............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.............................................................................................................................................................................................. + ld2 { v25.S, v26.S }[1], [x8], #8 // ............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................. // @slothy:reads=[B8] + ld2 { v12.S, v13.S }[1], [x8], #8 // .............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................ // @slothy:reads=[B16] + usra v9.2D, v15.2D, #26 // ..........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................... + and v0.16B, v28.16B, v30.16B // .............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................ + add x13, sp, #STACK_X_0 // ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................. + ld2 { v14.S, v15.S }[1], [x8], #8 // ..............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................... // @slothy:reads=[B24] + and v4.16B, v4.16B, v30.16B // ...........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................. + and v2.16B, v16.16B, v30.16B // .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................ + add x28, sp, #STACK_A_0 // ....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*......................................................................................................................................................................................... + ld2 { v0.S, v1.S }[1], [x28], #8 // .....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................ // @slothy:reads=[A0] + ld2 { v2.S, v3.S }[1], [x28], #8 // ......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*....................................................................................................................................................................................... // @slothy:reads=[A8] + ld2 { v10.S, v11.S }[0], [x13], #8 // .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................ // @slothy:reads=[X0] + ld2 { v4.S, v5.S }[1], [x28], #8 // .......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*...................................................................................................................................................................................... // @slothy:reads=[A16] + ld2 { v6.S, v7.S }[1], [x28], #8 // ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*..................................................................................................................................................................................... // @slothy:reads=[A24] + ld2 { v8.S, v9.S }[1], [x28], #8 // .........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................... // @slothy:reads=[A32] + ld2 { v25.S, v26.S }[0], [x13], #8 // ..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................... // @slothy:reads=[X8] + ld2 { v12.S, v13.S }[0], [x13], #8 // ...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.......................................................................................................................................................................... // @slothy:reads=[X16] + ld2 { v17.S, v18.S }[1], [x8], #8 // ...............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.............................................................................................................................................................................. // @slothy:reads=[B32] + ld2 { v14.S, v15.S }[0], [x13], #8 // ....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*......................................................................................................................................................................... // @slothy:reads=[X24] + mul v23.2S, v15.2S, v31.2S // ............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................. + ld2 { v17.S, v18.S }[0], [x13], #8 // .....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................ // @slothy:reads=[X32] + mul v21.2S, v18.2S, v31.2S // ...........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................. + umull v20.2D, v0.2S, v18.2S // ......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*....................................................................................................................................................................... + umull v28.2D, v1.2S, v15.2S // ............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................. + umlal v28.2D, v3.2S, v13.2S // .............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................ + umlal v28.2D, v5.2S, v26.2S // ..............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................... + umlal v20.2D, v2.2S, v15.2S // .......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*...................................................................................................................................................................... + umull v15.2D, v0.2S, v15.2S // .......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*...................................................................................................................................................... + umlal v15.2D, v2.2S, v13.2S // ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*..................................................................................................................................................... + umlal v15.2D, v4.2S, v26.2S // .........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................... + umlal v15.2D, v6.2S, v11.2S // ..........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................... + umlal v15.2D, v8.2S, v21.2S // ...........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................. + umlal v28.2D, v7.2S, v11.2S // ...............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.............................................................................................................................................................. + umlal v28.2D, v9.2S, v21.2S // ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................. + shl v22.2D, v28.2D, #1 // ......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*....................................................................................................................................................... + umull v24.2D, v1.2S, v26.2S // ..........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................... + umull v16.2D, v1.2S, v13.2S // ...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.......................................................................................................................................... + umull v28.2D, v0.2S, v13.2S // ..............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................... + umlal v24.2D, v3.2S, v11.2S // ...........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................. + umlal v16.2D, v3.2S, v26.2S // ....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*......................................................................................................................................... + umlal v24.2D, v5.2S, v21.2S // ............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................. + umlal v24.2D, v7.2S, v23.2S // .............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................ + umlal v16.2D, v5.2S, v11.2S // .....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................ + umlal v20.2D, v4.2S, v13.2S // ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*..................................................................................................................................................................... + umlal v16.2D, v7.2S, v21.2S // ......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*....................................................................................................................................... + mul v13.2S, v13.2S, v31.2S // ...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.......................................................................................................................... + umlal v22.2D, v0.2S, v17.2S // .............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................ + umlal v16.2D, v9.2S, v23.2S // .......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*...................................................................................................................................... + umlal v28.2D, v2.2S, v26.2S // ...............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.............................................................................................................................. + umlal v20.2D, v6.2S, v26.2S // .........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................... + umlal v28.2D, v4.2S, v11.2S // ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................. + umlal v20.2D, v8.2S, v11.2S // ..........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................... + shl v27.2D, v16.2D, #1 // .............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................ + umlal v20.2D, v1.2S, v17.2S // .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................ + umlal v28.2D, v6.2S, v21.2S // .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................ + mul v16.2S, v14.2S, v31.2S // .........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................... + umull v19.2D, v0.2S, v26.2S // .....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................ + umlal v20.2D, v3.2S, v14.2S // ..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................... + umlal v19.2D, v2.2S, v11.2S // ......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*....................................................................................................... + umlal v28.2D, v8.2S, v23.2S // ..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................... + umlal v27.2D, v0.2S, v14.2S // ....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*......................................................................................................................... + umlal v19.2D, v4.2S, v21.2S // .......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*...................................................................................................... + umlal v15.2D, v1.2S, v14.2S // ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*..................................................................................................................................... + umlal v19.2D, v6.2S, v23.2S // ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*..................................................................................................... + umlal v24.2D, v9.2S, v13.2S // ..............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................... + mul v18.2S, v17.2S, v31.2S // ..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................... + umlal v28.2D, v1.2S, v12.2S // ...............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.............................................................................................................. + umlal v27.2D, v2.2S, v12.2S // .....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................ + umlal v19.2D, v8.2S, v13.2S // .........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................... + umlal v28.2D, v3.2S, v25.2S // ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................. + mul v26.2S, v26.2S, v31.2S // ..........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................... + shl v17.2D, v24.2D, #1 // ....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*......................................................................................................... + umlal v17.2D, v0.2S, v12.2S // ...........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................. + umull v29.2D, v1.2S, v11.2S // .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................ + umlal v29.2D, v3.2S, v21.2S // ..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................... + umlal v22.2D, v2.2S, v14.2S // ..............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................... + umlal v19.2D, v1.2S, v25.2S // ......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*....................................................................................... + umlal v19.2D, v3.2S, v10.2S // .......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*...................................................................................... + umlal v29.2D, v5.2S, v23.2S // ...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.......................................................................................... + umlal v29.2D, v7.2S, v13.2S // ....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*......................................................................................... + umlal v29.2D, v9.2S, v26.2S // .....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................ + mul v14.2S, v12.2S, v31.2S // ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................. + umlal v19.2D, v5.2S, v18.2S // ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*..................................................................................... + umlal v19.2D, v7.2S, v16.2S // .........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................... + umlal v19.2D, v9.2S, v14.2S // ..........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................... + shl v29.2D, v29.2D, #1 // ...........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................. + umull v24.2D, v0.2S, v11.2S // ............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................. + umlal v29.2D, v0.2S, v25.2S // ..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................... + umlal v24.2D, v2.2S, v21.2S // .............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................ + umlal v24.2D, v4.2S, v23.2S // ..............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................... + umlal v28.2D, v5.2S, v10.2S // .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................ + umlal v24.2D, v6.2S, v13.2S // ...............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.............................................................................. + umlal v17.2D, v2.2S, v25.2S // ............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................. + umlal v17.2D, v4.2S, v10.2S // .............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................ + umlal v29.2D, v2.2S, v10.2S // ...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.......................................................................... + umlal v29.2D, v4.2S, v18.2S // ....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*......................................................................... + mul v11.2S, v11.2S, v31.2S // .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................ + umlal v15.2D, v3.2S, v12.2S // .........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................... + umlal v15.2D, v5.2S, v25.2S // ..........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................... + umlal v24.2D, v8.2S, v26.2S // ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................. + umlal v24.2D, v1.2S, v10.2S // ..............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................... + umlal v29.2D, v6.2S, v16.2S // .....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................ + umlal v29.2D, v8.2S, v14.2S // ......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*....................................................................... + umull v21.2D, v1.2S, v21.2S // ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*..................................................................... + umlal v21.2D, v3.2S, v23.2S // .........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................... + umlal v21.2D, v5.2S, v13.2S // ..........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................... + umlal v21.2D, v7.2S, v26.2S // .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................ + umlal v21.2D, v9.2S, v11.2S // .......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*...................................................... + shl v21.2D, v21.2D, #1 // ...............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.............................................. + umlal v21.2D, v0.2S, v10.2S // ..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................... + umlal v21.2D, v2.2S, v18.2S // ...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.......................................... + umlal v20.2D, v5.2S, v12.2S // ...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.......................................................................................................................................................... + umlal v21.2D, v4.2S, v16.2S // ....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*......................................... + umlal v21.2D, v6.2S, v14.2S // .....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................ + umlal v24.2D, v3.2S, v18.2S // ....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*......................................................... + umlal v24.2D, v5.2S, v16.2S // ..........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................... + umlal v22.2D, v4.2S, v12.2S // ...............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.............................................................................................................................................. + umlal v27.2D, v4.2S, v25.2S // ......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*....................................................................................................................... + trn1 v4.4S, v4.4S, v5.4S // ....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*......................... + mov v5.d[0], x15 // ....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*......... + trn1 v2.4S, v2.4S, v3.4S // ...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.......................... + mov v3.d[0], x21 // ...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.......... + umlal v27.2D, v6.2S, v10.2S // .......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*...................................................................................................................... + umlal v27.2D, v8.2S, v18.2S // ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*..................................................................................................................... + mul v12.2S, v25.2S, v31.2S // .......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*...................................................................... + umlal v21.2D, v8.2S, v12.2S // ......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*....................................... + umlal v17.2D, v6.2S, v18.2S // ..............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................... + umlal v17.2D, v8.2S, v16.2S // ...............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.............................................................................................. + umlal v22.2D, v6.2S, v25.2S // ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................. + umlal v22.2D, v8.2S, v10.2S // .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................ + trn1 v8.4S, v8.4S, v9.4S // ......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*....................... + umlal v28.2D, v7.2S, v18.2S // ..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................... + umlal v28.2D, v9.2S, v16.2S // ...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.......................................................................................................... + umlal v20.2D, v7.2S, v25.2S // ....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*......................................................................................................................................................... + umlal v20.2D, v9.2S, v10.2S // .....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................ + umlal v15.2D, v7.2S, v10.2S // ...........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................. + umlal v15.2D, v9.2S, v18.2S // ............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................. + umlal v24.2D, v7.2S, v14.2S // ...........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................. + umlal v24.2D, v9.2S, v12.2S // ............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................. + mov v9.d[0], x4 // ......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*....... + usra v19.2D, v29.2D, #26 // ............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................. + trn1 v6.4S, v6.4S, v7.4S // .....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................ + mov v7.d[0], x12 // .....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........ + usra v17.2D, v19.2D, #25 // ...............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.............................................................. + and v11.16B, v17.16B, v30.16B // ...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.......................................................... + usra v28.2D, v17.2D, #26 // ..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................... + usra v27.2D, v28.2D, #25 // .....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................ + ushr v13.2D, v30.2D, #1 // ...........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................. + and v23.16B, v28.16B, v13.16B // ......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*....................................................... + ldr d28, [sp, #STACK_MASK2] // .......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*...... // @slothy:reads=mask2 + trn1 v0.4S, v0.4S, v1.4S // ..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................... + mov v1.d[0], x9 // ..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........... + usra v15.2D, v27.2D, #26 // ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*..................................................... + usra v22.2D, v15.2D, #25 // .............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................ + usra v20.2D, v22.2D, #26 // ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................. + bic v17.16B, v20.16B, v13.16B // .......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*...................................... + usra v21.2D, v17.2D, #25 // .........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................... + usra v21.2D, v17.2D, #24 // ..........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................... + usra v21.2D, v17.2D, #21 // ...........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................. + usra v24.2D, v21.2D, #26 // ............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................. + and v26.16B, v24.16B, v13.16B // ...............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.............................. + and v25.16B, v29.16B, v30.16B // .............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................ + usra v25.2D, v24.2D, #25 // ..............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................... + and v17.16B, v19.16B, v13.16B // ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................. + usra v17.2D, v25.2D, #26 // ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................. + and v22.16B, v22.16B, v30.16B // .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................ + and v21.16B, v21.16B, v30.16B // .............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................ + and v15.16B, v15.16B, v13.16B // ..............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................... + and v24.16B, v20.16B, v13.16B // ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*..................................... + trn1 v19.4S, v22.4S, v24.4S // ...........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................. + and v27.16B, v27.16B, v30.16B // .........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................... + and v20.16B, v25.16B, v30.16B // .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................ + trn1 v13.4S, v20.4S, v17.4S // ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*..................... + mov v12.d[0], v2.d[1] // ..............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............... + ldr d29, [sp, #STACK_MASK1] // ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*..... // @slothy:reads=mask1 + mov v2.d[0], v13.d[1] // ..........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*... + mov v16.d[0], v6.d[1] // ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............. + eor w1, w11, w2 // ............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................. + mov v10.d[0], v0.d[1] // .............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................ + trn1 v17.4S, v27.4S, v15.4S // ..........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................... + mov v6.d[0], v17.d[1] // ............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*. + mov v14.d[0], v4.d[1] // ...............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.............. + mov v18.d[0], v8.d[1] // .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............ + mov v8.d[0], v19.d[1] // .............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................* + trn1 v15.4S, v11.4S, v23.4S // .........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................... + mov v4.d[0], v15.d[1] // ...........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.. + trn1 v11.4S, v21.4S, v26.4S // .......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*...................... + mov v0.d[0], v11.d[1] // .........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.... + + // original source code + // tst w1, #1 // .....*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // sub v25.2S, v28.2S, v1.2S // ..........................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // sub v27.2S, v29.2S, v3.2S // ...............*.............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // sub v20.2S, v29.2S, v5.2S // ...*.......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // sub v21.2S, v29.2S, v7.2S // .............*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // sub v26.2S, v29.2S, v9.2S // ............*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // add v25.2S, v0.2S, v25.2S // ...........................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // add v27.2S, v2.2S, v27.2S // .........................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // add v20.2S, v4.2S, v20.2S // ....*......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // add v21.2S, v6.2S, v21.2S // ........................*..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // add v26.2S, v8.2S, v26.2S // .......................*...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // sub v22.2S, v28.2S, v11.2S // .................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // sub v23.2S, v29.2S, v13.2S // *............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // sub v28.2S, v29.2S, v15.2S // ..............*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // sub v24.2S, v29.2S, v17.2S // ...........*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // sub v29.2S, v29.2S, v19.2S // ..........*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // add v22.2S, v10.2S, v22.2S // .............................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // add v23.2S, v12.2S, v23.2S // .........*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // add v28.2S, v14.2S, v28.2S // ...................*.......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // add v24.2S, v16.2S, v24.2S // ...............................................................*.............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // add v29.2S, v18.2S, v29.2S // ................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // add v0.2S, v0.2S, v1.2S // ....................*......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // add v2.2S, v2.2S, v3.2S // ......*....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // add v5.2S, v4.2S, v5.2S // ..........................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // add v7.2S, v6.2S, v7.2S // .................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // add v4.2S, v8.2S, v9.2S // .....................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // add v10.2S, v10.2S, v11.2S // ..*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // add v12.2S, v12.2S, v13.2S // .*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // add v14.2S, v14.2S, v15.2S // ......................*....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // add v17.2S, v16.2S, v17.2S // ................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // add v18.2S, v18.2S, v19.2S // ..................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // fcsel_dform v9, v0, v10, eq // ............................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // fcsel_dform v3, v2, v12, eq // .......*...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // fcsel_dform v1, v5, v14, eq // ...........................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // fcsel_dform v15, v7, v17, eq // ..................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // fcsel_dform v13, v4, v18, eq // .......................................*...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // mov x4, v9.d[0] // ....................................................*......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // mov x9, v3.d[0] // ........*..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // mov x12, v1.d[0] // ............................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // mov x21, v15.d[0] // .....................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // mov x11, v13.d[0] // ........................................*..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // trn2 v9.2S, v0.2S, v25.2S // ........................................................................................................................*..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // trn1 v0.2S, v0.2S, v25.2S // .............................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // trn2 v3.2S, v2.2S, v27.2S // ........................................................*..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // trn1 v2.2S, v2.2S, v27.2S // ...................................................*.......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // trn2 v1.2S, v5.2S, v20.2S // .................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // trn1 v5.2S, v5.2S, v20.2S // ..................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // trn2 v15.2S, v7.2S, v21.2S // ...............................................*.............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // trn1 v7.2S, v7.2S, v21.2S // ....................................*......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // trn2 v13.2S, v4.2S, v26.2S // ................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // trn1 v4.2S, v4.2S, v26.2S // .......................................................................................................*...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // fcsel_dform v25, v25, v22, eq // ..............................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // fcsel_dform v27, v27, v23, eq // ...............................*.............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // fcsel_dform v20, v20, v28, eq // ...................................*.......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // fcsel_dform v21, v21, v24, eq // ................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // fcsel_dform v26, v26, v29, eq // .....................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // trn2 v6.2S, v22.2S, v10.2S // ...............................................................................................................................................................................................................................................................................................*.............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // trn1 v10.2S, v22.2S, v10.2S // ................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // trn2 v22.2S, v23.2S, v12.2S // ..................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // trn1 v12.2S, v23.2S, v12.2S // .................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // trn2 v16.2S, v28.2S, v14.2S // ..............................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // trn1 v14.2S, v28.2S, v14.2S // ......................................................................................................................*....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // trn2 v11.2S, v24.2S, v17.2S // .................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // trn1 v17.2S, v24.2S, v17.2S // ..................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // trn2 v23.2S, v29.2S, v18.2S // .........................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // trn1 v18.2S, v29.2S, v18.2S // .......................................................................................................................*...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // stp d25, d27, [sp, #STACK_B_0] // ................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // stp d20, d21, [sp, #STACK_B_16] // .................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // str d26, [sp, #STACK_B_32] // ................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // lsr x15, x4, #32 // ......................................................*....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // lsr x2, x9, #32 // ...........................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // lsr x13, x12, #32 // .............................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // lsr x1, x21, #32 // ......................................*....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // lsr x19, x11, #32 // .........................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // add x0, x19, x19 // .........................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // add x5, x11, x11 // ............................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // add x26, x1, x1 // ......................................................................*....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // add x16, x21, x21 // ........................................................................*..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // add x6, x13, x13 // ..............................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // add x7, x12, x12 // ..........................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // add x10, x2, x2 // ............................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // add x25, x9, x9 // ................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // add x20, x15, x15 // .......................................................*...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umull x27, w12, w12 // .....................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // umull x12, w12, w6 // ...........................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // mul w19, w19, w30 // ...................................................................*.......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // mul w1, w1, w30 // ......................................................................................*....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // mul w13, w13, w30 // .............................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // umaddl x27, w19, w0, x27 // ....................................................................*......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x12, w4, w0, x12 // .............................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // umull x0, w4, w4 // ...............................................................................*.............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // umull x24, w4, w20 // ..............................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umull x14, w4, w25 // ........................................................................................*..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umull x22, w4, w10 // ...........................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // umull x17, w4, w7 // ...............................................................................................................*.............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // umull x23, w4, w6 // ..........................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umull x18, w4, w16 // ..................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umull x3, w4, w26 // .....................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // umaddl x4, w4, w5, x27 // .....................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // mul w27, w21, w30 // .......................................................................................*...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x14, w15, w20, x14 // .........................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x22, w15, w25, x22 // ............................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // umaddl x17, w20, w10, x17 // ................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // umaddl x23, w15, w7, x23 // ..................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x18, w20, w6, x18 // ...................................................................................*.......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x3, w15, w16, x3 // .........................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x4, w20, w26, x4 // .........................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x12, w15, w5, x12 // ..............................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // mul w15, w11, w30 // ............................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // umaddl x17, w9, w9, x17 // .................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // umaddl x23, w9, w10, x23 // ...................................................................................................................................*.......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x18, w9, w7, x18 // ....................................................................................*......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x3, w9, w6, x3 // ..........................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x4, w9, w16, x4 // ..........................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x9, w9, w26, x12 // .......................................................................*...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x12, w2, w10, x18 // .............................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // umaddl x18, w2, w7, x3 // ...........................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // umaddl x4, w10, w6, x4 // ..............................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x9, w2, w16, x9 // ..................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x12, w11, w15, x12 // ..............................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x21, w21, w27, x14 // ..........................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // add x9, x9, x4, lsr #26 // ...................................................................................................*.......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x11, w13, w6, x0 // .................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // add x11, x11, x9, lsr #25 // ....................................................................................................*......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // bic x2, x9, #0x1ffffff // .....................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // add x11, x11, x2, lsr #24 // ......................................................................................................*....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // and x9, x9, #0x1ffffff // ......................................................................................................................................................................................*....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // add x11, x11, x2, lsr #21 // .........................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x2, w1, w26, x17 // ..................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // add x13, x20, x20 // ................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // add x0, x10, x10 // ........................................................................................................*..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // add x20, x6, x6 // .....................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // add x14, x26, x26 // ...............................................................................................*.............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // umaddl x11, w27, w7, x11 // ..........................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x27, w27, w6, x24 // ...............................................................................................................................................*.............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // and x4, x4, #0x3ffffff // ................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // umaddl x11, w1, w0, x11 // ...........................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // umaddl x27, w1, w7, x27 // ..................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x21, w1, w20, x21 // ...........................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // umaddl x1, w1, w16, x22 // .............................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // umaddl x11, w15, w25, x11 // ............................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // umaddl x10, w15, w10, x27 // ...................................................................................................................................................*.......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x21, w15, w7, x21 // .......................................................................................................................................*...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x1, w15, w6, x1 // ..............................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x2, w15, w16, x2 // ...................................................................................................................*.......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x15, w15, w26, x23 // ....................................................................................................................................*......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x11, w19, w13, x11 // .................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // umaddl x13, w19, w25, x10 // .....................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // umaddl x21, w19, w0, x21 // ........................................................................................................................................*..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x1, w19, w7, x1 // ...............................................................................................................................*.............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // umaddl x2, w19, w20, x2 // ....................................................................................................................*......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x15, w19, w16, x15 // ......................................................................................................................................*....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x12, w19, w14, x12 // ....................................................................................................................................................*......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x19, w19, w5, x18 // .............................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // add x13, x13, x11, lsr #26 // ......................................................................................................................................................*....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // and x0, x11, #0x3ffffff // ............................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // add x21, x21, x13, lsr #25 // .......................................................................................................................................................*...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // bfi x0, x13, #32, #25 // ..........................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // add x11, x1, x21, lsr #26 // ...........................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // and x21, x21, #0x3ffffff // ....................................................................................................................................................................*......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // add x2, x2, x11, lsr #25 // ............................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // bfi x21, x11, #32, #25 // .......................................................................................................................................................................*...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // add x11, x15, x2, lsr #26 // .............................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // and x1, x2, #0x3ffffff // ...........................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // add x12, x12, x11, lsr #25 // ..............................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // bfi x1, x11, #32, #25 // ............................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // add x11, x19, x12, lsr #26 // ...............................................................................................................................................................*.............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // and x12, x12, #0x3ffffff // .....................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // add x4, x4, x11, lsr #25 // ........................................................................................................................................................................*..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // bfi x12, x11, #32, #25 // ......................................................................................................................................................................*....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // add x9, x9, x4, lsr #26 // ............................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // and x4, x4, #0x3ffffff // .....................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // bfi x4, x9, #32, #26 // ..............................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // stp x0, x21, [sp, #STACK_A_0] // ....................................................................................................................................................................................*......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // stp x1, x12, [sp, #STACK_A_16] // ...................................................................................................................................................................................*.......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // str x4, [sp, #STACK_A_32] // ...............................................................................................................................................................................................*.............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // ldr x4, [sp, #STACK_B_0] // .................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // ldr x9, [sp, #STACK_B_8] // .............................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // ldr x12, [sp, #STACK_B_16] // ...............................................................................................................................................................................*.............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // ldr x21, [sp, #STACK_B_24] // .......................................................................................................................................................................................*...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // ldr x11, [sp, #STACK_B_32] // .................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // lsr x15, x4, #32 // ........................................................................................................................................................*..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // lsr x2, x9, #32 // ..............................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // lsr x13, x12, #32 // ................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // lsr x1, x21, #32 // ........................................................................................................................................................................................*..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // lsr x19, x11, #32 // ..................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // add x0, x19, x19 // ...................................................................................................................................................................*.......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // add x5, x11, x11 // .........................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // add x26, x1, x1 // .........................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // add x16, x21, x21 // ...........................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // add x6, x13, x13 // ....................................................................................................................................................................................................*......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // add x7, x12, x12 // ...................................................................................................................................................................................................*.......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // add x10, x2, x2 // ..................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // add x25, x9, x9 // .........................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // add x20, x15, x15 // .........................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umull x27, w12, w12 // ......................................................................................................................................................................................................................................*....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umull x12, w12, w6 // ................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // mul w19, w19, w30 // ..................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // mul w1, w1, w30 // ..............................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // mul w13, w13, w30 // .................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // umaddl x27, w19, w0, x27 // .......................................................................................................................................................................................................................................*...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x12, w4, w0, x12 // .................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // umull x0, w4, w4 // ..................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umull x24, w4, w20 // ........................................................................................................................................................................................................................*..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umull x14, w4, w25 // ..........................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umull x22, w4, w10 // ................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // umull x17, w4, w7 // .............................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // umull x23, w4, w6 // ...................................................................................................................................................................................................................*.......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umull x18, w4, w16 // .................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // umull x3, w4, w26 // ..........................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x4, w4, w5, x27 // ........................................................................................................................................................................................................................................*..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // mul w27, w21, w30 // ............................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // umaddl x14, w15, w20, x14 // ...........................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // umaddl x22, w15, w25, x22 // .................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // umaddl x17, w20, w10, x17 // ..............................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x23, w15, w7, x23 // ....................................................................................................................................................................................................................*......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x18, w20, w6, x18 // .....................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // umaddl x3, w15, w16, x3 // ................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // umaddl x4, w20, w26, x4 // .........................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x12, w15, w5, x12 // ..................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // mul w15, w11, w30 // ................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // umaddl x17, w9, w9, x17 // ...............................................................................................................................................................................................................................*.............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // umaddl x23, w9, w10, x23 // .....................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // umaddl x18, w9, w7, x18 // ......................................................................................................................................................................................................*....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x3, w9, w6, x3 // .................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // umaddl x4, w9, w16, x4 // ..........................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x9, w9, w26, x12 // ...................................................................................................................................................................................................................................*.......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x12, w2, w10, x18 // .......................................................................................................................................................................................................*...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x18, w2, w7, x3 // ..................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x4, w10, w6, x4 // ...........................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // umaddl x9, w2, w16, x9 // ....................................................................................................................................................................................................................................*......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x12, w11, w15, x12 // ........................................................................................................................................................................................................*..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x21, w21, w27, x14 // .............................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // add x9, x9, x4, lsr #26 // ............................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // umaddl x11, w13, w6, x0 // .....................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // add x11, x11, x9, lsr #25 // .............................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // bic x2, x9, #0x1ffffff // .....................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // add x11, x11, x2, lsr #24 // ......................................................................................................................................................................................................................................................*....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // and x9, x9, #0x1ffffff // ............................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // add x11, x11, x2, lsr #21 // .......................................................................................................................................................................................................................................................*...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x2, w1, w26, x17 // ....................................................................................................................................................................................................................................................................*......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // add x13, x20, x20 // ..........................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // add x0, x10, x10 // ..............................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // add x20, x6, x6 // ...............................................................................................................................................................................................................*.............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // add x14, x26, x26 // .............................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // umaddl x11, w27, w7, x11 // ........................................................................................................................................................................................................................................................*..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x27, w27, w6, x24 // .........................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // and x4, x4, #0x3ffffff // .....................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // umaddl x11, w1, w0, x11 // .........................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x27, w1, w7, x27 // ..........................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x21, w1, w20, x21 // ................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // umaddl x1, w1, w16, x22 // ..................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x11, w15, w25, x11 // ..........................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x10, w15, w10, x27 // ...........................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // umaddl x21, w15, w7, x21 // .................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // umaddl x1, w15, w6, x1 // ...................................................................................................................................................................................................................................................*.......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x2, w15, w16, x2 // .........................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x15, w15, w26, x23 // ......................................................................................................................................................................................................................*....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x11, w19, w13, x11 // ...........................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // umaddl x13, w19, w25, x10 // ............................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // umaddl x21, w19, w0, x21 // ...............................................................................................................................................................................................................................................*.............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // umaddl x1, w19, w7, x1 // ....................................................................................................................................................................................................................................................*......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x2, w19, w20, x2 // ..........................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x15, w19, w16, x15 // .......................................................................................................................................................................................................................*...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x12, w19, w14, x12 // ..............................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x19, w19, w5, x18 // ...................................................................................................................................................................................................................................................................................*.......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // add x13, x13, x11, lsr #26 // ............................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // and x0, x11, #0x3ffffff // ...........................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // add x21, x21, x13, lsr #25 // .............................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // bfi x0, x13, #32, #25 // ......................................................................................................................................................................................................................................................................................................................*....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // add x11, x1, x21, lsr #26 // ..............................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // and x21, x21, #0x3ffffff // .....................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // add x2, x2, x11, lsr #25 // ...........................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // bfi x21, x11, #32, #25 // ......................................................................................................................................................................................................................................................................*....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // add x15, x15, x2, lsr #26 // ............................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // and x2, x2, #0x3ffffff // .......................................................................................................................................................................................................................................................................................*...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // add x12, x12, x15, lsr #25 // ...............................................................................................................................................................................................................................................................................*.............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // bfi x2, x15, #32, #25 // ........................................................................................................................................................................................................................................................................................*..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // add x1, x19, x12, lsr #26 // ....................................................................................................................................................................................................................................................................................*......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // and x12, x12, #0x3ffffff // .......................................................................................................................................................................................................................................................................................................*...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // add x4, x4, x1, lsr #25 // ......................................................................................................................................................................................................................................................................................*....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // bfi x12, x1, #32, #25 // ........................................................................................................................................................................................................................................................................................................*..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // add x9, x9, x4, lsr #26 // .............................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // and x4, x4, #0x3ffffff // ..........................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // bfi x4, x9, #32, #26 // ............................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // stp x0, x21, [sp, #STACK_B_0] // .......................................................................................................................................................................................................................................................................................................................*...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // stp x2, x12, [sp, #STACK_B_16] // ..................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // str x4, [sp, #STACK_B_32] // .................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // ldr x19, [sp, #STACK_A_0] // ....................................................................................................................................................................................................................................................................................................................*......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // ldr x5, [sp, #STACK_A_8] // ..................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // ldr x26, [sp, #STACK_A_16] // ..........................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // ldr x16, [sp, #STACK_A_24] // ..............................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // ldr x6, [sp, #STACK_A_32] // ................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // ldr x8, #=0x07fffffe07fffffc // ...............................................................................................................................................................................................................................................................*.............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // add x5, x5, x8 // ...................................................................................................................................................................................................................................................................*.......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // add x26, x26, x8 // ...........................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // add x16, x16, x8 // ...................................................................................................................................................................................................................................................................................................*.......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // add x6, x6, x8 // .................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // movk x8, #0xffb4 // ..............................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // add x19, x19, x8 // .....................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // sub x19, x19, x0 // ..............................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // sub x5, x5, x21 // .......................................................................................................................................................................................................................................................................*...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // sub x26, x26, x2 // ....................................................................................................................................................................................................................................................................................................*......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // sub x16, x16, x12 // .........................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // sub x6, x6, x4 // ......................................................................................................................................................................................................................................................................................................................................*....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // and x13, x13, #0x1ffffff // ........................................................................................................................................................................................................................................................................*..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // and x11, x11, #0x1ffffff // .........................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // and x15, x15, #0x1ffffff // .............................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // and x1, x1, #0x1ffffff // .............................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // mov w0, w0 // ........................................................................................................................................................................................................................................................................................................................*..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // mov w21, w21 // ......................................................................................................................................................................................................................................................................................................*....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // mov w2, w2 // ...............................................................................................................................................................................................................................................................................................................*.............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // mov w12, w12 // .........................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // mov w4, w4 // ................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // lsr x7, x19, #32 // ................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // lsr x10, x5, #32 // ..................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // lsr x25, x26, #32 // ...................................................................................................................................................................................................................................................................................................................*.......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // lsr x20, x16, #32 // .............................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // lsr x27, x6, #32 // .......................................................................................................................................................................................................................................................................................................................................*...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // ldr x24, #=121666 // .....................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // umaddl x9, w27, w24, x9 // ........................................................................................................................................................................................................................................................................................................................................*..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x0, w19, w24, x0 // ...............................................................................................................................................................................................................................................................................................................................*.............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // umaddl x13, w7, w24, x13 // .................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // umaddl x21, w5, w24, x21 // ............................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // lsr x14, x9, #25 // .........................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x11, w10, w24, x11 // ...................................................................................................................................................................................................................................................................................................................................*.......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // and x9, x9, #0x1ffffff // ..............................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x2, w26, w24, x2 // ...........................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // add x0, x0, x14 // ..........................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x15, w25, w24, x15 // .....................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // add x0, x0, x14, lsl #1 // ...........................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // umaddl x12, w16, w24, x12 // ..........................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // add x0, x0, x14, lsl #4 // ............................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // umaddl x1, w20, w24, x1 // ....................................................................................................................................................................................................................................................................................................................................*......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x4, w6, w24, x4 // .............................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // add x13, x13, x0, lsr #26 // ...............................................................................................................................................................................................................................................................................................................................................*.............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // and x0, x0, #0x3ffffff // ........................................................................................................................................................................................................................................................................................................................................................*..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // add x21, x21, x13, lsr #25 // ................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // and x13, x13, #0x1ffffff // .......................................................................................................................................................................................................................................................................................................................................................*...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // add x11, x11, x21, lsr #26 // .................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // and x21, x21, #0x3ffffff // ............................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // add x2, x2, x11, lsr #25 // ..................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // and x11, x11, #0x1ffffff // ...........................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // add x15, x15, x2, lsr #26 // ...................................................................................................................................................................................................................................................................................................................................................*.......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // and x2, x2, #0x3ffffff // ..........................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // add x12, x12, x15, lsr #25 // ....................................................................................................................................................................................................................................................................................................................................................*......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // and x15, x15, #0x1ffffff // ...............................................................................................................................................................................................................................................................................................................................................................*.............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // add x1, x1, x12, lsr #26 // .....................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // and x12, x12, #0x3ffffff // ..............................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // add x4, x4, x1, lsr #25 // .............................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // and x1, x1, #0x1ffffff // ......................................................................................................................................................................................................................................................................................................................................................*....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // add x9, x9, x4, lsr #26 // ................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // and x4, x4, #0x3ffffff // ..............................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // mul w24, w13, w30 // ...........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // mul w14, w21, w30 // .................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // mul w22, w11, w30 // ..........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // mul w17, w15, w30 // .....................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // mul w23, w12, w30 // ......................................................................................................................................................................................................................................................................................................................................................................................................*....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // mul w18, w1, w30 // .......................................................................................................................................................................................................................................................................................................................................................................*...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // mul w3, w4, w30 // ................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // mul w29, w9, w30 // .....................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // umull x28, w13, w6 // ..........................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x28, w11, w16, x28 // ......................................................................................................................................................................................................................................................................................................................................................................*....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x28, w15, w26, x28 // ...........................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // umaddl x28, w1, w5, x28 // ............................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // umaddl x9, w9, w19, x28 // ...............................................................................................................................................................................................................................................................................................................................................................................*.............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // umaddl x9, w0, w27, x9 // ................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // umaddl x9, w21, w20, x9 // .................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // umaddl x9, w2, w25, x9 // .................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // umaddl x9, w12, w10, x9 // ..................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x9, w4, w7, x9 // ...................................................................................................................................................................................................................................................................................................................................................................................................................................*.......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umull x28, w13, w20 // .........................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x28, w11, w25, x28 // ..................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x28, w15, w10, x28 // ...................................................................................................................................................................................................................................................................................................................................................................*.......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x28, w1, w7, x28 // ....................................................................................................................................................................................................................................................................................................................................................................*......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x28, w29, w27, x28 // ........................................................................................................................................................................................................................................................................................................................................................................*..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // add x28, x28, x28 // .........................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x28, w0, w6, x28 // .............................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // umaddl x28, w21, w16, x28 // .........................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x28, w2, w26, x28 // ..............................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x28, w12, w5, x28 // ...............................................................................................................................................................................................................................................................................................................................................................................................................................*.............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // umaddl x4, w4, w19, x28 // ................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // umull x28, w13, w16 // ..................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x28, w11, w26, x28 // ...................................................................................................................................................................................................................................................................................................................................................................................*.......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x28, w15, w5, x28 // .....................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // umaddl x1, w1, w19, x28 // ..........................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x1, w29, w6, x1 // .............................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // umaddl x1, w0, w20, x1 // ..............................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x1, w21, w25, x1 // .................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // umaddl x1, w2, w10, x1 // ..................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x1, w12, w7, x1 // ...................................................................................................................................................................................................................................................................................................................................................................................................................*.......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x1, w3, w27, x1 // ....................................................................................................................................................................................................................................................................................................................................................................................................................................*......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umull x28, w13, w25 // ....................................................................................................................................................................................................................................................................................................................................................................................*......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x28, w11, w10, x28 // ......................................................................................................................................................................................................................................................................................................................................................................................*....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x28, w15, w7, x28 // .......................................................................................................................................................................................................................................................................................................................................................................................*...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x28, w18, w27, x28 // ........................................................................................................................................................................................................................................................................................................................................................................................*..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x28, w29, w20, x28 // ...........................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // add x28, x28, x28 // ............................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // umaddl x28, w0, w16, x28 // ...............................................................................................................................................................................................................................................................................................................................................................................................*.............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // umaddl x28, w21, w26, x28 // .................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // umaddl x28, w2, w5, x28 // ..................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x12, w12, w19, x28 // ...................................................................................................................................................................................................................................................................................................................................................................................................*.......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x12, w3, w6, x12 // ....................................................................................................................................................................................................................................................................................................................................................................................................*......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umull x28, w29, w16 // ....................................................................................................................................................................................................................................................................................................................................................................................................................*......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x15, w15, w19, x28 // .....................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // umaddl x15, w18, w6, x15 // ......................................................................................................................................................................................................................................................................................................................................................................................................................*....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x15, w11, w5, x15 // .......................................................................................................................................................................................................................................................................................................................................................................................................................*...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x15, w13, w26, x15 // ........................................................................................................................................................................................................................................................................................................................................................................................................................*..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x15, w3, w20, x15 // .........................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x15, w23, w27, x15 // ..........................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x15, w2, w7, x15 // ...........................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // umaddl x15, w21, w10, x15 // ............................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // umaddl x15, w0, w25, x15 // .............................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // umull x28, w29, w25 // .......................................................................................................................................................................................................................................................................................................................................................................................................*...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x28, w18, w20, x28 // ........................................................................................................................................................................................................................................................................................................................................................................................................*..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x28, w17, w27, x28 // .........................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x28, w11, w7, x28 // ..........................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x28, w13, w10, x28 // ...........................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // add x28, x28, x28 // ............................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // umaddl x28, w3, w16, x28 // .............................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // umaddl x28, w23, w6, x28 // ..............................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x28, w2, w19, x28 // ...............................................................................................................................................................................................................................................................................................................................................................................................................*.............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // umaddl x28, w21, w5, x28 // ................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // umaddl x28, w0, w26, x28 // .....................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // umull x8, w29, w26 // ..............................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x8, w18, w16, x8 // ...............................................................................................................................................................................................................................................................................................................................................................................................................................................*.............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // umaddl x8, w17, w6, x8 // ................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // umaddl x11, w11, w19, x8 // .................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // umaddl x11, w13, w5, x11 // ..................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // mul w2, w2, w30 // .....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // umaddl x11, w3, w25, x11 // ......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*....................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x11, w23, w20, x11 // .......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*...................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x11, w2, w27, x11 // ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*..................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x11, w21, w7, x11 // .........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x11, w0, w10, x11 // ..........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................... + // add x15, x15, x28, lsr #26 // ......................................................................................................................................................................................................................................................................................................................................................................................................................................*....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // and x28, x28, #0x3ffffff // ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*..................................................................................................................................................................................................................................................................................................................................................................................................... + // add x12, x12, x15, lsr #25 // .......................................................................................................................................................................................................................................................................................................................................................................................................................................*...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // and x15, x15, #0x1ffffff // ...........................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // add x1, x1, x12, lsr #26 // ........................................................................................................................................................................................................................................................................................................................................................................................................................................*..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // and x12, x12, #0x3ffffff // ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*..................................................................................................................................................................................................................................................................................................................... + // add x4, x4, x1, lsr #25 // .........................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // bfi x12, x1, #32, #25 // .........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................... + // add x9, x9, x4, lsr #26 // ..........................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // and x4, x4, #0x3ffffff // ............................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // bic x1, x9, #0x3ffffff // ......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*....................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // lsr x8, x1, #26 // ............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // bfi x4, x9, #32, #26 // .............................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // add x9, x8, x1, lsr #25 // ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*..................................................................................................................................................................................................................................................................................................................................................................................................................................... + // add x9, x9, x1, lsr #22 // .........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x9, w29, w7, x9 // ..........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x9, w18, w10, x9 // ...........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................. + // umaddl x9, w17, w25, x9 // ............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................. + // umaddl x9, w22, w20, x9 // .............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................ + // umaddl x9, w24, w27, x9 // ..............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................... + // add x9, x9, x9 // ...............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.............................................................................................................................................................................................................................................................................................................................................................................................................................. + // umaddl x9, w3, w5, x9 // ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................. + // umaddl x9, w23, w26, x9 // .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................ + // umaddl x9, w2, w16, x9 // ..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x9, w14, w6, x9 // ...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.......................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x9, w0, w19, x9 // .........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................... + // umull x1, w29, w5 // .......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*...................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x1, w18, w26, x1 // ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*..................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x1, w17, w16, x1 // .........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x1, w22, w6, x1 // ..........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x1, w13, w19, x1 // ...........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // umaddl x1, w3, w10, x1 // ....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*......................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x1, w23, w25, x1 // .....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................ + // umaddl x24, w2, w20, x1 // ......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*....................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x1, w14, w27, x24 // .......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*...................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x1, w0, w7, x1 // ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*..................................................................................................................................................................................................................................................................................................................................................................................................................... + // umull x10, w29, w10 // ............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // umaddl x10, w18, w25, x10 // .............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // umaddl x10, w17, w20, x10 // ..............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x10, w22, w27, x10 // ...............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.............................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // umaddl x13, w13, w7, x10 // ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // add x13, x13, x13 // .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // umaddl x13, w3, w26, x13 // ..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x13, w23, w16, x13 // ...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.......................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x2, w2, w6, x13 // ....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*......................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umaddl x21, w21, w19, x2 // .....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // umaddl x21, w0, w5, x21 // ..........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................... + // add x2, x1, x9, lsr #26 // .........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................... + // and x9, x9, #0x3ffffff // ..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................... + // add x21, x21, x2, lsr #25 // ...........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................. + // bfi x9, x2, #32, #25 // ...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.......................................................................................................................................................................................................................................................................................................... + // add x11, x11, x21, lsr #26 // ............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................. + // and x21, x21, #0x3ffffff // ..........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................... + // add x2, x28, x11, lsr #25 // .............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................ + // bfi x21, x11, #32, #25 // ...........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................. + // add x11, x15, x2, lsr #26 // ..............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................... + // and x15, x2, #0x3ffffff // ......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*....................................................................................................................................................................................................................................................................................................................... + // bfi x15, x11, #32, #26 // ............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................. + // ldr x11, [sp, #STACK_CTR] // ..............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................... + // lsr x2, x11, #32 // ....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*......................................................................................................................................................................................................................................................................................................... + // subs w0, w11, #1 // ...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.......................................................................................................................................................................................................................................................................................................................... + // asr w11, w0, #5 // ....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*......................................................................................................................................................................................................................................................................................................................... + // add x13, sp, #STACK_SCALAR // ..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................... + // ldr w11, [x13, w11, SXTW #2] // .......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*...................................................................................................................................................................................................................................................................................................................... + // and w13, w0, #0x1f // .....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................ + // lsr w11, w11, w13 // ..............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................... + // stp w0, w11, [sp, #STACK_CTR] // ...............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.............................................................................................................................................................................................................................................................................................................. + // umull v25.2D, v0.2S, v23.2S // ...........................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // umlal v25.2D, v2.2S, v11.2S // ............................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // umlal v25.2D, v5.2S, v16.2S // .............................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // umlal v25.2D, v7.2S, v22.2S // ..............................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umlal v25.2D, v4.2S, v6.2S // .....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // mul v27.2S, v23.2S, v31.2S // ...................................................................................................................................................................................................................................................................................................................................................................................................................................................*.......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umull v20.2D, v9.2S, v11.2S // ...............................................................................................................................................................................................................................................................................................................................................................................................................................................................*.............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // umlal v20.2D, v3.2S, v16.2S // ................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // umlal v20.2D, v1.2S, v22.2S // .................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // umlal v20.2D, v15.2S, v6.2S // .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // umlal v20.2D, v13.2S, v27.2S // ..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umlal v25.2D, v9.2S, v18.2S // ......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umlal v25.2D, v3.2S, v17.2S // .............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // umlal v25.2D, v1.2S, v14.2S // ...............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.............................................................................................................................................................................................................................................................................................................................................................................................................................................. + // umlal v25.2D, v15.2S, v12.2S // ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................. + // umlal v25.2D, v13.2S, v10.2S // ..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................... + // shl v20.2D, v20.2D, #1 // ...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umull v21.2D, v0.2S, v11.2S // .......................................................................................................................................................................................................................................................................................................................................................................................................................................................*...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umlal v21.2D, v2.2S, v16.2S // ........................................................................................................................................................................................................................................................................................................................................................................................................................................................*..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umlal v21.2D, v5.2S, v22.2S // .........................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umlal v21.2D, v7.2S, v6.2S // ..........................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umlal v21.2D, v4.2S, v27.2S // ..............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // mul v26.2S, v11.2S, v31.2S // ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................*..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umlal v20.2D, v0.2S, v18.2S // ....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umlal v20.2D, v2.2S, v17.2S // .............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................ + // umlal v20.2D, v5.2S, v14.2S // ..............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................... + // umlal v20.2D, v7.2S, v12.2S // ...............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.............................................................................................................................................................................................................................................................................................................................................................................. + // umlal v20.2D, v4.2S, v10.2S // ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................. + // mul v18.2S, v18.2S, v31.2S // .......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umull v11.2D, v9.2S, v16.2S // ....................................................................................................................................................................................................................................................................................................................................................................................................................................................................*......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umlal v11.2D, v3.2S, v22.2S // .....................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // umlal v11.2D, v1.2S, v6.2S // ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umlal v11.2D, v15.2S, v27.2S // .........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umlal v11.2D, v13.2S, v26.2S // ..........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umlal v21.2D, v9.2S, v17.2S // ...........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................. + // umlal v21.2D, v3.2S, v14.2S // ............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................. + // umlal v21.2D, v1.2S, v12.2S // ....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*......................................................................................................................................................................................................................................................................................................................................................................................... + // umlal v21.2D, v15.2S, v10.2S // ............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................. + // umlal v21.2D, v13.2S, v18.2S // .............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................ + // shl v11.2D, v11.2D, #1 // ...........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // umull v23.2D, v0.2S, v16.2S // ..................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umlal v23.2D, v2.2S, v22.2S // ...................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umlal v23.2D, v5.2S, v6.2S // ...............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // umlal v23.2D, v7.2S, v27.2S // ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // umlal v23.2D, v4.2S, v26.2S // .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // mul v16.2S, v16.2S, v31.2S // .........................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umlal v11.2D, v0.2S, v17.2S // ............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // umlal v11.2D, v2.2S, v14.2S // ....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*......................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umlal v11.2D, v5.2S, v12.2S // ......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*....................................................................................................................................................................................................................................................................................................................................................................................... + // umlal v11.2D, v7.2S, v10.2S // .......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*...................................................................................................................................................................................................................................................................................................................................................................................... + // umlal v11.2D, v4.2S, v18.2S // ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*..................................................................................................................................................................................................................................................................................................................................................................................... + // mul v17.2S, v17.2S, v31.2S // ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................. + // umull v28.2D, v9.2S, v22.2S // ......................................................................................................................................................................................................................................................................................................................................................................................................................................................................*....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umlal v28.2D, v3.2S, v6.2S // ..........................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umlal v28.2D, v1.2S, v27.2S // ...........................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // umlal v28.2D, v15.2S, v26.2S // ............................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // umlal v28.2D, v13.2S, v16.2S // .............................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // umlal v23.2D, v9.2S, v14.2S // ..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umlal v23.2D, v3.2S, v12.2S // ...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umlal v23.2D, v1.2S, v10.2S // ....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umlal v23.2D, v15.2S, v18.2S // .....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // umlal v23.2D, v13.2S, v17.2S // .........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................... + // shl v28.2D, v28.2D, #1 // ..............................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umull v29.2D, v0.2S, v22.2S // ....................................................................................................................................................................................................................................................................................................................................................................................................................................................*......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umlal v29.2D, v2.2S, v6.2S // .....................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // umlal v29.2D, v5.2S, v27.2S // ......................................................................................................................................................................................................................................................................................................................................................................................................................................................*....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umlal v29.2D, v7.2S, v26.2S // .............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // umlal v29.2D, v4.2S, v16.2S // .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................ + // mul v22.2S, v22.2S, v31.2S // .......................................................................................................................................................................................................................................................................................................................................................................................................................................................................*...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umlal v28.2D, v0.2S, v14.2S // ...............................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // umlal v28.2D, v2.2S, v12.2S // ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // umlal v28.2D, v5.2S, v10.2S // .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................ + // umlal v28.2D, v7.2S, v18.2S // ..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................... + // umlal v28.2D, v4.2S, v17.2S // ...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.......................................................................................................................................................................................................................................................................................................................................................................................... + // mul v14.2S, v14.2S, v31.2S // ...............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.............................................................................................................................................................................................................................................................................................................................................................................................................. + // umull v8.2D, v9.2S, v6.2S // ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umlal v8.2D, v3.2S, v27.2S // .........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umlal v8.2D, v1.2S, v26.2S // ..............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umlal v8.2D, v15.2S, v16.2S // ...............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.............................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // umlal v8.2D, v13.2S, v22.2S // ...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.......................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umlal v29.2D, v9.2S, v12.2S // ..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umlal v29.2D, v3.2S, v10.2S // ...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.......................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umlal v29.2D, v1.2S, v18.2S // ...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.......................................................................................................................................................................................................................................................................................................................................................................................................... + // umlal v29.2D, v15.2S, v17.2S // ....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*......................................................................................................................................................................................................................................................................................................................................................................................................... + // umlal v29.2D, v13.2S, v14.2S // .....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................ + // shl v8.2D, v8.2D, #1 // ....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*......................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umull v19.2D, v0.2S, v6.2S // ......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umlal v19.2D, v2.2S, v27.2S // .......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umlal v19.2D, v5.2S, v26.2S // ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // umlal v19.2D, v7.2S, v16.2S // .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // umlal v19.2D, v4.2S, v22.2S // ..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // mul v6.2S, v6.2S, v31.2S // ..........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................... + // umlal v8.2D, v0.2S, v12.2S // .....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................ + // umlal v8.2D, v2.2S, v10.2S // ......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*....................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umlal v8.2D, v5.2S, v18.2S // .......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*...................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umlal v8.2D, v7.2S, v17.2S // .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................ + // umlal v8.2D, v4.2S, v14.2S // ..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................... + // mul v12.2S, v12.2S, v31.2S // .........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................... + // umull v27.2D, v9.2S, v27.2S // ..........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // umlal v27.2D, v3.2S, v26.2S // ...........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // umlal v27.2D, v1.2S, v16.2S // ............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // ushr v26.2D, v30.2D, #1 // ...............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.............................................................................................................................................................................................................................................................................................................................................................................................. + // usra v29.2D, v8.2D, #26 // ......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*....................................................................................................................................................................................................................................................................................................................................................................................................... + // and v16.16B, v8.16B, v30.16B // .......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*...................................................................................................................................................................................................................................................................................................................................................................................................... + // umlal v19.2D, v9.2S, v10.2S // ..............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................... + // usra v28.2D, v29.2D, #25 // .....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................ + // and v29.16B, v29.16B, v26.16B // ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................. + // umlal v27.2D, v15.2S, v22.2S // .............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // usra v23.2D, v28.2D, #26 // ..........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................... + // and v22.16B, v28.16B, v30.16B // ............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................. + // umlal v19.2D, v3.2S, v18.2S // .......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*...................................................................................................................................................................................................................................................................................................................................... + // usra v11.2D, v23.2D, #25 // ...........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................. + // and v23.16B, v23.16B, v26.16B // .............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................ + // umlal v27.2D, v13.2S, v6.2S // ...........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................. + // usra v21.2D, v11.2D, #26 // ..............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................... + // and v6.16B, v11.16B, v30.16B // ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................. + // umlal v19.2D, v1.2S, v17.2S // ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*..................................................................................................................................................................................................................................................................................................................................... + // umlal v19.2D, v15.2S, v14.2S // .........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................... + // umlal v19.2D, v13.2S, v12.2S // ..........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................... + // usra v20.2D, v21.2D, #25 // .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................ + // and v21.16B, v21.16B, v26.16B // .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................ + // shl v27.2D, v27.2D, #1 // ............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................. + // usra v25.2D, v20.2D, #26 // ...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.......................................................................................................................................................................................................................................................................................................................................................................... + // and v20.16B, v20.16B, v30.16B // .....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................ + // umlal v27.2D, v0.2S, v10.2S // .............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................ + // umlal v27.2D, v2.2S, v18.2S // ..............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................... + // umlal v27.2D, v5.2S, v17.2S // ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................. + // umlal v27.2D, v7.2S, v14.2S // .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................ + // umlal v27.2D, v4.2S, v12.2S // ..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................... + // bic v18.16B, v25.16B, v26.16B // ...............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.............................................................................................................................................................................................................................................................................................................................................. + // and v25.16B, v25.16B, v26.16B // ....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*......................................................................................................................................................................................................................................................................................................................................................................... + // usra v27.2D, v18.2D, #25 // ...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.......................................................................................................................................................................................................................................................................................................................................... + // usra v27.2D, v18.2D, #24 // ....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*......................................................................................................................................................................................................................................................................................................................................... + // usra v27.2D, v18.2D, #21 // .....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................ + // usra v19.2D, v27.2D, #26 // ...........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................. + // and v18.16B, v27.16B, v30.16B // ......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*....................................................................................................................................................................................................................................................................................................................................... + // usra v16.2D, v19.2D, #25 // .............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................ + // and v12.16B, v19.16B, v26.16B // ............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................. + // usra v29.2D, v16.2D, #26 // ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................. + // and v0.16B, v16.16B, v30.16B // ...............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.............................................................................................................................................................................................................................................................................................................................. + // uzp1 v5.4S, v22.4S, v23.4S // ..............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................... + // uzp1 v7.4S, v6.4S, v21.4S // ..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................... + // ld1r {v4.2D}, [sp] // .......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*...................................................................................................................................................................................................................................................................................................................................................................... + // uzp1 v17.4S, v5.4S, v7.4S // ...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.......................................................................................................................................................................................................................................................................................................................................................... + // uzp2 v5.4S, v5.4S, v7.4S // .....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................ + // trn1 v7.4S, v20.4S, v25.4S // ......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*....................................................................................................................................................................................................................................................................................................................................................................... + // ldr b27, [sp, #STACK_MASK2] // ...............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.............................................................................................................................................................................................................................................................................................................................................................. + // uzp1 v18.4S, v18.4S, v12.4S // .............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................ + // uzp1 v12.4S, v0.4S, v29.4S // .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................ + // mov v25.d[0], v7.d[1] // .........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................... + // uzp1 v0.4S, v18.4S, v12.4S // ......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*....................................................................................................................................................................................................................................................................................................... + // uzp2 v18.4S, v18.4S, v12.4S // .....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................ + // add v12.4S, v17.4S, v4.4S // ....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*......................................................................................................................................................................................................................................................................................................................................................... + // add v14.2S, v7.2S, v4.2S // ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*..................................................................................................................................................................................................................................................................................................................................................................... + // mov v4.b[0], v27.b[0] // .......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*...................................................................................................................................................................................................................................................................................................................................................... + // add v27.4S, v0.4S, v18.4S // .......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*...................................................................................................................................................................................................................................................................................................... + // add v17.4S, v17.4S, v5.4S // ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*..................................................................................................................................................................................................................................................................................................................................................... + // add v7.2S, v7.2S, v25.2S // ...........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................. + // add v0.4S, v0.4S, v4.4S // ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*..................................................................................................................................................................................................................................................................................................... + // sub v12.4S, v12.4S, v5.4S // ......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*....................................................................................................................................................................................................................................................................................................................................................... + // sub v18.4S, v0.4S, v18.4S // .........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................... + // sub v25.2S, v14.2S, v25.2S // ..........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................... + // zip1 v0.4S, v18.4S, v27.4S // ..........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................... + // zip2 v18.4S, v18.4S, v27.4S // ...........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................. + // zip1 v5.4S, v12.4S, v17.4S // ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................. + // zip2 v12.4S, v12.4S, v17.4S // .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................ + // zip1 v4.2S, v25.2S, v7.2S // .............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................ + // zip2 v25.2S, v25.2S, v7.2S // ..............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................... + // mov v9.d[0], v0.d[1] // ............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................. + // mov v3.d[0], v18.d[1] // ......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*....................................................................................................................................................................................................................................................................................... + // mov v1.d[0], v5.d[1] // ..........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................... + // mov v15.d[0], v12.d[1] // ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................. + // shl v7.2S, v25.2S, #1 // ............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................. + // shl v17.2S, v4.2S, #1 // ...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.......................................................................................................................................................................................................................................................................................... + // shl v27.2S, v15.2S, #1 // .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................ + // shl v14.2S, v12.2S, #1 // ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*..................................................................................................................................................................................................................................................................................... + // shl v10.2S, v1.2S, #1 // ...........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................. + // shl v20.2S, v5.2S, #1 // ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*..................................................................................................................................................................................................................................................... + // shl v21.2S, v3.2S, #1 // .......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*...................................................................................................................................................................................................................................................................................... + // shl v2.2S, v18.2S, #1 // ...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.......................................................................................................................................................................................................................................................... + // shl v22.2S, v9.2S, #1 // ..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................... + // umull v13.2D, v0.2S, v7.2S // ....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*......................................................................................................................................................................................................................................................................... + // umlal v13.2D, v9.2S, v17.2S // .....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................ + // umlal v13.2D, v18.2S, v27.2S // ......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*....................................................................................................................................................................................................................................................................... + // umlal v13.2D, v3.2S, v14.2S // .......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*...................................................................................................................................................................................................................................................................... + // umlal v13.2D, v5.2S, v10.2S // ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*..................................................................................................................................................................................................................................................................... + // umull v6.2D, v0.2S, v17.2S // ....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*......................................................................................................................................................................................................................................................................................... + // umlal v6.2D, v22.2S, v27.2S // .....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................ + // umlal v6.2D, v18.2S, v14.2S // .........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................... + // umlal v6.2D, v21.2S, v10.2S // ..............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................... + // umlal v6.2D, v5.2S, v5.2S // ...............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.............................................................................................................................................................................................................................................................................. + // mul v25.2S, v25.2S, v31.2S // .............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................ + // umull v16.2D, v0.2S, v27.2S // ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................. + // umlal v16.2D, v9.2S, v14.2S // .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................ + // umlal v16.2D, v18.2S, v10.2S // ..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................... + // umlal v16.2D, v3.2S, v20.2S // ...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.......................................................................................................................................................................................................................................... + // umlal v6.2D, v25.2S, v7.2S // ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................. + // umull v7.2D, v0.2S, v14.2S // ......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*....................................................................................................................................................................................................................................................... + // umlal v7.2D, v22.2S, v10.2S // .......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*...................................................................................................................................................................................................................................................... + // umlal v7.2D, v18.2S, v20.2S // ............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................. + // umlal v7.2D, v21.2S, v3.2S // .............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................ + // umull v5.2D, v0.2S, v10.2S // .........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................... + // umlal v5.2D, v9.2S, v20.2S // ..........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................... + // umlal v5.2D, v18.2S, v21.2S // ...........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................. + // umull v3.2D, v0.2S, v20.2S // ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................. + // umlal v3.2D, v22.2S, v21.2S // .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................ + // umlal v3.2D, v18.2S, v18.2S // ..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................... + // umull v18.2D, v0.2S, v21.2S // .........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................... + // umlal v18.2D, v9.2S, v2.2S // ..........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................... + // umull v11.2D, v0.2S, v2.2S // .......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*...................................................................................................................................................................................................................................... + // umlal v11.2D, v22.2S, v9.2S // ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*..................................................................................................................................................................................................................................... + // umull v23.2D, v0.2S, v22.2S // .............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................ + // umull v0.2D, v0.2S, v0.2S // .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................ + // usra v13.2D, v6.2D, #26 // .........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................... + // and v28.16B, v6.16B, v30.16B // ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*..................................................................................................................................................................................................................... + // mul v6.2S, v4.2S, v31.2S // ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................. + // bic v29.16B, v13.16B, v26.16B // ..........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................... + // and v9.16B, v13.16B, v26.16B // ...............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.............................................................................................................................................................................................................. + // usra v0.2D, v29.2D, #25 // ...........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................. + // mul v15.2S, v15.2S, v31.2S // .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................ + // usra v0.2D, v29.2D, #24 // ............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................. + // mul v13.2S, v12.2S, v31.2S // .........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................... + // usra v0.2D, v29.2D, #21 // .............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................ + // mul v1.2S, v1.2S, v31.2S // ..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................... + // shl v22.2S, v22.2S, #1 // ...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.......................................................................................................................................................................................................................................................................... + // shl v29.2S, v21.2S, #1 // ..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................... + // shl v8.2S, v10.2S, #1 // ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................. + // shl v19.2S, v27.2S, #1 // .....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................ + // umlal v0.2D, v1.2S, v10.2S // ..............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................... + // umlal v0.2D, v25.2S, v22.2S // ...............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.............................................................................................................................................................................................................................................................. + // umlal v0.2D, v6.2S, v2.2S // ....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*......................................................................................................................................................................................................................................................... + // umlal v0.2D, v15.2S, v29.2S // .....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................ + // umlal v0.2D, v13.2S, v20.2S // ..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................... + // umlal v23.2D, v25.2S, v2.2S // ..............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................... + // umlal v23.2D, v6.2S, v21.2S // ...............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.............................................................................................................................................................................................................................. + // umlal v23.2D, v15.2S, v20.2S // ...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.......................................................................................................................................................................................................................... + // umlal v23.2D, v13.2S, v10.2S // ....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*......................................................................................................................................................................................................................... + // umlal v11.2D, v13.2S, v12.2S // ..........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................... + // umlal v11.2D, v25.2S, v29.2S // ...........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................. + // umlal v11.2D, v6.2S, v20.2S // ............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................. + // umlal v11.2D, v15.2S, v8.2S // .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................ + // usra v23.2D, v0.2D, #26 // .....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................ + // umlal v18.2D, v25.2S, v20.2S // ...........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................. + // umlal v18.2D, v6.2S, v10.2S // ............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................. + // umlal v18.2D, v15.2S, v14.2S // .............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................ + // usra v11.2D, v23.2D, #25 // ......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*....................................................................................................................................................................................................................... + // umlal v3.2D, v15.2S, v27.2S // ...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.......................................................................................................................................................................................................... + // umlal v3.2D, v25.2S, v8.2S // ....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*......................................................................................................................................................................................................... + // umlal v3.2D, v6.2S, v14.2S // .....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................ + // usra v18.2D, v11.2D, #26 // ..............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................... + // umlal v5.2D, v25.2S, v14.2S // ..............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................... + // umlal v5.2D, v6.2S, v27.2S // .......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*...................................................................................................................................................................................................................... + // usra v3.2D, v18.2D, #25 // ......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*....................................................................................................................................................................................................... + // umlal v7.2D, v6.2S, v4.2S // ...............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.............................................................................................................................................................................................................................................. + // umlal v7.2D, v25.2S, v19.2S // ......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*....................................................................................................................................................................................................................................... + // usra v5.2D, v3.2D, #26 // .......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*...................................................................................................................................................................................................... + // umlal v16.2D, v25.2S, v17.2S // ....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*......................................................................................................................................................................................................................................... + // usra v7.2D, v5.2D, #25 // ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*..................................................................................................................................................................................................... + // usra v16.2D, v7.2D, #26 // .........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................... + // usra v28.2D, v16.2D, #25 // ..........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................... + // usra v9.2D, v28.2D, #26 // ....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*......................................................................................................................................................................................... + // and v4.16B, v3.16B, v30.16B // ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*..................................................................................................................................................................................... + // and v5.16B, v5.16B, v26.16B // ............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................. + // and v0.16B, v0.16B, v30.16B // .....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................ + // and v6.16B, v7.16B, v30.16B // ..............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................... + // and v1.16B, v23.16B, v26.16B // .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................ + // and v7.16B, v16.16B, v26.16B // ...............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.............................................................................................................................................................................................. + // and v2.16B, v11.16B, v30.16B // .........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................... + // and v8.16B, v28.16B, v30.16B // ...........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................. + // and v3.16B, v18.16B, v26.16B // .............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................ + // add x1, sp, #STACK_A_0 // ..........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................... + // ld2 { v0.S, v1.S }[1], [x1], #8 // ...........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................. + // ld2 { v2.S, v3.S }[1], [x1], #8 // ............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................. + // ld2 { v4.S, v5.S }[1], [x1], #8 // ..............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................... + // ld2 { v6.S, v7.S }[1], [x1], #8 // ...............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.............................................................................................................................................................................. + // ld2 { v8.S, v9.S }[1], [x1], #8 // ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................. + // add x1, sp, #STACK_B_0 // ...............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.............................................................................................................................................................................................................................................................................................. + // ld2 { v10.S, v11.S }[1], [x1], #8 // ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................. + // ld2 { v25.S, v26.S }[1], [x1], #8 // ..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................... + // ld2 { v12.S, v13.S }[1], [x1], #8 // ...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.......................................................................................................................................................................................... + // ld2 { v17.S, v18.S }[1], [x1], #8 // .......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*...................................................................................................................................................................................... + // ld2 { v27.S, v28.S }[1], [x1], #8 // ...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.......................................................................................................................................................................... + // add x1, sp, #STACK_X_0 // ......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*....................................................................................................................................................................................... + // ld2 { v10.S, v11.S }[0], [x1], #8 // .............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................ + // ld2 { v25.S, v26.S }[0], [x1], #8 // .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................ + // ld2 { v12.S, v13.S }[0], [x1], #8 // ..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................... + // ld2 { v17.S, v18.S }[0], [x1], #8 // ....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*......................................................................................................................................................................... + // ld2 { v27.S, v28.S }[0], [x1], #8 // ......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*....................................................................................................................................................................... + // umull v20.2D, v0.2S, v28.2S // ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*..................................................................................................................................................................... + // umlal v20.2D, v2.2S, v18.2S // ............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................. + // umlal v20.2D, v4.2S, v13.2S // .............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................ + // umlal v20.2D, v6.2S, v26.2S // ...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.......................................................................................................................................... + // umlal v20.2D, v8.2S, v11.2S // .....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................ + // mul v21.2S, v28.2S, v31.2S // .......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*...................................................................................................................................................................... + // umull v14.2D, v1.2S, v18.2S // .........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................... + // umlal v14.2D, v3.2S, v13.2S // ..........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................... + // umlal v14.2D, v5.2S, v26.2S // ...........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................. + // umlal v14.2D, v7.2S, v11.2S // ..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................... + // umlal v14.2D, v9.2S, v21.2S // ...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.......................................................................................................................................................... + // umlal v20.2D, v1.2S, v27.2S // .......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*...................................................................................................................................... + // umlal v20.2D, v3.2S, v17.2S // ...........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................. + // umlal v20.2D, v5.2S, v12.2S // .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................ + // umlal v20.2D, v7.2S, v25.2S // .......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*...................................................... + // umlal v20.2D, v9.2S, v10.2S // ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*..................................................... + // shl v22.2D, v14.2D, #1 // ....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*......................................................................................................................................................... + // umull v15.2D, v0.2S, v18.2S // .............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................ + // umlal v15.2D, v2.2S, v13.2S // ..............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................... + // umlal v15.2D, v4.2S, v26.2S // ...............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.............................................................................................................................................................. + // umlal v15.2D, v6.2S, v11.2S // ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................. + // umlal v15.2D, v8.2S, v21.2S // .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................ + // mul v23.2S, v18.2S, v31.2S // .....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................ + // umlal v22.2D, v0.2S, v27.2S // ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................. + // umlal v22.2D, v2.2S, v17.2S // .............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................ + // umlal v22.2D, v4.2S, v12.2S // ......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*....................................................................... + // umlal v22.2D, v6.2S, v25.2S // ..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................... + // umlal v22.2D, v8.2S, v10.2S // ...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.......................................................... + // mul v18.2S, v27.2S, v31.2S // ...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.......................................................................................................................... + // umull v27.2D, v1.2S, v13.2S // ......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*....................................................................................................................................................... + // umlal v27.2D, v3.2S, v26.2S // .........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................... + // umlal v27.2D, v5.2S, v11.2S // ............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................. + // umlal v27.2D, v7.2S, v21.2S // ..............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................... + // umlal v27.2D, v9.2S, v23.2S // .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................ + // umlal v15.2D, v1.2S, v17.2S // ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................. + // umlal v15.2D, v3.2S, v12.2S // ...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.......................................................................................... + // umlal v15.2D, v5.2S, v25.2S // ....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*......................................................................................... + // umlal v15.2D, v7.2S, v10.2S // .........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................... + // umlal v15.2D, v9.2S, v18.2S // ..........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................... + // shl v27.2D, v27.2D, #1 // ......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*....................................................................................................................................... + // umull v28.2D, v0.2S, v13.2S // .......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*...................................................................................................................................................... + // umlal v28.2D, v2.2S, v26.2S // ..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................... + // umlal v28.2D, v4.2S, v11.2S // ....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*......................................................................................................................................... + // umlal v28.2D, v6.2S, v21.2S // ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*..................................................................................................................................... + // umlal v28.2D, v8.2S, v23.2S // .............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................ + // mul v13.2S, v13.2S, v31.2S // ...............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.............................................................................................................................................. + // umlal v27.2D, v0.2S, v17.2S // ..............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................... + // umlal v27.2D, v2.2S, v12.2S // .....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................ + // umlal v27.2D, v4.2S, v25.2S // .......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*...................................................................... + // umlal v27.2D, v6.2S, v10.2S // ............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................. + // umlal v27.2D, v8.2S, v18.2S // .............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................ + // mul v16.2S, v17.2S, v31.2S // .........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................... + // umull v17.2D, v1.2S, v26.2S // .....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................ + // umlal v17.2D, v3.2S, v11.2S // ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*..................................................................................................................................................... + // umlal v17.2D, v5.2S, v21.2S // ..........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................... + // umlal v17.2D, v7.2S, v23.2S // ...........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................. + // umlal v17.2D, v9.2S, v13.2S // ..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................... + // umlal v28.2D, v1.2S, v12.2S // ....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*......................................................................................................................... + // umlal v28.2D, v3.2S, v25.2S // .......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*...................................................................................................................... + // umlal v28.2D, v5.2S, v10.2S // ............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................. + // umlal v28.2D, v7.2S, v18.2S // .....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................ + // umlal v28.2D, v9.2S, v16.2S // ......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*....................................................... + // shl v17.2D, v17.2D, #1 // .........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................... + // umull v29.2D, v0.2S, v26.2S // ..........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................... + // umlal v29.2D, v2.2S, v11.2S // ............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................. + // umlal v29.2D, v4.2S, v21.2S // ...............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.............................................................................................................................. + // umlal v29.2D, v6.2S, v23.2S // .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................ + // umlal v29.2D, v8.2S, v13.2S // ......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*....................................................................................................................... + // mul v26.2S, v26.2S, v31.2S // ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*..................................................................................................................... + // umlal v17.2D, v0.2S, v12.2S // ..........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................... + // umlal v17.2D, v2.2S, v25.2S // ..............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................... + // umlal v17.2D, v4.2S, v10.2S // ...............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.............................................................................................. + // umlal v17.2D, v6.2S, v18.2S // ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................. + // umlal v17.2D, v8.2S, v16.2S // .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................ + // mul v14.2S, v12.2S, v31.2S // ...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.......................................................................................................... + // umull v12.2D, v1.2S, v11.2S // ...........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................. + // umlal v12.2D, v3.2S, v21.2S // ............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................. + // umlal v12.2D, v5.2S, v23.2S // ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................. + // umlal v12.2D, v7.2S, v13.2S // .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................ + // umlal v12.2D, v9.2S, v26.2S // ..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................... + // umlal v29.2D, v1.2S, v25.2S // ..............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................... + // umlal v29.2D, v3.2S, v10.2S // ...............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.............................................................................................................. + // umlal v29.2D, v5.2S, v18.2S // ....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*......................................................................................................... + // umlal v29.2D, v7.2S, v16.2S // .....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................ + // umlal v29.2D, v9.2S, v14.2S // ......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*....................................................................................................... + // shl v19.2D, v12.2D, #1 // .......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*...................................................................................................... + // umull v24.2D, v0.2S, v11.2S // ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*..................................................................................................... + // umlal v24.2D, v2.2S, v21.2S // ..........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................... + // umlal v24.2D, v4.2S, v23.2S // ...........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................. + // umlal v24.2D, v6.2S, v13.2S // .............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................ + // umlal v24.2D, v8.2S, v26.2S // .....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................ + // mul v11.2S, v11.2S, v31.2S // ..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................... + // umlal v19.2D, v0.2S, v25.2S // .........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................... + // umlal v19.2D, v2.2S, v10.2S // ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................. + // umlal v19.2D, v4.2S, v18.2S // .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................ + // umlal v19.2D, v6.2S, v16.2S // .......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*...................................................................................... + // umlal v19.2D, v8.2S, v14.2S // ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*..................................................................................... + // mul v12.2S, v25.2S, v31.2S // ..............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................... + // umull v25.2D, v1.2S, v21.2S // .........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................... + // umlal v25.2D, v3.2S, v23.2S // ..........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................... + // umlal v25.2D, v5.2S, v13.2S // ...........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................. + // ushr v21.2D, v30.2D, #1 // .....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................ + // usra v29.2D, v19.2D, #26 // ..............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................... + // and v13.16B, v19.16B, v30.16B // ...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.......................... + // umlal v24.2D, v1.2S, v10.2S // ......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*....................................................................................... + // usra v17.2D, v29.2D, #25 // .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................ + // and v23.16B, v29.16B, v21.16B // .....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................ + // umlal v25.2D, v7.2S, v26.2S // ............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................. + // usra v28.2D, v17.2D, #26 // ...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.......................................... + // and v17.16B, v17.16B, v30.16B // ..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................... + // umlal v24.2D, v3.2S, v18.2S // ....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*......................................................................... + // usra v27.2D, v28.2D, #25 // ....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*......................................... + // and v26.16B, v28.16B, v21.16B // ......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*....................................... + // umlal v25.2D, v9.2S, v11.2S // .............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................ + // usra v15.2D, v27.2D, #26 // ..........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................... + // and v27.16B, v27.16B, v30.16B // ............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................. + // umlal v24.2D, v5.2S, v16.2S // .....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................ + // umlal v24.2D, v7.2S, v14.2S // ...........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................. + // umlal v24.2D, v9.2S, v12.2S // ............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................. + // usra v22.2D, v15.2D, #25 // ...........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................. + // and v28.16B, v15.16B, v21.16B // .........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................... + // shl v25.2D, v25.2D, #1 // ..............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................... + // usra v20.2D, v22.2D, #26 // ............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................. + // and v22.16B, v22.16B, v30.16B // .......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*...................... + // umlal v25.2D, v0.2S, v10.2S // ...............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.............................................................................. + // umlal v25.2D, v2.2S, v18.2S // ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................. + // umlal v25.2D, v4.2S, v16.2S // ..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................... + // umlal v25.2D, v6.2S, v14.2S // ...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.......................................................................... + // umlal v25.2D, v8.2S, v12.2S // ...............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.............................................................. + // bic v15.16B, v20.16B, v21.16B // .............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................ + // and v20.16B, v20.16B, v21.16B // ..........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................... + // usra v25.2D, v15.2D, #25 // ..............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................... + // usra v25.2D, v15.2D, #24 // ...............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.............................. + // usra v25.2D, v15.2D, #21 // ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................. + // usra v24.2D, v25.2D, #26 // .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................ + // and v25.16B, v25.16B, v30.16B // ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*..................... + // usra v13.2D, v24.2D, #25 // ....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*......................... + // and v21.16B, v24.16B, v21.16B // ..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................... + // usra v23.2D, v13.2D, #26 // ......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*....................... + // and v24.16B, v13.16B, v30.16B // .............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................ + // trn1 v0.4S, v0.4S, v1.4S // ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*..................................... + // trn1 v2.4S, v2.4S, v3.4S // ..........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................... + // trn1 v4.4S, v4.4S, v5.4S // ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*..................................................................... + // trn1 v6.4S, v6.4S, v7.4S // ...............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.............................................. + // trn1 v8.4S, v8.4S, v9.4S // ....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*......................................................... + // trn1 v11.4S, v25.4S, v21.4S // ............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*. + // trn1 v13.4S, v24.4S, v23.4S // ..............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............... + // trn1 v15.4S, v17.4S, v26.4S // ..........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*... + // trn1 v17.4S, v27.4S, v28.4S // .....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........ + // trn1 v19.4S, v22.4S, v20.4S // ...........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................. + // eor w1, w11, w2 // ...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.......... + // mov v10.d[0], v0.d[1] // ....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*......... + // mov v12.d[0], v2.d[1] // ...............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.............. + // mov v14.d[0], v4.d[1] // .......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*...... + // mov v16.d[0], v6.d[1] // ..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........... + // mov v18.d[0], v8.d[1] // ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*..... + // mov v1.d[0], x9 // .........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................... + // mov v3.d[0], x21 // ...........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................. + // mov v5.d[0], x15 // .........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................... + // mov v7.d[0], x12 // ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................. + // mov v9.d[0], x4 // .............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................ + // ldr d28, [sp, #STACK_MASK2] // .......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*...................................... + // ldr d29, [sp, #STACK_MASK1] // ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............. + // mov v0.d[0], v11.d[1] // .............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................* + // mov v2.d[0], v13.d[1] // .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............ + // mov v4.d[0], v15.d[1] // ...........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.. + // mov v6.d[0], v17.d[1] // ......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*....... + // mov v8.d[0], v19.d[1] // .........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.... + + end_label: + + + subs w11, w0, #-1 + cbnz w11, mainloop + + + mov w0, v1.s[0] + mov w1, v1.s[1] + mov w2, v3.s[0] + mov w3, v3.s[1] + mov w4, v5.s[0] + mov w5, v5.s[1] + mov w6, v7.s[0] + mov w7, v7.s[1] + mov w8, v9.s[0] + mov w9, v9.s[1] + + stp w0, w1, [sp, #80] + stp w2, w3, [sp, #88] + stp w4, w5, [sp, #96] + stp w6, w7, [sp, #104] + stp w8, w9, [sp, #112] + + mov x10, v0.d[0] + mov x11, v2.d[0] + mov x12, v4.d[0] + mov x13, v6.d[0] + mov x14, v8.d[0] + + stp x10, x11, [sp] + stp x12, x13, [sp, #16] + str x14, [sp, #32] + + adr x10, invtable + str x10, [sp, #160] + +.Linvloopnext: + ldrh w11, [x10], #2 + mov v20.s[0], w11 + str x10, [sp, #160] + + and w12, w11, #0x7f + subs w30, w12, #1 // square times + bmi .Lskipsquare + + mov w23, w3 + mov w24, w4 + mov w25, w5 + mov w26, w6 + mov w27, w7 + mov w14, w8 + add w10, w0, w0 + add w11, w1, w1 + add w12, w2, w2 + +.Lsqrloop1: + umull x20, w0, w0 + add x4, x24, x23, lsr #25 + umull x21, w10, w1 + and x3, x23, #0x1ffffff + umull x22, w10, w2 + add w13, w3, w3 + umull x23, w10, w3 + add x5, x25, x4, lsr #26 + umull x24, w11, w13 + and x4, x4, #0x3ffffff + umull x28, w4, w4 + add x6, x26, x5, lsr #25 + umull x25, w12, w3 + and x5, x5, #0x1ffffff + umull x26, w13, w3 + add w15, w5, w5 + umaddl x28, w13, w15, x28 + add x7, x27, x6, lsr #26 + umull x19, w4, w15 + and x6, x6, #0x3ffffff + umull x27, w11, w6 + add x8, x14, x7, lsr #25 + umaddl x28, w12, w6, x28 + and x7, x7, #0x1ffffff + umaddl x19, w13, w6, x19 + add x9, x9, x8, lsr #26 + umaddl x27, w10, w7, x27 + add w17, w7, w7 + umaddl x28, w11, w17, x28 + and x8, x8, #0x3ffffff + umaddl x19, w10, w9, x19 + add w14, w9, w9 + umaddl x27, w12, w5, x27 + add w16, w14, w14, lsl #1 + umaddl x28, w10, w8, x28 + add w3, w15, w15, lsl #1 + umaddl x19, w12, w7, x19 + add w16, w16, w14, lsl #4 + umaddl x27, w13, w4, x27 + add w3, w3, w15, lsl #4 + umaddl x28, w16, w9, x28 + + umaddl x19, w11, w8, x19 + add w9, w6, w6, lsl #1 + umaddl x20, w3, w5, x20 + + umaddl x24, w10, w4, x24 + add w9, w9, w6, lsl #4 + umaddl x25, w10, w5, x25 + add x19, x19, x28, lsr #26 + umaddl x26, w10, w6, x26 + and x14, x28, #0x3ffffff + umaddl x22, w11, w1, x22 + add x20, x20, x19, lsr #25 + umaddl x23, w11, w2, x23 + bic x1, x19, #0x1ffffff + umaddl x26, w12, w4, x26 + add x20, x20, x1, lsr #24 + umaddl x24, w2, w2, x24 + add w0, w4, w4 + umaddl x25, w11, w4, x25 + add x20, x20, x1, lsr #21 + umaddl x26, w11, w15, x26 + add w1, w17, w17, lsl #1 + umaddl x20, w9, w0, x20 + + umaddl x21, w9, w15, x21 + add w1, w1, w17, lsl #4 + umaddl x22, w9, w6, x22 + add w10, w8, w8, lsl #1 + umaddl x20, w1, w13, x20 + and x9, x19, #0x1ffffff + umaddl x21, w1, w4, x21 + add w10, w10, w8, lsl #4 + umaddl x22, w1, w15, x22 + subs w30, w30, #1 + umaddl x20, w10, w12, x20 + + umaddl x21, w10, w13, x21 + + umaddl x22, w10, w0, x22 + + umaddl x20, w16, w11, x20 + + umaddl x21, w16, w2, x21 + + umaddl x22, w16, w13, x22 + add w11, w6, w6 + umaddl x23, w1, w6, x23 + + umaddl x24, w1, w7, x24 + add x21, x21, x20, lsr #26 + umaddl x26, w10, w8, x26 + and x0, x20, #0x3ffffff + umaddl x23, w10, w15, x23 + add x22, x22, x21, lsr #25 + umaddl x24, w10, w11, x24 + and x1, x21, #0x1ffffff + umaddl x25, w10, w17, x25 + and x2, x22, #0x3ffffff + umaddl x23, w16, w4, x23 + add w10, w0, w0 + umaddl x24, w16, w15, x24 + add w11, w1, w1 + umaddl x25, w16, w6, x25 + add w12, w2, w2 + umaddl x26, w16, w17, x26 + add x23, x23, x22, lsr #26 + umaddl x27, w16, w8, x27 + bpl .Lsqrloop1 + + mov w11, v20.s[0] + add x4, x24, x23, lsr #25 + and x3, x23, #0x1ffffff + add x5, x25, x4, lsr #26 + and x4, x4, #0x3ffffff + add x6, x26, x5, lsr #25 + and x5, x5, #0x1ffffff + add x7, x27, x6, lsr #26 + and x6, x6, #0x3ffffff + add x8, x14, x7, lsr #25 + and x7, x7, #0x1ffffff + add x9, x9, x8, lsr #26 + and x8, x8, #0x3ffffff +.Lskipsquare: + mov w12, #40 + tst w11, #1<<8 + ubfx w13, w11, #9, #2 + bne .Lskipmul + mul w20, w13, w12 + add x20, sp, x20 + + ldp w10, w11, [x20] + ldp w12, w13, [x20, #8] + ldp w14, w15, [x20, #16] + ldp w16, w17, [x20, #24] + ldp w19, w20, [x20, #32] + mov w30, #19 + + umull x21, w1, w19 + umull x22, w1, w17 + umull x23, w1, w16 + umull x24, w1, w15 + umaddl x21, w3, w16, x21 + umaddl x22, w3, w15, x22 + umaddl x23, w3, w14, x23 + umaddl x24, w3, w13, x24 + umaddl x21, w5, w14, x21 + umaddl x22, w5, w13, x22 + umaddl x23, w5, w12, x23 + umaddl x24, w5, w11, x24 + umaddl x21, w7, w12, x21 + umaddl x22, w7, w11, x22 + umaddl x23, w7, w10, x23 + mul w27, w7, w30 + mul w25, w9, w30 + mul w26, w8, w30 + mul w28, w6, w30 + umaddl x24, w27, w20, x24 + umaddl x21, w9, w10, x21 + umaddl x22, w25, w20, x22 + umaddl x23, w25, w19, x23 + umaddl x24, w25, w17, x24 + add x22, x22, x22 + umaddl x21, w0, w20, x21 + add x24, x24, x24 + umaddl x22, w0, w19, x22 + umaddl x23, w0, w17, x23 + umaddl x24, w0, w16, x24 + umaddl x21, w2, w17, x21 + umaddl x22, w2, w16, x22 + umaddl x23, w2, w15, x23 + umaddl x24, w2, w14, x24 + umaddl x21, w4, w15, x21 + umaddl x22, w4, w14, x22 + umaddl x23, w4, w13, x23 + umaddl x24, w4, w12, x24 + umaddl x21, w6, w13, x21 + umaddl x22, w6, w12, x22 + umaddl x23, w6, w11, x23 + umaddl x24, w6, w10, x24 + umaddl x21, w8, w11, x21 + umaddl x22, w8, w10, x22 + umaddl x23, w26, w20, x23 + umaddl x24, w26, w19, x24 + umull x6, w25, w16 + umull x7, w25, w15 + umull x8, w25, w14 + umaddl x6, w5, w10, x6 + mul w5, w5, w30 + umaddl x7, w27, w17, x7 + umaddl x8, w27, w16, x8 + umaddl x6, w27, w19, x6 + umaddl x7, w5, w20, x7 + umaddl x8, w5, w19, x8 + umaddl x6, w3, w12, x6 + umaddl x7, w3, w11, x7 + umaddl x8, w3, w10, x8 + umaddl x6, w1, w14, x6 + umaddl x7, w1, w13, x7 + umaddl x8, w1, w12, x8 + mul w9, w4, w30 + add x7, x7, x7 + umaddl x6, w26, w17, x6 + umaddl x7, w26, w16, x7 + umaddl x8, w26, w15, x8 + umaddl x6, w28, w20, x6 + umaddl x7, w28, w19, x7 + umaddl x8, w28, w17, x8 + umaddl x6, w4, w11, x6 + umaddl x7, w4, w10, x7 + umaddl x8, w9, w20, x8 + umaddl x6, w2, w13, x6 + umaddl x7, w2, w12, x7 + umaddl x8, w2, w11, x8 + umaddl x6, w0, w15, x6 + umaddl x7, w0, w14, x7 + umaddl x8, w0, w13, x8 + mul w4, w3, w30 + add x6, x6, x7, lsr #26 + and x7, x7, #0x3ffffff + add x24, x24, x6, lsr #25 + and x6, x6, #0x1ffffff + add x23, x23, x24, lsr #26 + and x24, x24, #0x3ffffff + add x22, x22, x23, lsr #25 + bfi x24, x23, #32, #25 + add x21, x21, x22, lsr #26 + and x22, x22, #0x3ffffff + bic x3, x21, #0x3ffffff + lsr x23, x3, #26 + bfi x22, x21, #32, #26 + add x23, x23, x3, lsr #25 + umull x21, w25, w13 + add x23, x23, x3, lsr #22 + umull x3, w25, w12 + umaddl x23, w25, w11, x23 + umaddl x21, w27, w15, x21 + umaddl x3, w27, w14, x3 + umaddl x23, w27, w13, x23 + mul w27, w1, w30 + umaddl x3, w5, w16, x3 + umaddl x23, w5, w15, x23 + umaddl x21, w5, w17, x21 + umaddl x3, w4, w19, x3 + umaddl x23, w4, w17, x23 + umaddl x21, w4, w20, x21 + umaddl x3, w1, w10, x3 + umaddl x23, w27, w20, x23 + umaddl x21, w1, w11, x21 + mul w25, w2, w30 + add x23, x23, x23 + add x21, x21, x21 + umaddl x23, w26, w12, x23 + umaddl x3, w26, w13, x3 + umaddl x21, w26, w14, x21 + umaddl x23, w28, w14, x23 + umaddl x3, w28, w15, x3 + umaddl x21, w28, w16, x21 + umaddl x23, w9, w16, x23 + umaddl x3, w9, w17, x3 + umaddl x21, w9, w19, x21 + umaddl x23, w25, w19, x23 + umaddl x3, w25, w20, x3 + umaddl x21, w2, w10, x21 + umaddl x23, w0, w10, x23 + umaddl x3, w0, w11, x3 + umaddl x21, w0, w12, x21 + add x1, x3, x23, lsr #26 + and x0, x23, #0x3ffffff + add x2, x21, x1, lsr #25 + and x1, x1, #0x1ffffff + add x3, x8, x2, lsr #26 + and x2, x2, #0x3ffffff + add x4, x7, x3, lsr #25 + and x3, x3, #0x1ffffff + add x5, x6, x4, lsr #26 + and x4, x4, #0x3ffffff + and x5, x5, #0x3ffffff + + mov w11, v20.s[0] + mov w6, w24 + lsr x7, x24, #32 + mov w8, w22 + lsr x9, x22, #32 +.Lskipmul: + ubfx w12, w11, #11, #2 + cbz w12, .Lskipstore + mov w13, #40 + mul w12, w12, w13 + add x12, sp, x12 + + stp w0, w1, [x12] + stp w2, w3, [x12, #8] + stp w4, w5, [x12, #16] + stp w6, w7, [x12, #24] + stp w8, w9, [x12, #32] +.Lskipstore: + + ldr x10, [sp, #160] + adr x11, invtable+13*2 + cmp x10, x11 + bne .Linvloopnext + + // Final reduce + // w5 and w9 are 26 bits instead of 25 + + orr x10, x0, x1, lsl #26 + orr x10, x10, x2, lsl #51 + + lsr x11, x2, #13 + orr x11, x11, x3, lsl #13 + orr x11, x11, x4, lsl #38 + + add x12, x5, x6, lsl #25 + adds x12, x12, x7, lsl #51 + + lsr x13, x7, #13 + orr x13, x13, x8, lsl #12 + orr x13, x13, x9, lsl #38 + + adcs x13, x13, xzr + adc x14, xzr, xzr + + extr x17, x14, x13, #63 + mov w19, #19 + mul w15, w17, w19 + add w15, w15, #19 + + adds x15, x10, x15 + adcs x15, x11, xzr + adcs x15, x12, xzr + adcs x15, x13, xzr + adc x16, x14, xzr + + extr x16, x16, x15, #63 + mul w16, w16, w19 + + adds x10, x10, x16 + adcs x11, x11, xzr + adcs x12, x12, xzr + adc x13, x13, xzr + and x13, x13, 0x7fffffffffffffff + + ldr x17, [sp, STACK_OUT_PTR] + stp x10, x11, [x17] + stp x12, x13, [x17, #16] + + add sp, sp, STACK_OUT_PTR+8 + + ldp x19, x20, [sp, #16] + ldp x21, x22, [sp, #32] + ldp x23, x24, [sp, #48] + ldp x25, x26, [sp, #64] + ldp x27, x28, [sp, #80] + ldp d8, d9, [sp, #96] + ldp d10, d11, [sp, #112] + ldp d12, d13, [sp, #128] + ldp d14, d15, [sp, #144] + ldp x29, x30, [sp], #160 + + ret + // .size x25519_scalarmult, .-x25519_scalarmult + // .type invtable, %object +invtable: + // square times, + // skip mul, + // mulsource, + // dest + .hword 1|(1<<8) |(1<<11) + .hword 2| (2<<9)|(2<<11) + .hword 0| (1<<9)|(1<<11) + .hword 1| (2<<9)|(2<<11) + .hword 5| (2<<9)|(2<<11) + .hword 10| (2<<9)|(3<<11) + .hword 20| (3<<9) + .hword 10| (2<<9)|(2<<11) + .hword 50| (2<<9)|(3<<11) + .hword 100| (3<<9) + .hword 50| (2<<9) + .hword 5| (1<<9) + .hword 0| (0<<9) + // .size invtable, .-invtable + +END: \ No newline at end of file diff --git a/tutorial/opt/aarch64_simple0_loop_opt_a55.s b/tutorial/opt/aarch64_simple0_loop_opt_a55.s new file mode 100644 index 00000000..3949d03b --- /dev/null +++ b/tutorial/opt/aarch64_simple0_loop_opt_a55.s @@ -0,0 +1,125 @@ +qdata0 .req q8 +qdata1 .req q9 +qdata2 .req q10 +qdata3 .req q11 + +qtwiddle .req q0 +qmodulus .req q1 + +data0 .req v8 +data1 .req v9 +data2 .req v10 +data3 .req v11 + +twiddle .req v0 +modulus .req v1 + +tmp .req v12 + +data_ptr .req x0 +twiddle_ptr .req x1 +modulus_ptr .req x2 + +.macro barmul out, in, twiddle, modulus + mul \out.8h, \in.8h, \twiddle.h[0] + sqrdmulh \in.8h, \in.8h, \twiddle.h[1] + mls \out.8h, \in.8h, \modulus.h[0] +.endm + +.macro butterfly data0, data1, tmp, twiddle, modulus + barmul \tmp, \data1, \twiddle, \modulus + sub \data1.8h, \data0.8h, \tmp.8h + add \data0.8h, \data0.8h, \tmp.8h +.endm + +count .req x2 +ldr qtwiddle, [twiddle_ptr, #0] +ldr qmodulus, [modulus_ptr, #0] +mov count, #16 + ldr q16, [x0, #16] + mul v14.8H, v16.8H, v0.H[0] + sub count, count, #1 +start: + ldr q5, [x0, #48] // ...*.............. + // gap // .................. + // gap // .................. + // gap // .................. + sqrdmulh v23.8H, v16.8H, v0.H[1] // .....*............ + // gap // .................. + ldr q15, [x0, #0] // *................. + // gap // .................. + // gap // .................. + // gap // .................. + mul v19.8H, v5.8H, v0.H[0] // .........*........ + // gap // .................. + sqrdmulh v24.8H, v5.8H, v0.H[1] // ..........*....... + // gap // .................. + mls v14.8H, v23.8H, v1.H[0] // ......*........... + // gap // .................. + ldr q4, [x0, #32] // ..*............... + // gap // .................. + // gap // .................. + // gap // .................. + mls v19.8H, v24.8H, v1.H[0] // ...........*...... + // gap // .................. + sub v5.8H, v15.8H, v14.8H // .......*.......... + // gap // .................. + ldr q16, [x0, #80] // .e................ + // gap // .................. + // gap // .................. + // gap // .................. + sub v7.8H, v4.8H, v19.8H // ............*..... + // gap // .................. + str q5, [x0, #16] // ...............*.. + // gap // .................. + add v21.8H, v15.8H, v14.8H // ........*......... + // gap // .................. + str q7, [x0, #48] // .................* + // gap // .................. + add v8.8H, v4.8H, v19.8H // .............*.... + // gap // .................. + str q21, [x0], #4*16 // ..............*... + // gap // .................. + mul v14.8H, v16.8H, v0.H[0] // ....e............. + // gap // .................. + str q8, [x0, #-32] // ................*. + // gap // .................. + + // original source code + // ldr q8, [x0, #0*16] // .........|.*............... + // ldr q9, [x0, #1*16] // e........|........e........ + // ldr q10, [x0, #2*16] // .........|.....*........... + // ldr q11, [x0, #3*16] // .........*................. + // mul v12.8h, v9.8h, v0.h[0] // .......e.|...............e. + // sqrdmulh v9.8h, v9.8h, v0.h[1] // .........|*................ + // mls v12.8h, v9.8h, v1.h[0] // .........|....*............ + // sub v9.8h, v8.8h, v12.8h // .........|.......*......... + // add v8.8h, v8.8h, v12.8h // ...*.....|...........*..... + // mul v12.8h, v11.8h, v0.h[0] // .........|..*.............. + // sqrdmulh v11.8h, v11.8h, v0.h[1] // .........|...*............. + // mls v12.8h, v11.8h, v1.h[0] // .........|......*.......... + // sub v11.8h, v10.8h, v12.8h // .*.......|.........*....... + // add v10.8h, v10.8h, v12.8h // .....*...|.............*... + // str q8, [x0], #4*16 // ......*..|..............*.. + // str q9, [x0, #-3*16] // ..*......|..........*...... + // str q10, [x0, #-2*16] // ........*|................* + // str q11, [x0, #-1*16] // ....*....|............*.... + + sub count, count, #1 + cbnz count, start + ldr q5, [x0, #48] + sqrdmulh v23.8H, v16.8H, v0.H[1] + ldr q15, [x0, #0] + mul v19.8H, v5.8H, v0.H[0] + sqrdmulh v24.8H, v5.8H, v0.H[1] + mls v14.8H, v23.8H, v1.H[0] + ldr q4, [x0, #32] + mls v19.8H, v24.8H, v1.H[0] + sub v5.8H, v15.8H, v14.8H + sub v7.8H, v4.8H, v19.8H + str q5, [x0, #16] + add v21.8H, v15.8H, v14.8H + str q7, [x0, #48] + add v8.8H, v4.8H, v19.8H + str q21, [x0], #4*16 + str q8, [x0, #-32] \ No newline at end of file diff --git a/tutorial/opt/aarch64_simple0_loop_opt_mca_a55.s b/tutorial/opt/aarch64_simple0_loop_opt_mca_a55.s new file mode 100644 index 00000000..7fbbe9d6 --- /dev/null +++ b/tutorial/opt/aarch64_simple0_loop_opt_mca_a55.s @@ -0,0 +1,421 @@ +qdata0 .req q8 +qdata1 .req q9 +qdata2 .req q10 +qdata3 .req q11 + +qtwiddle .req q0 +qmodulus .req q1 + +data0 .req v8 +data1 .req v9 +data2 .req v10 +data3 .req v11 + +twiddle .req v0 +modulus .req v1 + +tmp .req v12 + +data_ptr .req x0 +twiddle_ptr .req x1 +modulus_ptr .req x2 + +.macro barmul out, in, twiddle, modulus + mul \out.8h, \in.8h, \twiddle.h[0] + sqrdmulh \in.8h, \in.8h, \twiddle.h[1] + mls \out.8h, \in.8h, \modulus.h[0] +.endm + +.macro butterfly data0, data1, tmp, twiddle, modulus + barmul \tmp, \data1, \twiddle, \modulus + sub \data1.8h, \data0.8h, \tmp.8h + add \data0.8h, \data0.8h, \tmp.8h +.endm + +count .req x2 +ldr qtwiddle, [twiddle_ptr, #0] +ldr qmodulus, [modulus_ptr, #0] +mov count, #16 + ldr q31, [x0, #16] + mul v4.8H, v31.8H, v0.H[0] + sub count, count, #1 +start: + ldr q25, [x0, #48] // ...*.............. + // gap // .................. + // gap // .................. + // gap // .................. + sqrdmulh v11.8H, v31.8H, v0.H[1] // .....*............ + // gap // .................. + ldr q12, [x0, #0] // *................. + // gap // .................. + // gap // .................. + // gap // .................. + mul v3.8H, v25.8H, v0.H[0] // .........*........ + // gap // .................. + sqrdmulh v31.8H, v25.8H, v0.H[1] // ..........*....... + // gap // .................. + mls v4.8H, v11.8H, v1.H[0] // ......*........... + // gap // .................. + ldr q25, [x0, #32] // ..*............... + // gap // .................. + // gap // .................. + // gap // .................. + mls v3.8H, v31.8H, v1.H[0] // ...........*...... + // gap // .................. + sub v23.8H, v12.8H, v4.8H // .......*.......... + // gap // .................. + ldr q31, [x0, #80] // .e................ + // gap // .................. + // gap // .................. + // gap // .................. + sub v19.8H, v25.8H, v3.8H // ............*..... + // gap // .................. + str q23, [x0, #16] // ...............*.. + // gap // .................. + add v3.8H, v25.8H, v3.8H // .............*.... + // gap // .................. + str q19, [x0, #48] // .................* + // gap // .................. + add v25.8H, v12.8H, v4.8H // ........*......... + // gap // .................. + str q3, [x0, #32] // ................*. + // gap // .................. + mul v4.8H, v31.8H, v0.H[0] // ....e............. + // gap // .................. + str q25, [x0], #4*16 // ..............*... + // gap // .................. + + // original source code + // ldr q8, [x0, #0*16] // .........|.*............... + // ldr q9, [x0, #1*16] // e........|........e........ + // ldr q10, [x0, #2*16] // .........|.....*........... + // ldr q11, [x0, #3*16] // .........*................. + // mul v12.8h, v9.8h, v0.h[0] // .......e.|...............e. + // sqrdmulh v9.8h, v9.8h, v0.h[1] // .........|*................ + // mls v12.8h, v9.8h, v1.h[0] // .........|....*............ + // sub v9.8h, v8.8h, v12.8h // .........|.......*......... + // add v8.8h, v8.8h, v12.8h // .....*...|.............*... + // mul v12.8h, v11.8h, v0.h[0] // .........|..*.............. + // sqrdmulh v11.8h, v11.8h, v0.h[1] // .........|...*............. + // mls v12.8h, v11.8h, v1.h[0] // .........|......*.......... + // sub v11.8h, v10.8h, v12.8h // .*.......|.........*....... + // add v10.8h, v10.8h, v12.8h // ...*.....|...........*..... + // str q8, [x0], #4*16 // ........*|................* + // str q9, [x0, #-3*16] // ..*......|..........*...... + // str q10, [x0, #-2*16] // ......*..|..............*.. + // str q11, [x0, #-1*16] // ....*....|............*.... + + // + // LLVM MCA STATISTICS (ORIGINAL) BEGIN + // + // + // [0] Code Region + // + // Iterations: 100 + // Instructions: 1800 + // Total Cycles: 2902 + // Total uOps: 1900 + // + // Dispatch Width: 2 + // uOps Per Cycle: 0.65 + // IPC: 0.62 + // Block RThroughput: 10.0 + // + // + // Resources: + // [0.0] - CortexA55UnitALU + // [0.1] - CortexA55UnitALU + // [1] - CortexA55UnitB + // [2] - CortexA55UnitDiv + // [3.0] - CortexA55UnitFPALU + // [3.1] - CortexA55UnitFPALU + // [4] - CortexA55UnitFPDIV + // [5.0] - CortexA55UnitFPMAC + // [5.1] - CortexA55UnitFPMAC + // [6] - CortexA55UnitLd + // [7] - CortexA55UnitMAC + // [8] - CortexA55UnitSt + // + // + // Resource pressure per iteration: + // [0.0] [0.1] [1] [2] [3.0] [3.1] [4] [5.0] [5.1] [6] [7] [8] + // - - - - 10.00 10.00 - - - 4.00 - 4.00 + // + // Resource pressure by instruction: + // [0.0] [0.1] [1] [2] [3.0] [3.1] [4] [5.0] [5.1] [6] [7] [8] Instructions: + // - - - - - - - - - 1.00 - - ldr q8, [x0] + // - - - - - - - - - 1.00 - - ldr q9, [x0, #16] + // - - - - - - - - - 1.00 - - ldr q10, [x0, #32] + // - - - - - - - - - 1.00 - - ldr q11, [x0, #48] + // - - - - - 2.00 - - - - - - mul.8h v12, v9, v0[0] + // - - - - 2.00 - - - - - - - sqrdmulh.8h v9, v9, v0[1] + // - - - - - 2.00 - - - - - - mls.8h v12, v9, v1[0] + // - - - - 2.00 - - - - - - - sub.8h v9, v8, v12 + // - - - - - 2.00 - - - - - - add.8h v8, v8, v12 + // - - - - 2.00 - - - - - - - mul.8h v12, v11, v0[0] + // - - - - - 2.00 - - - - - - sqrdmulh.8h v11, v11, v0[1] + // - - - - 2.00 - - - - - - - mls.8h v12, v11, v1[0] + // - - - - - 2.00 - - - - - - sub.8h v11, v10, v12 + // - - - - 2.00 - - - - - - - add.8h v10, v10, v12 + // - - - - - - - - - - - 1.00 str q8, [x0], #64 + // - - - - - - - - - - - 1.00 stur q9, [x0, #-48] + // - - - - - - - - - - - 1.00 stur q10, [x0, #-32] + // - - - - - - - - - - - 1.00 stur q11, [x0, #-16] + // + // + // Timeline view: + // 0123456789 0123456789 0123456789 0123456789 + // Index 0123456789 0123456789 0123456789 0123456789 012345678 + // + // [0,0] DeeE . . . . . . . . . . . . . . . . . . ldr q8, [x0] + // [0,1] .DeeE. . . . . . . . . . . . . . . . . . ldr q9, [x0, #16] + // [0,2] . DeeE . . . . . . . . . . . . . . . . . ldr q10, [x0, #32] + // [0,3] . DeeE . . . . . . . . . . . . . . . . . ldr q11, [x0, #48] + // [0,4] . DeeeE . . . . . . . . . . . . . . . . . mul.8h v12, v9, v0[0] + // [0,5] . DeeeE. . . . . . . . . . . . . . . . . sqrdmulh.8h v9, v9, v0[1] + // [0,6] . . DeeeE . . . . . . . . . . . . . . . . mls.8h v12, v9, v1[0] + // [0,7] . . . DeE . . . . . . . . . . . . . . . sub.8h v9, v8, v12 + // [0,8] . . . DeE . . . . . . . . . . . . . . . add.8h v8, v8, v12 + // [0,9] . . . DeeeE. . . . . . . . . . . . . . . mul.8h v12, v11, v0[0] + // [0,10] . . . .DeeeE . . . . . . . . . . . . . . sqrdmulh.8h v11, v11, v0[1] + // [0,11] . . . . DeeeE. . . . . . . . . . . . . . mls.8h v12, v11, v1[0] + // [0,12] . . . . . DeE . . . . . . . . . . . . . sub.8h v11, v10, v12 + // [0,13] . . . . . DeE . . . . . . . . . . . . . add.8h v10, v10, v12 + // [0,14] . . . . . .DE . . . . . . . . . . . . . str q8, [x0], #64 + // [0,15] . . . . . . DE . . . . . . . . . . . . . stur q9, [x0, #-48] + // [0,16] . . . . . . DE. . . . . . . . . . . . . stur q10, [x0, #-32] + // [0,17] . . . . . . DE . . . . . . . . . . . . stur q11, [x0, #-16] + // [1,0] . . . . . . DeeE . . . . . . . . . . . . ldr q8, [x0] + // [1,1] . . . . . . DeeE . . . . . . . . . . . . ldr q9, [x0, #16] + // [1,2] . . . . . . .DeeE. . . . . . . . . . . . ldr q10, [x0, #32] + // [1,3] . . . . . . . DeeE . . . . . . . . . . . ldr q11, [x0, #48] + // [1,4] . . . . . . . DeeeE . . . . . . . . . . . mul.8h v12, v9, v0[0] + // [1,5] . . . . . . . DeeeE . . . . . . . . . . . sqrdmulh.8h v9, v9, v0[1] + // [1,6] . . . . . . . . DeeeE . . . . . . . . . . mls.8h v12, v9, v1[0] + // [1,7] . . . . . . . . . DeE. . . . . . . . . . sub.8h v9, v8, v12 + // [1,8] . . . . . . . . . DeE . . . . . . . . . add.8h v8, v8, v12 + // [1,9] . . . . . . . . . DeeeE . . . . . . . . . mul.8h v12, v11, v0[0] + // [1,10] . . . . . . . . . DeeeE. . . . . . . . . sqrdmulh.8h v11, v11, v0[1] + // [1,11] . . . . . . . . . . DeeeE . . . . . . . . mls.8h v12, v11, v1[0] + // [1,12] . . . . . . . . . . . DeE . . . . . . . sub.8h v11, v10, v12 + // [1,13] . . . . . . . . . . . DeE . . . . . . . add.8h v10, v10, v12 + // [1,14] . . . . . . . . . . . DE . . . . . . . str q8, [x0], #64 + // [1,15] . . . . . . . . . . . .DE . . . . . . . stur q9, [x0, #-48] + // [1,16] . . . . . . . . . . . . DE . . . . . . . stur q10, [x0, #-32] + // [1,17] . . . . . . . . . . . . DE. . . . . . . stur q11, [x0, #-16] + // [2,0] . . . . . . . . . . . . DeeE . . . . . . ldr q8, [x0] + // [2,1] . . . . . . . . . . . . DeeE . . . . . . ldr q9, [x0, #16] + // [2,2] . . . . . . . . . . . . DeeE . . . . . . ldr q10, [x0, #32] + // [2,3] . . . . . . . . . . . . .DeeE. . . . . . ldr q11, [x0, #48] + // [2,4] . . . . . . . . . . . . . DeeeE . . . . . mul.8h v12, v9, v0[0] + // [2,5] . . . . . . . . . . . . . DeeeE . . . . . sqrdmulh.8h v9, v9, v0[1] + // [2,6] . . . . . . . . . . . . . . DeeeE . . . . mls.8h v12, v9, v1[0] + // [2,7] . . . . . . . . . . . . . . .DeE . . . . sub.8h v9, v8, v12 + // [2,8] . . . . . . . . . . . . . . . DeE. . . . add.8h v8, v8, v12 + // [2,9] . . . . . . . . . . . . . . . DeeeE . . . mul.8h v12, v11, v0[0] + // [2,10] . . . . . . . . . . . . . . . DeeeE . . . sqrdmulh.8h v11, v11, v0[1] + // [2,11] . . . . . . . . . . . . . . . . DeeeE . . mls.8h v12, v11, v1[0] + // [2,12] . . . . . . . . . . . . . . . . . DeE. . sub.8h v11, v10, v12 + // [2,13] . . . . . . . . . . . . . . . . . DeE . add.8h v10, v10, v12 + // [2,14] . . . . . . . . . . . . . . . . . DE . str q8, [x0], #64 + // [2,15] . . . . . . . . . . . . . . . . . DE . stur q9, [x0, #-48] + // [2,16] . . . . . . . . . . . . . . . . . .DE. stur q10, [x0, #-32] + // [2,17] . . . . . . . . . . . . . . . . . . DE stur q11, [x0, #-16] + // + // + // Average Wait times (based on the timeline view): + // [0]: Executions + // [1]: Average time spent waiting in a scheduler's queue + // [2]: Average time spent waiting in a scheduler's queue while ready + // [3]: Average time elapsed from WB until retire stage + // + // [0] [1] [2] [3] + // 0. 3 0.0 0.0 0.0 ldr q8, [x0] + // 1. 3 0.0 0.0 0.0 ldr q9, [x0, #16] + // 2. 3 0.0 0.0 0.0 ldr q10, [x0, #32] + // 3. 3 0.0 0.0 0.0 ldr q11, [x0, #48] + // 4. 3 0.0 0.0 0.0 mul.8h v12, v9, v0[0] + // 5. 3 0.0 0.0 0.0 sqrdmulh.8h v9, v9, v0[1] + // 6. 3 0.0 0.0 0.0 mls.8h v12, v9, v1[0] + // 7. 3 0.0 0.0 0.0 sub.8h v9, v8, v12 + // 8. 3 0.0 0.0 0.0 add.8h v8, v8, v12 + // 9. 3 0.0 0.0 0.0 mul.8h v12, v11, v0[0] + // 10. 3 0.0 0.0 0.0 sqrdmulh.8h v11, v11, v0[1] + // 11. 3 0.0 0.0 0.0 mls.8h v12, v11, v1[0] + // 12. 3 0.0 0.0 0.0 sub.8h v11, v10, v12 + // 13. 3 0.0 0.0 0.0 add.8h v10, v10, v12 + // 14. 3 0.0 0.0 0.0 str q8, [x0], #64 + // 15. 3 0.0 0.0 0.0 stur q9, [x0, #-48] + // 16. 3 0.0 0.0 0.0 stur q10, [x0, #-32] + // 17. 3 0.0 0.0 0.0 stur q11, [x0, #-16] + // 3 0.0 0.0 0.0 + // + // + // ORIGINAL LLVM MCA STATISTICS (ORIGINAL) END + // + // + // LLVM MCA STATISTICS (OPTIMIZED) BEGIN + // + // + // [0] Code Region + // + // Iterations: 100 + // Instructions: 1800 + // Total Cycles: 1803 + // Total uOps: 1900 + // + // Dispatch Width: 2 + // uOps Per Cycle: 1.05 + // IPC: 1.00 + // Block RThroughput: 10.0 + // + // + // Resources: + // [0.0] - CortexA55UnitALU + // [0.1] - CortexA55UnitALU + // [1] - CortexA55UnitB + // [2] - CortexA55UnitDiv + // [3.0] - CortexA55UnitFPALU + // [3.1] - CortexA55UnitFPALU + // [4] - CortexA55UnitFPDIV + // [5.0] - CortexA55UnitFPMAC + // [5.1] - CortexA55UnitFPMAC + // [6] - CortexA55UnitLd + // [7] - CortexA55UnitMAC + // [8] - CortexA55UnitSt + // + // + // Resource pressure per iteration: + // [0.0] [0.1] [1] [2] [3.0] [3.1] [4] [5.0] [5.1] [6] [7] [8] + // - - - - 10.00 10.00 - - - 4.00 - 4.00 + // + // Resource pressure by instruction: + // [0.0] [0.1] [1] [2] [3.0] [3.1] [4] [5.0] [5.1] [6] [7] [8] Instructions: + // - - - - - - - - - 1.00 - - ldr q25, [x0, #48] + // - - - - - 2.00 - - - - - - sqrdmulh.8h v11, v31, v0[1] + // - - - - - - - - - 1.00 - - ldr q12, [x0] + // - - - - 2.00 - - - - - - - mul.8h v3, v25, v0[0] + // - - - - - 2.00 - - - - - - sqrdmulh.8h v31, v25, v0[1] + // - - - - 2.00 - - - - - - - mls.8h v4, v11, v1[0] + // - - - - - - - - - 1.00 - - ldr q25, [x0, #32] + // - - - - - 2.00 - - - - - - mls.8h v3, v31, v1[0] + // - - - - 2.00 - - - - - - - sub.8h v23, v12, v4 + // - - - - - - - - - 1.00 - - ldr q31, [x0, #80] + // - - - - - 2.00 - - - - - - sub.8h v19, v25, v3 + // - - - - - - - - - - - 1.00 str q23, [x0, #16] + // - - - - 2.00 - - - - - - - add.8h v3, v25, v3 + // - - - - - - - - - - - 1.00 str q19, [x0, #48] + // - - - - - 2.00 - - - - - - add.8h v25, v12, v4 + // - - - - - - - - - - - 1.00 str q3, [x0, #32] + // - - - - 2.00 - - - - - - - mul.8h v4, v31, v0[0] + // - - - - - - - - - - - 1.00 str q25, [x0], #64 + // + // + // Timeline view: + // 0123456789 0123456789 0123456 + // Index 0123456789 0123456789 0123456789 + // + // [0,0] DeeE . . . . . . . . . . .. ldr q25, [x0, #48] + // [0,1] .DeeeE . . . . . . . . . .. sqrdmulh.8h v11, v31, v0[1] + // [0,2] . DeeE . . . . . . . . . .. ldr q12, [x0] + // [0,3] . DeeeE . . . . . . . . . .. mul.8h v3, v25, v0[0] + // [0,4] . DeeeE . . . . . . . . . .. sqrdmulh.8h v31, v25, v0[1] + // [0,5] . DeeeE. . . . . . . . . .. mls.8h v4, v11, v1[0] + // [0,6] . .DeeE. . . . . . . . . .. ldr q25, [x0, #32] + // [0,7] . . DeeeE . . . . . . . . .. mls.8h v3, v31, v1[0] + // [0,8] . . DeE . . . . . . . . .. sub.8h v23, v12, v4 + // [0,9] . . DeeE . . . . . . . . .. ldr q31, [x0, #80] + // [0,10] . . . DeE. . . . . . . . .. sub.8h v19, v25, v3 + // [0,11] . . . DE . . . . . . . . .. str q23, [x0, #16] + // [0,12] . . . DeE . . . . . . . .. add.8h v3, v25, v3 + // [0,13] . . . DE . . . . . . . .. str q19, [x0, #48] + // [0,14] . . . DeE . . . . . . . .. add.8h v25, v12, v4 + // [0,15] . . . DE . . . . . . . .. str q3, [x0, #32] + // [0,16] . . . .DeeeE . . . . . . .. mul.8h v4, v31, v0[0] + // [0,17] . . . . DE . . . . . . . .. str q25, [x0], #64 + // [1,0] . . . . DeeE . . . . . . .. ldr q25, [x0, #48] + // [1,1] . . . . DeeeE . . . . . . .. sqrdmulh.8h v11, v31, v0[1] + // [1,2] . . . . DeeE . . . . . . .. ldr q12, [x0] + // [1,3] . . . . .DeeeE . . . . . .. mul.8h v3, v25, v0[0] + // [1,4] . . . . . DeeeE . . . . . .. sqrdmulh.8h v31, v25, v0[1] + // [1,5] . . . . . DeeeE . . . . . .. mls.8h v4, v11, v1[0] + // [1,6] . . . . . DeeE . . . . . .. ldr q25, [x0, #32] + // [1,7] . . . . . .DeeeE . . . . .. mls.8h v3, v31, v1[0] + // [1,8] . . . . . . DeE . . . . .. sub.8h v23, v12, v4 + // [1,9] . . . . . . DeeE . . . . .. ldr q31, [x0, #80] + // [1,10] . . . . . . DeE . . . . .. sub.8h v19, v25, v3 + // [1,11] . . . . . . DE . . . . .. str q23, [x0, #16] + // [1,12] . . . . . . .DeE . . . . .. add.8h v3, v25, v3 + // [1,13] . . . . . . . DE . . . . .. str q19, [x0, #48] + // [1,14] . . . . . . . DeE . . . .. add.8h v25, v12, v4 + // [1,15] . . . . . . . DE. . . . .. str q3, [x0, #32] + // [1,16] . . . . . . . DeeeE . . . .. mul.8h v4, v31, v0[0] + // [1,17] . . . . . . . DE . . . .. str q25, [x0], #64 + // [2,0] . . . . . . . .DeeE. . . .. ldr q25, [x0, #48] + // [2,1] . . . . . . . . DeeeE . . .. sqrdmulh.8h v11, v31, v0[1] + // [2,2] . . . . . . . . DeeE . . .. ldr q12, [x0] + // [2,3] . . . . . . . . DeeeE . . .. mul.8h v3, v25, v0[0] + // [2,4] . . . . . . . . DeeeE. . .. sqrdmulh.8h v31, v25, v0[1] + // [2,5] . . . . . . . . .DeeeE . .. mls.8h v4, v11, v1[0] + // [2,6] . . . . . . . . . DeeE . .. ldr q25, [x0, #32] + // [2,7] . . . . . . . . . DeeeE . .. mls.8h v3, v31, v1[0] + // [2,8] . . . . . . . . . .DeE . .. sub.8h v23, v12, v4 + // [2,9] . . . . . . . . . .DeeE. .. ldr q31, [x0, #80] + // [2,10] . . . . . . . . . . DeE .. sub.8h v19, v25, v3 + // [2,11] . . . . . . . . . . DE. .. str q23, [x0, #16] + // [2,12] . . . . . . . . . . DeE .. add.8h v3, v25, v3 + // [2,13] . . . . . . . . . . DE .. str q19, [x0, #48] + // [2,14] . . . . . . . . . . .DeE .. add.8h v25, v12, v4 + // [2,15] . . . . . . . . . . .DE .. str q3, [x0, #32] + // [2,16] . . . . . . . . . . . DeeeE mul.8h v4, v31, v0[0] + // [2,17] . . . . . . . . . . . DE.. str q25, [x0], #64 + // + // + // Average Wait times (based on the timeline view): + // [0]: Executions + // [1]: Average time spent waiting in a scheduler's queue + // [2]: Average time spent waiting in a scheduler's queue while ready + // [3]: Average time elapsed from WB until retire stage + // + // [0] [1] [2] [3] + // 0. 3 0.0 0.0 0.0 ldr q25, [x0, #48] + // 1. 3 0.0 0.0 0.0 sqrdmulh.8h v11, v31, v0[1] + // 2. 3 0.0 0.0 0.0 ldr q12, [x0] + // 3. 3 0.0 0.0 0.0 mul.8h v3, v25, v0[0] + // 4. 3 0.0 0.0 0.0 sqrdmulh.8h v31, v25, v0[1] + // 5. 3 0.0 0.0 0.0 mls.8h v4, v11, v1[0] + // 6. 3 0.0 0.0 0.0 ldr q25, [x0, #32] + // 7. 3 0.0 0.0 0.0 mls.8h v3, v31, v1[0] + // 8. 3 0.0 0.0 0.0 sub.8h v23, v12, v4 + // 9. 3 0.0 0.0 0.0 ldr q31, [x0, #80] + // 10. 3 0.0 0.0 0.0 sub.8h v19, v25, v3 + // 11. 3 0.0 0.0 0.0 str q23, [x0, #16] + // 12. 3 0.0 0.0 0.0 add.8h v3, v25, v3 + // 13. 3 0.0 0.0 0.0 str q19, [x0, #48] + // 14. 3 0.0 0.0 0.0 add.8h v25, v12, v4 + // 15. 3 0.0 0.0 0.0 str q3, [x0, #32] + // 16. 3 0.0 0.0 0.0 mul.8h v4, v31, v0[0] + // 17. 3 0.0 0.0 0.0 str q25, [x0], #64 + // 3 0.0 0.0 0.0 + // + // + // ORIGINAL LLVM MCA STATISTICS (OPTIMIZED) END + // + sub count, count, #1 + cbnz count, start + ldr q25, [x0, #48] + sqrdmulh v11.8H, v31.8H, v0.H[1] + ldr q12, [x0, #0] + mul v3.8H, v25.8H, v0.H[0] + sqrdmulh v31.8H, v25.8H, v0.H[1] + mls v4.8H, v11.8H, v1.H[0] + ldr q25, [x0, #32] + mls v3.8H, v31.8H, v1.H[0] + sub v23.8H, v12.8H, v4.8H + sub v19.8H, v25.8H, v3.8H + str q23, [x0, #16] + add v3.8H, v25.8H, v3.8H + str q19, [x0, #48] + add v25.8H, v12.8H, v4.8H + str q3, [x0, #32] + str q25, [x0], #4*16 \ No newline at end of file diff --git a/tutorial/opt/aarch64_simple0_macros_opt_a55.s b/tutorial/opt/aarch64_simple0_macros_opt_a55.s new file mode 100644 index 00000000..136fb51c --- /dev/null +++ b/tutorial/opt/aarch64_simple0_macros_opt_a55.s @@ -0,0 +1,117 @@ +qdata0 .req q8 +qdata1 .req q9 +qdata2 .req q10 +qdata3 .req q11 + +qtwiddle .req q0 +qmodulus .req q1 + +data0 .req v8 +data1 .req v9 +data2 .req v10 +data3 .req v11 + +twiddle .req v0 +modulus .req v1 + +tmp .req v12 + +data_ptr .req x0 +twiddle_ptr .req x1 +modulus_ptr .req x2 + +.macro barmul out, in, twiddle, modulus + mul \out.8h, \in.8h, \twiddle.h[0] + sqrdmulh \in.8h, \in.8h, \twiddle.h[1] + mls \out.8h, \in.8h, \modulus.h[0] +.endm + +.macro butterfly data0, data1, tmp, twiddle, modulus + barmul \tmp, \data1, \twiddle, \modulus + sub \data1.8h, \data0.8h, \tmp.8h + add \data0.8h, \data0.8h, \tmp.8h +.endm + +count .req x2 + + start: + ldr q4, [x1, #0] // *................... + // gap // .................... + // gap // .................... + // gap // .................... + ldr q25, [x0, #16] // ...*................ + // gap // .................... + // gap // .................... + // gap // .................... + ldr q1, [x0, #48] // .....*.............. + // gap // .................... + // gap // .................... + // gap // .................... + mul v24.8H, v25.8H, v4.H[0] // ......*............. + // gap // .................... + sqrdmulh v25.8H, v25.8H, v4.H[1] // .......*............ + // gap // .................... + mul v8.8H, v1.8H, v4.H[0] // ...........*........ + // gap // .................... + sqrdmulh v4.8H, v1.8H, v4.H[1] // ............*....... + // gap // .................... + ldr q1, [x2, #0] // .*.................. + // gap // .................... + // gap // .................... + // gap // .................... + ldr q20, [x0, #0] // ..*................. + // gap // .................... + // gap // .................... + // gap // .................... + mls v24.8H, v25.8H, v1.H[0] // ........*........... + // gap // .................... + mls v8.8H, v4.8H, v1.H[0] // .............*...... + // gap // .................... + ldr q4, [x0, #32] // ....*............... + // gap // .................... + // gap // .................... + // gap // .................... + sub v25.8H, v20.8H, v24.8H // .........*.......... + // gap // .................... + add v1.8H, v20.8H, v24.8H // ..........*......... + // gap // .................... + sub v24.8H, v4.8H, v8.8H // ..............*..... + // gap // .................... + str q25, [x0, #16] // .................*.. + // gap // .................... + add v4.8H, v4.8H, v8.8H // ...............*.... + // gap // .................... + str q1, [x0], #4*16 // ................*... + // gap // .................... + // gap // .................... + // gap // .................... + str q4, [x0, #-32] // ..................*. + // gap // .................... + // gap // .................... + // gap // .................... + str q24, [x0, #-16] // ...................* + // gap // .................... + + // original source code + // ldr q0, [x1, #0] // *................... + // ldr q1, [x2, #0] // .......*............ + // ldr q8, [x0, #0*16] // ........*........... + // ldr q9, [x0, #1*16] // .*.................. + // ldr q10, [x0, #2*16] // ...........*........ + // ldr q11, [x0, #3*16] // ..*................. + // mul v12.8h, v9.8h, v0.h[0] // ...*................ + // sqrdmulh v9.8h, v9.8h, v0.h[1] // ....*............... + // mls v12.8h, v9.8h, v1.h[0] // .........*.......... + // sub v9.8h, v8.8h, v12.8h // ............*....... + // add v8.8h, v8.8h, v12.8h // .............*...... + // mul v12.8h, v11.8h, v0.h[0] // .....*.............. + // sqrdmulh v11.8h, v11.8h, v0.h[1] // ......*............. + // mls v12.8h, v11.8h, v1.h[0] // ..........*......... + // sub v11.8h, v10.8h, v12.8h // ..............*..... + // add v10.8h, v10.8h, v12.8h // ................*... + // str q8, [x0], #4*16 // .................*.. + // str q9, [x0, #-3*16] // ...............*.... + // str q10, [x0, #-2*16] // ..................*. + // str q11, [x0, #-1*16] // ...................* + + end: diff --git a/tutorial/opt/aarch64_simple0_opt_a55.s b/tutorial/opt/aarch64_simple0_opt_a55.s new file mode 100644 index 00000000..3c1beca7 --- /dev/null +++ b/tutorial/opt/aarch64_simple0_opt_a55.s @@ -0,0 +1,78 @@ + ldr q7, [x1, #0] // *................... + // gap // .................... + // gap // .................... + // gap // .................... + ldr q31, [x0, #16] // ...*................ + // gap // .................... + // gap // .................... + // gap // .................... + ldr q24, [x0, #48] // .....*.............. + // gap // .................... + // gap // .................... + // gap // .................... + mul v29.8H, v31.8H, v7.H[0] // ......*............. + // gap // .................... + sqrdmulh v31.8H, v31.8H, v7.H[1] // .......*............ + // gap // .................... + mul v16.8H, v24.8H, v7.H[0] // ...........*........ + // gap // .................... + sqrdmulh v7.8H, v24.8H, v7.H[1] // ............*....... + // gap // .................... + ldr q1, [x2, #0] // .*.................. + // gap // .................... + // gap // .................... + // gap // .................... + ldr q24, [x0] // ..*................. + // gap // .................... + // gap // .................... + // gap // .................... + mls v29.8H, v31.8H, v1.H[0] // ........*........... + // gap // .................... + mls v16.8H, v7.8H, v1.H[0] // .............*...... + // gap // .................... + ldr q7, [x0, #32] // ....*............... + // gap // .................... + // gap // .................... + // gap // .................... + sub v31.8H, v24.8H, v29.8H // .........*.......... + // gap // .................... + add v24.8H, v24.8H, v29.8H // ..........*......... + // gap // .................... + sub v29.8H, v7.8H, v16.8H // ..............*..... + // gap // .................... + str q31, [x0, #16] // .................*.. + // gap // .................... + add v7.8H, v7.8H, v16.8H // ...............*.... + // gap // .................... + str q24, [x0], #4*16 // ................*... + // gap // .................... + // gap // .................... + // gap // .................... + str q7, [x0, #-32] // ..................*. + // gap // .................... + // gap // .................... + // gap // .................... + str q29, [x0, #-16] // ...................* + // gap // .................... + + // original source code + // ldr q0, [x1, #0] // *................... + // ldr q1, [x2, #0] // .......*............ + // ldr q8, [x0] // ........*........... + // ldr q9, [x0, #1*16] // .*.................. + // ldr q10, [x0, #2*16] // ...........*........ + // ldr q11, [x0, #3*16] // ..*................. + // mul v24.8h, v9.8h, v0.h[0] // ...*................ + // sqrdmulh v9.8h, v9.8h, v0.h[1] // ....*............... + // mls v24.8h, v9.8h, v1.h[0] // .........*.......... + // sub v9.8h, v8.8h, v24.8h // ............*....... + // add v8.8h, v8.8h, v24.8h // .............*...... + // mul v24.8h, v11.8h, v0.h[0] // .....*.............. + // sqrdmulh v11.8h, v11.8h, v0.h[1] // ......*............. + // mls v24.8h, v11.8h, v1.h[0] // ..........*......... + // sub v11.8h, v10.8h, v24.8h // ..............*..... + // add v10.8h, v10.8h, v24.8h // ................*... + // str q8, [x0], #4*16 // .................*.. + // str q9, [x0, #-3*16] // ...............*.... + // str q10, [x0, #-2*16] // ..................*. + // str q11, [x0, #-1*16] // ...................* diff --git a/tutorial/opt/ntt_kyber_123_4567_opt_a55.s b/tutorial/opt/ntt_kyber_123_4567_opt_a55.s new file mode 100644 index 00000000..b8f43f7e --- /dev/null +++ b/tutorial/opt/ntt_kyber_123_4567_opt_a55.s @@ -0,0 +1,1350 @@ +/// +/// Copyright (c) 2022 Arm Limited +/// Copyright (c) 2022 Hanno Becker +/// Copyright (c) 2023 Amin Abdulrahman, Matthias Kannwischer +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE. +/// + +// Needed to provide ASM_LOAD directive +#include + +// NOTE +// We use a lot of trivial macros to simplify the parsing burden for Slothy +// The macros are not unfolded by Slothy and thus interpreted as instructions, +// which are easier to parse due to e.g. the lack of size specifiers and simpler +// syntax for pre and post increment for loads and stores. + +// Eventually, NeLight should include a proper parser for AArch64, +// but for initial investigations, the below is enough. + +.macro ldr_vo vec, base, offset + ldr qform_\vec, [\base, #\offset] +.endm + +.macro ldr_vi vec, base, inc + ldr qform_\vec, [\base], #\inc +.endm + +.macro str_vo vec, base, offset + str qform_\vec, [\base, #\offset] +.endm +.macro str_vi vec, base, inc + str qform_\vec, [\base], #\inc +.endm + +.macro vqrdmulh d,a,b + sqrdmulh \d\().8h, \a\().8h, \b\().8h +.endm +.macro vmlsq d,a,b,i + mls \d\().8h, \a\().8h, \b\().h[\i] +.endm +.macro vqrdmulhq d,a,b,i + sqrdmulh \d\().8h, \a\().8h, \b\().h[\i] +.endm +.macro vqdmulhq d,a,b,i + sqdmulh \d\().8h, \a\().8h, \b\().h[\i] +.endm +.macro vmulq d,a,b,i + mul \d\().8h, \a\().8h, \b\().h[\i] +.endm + +.macro mulmodq dst, src, const, idx0, idx1 + vmulq \dst, \src, \const, \idx0 + vqrdmulhq \src, \src, \const, \idx1 + vmlsq \dst, \src, consts, 0 +.endm + +.macro mulmod dst, src, const, const_twisted + mul \dst\().8h, \src\().8h, \const\().8h + vqrdmulh \src, \src, \const_twisted + vmlsq \dst, \src, consts, 0 +.endm + +.macro ct_butterfly a, b, root, idx0, idx1 + mulmodq tmp, \b, \root, \idx0, \idx1 + sub \b\().8h, \a\().8h, tmp.8h + add \a\().8h, \a\().8h, tmp.8h +.endm + +.macro mulmod_v dst, src, const, const_twisted + mul \dst\().8h, \src\().8h, \const\().8h + vqrdmulh \src, \src, \const_twisted + vmlsq \dst, \src, consts, 0 +.endm + +.macro ct_butterfly_v a, b, root, root_twisted + mulmod tmp, \b, \root, \root_twisted + sub \b\().8h, \a\().8h, tmp.8h + add \a\().8h, \a\().8h, tmp.8h +.endm + +.macro barrett_reduce a + vqdmulhq t0, \a, consts, 1 + srshr t0.8h, t0.8h, #11 + vmlsq \a, t0, consts, 0 +.endm + +.macro load_roots_123 + ldr_vi root0, r_ptr0, 32 + ldr_vo root1, r_ptr0, -16 +.endm + +.macro load_next_roots_45 + ldr_vi root0, r_ptr0, 16 +.endm + +.macro load_next_roots_67 + ldr_vi root0, r_ptr1, (6*16) + ldr_vo root0_tw, r_ptr1, (-6*16 + 1*16) + ldr_vo root1, r_ptr1, (-6*16 + 2*16) + ldr_vo root1_tw, r_ptr1, (-6*16 + 3*16) + ldr_vo root2, r_ptr1, (-6*16 + 4*16) + ldr_vo root2_tw, r_ptr1, (-6*16 + 5*16) +.endm + +.macro transpose4 data + trn1 t0.4s, \data\()0.4s, \data\()1.4s + trn2 t1.4s, \data\()0.4s, \data\()1.4s + trn1 t2.4s, \data\()2.4s, \data\()3.4s + trn2 t3.4s, \data\()2.4s, \data\()3.4s + + trn2 \data\()2.2d, t0.2d, t2.2d + trn2 \data\()3.2d, t1.2d, t3.2d + trn1 \data\()0.2d, t0.2d, t2.2d + trn1 \data\()1.2d, t1.2d, t3.2d +.endm + +.macro transpose_single data_out, data_in + trn1 \data_out\()0.4s, \data_in\()0.4s, \data_in\()1.4s + trn2 \data_out\()1.4s, \data_in\()0.4s, \data_in\()1.4s + trn1 \data_out\()2.4s, \data_in\()2.4s, \data_in\()3.4s + trn2 \data_out\()3.4s, \data_in\()2.4s, \data_in\()3.4s +.endm + +.macro save_gprs // @slothy:no-unfold + sub sp, sp, #(16*6) + stp x19, x20, [sp, #16*0] + stp x19, x20, [sp, #16*0] + stp x21, x22, [sp, #16*1] + stp x23, x24, [sp, #16*2] + stp x25, x26, [sp, #16*3] + stp x27, x28, [sp, #16*4] + str x29, [sp, #16*5] +.endm + +.macro restore_gprs // @slothy:no-unfold + ldp x19, x20, [sp, #16*0] + ldp x21, x22, [sp, #16*1] + ldp x23, x24, [sp, #16*2] + ldp x25, x26, [sp, #16*3] + ldp x27, x28, [sp, #16*4] + ldr x29, [sp, #16*5] + add sp, sp, #(16*6) +.endm + +.macro save_vregs // @slothy:no-unfold + sub sp, sp, #(16*4) + stp d8, d9, [sp, #16*0] + stp d10, d11, [sp, #16*1] + stp d12, d13, [sp, #16*2] + stp d14, d15, [sp, #16*3] +.endm + +.macro restore_vregs // @slothy:no-unfold + ldp d8, d9, [sp, #16*0] + ldp d10, d11, [sp, #16*1] + ldp d12, d13, [sp, #16*2] + ldp d14, d15, [sp, #16*3] + add sp, sp, #(16*4) +.endm + +#define STACK_SIZE 16 +#define STACK0 0 + +.macro restore a, loc // @slothy:no-unfold + ldr \a, [sp, #\loc\()] +.endm +.macro save loc, a // @slothy:no-unfold + str \a, [sp, #\loc\()] +.endm +.macro push_stack // @slothy:no-unfold + save_gprs + save_vregs + sub sp, sp, #STACK_SIZE +.endm + +.macro pop_stack // @slothy:no-unfold + add sp, sp, #STACK_SIZE + restore_vregs + restore_gprs +.endm + +.data +.p2align 4 +roots: +#include "ntt_kyber_123_45_67_twiddles.s" +.text + + .global ntt_kyber_123_4567 + .global _ntt_kyber_123_4567 + +.p2align 4 +const_addr: .short 3329 + .short 20159 + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 +ntt_kyber_123_4567: +_ntt_kyber_123_4567: + push_stack + + in .req x0 + inp .req x1 + count .req x2 + r_ptr0 .req x3 + r_ptr1 .req x4 + xtmp .req x5 + + qform_v0 .req q0 + qform_v1 .req q1 + qform_v2 .req q2 + qform_v3 .req q3 + qform_v4 .req q4 + qform_v5 .req q5 + qform_v6 .req q6 + qform_v7 .req q7 + qform_v8 .req q8 + qform_v9 .req q9 + qform_v10 .req q10 + qform_v11 .req q11 + qform_v12 .req q12 + qform_v13 .req q13 + qform_v14 .req q14 + qform_v15 .req q15 + qform_v16 .req q16 + qform_v17 .req q17 + qform_v18 .req q18 + qform_v19 .req q19 + qform_v20 .req q20 + qform_v21 .req q21 + qform_v22 .req q22 + qform_v23 .req q23 + qform_v24 .req q24 + qform_v25 .req q25 + qform_v26 .req q26 + qform_v27 .req q27 + qform_v28 .req q28 + qform_v29 .req q29 + qform_v30 .req q30 + qform_v31 .req q31 + + data0 .req v8 + data1 .req v9 + data2 .req v10 + data3 .req v11 + data4 .req v12 + data5 .req v13 + data6 .req v14 + data7 .req v15 + + x_00 .req x10 + x_01 .req x11 + x_10 .req x12 + x_11 .req x13 + x_20 .req x14 + x_21 .req x15 + x_30 .req x16 + x_31 .req x17 + + xt_00 .req x_00 + xt_01 .req x_20 + xt_10 .req x_10 + xt_11 .req x_30 + xt_20 .req x_01 + xt_21 .req x_21 + xt_30 .req x_11 + xt_31 .req x_31 + + qform_data0 .req q8 + qform_data1 .req q9 + qform_data2 .req q10 + qform_data3 .req q11 + qform_data4 .req q12 + qform_data5 .req q13 + qform_data6 .req q14 + qform_data7 .req q15 + + root0 .req v0 + root1 .req v1 + root2 .req v2 + root0_tw .req v4 + root1_tw .req v5 + root2_tw .req v6 + + consts .req v7 + qform_consts .req q7 + + qform_root0 .req q0 + qform_root1 .req q1 + qform_root2 .req q2 + qform_root0_tw .req q4 + qform_root1_tw .req q5 + qform_root2_tw .req q6 + + tmp .req v24 + t0 .req v25 + t1 .req v26 + t2 .req v27 + t3 .req v28 + + ASM_LOAD(r_ptr0, roots) + ASM_LOAD(r_ptr1, roots_l56) + + ASM_LOAD(xtmp, const_addr) + ld1 {consts.8h}, [xtmp] + + save STACK0, in + mov count, #4 + + load_roots_123 + + .p2align 2 + ldr q4, [x0, #0] // *......... + // gap // .......... + // gap // .......... + // gap // .......... + ldr q16, [x0, #128] // ..*....... + // gap // .......... + // gap // .......... + // gap // .......... + ldr q5, [x0, #192] // ...*...... + // gap // .......... + // gap // .......... + // gap // .......... + ldr q26, [x0, #256] // ....*..... + // gap // .......... + // gap // .......... + // gap // .......... + ldr q14, [x0, #320] // .....*.... + // gap // .......... + // gap // .......... + // gap // .......... + sqrdmulh v9.8H, v26.8H, v0.H[1] // ......*... + // gap // .......... + ldr q15, [x0, #448] // .......*.. + // gap // .......... + // gap // .......... + // gap // .......... + ldr q29, [x0, #384] // ........*. + // gap // .......... + // gap // .......... + // gap // .......... + sqrdmulh v13.8H, v15.8H, v0.H[1] // .........* + // gap // .......... + ldr q17, [x0, #64] // .*........ + // gap // .......... + + // original source code + // ldr q4, [x0, #0] // *......... + // ldr q17, [x0, #64] // .........* + // ldr q16, [x0, #128] // .*........ + // ldr q5, [x0, #192] // ..*....... + // ldr q26, [x0, #256] // ...*...... + // ldr q14, [x0, #320] // ....*..... + // sqrdmulh v9.8H, v26.8H, v0.H[1] // .....*.... + // ldr q15, [x0, #448] // ......*... + // ldr q29, [x0, #384] // .......*.. + // sqrdmulh v13.8H, v15.8H, v0.H[1] // ........*. + + sub count, count, #1 +layer123_start: + mul v26.8H, v26.8H, v0.H[0] // ........*................................................................... + // gap // ............................................................................ + mul v21.8H, v14.8H, v0.H[0] // .............*.............................................................. + // gap // ............................................................................ + sqrdmulh v22.8H, v14.8H, v0.H[1] // ..............*............................................................. + // gap // ............................................................................ + mul v12.8H, v29.8H, v0.H[0] // ..................*......................................................... + // gap // ............................................................................ + mls v26.8H, v9.8H, v7.H[0] // ..........*................................................................. + // gap // ............................................................................ + sqrdmulh v14.8H, v29.8H, v0.H[1] // ...................*........................................................ + // gap // ............................................................................ + mls v21.8H, v22.8H, v7.H[0] // ...............*............................................................ + // gap // ............................................................................ + mul v22.8H, v15.8H, v0.H[0] // .......................*.................................................... + // gap // ............................................................................ + sub v9.8H, v4.8H, v26.8H // ...........*................................................................ + // gap // ............................................................................ + mls v12.8H, v14.8H, v7.H[0] // ....................*....................................................... + // gap // ............................................................................ + add v26.8H, v4.8H, v26.8H // ............*............................................................... + // gap // ............................................................................ + sub v14.8H, v17.8H, v21.8H // ................*........................................................... + // gap // ............................................................................ + add v21.8H, v17.8H, v21.8H // .................*.......................................................... + // gap // ............................................................................ + sub v15.8H, v16.8H, v12.8H // .....................*...................................................... + // gap // ............................................................................ + add v12.8H, v16.8H, v12.8H // ......................*..................................................... + // gap // ............................................................................ + mls v22.8H, v13.8H, v7.H[0] // .........................*.................................................. + // gap // ............................................................................ + mul v16.8H, v15.8H, v0.H[4] // ......................................*..................................... + // gap // ............................................................................ + sqrdmulh v15.8H, v15.8H, v0.H[5] // .......................................*.................................... + // gap // ............................................................................ + mul v4.8H, v12.8H, v0.H[2] // ............................*............................................... + // gap // ............................................................................ + sub v17.8H, v5.8H, v22.8H // ..........................*................................................. + // gap // ............................................................................ + add v22.8H, v5.8H, v22.8H // ...........................*................................................ + // gap // ............................................................................ + mls v16.8H, v15.8H, v7.H[0] // ........................................*................................... + // gap // ............................................................................ + sqrdmulh v12.8H, v12.8H, v0.H[3] // .............................*.............................................. + // gap // ............................................................................ + mul v15.8H, v17.8H, v0.H[4] // ...........................................*................................ + // gap // ............................................................................ + sqrdmulh v17.8H, v17.8H, v0.H[5] // ............................................*............................... + // gap // ............................................................................ + sub v5.8H, v9.8H, v16.8H // .........................................*.................................. + // gap // ............................................................................ + add v9.8H, v9.8H, v16.8H // ..........................................*................................. + // gap // ............................................................................ + mls v4.8H, v12.8H, v7.H[0] // ..............................*............................................. + // gap // ............................................................................ + mul v12.8H, v22.8H, v0.H[2] // .................................*.......................................... + // gap // ............................................................................ + mls v15.8H, v17.8H, v7.H[0] // .............................................*.............................. + // gap // ............................................................................ + sqrdmulh v22.8H, v22.8H, v0.H[3] // ..................................*......................................... + // gap // ............................................................................ + sub v16.8H, v26.8H, v4.8H // ...............................*............................................ + // gap // ............................................................................ + add v26.8H, v26.8H, v4.8H // ................................*........................................... + // gap // ............................................................................ + sub v4.8H, v14.8H, v15.8H // ..............................................*............................. + // gap // ............................................................................ + add v14.8H, v14.8H, v15.8H // ...............................................*............................ + // gap // ............................................................................ + mls v12.8H, v22.8H, v7.H[0] // ...................................*........................................ + // gap // ............................................................................ + mul v22.8H, v4.8H, v1.H[4] // ...............................................................*............ + // gap // ............................................................................ + mul v15.8H, v14.8H, v1.H[2] // ..........................................................*................. + // gap // ............................................................................ + sqrdmulh v14.8H, v14.8H, v1.H[3] // ...........................................................*................ + // gap // ............................................................................ + sub v17.8H, v21.8H, v12.8H // ....................................*....................................... + // gap // ............................................................................ + add v21.8H, v21.8H, v12.8H // .....................................*...................................... + // gap // ............................................................................ + sqrdmulh v12.8H, v4.8H, v1.H[5] // ................................................................*........... + // gap // ............................................................................ + mul v4.8H, v17.8H, v1.H[0] // .....................................................*...................... + // gap // ............................................................................ + mul v29.8H, v21.8H, v0.H[6] // ................................................*........................... + // gap // ............................................................................ + sqrdmulh v21.8H, v21.8H, v0.H[7] // .................................................*.......................... + // gap // ............................................................................ + sqrdmulh v17.8H, v17.8H, v1.H[1] // ......................................................*..................... + // gap // ............................................................................ + mls v15.8H, v14.8H, v7.H[0] // ............................................................*............... + // gap // ............................................................................ + mls v22.8H, v12.8H, v7.H[0] // .................................................................*.......... + // gap // ............................................................................ + mls v29.8H, v21.8H, v7.H[0] // ..................................................*......................... + // gap // ............................................................................ + mls v4.8H, v17.8H, v7.H[0] // .......................................................*.................... + // gap // ............................................................................ + sub v21.8H, v9.8H, v15.8H // .............................................................*.............. + // gap // ............................................................................ + sub v12.8H, v5.8H, v22.8H // ..................................................................*......... + // gap // ............................................................................ + add v22.8H, v5.8H, v22.8H // ...................................................................*........ + // gap // ............................................................................ + add v14.8H, v9.8H, v15.8H // ..............................................................*............. + // gap // ............................................................................ + sub v9.8H, v26.8H, v29.8H // ...................................................*........................ + // gap // ............................................................................ + add v26.8H, v26.8H, v29.8H // ....................................................*....................... + // gap // ............................................................................ + sub v15.8H, v16.8H, v4.8H // ........................................................*................... + // gap // ............................................................................ + add v16.8H, v16.8H, v4.8H // .........................................................*.................. + // gap // ............................................................................ + str q26, [x0], #(16) // ....................................................................*....... + // gap // ............................................................................ + ldr q4, [x0, #0] // e........................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + str q9, [x0, #48] // .....................................................................*...... + // gap // ............................................................................ + ldr q17, [x0, #64] // .e.......................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + str q16, [x0, #112] // ......................................................................*..... + // gap // ............................................................................ + ldr q16, [x0, #128] // ..e......................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + str q15, [x0, #176] // .......................................................................*.... + // gap // ............................................................................ + ldr q5, [x0, #192] // ...e........................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + str q14, [x0, #240] // ........................................................................*... + // gap // ............................................................................ + ldr q26, [x0, #256] // ....e....................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + str q21, [x0, #304] // .........................................................................*.. + // gap // ............................................................................ + ldr q14, [x0, #320] // .....e...................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + str q22, [x0, #368] // ..........................................................................*. + // gap // ............................................................................ + sqrdmulh v9.8H, v26.8H, v0.H[1] // .........e.................................................................. + // gap // ............................................................................ + str q12, [x0, #432] // ...........................................................................* + // gap // ............................................................................ + ldr q15, [x0, #448] // .......e.................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + ldr q29, [x0, #384] // ......e..................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sqrdmulh v13.8H, v15.8H, v0.H[1] // ........................e................................................... + // gap // ............................................................................ + + // original source code + // ldr q8, [x0, #0] // e................|..........................................................e............. + // ldr q9, [x0, #(1*(512/8))] // ..e..............|............................................................e........... + // ldr q10, [x0, #(2*(512/8))] // ....e............|..............................................................e......... + // ldr q11, [x0, #(3*(512/8))] // ......e..........|................................................................e....... + // ldr q12, [x0, #(4*(512/8))] // ........e........|..................................................................e..... + // ldr q13, [x0, #(5*(512/8))] // ..........e......|....................................................................e... + // ldr q14, [x0, #(6*(512/8))] // ...............e.|........................................................................ + // ldr q15, [x0, #(7*(512/8))] // ..............e..|........................................................................ + // mul v24.8h, v12.8h, v0.h[0] // .................*........................................................................ + // sqrdmulh v12.8h, v12.8h, v0.h[1] // ............e....|......................................................................e. + // mls v24.8h, v12.8h, v7.h[0] // .................|...*.................................................................... + // sub v12.8h, v8.8h, v24.8h // .................|.......*................................................................ + // add v8.8h, v8.8h, v24.8h // .................|.........*.............................................................. + // mul v24.8h, v13.8h, v0.h[0] // .................|*....................................................................... + // sqrdmulh v13.8h, v13.8h, v0.h[1] // .................|.*...................................................................... + // mls v24.8h, v13.8h, v7.h[0] // .................|.....*.................................................................. + // sub v13.8h, v9.8h, v24.8h // .................|..........*............................................................. + // add v9.8h, v9.8h, v24.8h // .................|...........*............................................................ + // mul v24.8h, v14.8h, v0.h[0] // .................|..*..................................................................... + // sqrdmulh v14.8h, v14.8h, v0.h[1] // .................|....*................................................................... + // mls v24.8h, v14.8h, v7.h[0] // .................|........*............................................................... + // sub v14.8h, v10.8h, v24.8h // .................|............*........................................................... + // add v10.8h, v10.8h, v24.8h // .................|.............*.......................................................... + // mul v24.8h, v15.8h, v0.h[0] // .................|......*................................................................. + // sqrdmulh v15.8h, v15.8h, v0.h[1] // ................e|........................................................................ + // mls v24.8h, v15.8h, v7.h[0] // .................|..............*......................................................... + // sub v15.8h, v11.8h, v24.8h // .................|..................*..................................................... + // add v11.8h, v11.8h, v24.8h // .................|...................*.................................................... + // mul v24.8h, v10.8h, v0.h[2] // .................|.................*...................................................... + // sqrdmulh v10.8h, v10.8h, v0.h[3] // .................|.....................*.................................................. + // mls v24.8h, v10.8h, v7.h[0] // .................|..........................*............................................. + // sub v10.8h, v8.8h, v24.8h // .................|..............................*......................................... + // add v8.8h, v8.8h, v24.8h // .................|...............................*........................................ + // mul v24.8h, v11.8h, v0.h[2] // .................|...........................*............................................ + // sqrdmulh v11.8h, v11.8h, v0.h[3] // .................|.............................*.......................................... + // mls v24.8h, v11.8h, v7.h[0] // .................|..................................*..................................... + // sub v11.8h, v9.8h, v24.8h // .................|......................................*................................. + // add v9.8h, v9.8h, v24.8h // .................|.......................................*................................ + // mul v24.8h, v14.8h, v0.h[4] // .................|...............*........................................................ + // sqrdmulh v14.8h, v14.8h, v0.h[5] // .................|................*....................................................... + // mls v24.8h, v14.8h, v7.h[0] // .................|....................*................................................... + // sub v14.8h, v12.8h, v24.8h // .................|........................*............................................... + // add v12.8h, v12.8h, v24.8h // .................|.........................*.............................................. + // mul v24.8h, v15.8h, v0.h[4] // .................|......................*................................................. + // sqrdmulh v15.8h, v15.8h, v0.h[5] // .................|.......................*................................................ + // mls v24.8h, v15.8h, v7.h[0] // .................|............................*........................................... + // sub v15.8h, v13.8h, v24.8h // .................|................................*....................................... + // add v13.8h, v13.8h, v24.8h // .................|.................................*...................................... + // mul v24.8h, v9.8h, v0.h[6] // .................|..........................................*............................. + // sqrdmulh v9.8h, v9.8h, v0.h[7] // .................|...........................................*............................ + // mls v24.8h, v9.8h, v7.h[0] // .................|...............................................*........................ + // sub v9.8h, v8.8h, v24.8h // .................|.....................................................*.................. + // add v8.8h, v8.8h, v24.8h // .................|......................................................*................. + // mul v24.8h, v11.8h, v1.h[0] // .................|.........................................*.............................. + // sqrdmulh v11.8h, v11.8h, v1.h[1] // .................|............................................*........................... + // mls v24.8h, v11.8h, v7.h[0] // .................|................................................*....................... + // sub v11.8h, v10.8h, v24.8h // .................|.......................................................*................ + // add v10.8h, v10.8h, v24.8h // .................|........................................................*............... + // mul v24.8h, v13.8h, v1.h[2] // .................|....................................*................................... + // sqrdmulh v13.8h, v13.8h, v1.h[3] // .................|.....................................*.................................. + // mls v24.8h, v13.8h, v7.h[0] // .................|.............................................*.......................... + // sub v13.8h, v12.8h, v24.8h // .................|.................................................*...................... + // add v12.8h, v12.8h, v24.8h // .................|....................................................*................... + // mul v24.8h, v15.8h, v1.h[4] // .................|...................................*.................................... + // sqrdmulh v15.8h, v15.8h, v1.h[5] // .................|........................................*............................... + // mls v24.8h, v15.8h, v7.h[0] // .................|..............................................*......................... + // sub v15.8h, v14.8h, v24.8h // .................|..................................................*..................... + // add v14.8h, v14.8h, v24.8h // .................|...................................................*.................... + // str q8, [x0], #(16) // .................|.........................................................*.............. + // str q9, [x0, #(-16 + 1*(512/8))] // .*...............|...........................................................*............ + // str q10, [x0, #(-16 + 2*(512/8))] // ...*.............|.............................................................*.......... + // str q11, [x0, #(-16 + 3*(512/8))] // .....*...........|...............................................................*........ + // str q12, [x0, #(-16 + 4*(512/8))] // .......*.........|.................................................................*...... + // str q13, [x0, #(-16 + 5*(512/8))] // .........*.......|...................................................................*.... + // str q14, [x0, #(-16 + 6*(512/8))] // ...........*.....|.....................................................................*.. + // str q15, [x0, #(-16 + 7*(512/8))] // .............*...|.......................................................................* + + sub count, count, #1 + cbnz count, layer123_start + mul v12.8H, v15.8H, v0.H[0] // .......*.......................................................... + // gap // .................................................................. + sqrdmulh v10.8H, v14.8H, v0.H[1] // ..*............................................................... + // gap // .................................................................. + mul v14.8H, v14.8H, v0.H[0] // .*................................................................ + // gap // .................................................................. + sqrdmulh v3.8H, v29.8H, v0.H[1] // .....*............................................................ + // gap // .................................................................. + mls v12.8H, v13.8H, v7.H[0] // ...............*.................................................. + // gap // .................................................................. + mul v8.8H, v29.8H, v0.H[0] // ...*.............................................................. + // gap // .................................................................. + mls v14.8H, v10.8H, v7.H[0] // ......*........................................................... + // gap // .................................................................. + mul v13.8H, v26.8H, v0.H[0] // *................................................................. + // gap // .................................................................. + add v21.8H, v5.8H, v12.8H // ....................*............................................. + // gap // .................................................................. + sub v22.8H, v5.8H, v12.8H // ...................*.............................................. + // gap // .................................................................. + mls v8.8H, v3.8H, v7.H[0] // .........*........................................................ + // gap // .................................................................. + mul v15.8H, v21.8H, v0.H[2] // ............................*..................................... + // gap // .................................................................. + sqrdmulh v26.8H, v21.8H, v0.H[3] // ..............................*................................... + // gap // .................................................................. + sqrdmulh v12.8H, v22.8H, v0.H[5] // ........................*......................................... + // gap // .................................................................. + mul v29.8H, v22.8H, v0.H[4] // .......................*.......................................... + // gap // .................................................................. + sub v21.8H, v16.8H, v8.8H // .............*.................................................... + // gap // .................................................................. + mls v15.8H, v26.8H, v7.H[0] // ...................................*.............................. + // gap // .................................................................. + sub v11.8H, v17.8H, v14.8H // ...........*...................................................... + // gap // .................................................................. + mls v29.8H, v12.8H, v7.H[0] // .............................*.................................... + // gap // .................................................................. + mul v3.8H, v21.8H, v0.H[4] // ................*................................................. + // gap // .................................................................. + sqrdmulh v26.8H, v21.8H, v0.H[5] // .................*................................................ + // gap // .................................................................. + mls v13.8H, v9.8H, v7.H[0] // ....*............................................................. + // gap // .................................................................. + sub v21.8H, v11.8H, v29.8H // .................................*................................ + // gap // .................................................................. + add v12.8H, v17.8H, v14.8H // ............*..................................................... + // gap // .................................................................. + mls v3.8H, v26.8H, v7.H[0] // .....................*............................................ + // gap // .................................................................. + sqrdmulh v26.8H, v21.8H, v1.H[5] // .........................................*........................ + // gap // .................................................................. + mul v21.8H, v21.8H, v1.H[4] // ....................................*............................. + // gap // .................................................................. + add v16.8H, v16.8H, v8.8H // ..............*................................................... + // gap // .................................................................. + sub v28.8H, v4.8H, v13.8H // ........*......................................................... + // gap // .................................................................. + sub v22.8H, v12.8H, v15.8H // .......................................*.......................... + // gap // .................................................................. + mls v21.8H, v26.8H, v7.H[0] // ...............................................*.................. + // gap // .................................................................. + sub v9.8H, v28.8H, v3.8H // .........................*........................................ + // gap // .................................................................. + add v5.8H, v28.8H, v3.8H // ..........................*....................................... + // gap // .................................................................. + add v10.8H, v4.8H, v13.8H // ..........*....................................................... + // gap // .................................................................. + add v26.8H, v9.8H, v21.8H // ....................................................*............. + // gap // .................................................................. + mul v14.8H, v22.8H, v1.H[0] // ..........................................*....................... + // gap // .................................................................. + add v12.8H, v12.8H, v15.8H // ........................................*......................... + // gap // .................................................................. + sqrdmulh v22.8H, v22.8H, v1.H[1] // .............................................*.................... + // gap // .................................................................. + sub v25.8H, v9.8H, v21.8H // ...................................................*.............. + // gap // .................................................................. + add v21.8H, v11.8H, v29.8H // ..................................*............................... + // gap // .................................................................. + sqrdmulh v9.8H, v16.8H, v0.H[3] // ......................*........................................... + // gap // .................................................................. + mul v4.8H, v16.8H, v0.H[2] // ..................*............................................... + // gap // .................................................................. + mul v15.8H, v12.8H, v0.H[6] // ...........................................*...................... + // gap // .................................................................. + mls v14.8H, v22.8H, v7.H[0] // .................................................*................ + // gap // .................................................................. + str q25, [x0, #448] // .................................................................* + // gap // .................................................................. + mls v4.8H, v9.8H, v7.H[0] // ...........................*...................................... + // gap // .................................................................. + sqrdmulh v22.8H, v21.8H, v1.H[3] // ......................................*........................... + // gap // .................................................................. + mul v16.8H, v21.8H, v1.H[2] // .....................................*............................ + // gap // .................................................................. + sqrdmulh v12.8H, v12.8H, v0.H[7] // ............................................*..................... + // gap // .................................................................. + sub v17.8H, v10.8H, v4.8H // ...............................*.................................. + // gap // .................................................................. + str q26, [x0, #384] // ................................................................*. + // gap // .................................................................. + mls v16.8H, v22.8H, v7.H[0] // ..............................................*................... + // gap // .................................................................. + sub v21.8H, v17.8H, v14.8H // ........................................................*......... + // gap // .................................................................. + add v26.8H, v17.8H, v14.8H // .........................................................*........ + // gap // .................................................................. + mls v15.8H, v12.8H, v7.H[0] // ................................................*................. + // gap // .................................................................. + add v27.8H, v10.8H, v4.8H // ................................*................................. + // gap // .................................................................. + str q21, [x0, #192] // .............................................................*.... + // gap // .................................................................. + sub v9.8H, v5.8H, v16.8H // ..................................................*............... + // gap // .................................................................. + str q26, [x0, #128] // ............................................................*..... + // gap // .................................................................. + sub v12.8H, v27.8H, v15.8H // ......................................................*........... + // gap // .................................................................. + str q9, [x0, #320] // ...............................................................*.. + // gap // .................................................................. + add v14.8H, v5.8H, v16.8H // .....................................................*............ + // gap // .................................................................. + str q12, [x0, #64] // ...........................................................*...... + // gap // .................................................................. + add v22.8H, v27.8H, v15.8H // .......................................................*.......... + // gap // .................................................................. + str q14, [x0, #256] // ..............................................................*... + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + str q22, [x0], #(16) // ..........................................................*....... + // gap // .................................................................. + + // original source code + // mul v26.8H, v26.8H, v0.H[0] // .......*.......................................................... + // mul v21.8H, v14.8H, v0.H[0] // ..*............................................................... + // sqrdmulh v22.8H, v14.8H, v0.H[1] // .*................................................................ + // mul v12.8H, v29.8H, v0.H[0] // .....*............................................................ + // mls v26.8H, v9.8H, v7.H[0] // .....................*............................................ + // sqrdmulh v14.8H, v29.8H, v0.H[1] // ...*.............................................................. + // mls v21.8H, v22.8H, v7.H[0] // ......*........................................................... + // mul v22.8H, v15.8H, v0.H[0] // *................................................................. + // sub v9.8H, v4.8H, v26.8H // ............................*..................................... + // mls v12.8H, v14.8H, v7.H[0] // ..........*....................................................... + // add v26.8H, v4.8H, v26.8H // .................................*................................ + // sub v14.8H, v17.8H, v21.8H // .................*................................................ + // add v21.8H, v17.8H, v21.8H // .......................*.......................................... + // sub v15.8H, v16.8H, v12.8H // ...............*.................................................. + // add v12.8H, v16.8H, v12.8H // ...........................*...................................... + // mls v22.8H, v13.8H, v7.H[0] // ....*............................................................. + // mul v16.8H, v15.8H, v0.H[4] // ...................*.............................................. + // sqrdmulh v15.8H, v15.8H, v0.H[5] // ....................*............................................. + // mul v4.8H, v12.8H, v0.H[2] // .........................................*........................ + // sub v17.8H, v5.8H, v22.8H // .........*........................................................ + // add v22.8H, v5.8H, v22.8H // ........*......................................................... + // mls v16.8H, v15.8H, v7.H[0] // ........................*......................................... + // sqrdmulh v12.8H, v12.8H, v0.H[3] // ........................................*......................... + // mul v15.8H, v17.8H, v0.H[4] // ..............*................................................... + // sqrdmulh v17.8H, v17.8H, v0.H[5] // .............*.................................................... + // sub v5.8H, v9.8H, v16.8H // ...............................*.................................. + // add v9.8H, v9.8H, v16.8H // ................................*................................. + // mls v4.8H, v12.8H, v7.H[0] // .............................................*.................... + // mul v12.8H, v22.8H, v0.H[2] // ...........*...................................................... + // mls v15.8H, v17.8H, v7.H[0] // ..................*............................................... + // sqrdmulh v22.8H, v22.8H, v0.H[3] // ............*..................................................... + // sub v16.8H, v26.8H, v4.8H // .................................................*................ + // add v26.8H, v26.8H, v4.8H // .......................................................*.......... + // sub v4.8H, v14.8H, v15.8H // ......................*........................................... + // add v14.8H, v14.8H, v15.8H // .......................................*.......................... + // mls v12.8H, v22.8H, v7.H[0] // ................*................................................. + // mul v22.8H, v4.8H, v1.H[4] // ..........................*....................................... + // mul v15.8H, v14.8H, v1.H[2] // ...............................................*.................. + // sqrdmulh v14.8H, v14.8H, v1.H[3] // ..............................................*................... + // sub v17.8H, v21.8H, v12.8H // .............................*.................................... + // add v21.8H, v21.8H, v12.8H // ....................................*............................. + // sqrdmulh v12.8H, v4.8H, v1.H[5] // .........................*........................................ + // mul v4.8H, v17.8H, v1.H[0] // ...................................*.............................. + // mul v29.8H, v21.8H, v0.H[6] // ..........................................*....................... + // sqrdmulh v21.8H, v21.8H, v0.H[7] // ................................................*................. + // sqrdmulh v17.8H, v17.8H, v1.H[1] // .....................................*............................ + // mls v15.8H, v14.8H, v7.H[0] // ...................................................*.............. + // mls v22.8H, v12.8H, v7.H[0] // ..............................*................................... + // mls v29.8H, v21.8H, v7.H[0] // ......................................................*........... + // mls v4.8H, v17.8H, v7.H[0] // ...........................................*...................... + // sub v21.8H, v9.8H, v15.8H // .........................................................*........ + // sub v12.8H, v5.8H, v22.8H // ......................................*........................... + // add v22.8H, v5.8H, v22.8H // ..................................*............................... + // add v14.8H, v9.8H, v15.8H // .............................................................*.... + // sub v9.8H, v26.8H, v29.8H // ...........................................................*...... + // add v26.8H, v26.8H, v29.8H // ...............................................................*.. + // sub v15.8H, v16.8H, v4.8H // ....................................................*............. + // add v16.8H, v16.8H, v4.8H // .....................................................*............ + // str q26, [x0], #(16) // .................................................................* + // str q9, [x0, #48] // ..............................................................*... + // str q16, [x0, #112] // ..........................................................*....... + // str q15, [x0, #176] // ........................................................*......... + // str q14, [x0, #240] // ................................................................*. + // str q21, [x0, #304] // ............................................................*..... + // str q22, [x0, #368] // ..................................................*............... + // str q12, [x0, #432] // ............................................*..................... + + + restore inp, STACK0 + mov count, #8 + + .p2align 2 + ldr q13, [x3], #16 // ..*....................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + ldr q6, [x1, #48] // *......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + ldr q3, [x1, #32] // .*........................................................ + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + sqrdmulh v4.8H, v6.8H, v13.H[1] // ...*...................................................... + // gap // .......................................................... + mul v18.8H, v6.8H, v13.H[0] // .....*.................................................... + // gap // .......................................................... + sqrdmulh v0.8H, v3.8H, v13.H[1] // .........*................................................ + // gap // .......................................................... + ldr q22, [x1, #16] // .......*.................................................. + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + mls v18.8H, v4.8H, v7.H[0] // ........*................................................. + // gap // .......................................................... + mul v24.8H, v3.8H, v13.H[0] // ......*................................................... + // gap // .......................................................... + ldr q9, [x1, #0] // ..........*............................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + sub v21.8H, v22.8H, v18.8H // .............*............................................ + // gap // .......................................................... + add v14.8H, v22.8H, v18.8H // ...........*.............................................. + // gap // .......................................................... + mls v24.8H, v0.8H, v7.H[0] // ............*............................................. + // gap // .......................................................... + mul v0.8H, v21.8H, v13.H[4] // .................*........................................ + // gap // .......................................................... + sqrdmulh v21.8H, v21.8H, v13.H[5] // ................*......................................... + // gap // .......................................................... + sqrdmulh v26.8H, v14.8H, v13.H[3] // ..............*........................................... + // gap // .......................................................... + mul v18.8H, v14.8H, v13.H[2] // ...............*.......................................... + // gap // .......................................................... + ldr q11, [x4], #(6*16) // ....*..................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + ldr q5, [x4, #-80] // ..................*....................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + mls v18.8H, v26.8H, v7.H[0] // ...................*...................................... + // gap // .......................................................... + add v26.8H, v9.8H, v24.8H // ....................*..................................... + // gap // .......................................................... + mls v0.8H, v21.8H, v7.H[0] // .....................*.................................... + // gap // .......................................................... + sub v22.8H, v9.8H, v24.8H // ......................*................................... + // gap // .......................................................... + add v12.8H, v26.8H, v18.8H // ........................*................................. + // gap // .......................................................... + sub v14.8H, v26.8H, v18.8H // .......................*.................................. + // gap // .......................................................... + sub v21.8H, v22.8H, v0.8H // .........................*................................ + // gap // .......................................................... + add v26.8H, v22.8H, v0.8H // ..........................*............................... + // gap // .......................................................... + trn2 v17.4S, v12.4S, v14.4S // ............................*............................. + // gap // .......................................................... + trn1 v29.4S, v12.4S, v14.4S // ...........................*.............................. + // gap // .......................................................... + trn2 v4.4S, v26.4S, v21.4S // .............................*............................ + // gap // .......................................................... + trn1 v16.4S, v26.4S, v21.4S // ................................*......................... + // gap // .......................................................... + ldr q0, [x4, #-16] // ..................................................*....... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + trn2 v21.2D, v29.2D, v16.2D // ...................................*...................... + // gap // .......................................................... + trn2 v22.2D, v17.2D, v4.2D // ...............................*.......................... + // gap // .......................................................... + sqrdmulh v26.8H, v21.8H, v5.8H // ..........................................*............... + // gap // .......................................................... + mul v15.8H, v22.8H, v11.8H // .................................*........................ + // gap // .......................................................... + sqrdmulh v22.8H, v22.8H, v5.8H // ..................................*....................... + // gap // .......................................................... + ldr q12, [x4, #-64] // ........................................*................. + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + trn1 v14.2D, v17.2D, v4.2D // ....................................*..................... + // gap // .......................................................... + mls v15.8H, v22.8H, v7.H[0] // ......................................*................... + // gap // .......................................................... + ldr q5, [x4, #-48] // ..............................*........................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + mul v9.8H, v21.8H, v11.8H // .....................................*.................... + // gap // .......................................................... + add v18.8H, v14.8H, v15.8H // .........................................*................ + // gap // .......................................................... + ldr q21, [x4, #-32] // ...............................................*.......... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + mul v12.8H, v18.8H, v12.8H // ............................................*............. + // gap // .......................................................... + sqrdmulh v22.8H, v18.8H, v5.8H // .............................................*............ + // gap // .......................................................... + mls v9.8H, v26.8H, v7.H[0] // ..............................................*........... + // gap // .......................................................... + trn1 v16.2D, v29.2D, v16.2D // .......................................*.................. + // gap // .......................................................... + sub v14.8H, v14.8H, v15.8H // ...........................................*.............. + // gap // .......................................................... + mls v12.8H, v22.8H, v7.H[0] // ................................................*......... + // gap // .......................................................... + add v26.8H, v16.8H, v9.8H // .................................................*........ + // gap // .......................................................... + sqrdmulh v25.8H, v14.8H, v0.8H // .......................................................*.. + // gap // .......................................................... + mul v18.8H, v14.8H, v21.8H // .....................................................*.... + // gap // .......................................................... + add v27.8H, v26.8H, v12.8H // ....................................................*..... + // gap // .......................................................... + sub v28.8H, v26.8H, v12.8H // ...................................................*...... + // gap // .......................................................... + sub v8.8H, v16.8H, v9.8H // ........................................................*. + // gap // .......................................................... + sqdmulh v24.8H, v27.8H, v7.H[1] // .........................................................* + // gap // .......................................................... + sqdmulh v0.8H, v28.8H, v7.H[1] // ......................................................*... + // gap // .......................................................... + + // original source code + // ldr q0, [x1, #48] // .*........................................................ + // ldr q23, [x1, #32] // ..*....................................................... + // ldr q10, [x3], #16 // *......................................................... + // sqrdmulh v2.8H, v0.8H, v10.H[1] // ...*...................................................... + // ldr q22, [x4], #(6*16) // .................*........................................ + // mul v0.8H, v0.8H, v10.H[0] // ....*..................................................... + // mul v26.8H, v23.8H, v10.H[0] // ........*................................................. + // ldr q6, [x1, #16] // ......*................................................... + // mls v0.8H, v2.8H, v7.H[0] // .......*.................................................. + // sqrdmulh v24.8H, v23.8H, v10.H[1] // .....*.................................................... + // ldr q18, [x1, #0] // .........*................................................ + // add v14.8H, v6.8H, v0.8H // ...........*.............................................. + // mls v26.8H, v24.8H, v7.H[0] // ............*............................................. + // sub v17.8H, v6.8H, v0.8H // ..........*............................................... + // sqrdmulh v21.8H, v14.8H, v10.H[3] // ...............*.......................................... + // mul v24.8H, v14.8H, v10.H[2] // ................*......................................... + // sqrdmulh v8.8H, v17.8H, v10.H[5] // ..............*........................................... + // mul v31.8H, v17.8H, v10.H[4] // .............*............................................ + // ldr q16, [x4, #-80] // ..................*....................................... + // mls v24.8H, v21.8H, v7.H[0] // ...................*...................................... + // add v6.8H, v18.8H, v26.8H // ....................*..................................... + // mls v31.8H, v8.8H, v7.H[0] // .....................*.................................... + // sub v0.8H, v18.8H, v26.8H // ......................*................................... + // sub v20.8H, v6.8H, v24.8H // ........................*................................. + // add v28.8H, v6.8H, v24.8H // .......................*.................................. + // sub v23.8H, v0.8H, v31.8H // .........................*................................ + // add v18.8H, v0.8H, v31.8H // ..........................*............................... + // trn1 v19.4S, v28.4S, v20.4S // ............................*............................. + // trn2 v26.4S, v28.4S, v20.4S // ...........................*.............................. + // trn2 v9.4S, v18.4S, v23.4S // .............................*............................ + // ldr q12, [x4, #-48] // ........................................*................. + // trn2 v15.2D, v26.2D, v9.2D // .................................*........................ + // trn1 v0.4S, v18.4S, v23.4S // ..............................*........................... + // mul v24.8H, v15.8H, v22.8H // ...................................*...................... + // sqrdmulh v10.8H, v15.8H, v16.8H // ....................................*..................... + // trn2 v28.2D, v19.2D, v0.2D // ................................*......................... + // trn1 v21.2D, v26.2D, v9.2D // ......................................*................... + // mul v26.8H, v28.8H, v22.8H // .........................................*................ + // mls v24.8H, v10.8H, v7.H[0] // .......................................*.................. + // trn1 v4.2D, v19.2D, v0.2D // ...............................................*.......... + // ldr q13, [x4, #-64] // .....................................*.................... + // add v29.8H, v21.8H, v24.8H // ..........................................*............... + // sqrdmulh v31.8H, v28.8H, v16.8H // ..................................*....................... + // sub v14.8H, v21.8H, v24.8H // ................................................*......... + // mul v15.8H, v29.8H, v13.8H // ............................................*............. + // sqrdmulh v12.8H, v29.8H, v12.8H // .............................................*............ + // mls v26.8H, v31.8H, v7.H[0] // ..............................................*........... + // ldr q31, [x4, #-32] // ...........................................*.............. + // mls v15.8H, v12.8H, v7.H[0] // .................................................*........ + // add v29.8H, v4.8H, v26.8H // ..................................................*....... + // ldr q8, [x4, #-16] // ...............................*.......................... + // sub v28.8H, v29.8H, v15.8H // ......................................................*... + // add v27.8H, v29.8H, v15.8H // .....................................................*.... + // mul v18.8H, v14.8H, v31.8H // ....................................................*..... + // sqdmulh v0.8H, v28.8H, v7.H[1] // .........................................................* + // sqrdmulh v25.8H, v14.8H, v8.8H // ...................................................*...... + // sub v8.8H, v4.8H, v26.8H // .......................................................*.. + // sqdmulh v24.8H, v27.8H, v7.H[1] // ........................................................*. + + sub count, count, #1 +layer4567_start: + srshr v6.8H, v0.8H, #11 // ...............................................................*........ + // gap // ........................................................................ + mls v18.8H, v25.8H, v7.H[0] // ........................................................*............... + // gap // ........................................................................ + ldr q0, [x1, #112] // ...e.................................................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mls v28.8H, v6.8H, v7.H[0] // ................................................................*....... + // gap // ........................................................................ + sub v30.8H, v8.8H, v18.8H // .........................................................*.............. + // gap // ........................................................................ + add v29.8H, v8.8H, v18.8H // ..........................................................*............. + // gap // ........................................................................ + srshr v19.8H, v24.8H, #11 // ............................................................*........... + // gap // ........................................................................ + sqdmulh v5.8H, v30.8H, v7.H[1] // ....................................................................*... + // gap // ........................................................................ + sqdmulh v24.8H, v29.8H, v7.H[1] // .................................................................*...... + // gap // ........................................................................ + ldr q23, [x1, #96] // ..e..................................................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + srshr v12.8H, v5.8H, #11 // .....................................................................*.. + // gap // ........................................................................ + srshr v11.8H, v24.8H, #11 // ..................................................................*..... + // gap // ........................................................................ + ldr q10, [x3], #16 // ....e................................................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mls v30.8H, v12.8H, v7.H[0] // ......................................................................*. + // gap // ........................................................................ + mls v29.8H, v11.8H, v7.H[0] // ...................................................................*.... + // gap // ........................................................................ + mls v27.8H, v19.8H, v7.H[0] // .............................................................*.......... + // gap // ........................................................................ + sqrdmulh v2.8H, v0.8H, v10.H[1] // ...........e............................................................ + // gap // ........................................................................ + ldr q22, [x4], #(6*16) // .................................e...................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + st4 {v27.4S, v28.4S, v29.4S, v30.4S}, [x1], #64 // .......................................................................* + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mul v0.8H, v0.8H, v10.H[0] // ..........e............................................................. + // gap // ........................................................................ + mul v26.8H, v23.8H, v10.H[0] // .....e.................................................................. + // gap // ........................................................................ + ldr q6, [x1, #16] // .e...................................................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mls v0.8H, v2.8H, v7.H[0] // ............e........................................................... + // gap // ........................................................................ + sqrdmulh v24.8H, v23.8H, v10.H[1] // ......e................................................................. + // gap // ........................................................................ + ldr q18, [x1, #0] // e....................................................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + add v14.8H, v6.8H, v0.8H // ..............e......................................................... + // gap // ........................................................................ + mls v26.8H, v24.8H, v7.H[0] // .......e................................................................ + // gap // ........................................................................ + sub v17.8H, v6.8H, v0.8H // .............e.......................................................... + // gap // ........................................................................ + sqrdmulh v21.8H, v14.8H, v10.H[3] // ................e....................................................... + // gap // ........................................................................ + mul v24.8H, v14.8H, v10.H[2] // ...............e........................................................ + // gap // ........................................................................ + sqrdmulh v8.8H, v17.8H, v10.H[5] // .....................e.................................................. + // gap // ........................................................................ + mul v31.8H, v17.8H, v10.H[4] // ....................e................................................... + // gap // ........................................................................ + ldr q16, [x4, #-80] // ..................................e..................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mls v24.8H, v21.8H, v7.H[0] // .................e...................................................... + // gap // ........................................................................ + add v6.8H, v18.8H, v26.8H // .........e.............................................................. + // gap // ........................................................................ + mls v31.8H, v8.8H, v7.H[0] // ......................e................................................. + // gap // ........................................................................ + sub v0.8H, v18.8H, v26.8H // ........e............................................................... + // gap // ........................................................................ + sub v20.8H, v6.8H, v24.8H // ..................e..................................................... + // gap // ........................................................................ + add v28.8H, v6.8H, v24.8H // ...................e.................................................... + // gap // ........................................................................ + sub v23.8H, v0.8H, v31.8H // .......................e................................................ + // gap // ........................................................................ + add v18.8H, v0.8H, v31.8H // ........................e............................................... + // gap // ........................................................................ + trn1 v19.4S, v28.4S, v20.4S // .........................e.............................................. + // gap // ........................................................................ + trn2 v26.4S, v28.4S, v20.4S // ..........................e............................................. + // gap // ........................................................................ + trn2 v9.4S, v18.4S, v23.4S // ............................e........................................... + // gap // ........................................................................ + ldr q12, [x4, #-48] // ....................................e................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + trn2 v15.2D, v26.2D, v9.2D // ..............................e......................................... + // gap // ........................................................................ + trn1 v0.4S, v18.4S, v23.4S // ...........................e............................................ + // gap // ........................................................................ + mul v24.8H, v15.8H, v22.8H // ............................................e........................... + // gap // ........................................................................ + sqrdmulh v10.8H, v15.8H, v16.8H // .............................................e.......................... + // gap // ........................................................................ + trn2 v28.2D, v19.2D, v0.2D // .............................e.......................................... + // gap // ........................................................................ + trn1 v21.2D, v26.2D, v9.2D // ................................e....................................... + // gap // ........................................................................ + mul v26.8H, v28.8H, v22.8H // .......................................e................................ + // gap // ........................................................................ + mls v24.8H, v10.8H, v7.H[0] // ..............................................e......................... + // gap // ........................................................................ + trn1 v4.2D, v19.2D, v0.2D // ...............................e........................................ + // gap // ........................................................................ + ldr q13, [x4, #-64] // ...................................e.................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + add v29.8H, v21.8H, v24.8H // ................................................e....................... + // gap // ........................................................................ + sqrdmulh v31.8H, v28.8H, v16.8H // ........................................e............................... + // gap // ........................................................................ + sub v14.8H, v21.8H, v24.8H // ...............................................e........................ + // gap // ........................................................................ + mul v15.8H, v29.8H, v13.8H // .................................................e...................... + // gap // ........................................................................ + sqrdmulh v12.8H, v29.8H, v12.8H // ..................................................e..................... + // gap // ........................................................................ + mls v26.8H, v31.8H, v7.H[0] // .........................................e.............................. + // gap // ........................................................................ + ldr q31, [x4, #-32] // .....................................e.................................. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mls v15.8H, v12.8H, v7.H[0] // ...................................................e.................... + // gap // ........................................................................ + add v29.8H, v4.8H, v26.8H // ...........................................e............................ + // gap // ........................................................................ + ldr q8, [x4, #-16] // ......................................e................................. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sub v28.8H, v29.8H, v15.8H // ....................................................e................... + // gap // ........................................................................ + add v27.8H, v29.8H, v15.8H // .....................................................e.................. + // gap // ........................................................................ + mul v18.8H, v14.8H, v31.8H // ......................................................e................. + // gap // ........................................................................ + sqdmulh v0.8H, v28.8H, v7.H[1] // ..............................................................e......... + // gap // ........................................................................ + sqrdmulh v25.8H, v14.8H, v8.8H // .......................................................e................ + // gap // ........................................................................ + sub v8.8H, v4.8H, v26.8H // ..........................................e............................. + // gap // ........................................................................ + sqdmulh v24.8H, v27.8H, v7.H[1] // ...........................................................e............ + // gap // ........................................................................ + + // original source code + // ldr q8, [x1, #(16*0)] // ......................e...............................................|.................. + // ldr q9, [x1, #(16*1)] // ...................e..................................................|.................. + // ldr q10, [x1, #(16*2)] // .......e..............................................................|........e......... + // ldr q11, [x1, #(16*3)] // e.....................................................................|.e................ + // ldr q0, [x3], #16 // ..........e...........................................................|...........e...... + // mul v24.8h, v10.8h, v0.h[0] // ..................e...................................................|.................. + // sqrdmulh v10.8h, v10.8h, v0.h[1] // .....................e................................................|.................. + // mls v24.8h, v10.8h, v7.h[0] // ........................e.............................................|.................. + // sub v10.8h, v8.8h, v24.8h // ..................................e...................................|.................. + // add v8.8h, v8.8h, v24.8h // ................................e.....................................|.................. + // mul v24.8h, v11.8h, v0.h[0] // .................e....................................................|.................. + // sqrdmulh v11.8h, v11.8h, v0.h[1] // ..............e.......................................................|...............e.. + // mls v24.8h, v11.8h, v7.h[0] // ....................e.................................................|.................. + // sub v11.8h, v9.8h, v24.8h // .........................e............................................|.................. + // add v9.8h, v9.8h, v24.8h // .......................e..............................................|.................. + // mul v24.8h, v9.8h, v0.h[2] // ...........................e..........................................|.................. + // sqrdmulh v9.8h, v9.8h, v0.h[3] // ..........................e...........................................|.................. + // mls v24.8h, v9.8h, v7.h[0] // ...............................e......................................|.................. + // sub v9.8h, v8.8h, v24.8h // ...................................e..................................|.................. + // add v8.8h, v8.8h, v24.8h // ....................................e.................................|.................. + // mul v24.8h, v11.8h, v0.h[4] // .............................e........................................|.................. + // sqrdmulh v11.8h, v11.8h, v0.h[5] // ............................e.........................................|.................. + // mls v24.8h, v11.8h, v7.h[0] // .................................e....................................|.................. + // sub v11.8h, v10.8h, v24.8h // .....................................e................................|.................. + // add v10.8h, v10.8h, v24.8h // ......................................e...............................|.................. + // trn1 v25.4s, v8.4s, v9.4s // .......................................e..............................|.................. + // trn2 v26.4s, v8.4s, v9.4s // ........................................e.............................|.................. + // trn1 v27.4s, v10.4s, v11.4s // ............................................e.........................|.................. + // trn2 v28.4s, v10.4s, v11.4s // .........................................e............................|.................. + // trn2 v10.2d, v25.2d, v27.2d // ...............................................e......................|.................. + // trn2 v11.2d, v26.2d, v28.2d // ...........................................e..........................|.................. + // trn1 v8.2d, v25.2d, v27.2d // ...................................................e..................|.................. + // trn1 v9.2d, v26.2d, v28.2d // ................................................e.....................|.................. + // ldr q0, [x4], #(6*16) // ...............e......................................................|................e. + // ldr q4, [x4, #(-6*16 + 1*16)] // ..............................e.......................................|.................. + // ldr q1, [x4, #(-6*16 + 2*16)] // ....................................................e.................|.................. + // ldr q5, [x4, #(-6*16 + 3*16)] // ..........................................e...........................|.................. + // ldr q2, [x4, #(-6*16 + 4*16)] // ...........................................................e..........|.................. + // ldr q6, [x4, #(-6*16 + 5*16)] // ..............................................................e.......|.................. + // mul v24.8h, v10.8h, v0.8h // .................................................e....................|.................. + // sqrdmulh v10.8h, v10.8h, v4.8h // ......................................................e...............|.................. + // mls v24.8h, v10.8h, v7.h[0] // ..........................................................e...........|.................. + // sub v10.8h, v8.8h, v24.8h // ....................................................................e.|.................. + // add v8.8h, v8.8h, v24.8h // .............................................................e........|.................. + // mul v24.8h, v11.8h, v0.8h // .............................................e........................|.................. + // sqrdmulh v11.8h, v11.8h, v4.8h // ..............................................e.......................|.................. + // mls v24.8h, v11.8h, v7.h[0] // ..................................................e...................|.................. + // sub v11.8h, v9.8h, v24.8h // .......................................................e..............|.................. + // add v9.8h, v9.8h, v24.8h // .....................................................e................|.................. + // mul v24.8h, v9.8h, v1.8h // ........................................................e.............|.................. + // sqrdmulh v9.8h, v9.8h, v5.8h // .........................................................e............|.................. + // mls v24.8h, v9.8h, v7.h[0] // ............................................................e.........|.................. + // sub v9.8h, v8.8h, v24.8h // ...............................................................e......|.................. + // add v8.8h, v8.8h, v24.8h // ................................................................e.....|.................. + // mul v24.8h, v11.8h, v2.8h // .................................................................e....|.................. + // sqrdmulh v11.8h, v11.8h, v6.8h // ...................................................................e..|.................. + // mls v24.8h, v11.8h, v7.h[0] // ......................................................................|*................. + // sub v11.8h, v10.8h, v24.8h // ..*...................................................................|...*.............. + // add v10.8h, v10.8h, v24.8h // ...*..................................................................|....*............. + // sqdmulh v25.8h, v8.8h, v7.h[1] // .....................................................................e|.................. + // srshr v25.8h, v25.8h, #11 // ....*.................................................................|.....*............ + // mls v8.8h, v25.8h, v7.h[0] // .............*........................................................|..............*... + // sqdmulh v25.8h, v9.8h, v7.h[1] // ..................................................................e...|.................. + // srshr v25.8h, v25.8h, #11 // ......................................................................*.................. + // mls v9.8h, v25.8h, v7.h[0] // .*....................................................................|..*............... + // sqdmulh v25.8h, v10.8h, v7.h[1] // ......*...............................................................|.......*.......... + // srshr v25.8h, v25.8h, #11 // .........*............................................................|..........*....... + // mls v10.8h, v25.8h, v7.h[0] // ............*.........................................................|.............*.... + // sqdmulh v25.8h, v11.8h, v7.h[1] // .....*................................................................|......*........... + // srshr v25.8h, v25.8h, #11 // ........*.............................................................|.........*........ + // mls v11.8h, v25.8h, v7.h[0] // ...........*..........................................................|............*..... + // st4 {v8.4S, v9.4S, v10.4S, v11.4S}, [x1], #64 // ................*.....................................................|.................* + + sub count, count, #1 + cbnz count, layer4567_start + mls v18.8H, v25.8H, v7.H[0] // .*............ + // gap // .............. + srshr v21.8H, v0.8H, #11 // *............. + // gap // .............. + srshr v0.8H, v24.8H, #11 // .....*........ + // gap // .............. + // gap // .............. + // gap // .............. + sub v30.8H, v8.8H, v18.8H // ...*.......... + // gap // .............. + add v29.8H, v8.8H, v18.8H // ....*......... + // gap // .............. + mls v27.8H, v0.8H, v7.H[0] // ............*. + // gap // .............. + sqdmulh v0.8H, v30.8H, v7.H[1] // ......*....... + // gap // .............. + sqdmulh v26.8H, v29.8H, v7.H[1] // .......*...... + // gap // .............. + mls v28.8H, v21.8H, v7.H[0] // ..*........... + // gap // .............. + // gap // .............. + // gap // .............. + srshr v0.8H, v0.8H, #11 // ........*..... + // gap // .............. + srshr v25.8H, v26.8H, #11 // .........*.... + // gap // .............. + // gap // .............. + // gap // .............. + mls v30.8H, v0.8H, v7.H[0] // ..........*... + // gap // .............. + mls v29.8H, v25.8H, v7.H[0] // ...........*.. + // gap // .............. + // gap // .............. + // gap // .............. + // gap // .............. + // gap // .............. + // gap // .............. + // gap // .............. + st4 {v27.4S, v28.4S, v29.4S, v30.4S}, [x1], #64 // .............* + // gap // .............. + + // original source code + // srshr v6.8H, v0.8H, #11 // .*............ + // mls v18.8H, v25.8H, v7.H[0] // *............. + // mls v28.8H, v6.8H, v7.H[0] // ........*..... + // sub v30.8H, v8.8H, v18.8H // ...*.......... + // add v29.8H, v8.8H, v18.8H // ....*......... + // srshr v19.8H, v24.8H, #11 // ..*........... + // sqdmulh v5.8H, v30.8H, v7.H[1] // ......*....... + // sqdmulh v24.8H, v29.8H, v7.H[1] // .......*...... + // srshr v12.8H, v5.8H, #11 // .........*.... + // srshr v11.8H, v24.8H, #11 // ..........*... + // mls v30.8H, v12.8H, v7.H[0] // ...........*.. + // mls v29.8H, v11.8H, v7.H[0] // ............*. + // mls v27.8H, v19.8H, v7.H[0] // .....*........ + // st4 {v27.4S, v28.4S, v29.4S, v30.4S}, [x1], #64 // .............* + + + pop_stack + ret \ No newline at end of file diff --git a/tutorial/tutorial-3a.py b/tutorial/tutorial-3a.py new file mode 100644 index 00000000..70ccf2d1 --- /dev/null +++ b/tutorial/tutorial-3a.py @@ -0,0 +1,23 @@ +import logging +import sys + +sys.path.append("../") +from slothy import Slothy + +import slothy.targets.aarch64.aarch64_neon as AArch64_Neon +import slothy.targets.aarch64.cortex_a55 as Target_CortexA55 + +logging.basicConfig(stream=sys.stdout, level=logging.INFO) + +arch = AArch64_Neon +target = Target_CortexA55 + +slothy = Slothy(arch, target) + +# example +slothy.load_source_from_file("../examples/naive/aarch64/aarch64_simple0.s") +slothy.config.variable_size=True +slothy.config.constraints.stalls_first_attempt=32 + +slothy.optimize() +slothy.write_source_to_file("opt/aarch64_simple0_opt_a55.s") diff --git a/tutorial/tutorial-3b.py b/tutorial/tutorial-3b.py new file mode 100644 index 00000000..be04c41f --- /dev/null +++ b/tutorial/tutorial-3b.py @@ -0,0 +1,23 @@ +import logging +import sys + +sys.path.append("../") +from slothy import Slothy + +import slothy.targets.aarch64.aarch64_neon as AArch64_Neon +import slothy.targets.aarch64.cortex_a55 as Target_CortexA55 + +logging.basicConfig(stream=sys.stdout, level=logging.INFO) + +arch = AArch64_Neon +target = Target_CortexA55 + +slothy = Slothy(arch, target) + +# example +slothy.load_source_from_file("../examples/naive/aarch64/aarch64_simple0_macros.s") +slothy.config.variable_size=True +slothy.config.constraints.stalls_first_attempt=32 + +slothy.optimize(start="start", end="end") +slothy.write_source_to_file("opt/aarch64_simple0_macros_opt_a55.s") diff --git a/tutorial/tutorial-4.py b/tutorial/tutorial-4.py new file mode 100644 index 00000000..cc239560 --- /dev/null +++ b/tutorial/tutorial-4.py @@ -0,0 +1,26 @@ +import logging +import sys + +sys.path.append("../") +from slothy import Slothy + +import slothy.targets.aarch64.aarch64_neon as AArch64_Neon +import slothy.targets.aarch64.cortex_a55 as Target_CortexA55 + +logging.basicConfig(stream=sys.stdout, level=logging.INFO) + +arch = AArch64_Neon +target = Target_CortexA55 + +slothy = Slothy(arch, target) + +# example +slothy.load_source_from_file("../examples/naive/aarch64/aarch64_simple0_loop.s") +slothy.config.variable_size=True +slothy.config.constraints.stalls_first_attempt=32 + +slothy.config.sw_pipelining.enabled = True +slothy.config.sw_pipelining.optimize_preamble = False +slothy.config.sw_pipelining.optimize_postamble = False +slothy.optimize_loop("start") +slothy.write_source_to_file("opt/aarch64_simple0_loop_opt_a55.s") diff --git a/tutorial/tutorial-5.py b/tutorial/tutorial-5.py new file mode 100644 index 00000000..1a87d5c7 --- /dev/null +++ b/tutorial/tutorial-5.py @@ -0,0 +1,27 @@ +import logging +import sys + +sys.path.append("../") +from slothy import Slothy + +import slothy.targets.aarch64.aarch64_neon as AArch64_Neon +import slothy.targets.aarch64.cortex_a55 as Target_CortexA55 + +logging.basicConfig(stream=sys.stdout, level=logging.INFO) + +arch = AArch64_Neon +target = Target_CortexA55 + +slothy = Slothy(arch, target) + +# example +slothy.load_source_from_file("../examples/naive/aarch64/aarch64_simple0_loop.s") +slothy.config.variable_size=True +slothy.config.constraints.stalls_first_attempt=32 + +slothy.config.sw_pipelining.enabled = True +slothy.config.sw_pipelining.optimize_preamble = False +slothy.config.sw_pipelining.optimize_postamble = False +slothy.config.with_llvm_mca = True +slothy.optimize_loop("start") +slothy.write_source_to_file("opt/aarch64_simple0_loop_opt_mca_a55.s") diff --git a/tutorial/tutorial-6.py b/tutorial/tutorial-6.py new file mode 100644 index 00000000..a9c3de6a --- /dev/null +++ b/tutorial/tutorial-6.py @@ -0,0 +1,27 @@ +import logging +import sys + +sys.path.append("../") +from slothy import Slothy + +import slothy.targets.aarch64.aarch64_neon as AArch64_Neon +import slothy.targets.aarch64.cortex_a55 as Target_CortexA55 + +logging.basicConfig(stream=sys.stdout, level=logging.INFO) + +arch = AArch64_Neon +target = Target_CortexA55 + +slothy = Slothy(arch, target) + +# example +slothy.load_source_from_file("../examples/naive/aarch64/ntt_kyber_123_4567.s") +slothy.config.sw_pipelining.enabled = True +slothy.config.inputs_are_outputs = True +slothy.config.sw_pipelining.minimize_overlapping = False +slothy.config.variable_size = True +slothy.config.reserved_regs = [f"x{i}" for i in range(0, 7)] + ["x30", "sp"] +slothy.config.constraints.stalls_first_attempt = 64 +slothy.optimize_loop("layer123_start") +slothy.optimize_loop("layer4567_start") +slothy.write_source_to_file("opt/ntt_kyber_123_4567_opt_a55.s") diff --git a/tutorial/tutorial-7.py b/tutorial/tutorial-7.py new file mode 100644 index 00000000..1cae15ee --- /dev/null +++ b/tutorial/tutorial-7.py @@ -0,0 +1,37 @@ +import logging +import sys + +sys.path.append("../") +from slothy import Slothy + +import slothy.targets.aarch64.aarch64_neon as AArch64_Neon +import slothy.targets.aarch64.cortex_a55 as Target_CortexA55 + +logging.basicConfig(stream=sys.stdout, level=logging.INFO) + +arch = AArch64_Neon +target = Target_CortexA55 + +slothy = Slothy(arch, target) + +# example +slothy.load_source_from_file("../examples/naive/aarch64/X25519-AArch64-simple.s") + +# first pass: replace symbolic register names by architectural registers +slothy.config.inputs_are_outputs=True +slothy.config.outputs=["x0"] +slothy.config.constraints.functional_only = True +slothy.config.constraints.allow_reordering = False +slothy.optimize(start="mainloop", end="end_label") +slothy.config.constraints.functional_only = False +slothy.config.constraints.allow_reordering = True + +# second pass: splitting heuristic +slothy.config.variable_size=True +slothy.config.constraints.stalls_first_attempt=32 +slothy.config.split_heuristic = True +slothy.config.split_heuristic_stepsize = 0.05 +slothy.config.split_heuristic_factor = 10 +slothy.config.split_heuristic_repeat = 2 +slothy.optimize(start="mainloop", end="end_label") +slothy.write_source_to_file("opt/X25519-AArch64-simple_opt.s") diff --git a/tutorial/tutorial_all.sh b/tutorial/tutorial_all.sh new file mode 100755 index 00000000..5089c441 --- /dev/null +++ b/tutorial/tutorial_all.sh @@ -0,0 +1,27 @@ +#!/usr/bin/env sh +set -e + +echo "* tutorial-3a.py (Straightline optimization)" +python3 tutorial-3a.py >/dev/null + +echo "* tutorial-3b.py (Clean code)" +python3 tutorial-3b.py >/dev/null + +echo "* tutorial-4.py (Software pipelining)" +python3 tutorial-4.py >/dev/null + +if [ -x "$(command -v llvm-mca)" ] +then + echo "* tutorial-5.py (Running SLOTHY with LLVM-MCA)" + python3 tutorial-5.py >/dev/null +else + echo "* tutorial-5.py (Running SLOTHY with LLVM-MCA) SKIP" +fi + +echo "* tutorial-6.py (Optimizing a full Neon NTT)" +python3 tutorial-6.py >/dev/null + +echo "* tutorial-7.py (Optimizing larger pieces of code)" +python3 tutorial-7.py >/dev/null + +echo "Done :-)"