diff --git a/example.py b/example.py index 3e1181dd..9604d1eb 100644 --- a/example.py +++ b/example.py @@ -590,14 +590,31 @@ def core(self, slothy): slothy.config.sw_pipelining.enabled = True slothy.config.inputs_are_outputs = True slothy.config.sw_pipelining.minimize_overlapping = False + slothy.config.sw_pipelining.optimize_preamble = False + slothy.config.sw_pipelining.allow_post = True slothy.config.variable_size = True - slothy.config.reserved_regs = [ - f"x{i}" for i in range(0, 7)] + ["x30", "sp"] - slothy.config.reserved_regs += self.target_reserved - slothy.config.constraints.stalls_first_attempt = 64 + slothy.config.constraints.stalls_first_attempt = 32 + slothy.config.inputs_are_outputs = True slothy.optimize_loop("layer123_start") - slothy.optimize_loop("layer4567_start") + slothy.config.outputs = slothy.last_result.kernel_input_output + [f"x{i}" for i in range(0,6)] + slothy.config.locked_registers = [f"x{i}" for i in range(0,6)] + slothy.config.sw_pipelining.enabled = False + slothy.config.inputs_are_outputs = False + slothy.optimize(start="ntt_kyber_123_4567_preamble", end="layer123_start") + + slothy.config.outputs = [] + slothy.config.sw_pipelining.enabled = True + slothy.config.inputs_are_outputs = True + slothy.config.sw_pipelining.optimize_preamble = True + slothy.config.sw_pipelining.optimize_postamble = True + slothy.optimize_loop("layer4567_start", postamble_label="ntt_kyber_123_4567_postamble") + + slothy.config.outputs = [f"v{i}" for i in range(8,16)] + slothy.config.locked_registers = [f"x{i}" for i in range(0,6)] + slothy.config.sw_pipelining.enabled = False + slothy.config.inputs_are_outputs = False + slothy.optimize(start="ntt_kyber_123_4567_postamble", end="ntt_kyber_123_4567_end") class ntt_kyber_123(Example): def __init__(self, var="", arch=AArch64_Neon, target=Target_CortexA55): @@ -1286,6 +1303,7 @@ def main(): ntt_kyber_12_345_67(True, target=Target_CortexM85r1), # Cortex-A55 ntt_kyber_123_4567(), + ntt_kyber_123_4567(var="lazy_trn"), ntt_kyber_123_4567(var="scalar_load"), ntt_kyber_123_4567(var="scalar_store"), ntt_kyber_123_4567(var="scalar_load_store"), @@ -1293,6 +1311,7 @@ def main(): ntt_kyber_1234_567(), # Cortex-A72 ntt_kyber_123_4567(target=Target_CortexA72), + ntt_kyber_123_4567(var="lazy_trn", target=Target_CortexA72), ntt_kyber_123_4567(var="scalar_load", target=Target_CortexA72), ntt_kyber_123_4567(var="scalar_store", target=Target_CortexA72), ntt_kyber_123_4567(var="scalar_load_store", target=Target_CortexA72), @@ -1300,6 +1319,7 @@ def main(): ntt_kyber_1234_567(target=Target_CortexA72), # # Apple M1 Firestorm ntt_kyber_123_4567(target=Target_AppleM1_firestorm, timeout=3600), + ntt_kyber_123_4567(var="lazy_trn", target=Target_AppleM1_firestorm, timeout=3600), ntt_kyber_123_4567(var="scalar_load", target=Target_AppleM1_firestorm, timeout=3600), ntt_kyber_123_4567(var="scalar_store", target=Target_AppleM1_firestorm, timeout=3600), ntt_kyber_123_4567(var="scalar_load_store", target=Target_AppleM1_firestorm, timeout=3600), @@ -1308,6 +1328,7 @@ def main(): ntt_kyber_1234_567(var="manual_st4", target=Target_AppleM1_firestorm, timeout=300), # Apple M1 Icestorm ntt_kyber_123_4567(target=Target_AppleM1_icestorm, timeout=3600), + ntt_kyber_123_4567(var="lazy_trn", target=Target_AppleM1_icestorm, timeout=3600), ntt_kyber_123_4567(var="scalar_load", target=Target_AppleM1_icestorm, timeout=3600), ntt_kyber_123_4567(var="scalar_store", target=Target_AppleM1_icestorm, timeout=3600), ntt_kyber_123_4567(var="scalar_load_store", target=Target_AppleM1_icestorm, timeout=3600), diff --git a/examples/naive/aarch64/ntt_kyber_123_4567_lazy_trn.s b/examples/naive/aarch64/ntt_kyber_123_4567_lazy_trn.s new file mode 100644 index 00000000..4ebb42a2 --- /dev/null +++ b/examples/naive/aarch64/ntt_kyber_123_4567_lazy_trn.s @@ -0,0 +1,385 @@ +/// +/// Copyright (c) 2022 Arm Limited +/// Copyright (c) 2022 Hanno Becker +/// Copyright (c) 2023 Amin Abdulrahman, Matthias Kannwischer +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE. +/// + +// Needed to provide ASM_LOAD directive +#include + +// NOTE +// We use a lot of trivial macros to simplify the parsing burden for Slothy +// The macros are not unfolded by Slothy and thus interpreted as instructions, +// which are easier to parse due to e.g. the lack of size specifiers and simpler +// syntax for pre and post increment for loads and stores. +// +// Eventually, NeLight should include a proper parser for AArch64, +// but for initial investigations, the below is enough. + +.macro ldr_vo vec, base, offset + ldr qform_\vec, [\base, #\offset] +.endm + +.macro ldr_vi vec, base, inc + ldr qform_\vec, [\base], #\inc +.endm + +.macro str_vo vec, base, offset + str qform_\vec, [\base, #\offset] +.endm +.macro str_vi vec, base, inc + str qform_\vec, [\base], #\inc +.endm + +.macro vqrdmulh d,a,b + sqrdmulh \d\().8h, \a\().8h, \b\().8h +.endm +.macro vmlsq d,a,b,i + mls \d\().8h, \a\().8h, \b\().h[\i] +.endm +.macro vqrdmulhq d,a,b,i + sqrdmulh \d\().8h, \a\().8h, \b\().h[\i] +.endm +.macro vqdmulhq d,a,b,i + sqdmulh \d\().8h, \a\().8h, \b\().h[\i] +.endm +.macro vmulq d,a,b,i + mul \d\().8h, \a\().8h, \b\().h[\i] +.endm + +.macro mulmodq dst, src, const, idx0, idx1 + vmulq \dst, \src, \const, \idx0 + vqrdmulhq \src, \src, \const, \idx1 + vmlsq \dst, \src, consts, 0 +.endm + +.macro mulmod dst, src, const, const_twisted + mul \dst\().8h, \src\().8h, \const\().8h + vqrdmulh \src, \src, \const_twisted + vmlsq \dst, \src, consts, 0 +.endm + +.macro ct_butterfly a, b, root, idx0, idx1 + mulmodq tmp, \b, \root, \idx0, \idx1 + sub \b\().8h, \a\().8h, tmp.8h + add \a\().8h, \a\().8h, tmp.8h +.endm + +.macro mulmod_v dst, src, const, const_twisted + mul \dst\().8h, \src\().8h, \const\().8h + vqrdmulh \src, \src, \const_twisted + vmlsq \dst, \src, consts, 0 +.endm + +.macro ct_butterfly_v a, b, root, root_twisted + mulmod tmp, \b, \root, \root_twisted + sub \b\().8h, \a\().8h, tmp.8h + add \a\().8h, \a\().8h, tmp.8h +.endm + +.macro barrett_reduce a + vqdmulhq t0, \a, consts, 1 + srshr t0.8h, t0.8h, #11 + vmlsq \a, t0, consts, 0 +.endm + +.macro load_roots_123 + ldr_vi root0, r_ptr0, 32 + ldr_vo root1, r_ptr0, -16 +.endm + +.macro load_next_roots_45 + ldr_vi root0, r_ptr0, 16 +.endm + +.macro load_next_roots_67_0 + ldr_vi root0, r_ptr1, (8*16) + ldr_vo root1, r_ptr1, (-8*16 + 1*16) + ldr_vo root0_tw, r_ptr1, (-8*16 + 2*16) + ldr_vo root1_tw, r_ptr1, (-8*16 + 3*16) +.endm + +.macro load_next_roots_67_1 + ldr_vo root0, r_ptr1, (-8*16 + 4*16) + ldr_vo root0_tw, r_ptr1, (-8*16 + 5*16) + ldr_vo root1, r_ptr1, (-8*16 + 6*16) + ldr_vo root1_tw, r_ptr1, (-8*16 + 7*16) +.endm + +.macro transpose4 data + trn1 t0.4s, \data\()0.4s, \data\()1.4s + trn2 t1.4s, \data\()0.4s, \data\()1.4s + trn1 t2.4s, \data\()2.4s, \data\()3.4s + trn2 t3.4s, \data\()2.4s, \data\()3.4s + + trn2 \data\()2.2d, t0.2d, t2.2d + trn2 \data\()3.2d, t1.2d, t3.2d + trn1 \data\()0.2d, t0.2d, t2.2d + trn1 \data\()1.2d, t1.2d, t3.2d +.endm + +.macro transpose4_0 data_out, data_in + trn1 \data_out\()0.4s, \data_in\()0.4s, \data_in\()1.4s + trn2 \data_out\()1.4s, \data_in\()0.4s, \data_in\()1.4s + trn1 \data_out\()2.4s, \data_in\()2.4s, \data_in\()3.4s + trn2 \data_out\()3.4s, \data_in\()2.4s, \data_in\()3.4s +.endm + +.macro transpose4_1 data_out, data_in + trn2 \data_out\()2.2d, \data_in\()0.2d, \data_in\()2.2d + trn2 \data_out\()3.2d, \data_in\()1.2d, \data_in\()3.2d + trn1 \data_out\()0.2d, \data_in\()0.2d, \data_in\()2.2d + trn1 \data_out\()1.2d, \data_in\()1.2d, \data_in\()3.2d +.endm + +.macro save_vregs + stp d8, d9, [sp, #16*0] + stp d10, d11, [sp, #16*1] + stp d12, d13, [sp, #16*2] + stp d14, d15, [sp, #16*3] +.endm + +.macro restore_vregs + ldp d8, d9, [sp, #16*0] + ldp d10, d11, [sp, #16*1] + ldp d12, d13, [sp, #16*2] + ldp d14, d15, [sp, #16*3] +.endm + +.macro alloc_stack + sub sp, sp, #(16*4) +.endm + +.macro free_stack + add sp, sp, #(16*4) +.endm + + in .req x0 + in_orig .req x1 + count .req x2 + r_ptr0 .req x3 + r_ptr1 .req x4 + modulus .req w5 + barrett .req w6 + + qform_v0 .req q0 + qform_v1 .req q1 + qform_v2 .req q2 + qform_v3 .req q3 + qform_v4 .req q4 + qform_v5 .req q5 + qform_v6 .req q6 + qform_v7 .req q7 + qform_v8 .req q8 + qform_v9 .req q9 + qform_v10 .req q10 + qform_v11 .req q11 + qform_v12 .req q12 + qform_v13 .req q13 + qform_v14 .req q14 + qform_v15 .req q15 + qform_v16 .req q16 + qform_v17 .req q17 + qform_v18 .req q18 + qform_v19 .req q19 + qform_v20 .req q20 + qform_v21 .req q21 + qform_v22 .req q22 + qform_v23 .req q23 + qform_v24 .req q24 + qform_v25 .req q25 + qform_v26 .req q26 + qform_v27 .req q27 + qform_v28 .req q28 + qform_v29 .req q29 + qform_v30 .req q30 + qform_v31 .req q31 + + data0 .req v8 + data1 .req v9 + data2 .req v10 + data3 .req v11 + data4 .req v12 + data5 .req v13 + data6 .req v14 + data7 .req v15 + + x_00 .req x10 + x_01 .req x11 + x_10 .req x12 + x_11 .req x13 + x_20 .req x14 + x_21 .req x15 + x_30 .req x16 + x_31 .req x17 + + xt_00 .req x_00 + xt_01 .req x_20 + xt_10 .req x_10 + xt_11 .req x_30 + xt_20 .req x_01 + xt_21 .req x_21 + xt_30 .req x_11 + xt_31 .req x_31 + + qform_data0 .req q8 + qform_data1 .req q9 + qform_data2 .req q10 + qform_data3 .req q11 + qform_data4 .req q12 + qform_data5 .req q13 + qform_data6 .req q14 + qform_data7 .req q15 + + root0 .req v0 + root1 .req v1 + root2 .req v2 + root0_tw .req v4 + root1_tw .req v5 + root2_tw .req v6 + + consts .req v7 + qform_consts .req q7 + + qform_root0 .req q0 + qform_root1 .req q1 + qform_root2 .req q2 + qform_root0_tw .req q4 + qform_root1_tw .req q5 + qform_root2_tw .req q6 + + tmp .req v24 + t0 .req v25 + t1 .req v26 + t2 .req v27 + t3 .req v28 + +.data +.p2align 4 +roots: +#include "ntt_kyber_123_45_67_twiddles_twisted.s" +.text + + .global ntt_kyber_123_4567 + .global _ntt_kyber_123_4567 + +.p2align 4 +ntt_kyber_123_4567: +_ntt_kyber_123_4567: + ASM_LOAD(r_ptr0, roots) + ASM_LOAD(r_ptr1, roots_l56) + ldr modulus, =3329 + ldr barrett, =20159 + mov consts.H[0], modulus + mov consts.H[1], barrett + + alloc_stack + +ntt_kyber_123_4567_preamble: + save_vregs + + mov in_orig, in + mov count, #4 + + load_roots_123 + + .p2align 2 +layer123_start: + + ldr_vo data0, in, 0 + ldr_vo data1, in, (1*(512/8)) + ldr_vo data2, in, (2*(512/8)) + ldr_vo data3, in, (3*(512/8)) + ldr_vo data4, in, (4*(512/8)) + ldr_vo data5, in, (5*(512/8)) + ldr_vo data6, in, (6*(512/8)) + ldr_vo data7, in, (7*(512/8)) + + ct_butterfly data0, data4, root0, 0, 1 + ct_butterfly data1, data5, root0, 0, 1 + ct_butterfly data2, data6, root0, 0, 1 + ct_butterfly data3, data7, root0, 0, 1 + + ct_butterfly data0, data2, root0, 2, 3 + ct_butterfly data1, data3, root0, 2, 3 + ct_butterfly data4, data6, root0, 4, 5 + ct_butterfly data5, data7, root0, 4, 5 + + ct_butterfly data0, data1, root0, 6, 7 + ct_butterfly data2, data3, root1, 0, 1 + ct_butterfly data4, data5, root1, 2, 3 + ct_butterfly data6, data7, root1, 4, 5 + + str_vi data0, in, (16) + str_vo data1, in, (-16 + 1*(512/8)) + str_vo data2, in, (-16 + 2*(512/8)) + str_vo data3, in, (-16 + 3*(512/8)) + str_vo data4, in, (-16 + 4*(512/8)) + str_vo data5, in, (-16 + 5*(512/8)) + str_vo data6, in, (-16 + 6*(512/8)) + str_vo data7, in, (-16 + 7*(512/8)) + + subs count, count, #1 + cbnz count, layer123_start + + mov in, in_orig + mov count, #8 + + .p2align 2 +layer4567_start: + ldr_vo data0, in, (16*0) + ldr_vo data1, in, (16*1) + ldr_vo data2, in, (16*2) + ldr_vo data3, in, (16*3) + + load_next_roots_45 + + ct_butterfly data0, data2, root0, 0, 1 + ct_butterfly data1, data3, root0, 0, 1 + ct_butterfly data0, data1, root0, 2, 3 + ct_butterfly data2, data3, root0, 4, 5 + + transpose4_1 t, data + + load_next_roots_67_0 + ct_butterfly_v t0, t2, root0, root0_tw + ct_butterfly_v t1, t3, root1, root1_tw + + transpose4_0 data, t + load_next_roots_67_1 + + ct_butterfly_v data0, data1, root0, root0_tw + ct_butterfly_v data2, data3, root1, root1_tw + + barrett_reduce data0 + barrett_reduce data1 + barrett_reduce data2 + barrett_reduce data3 + st4 {data0.4S, data1.4S, data2.4S, data3.4S}, [in], #64 + + subs count, count, #1 + cbnz count, layer4567_start + + restore_vregs +ntt_kyber_123_4567_end: + free_stack + ret diff --git a/examples/naive/aarch64/ntt_kyber_123_45_67_twiddles_twisted.s b/examples/naive/aarch64/ntt_kyber_123_45_67_twiddles_twisted.s new file mode 100644 index 00000000..5876bc45 --- /dev/null +++ b/examples/naive/aarch64/ntt_kyber_123_45_67_twiddles_twisted.s @@ -0,0 +1,622 @@ + +/// +/// Copyright (c) 2022 Arm Limited +/// Copyright (c) 2022 Hanno Becker +/// Copyright (c) 2023 Amin Abdulrahman, Matthias Kannwischer +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE. +/// + +.p2align 2 +roots_l012: +.short -1600 +.short -15749 +.short -749 +.short -7373 +.short -40 +.short -394 +.short -687 +.short -6762 +.short 630 +.short 6201 +.short -1432 +.short -14095 +.short 848 +.short 8347 +.short 0 +.short 0 +roots_l34: +.short 1062 +.short 10453 +.short 296 +.short 2914 +.short -882 +.short -8682 +.short 0 +.short 0 +.short -1410 +.short -13879 +.short 1339 +.short 13180 +.short 1476 +.short 14529 +.short 0 +.short 0 +.short 193 +.short 1900 +.short -283 +.short -2786 +.short 56 +.short 551 +.short 0 +.short 0 +.short 797 +.short 7845 +.short -1089 +.short -10719 +.short 1333 +.short 13121 +.short 0 +.short 0 +.short -543 +.short -5345 +.short 1426 +.short 14036 +.short -1235 +.short -12156 +.short 0 +.short 0 +.short -69 +.short -679 +.short 535 +.short 5266 +.short -447 +.short -4400 +.short 0 +.short 0 +.short 569 +.short 5601 +.short -936 +.short -9213 +.short -450 +.short -4429 +.short 0 +.short 0 +.short -1583 +.short -15582 +.short -1355 +.short -13338 +.short 821 +.short 8081 +.short 0 +.short 0 +roots_l56: +.short 289 +.short 289 +.short 289 +.short 289 +.short -76 +.short -76 +.short -76 +.short -76 +.short 331 +.short 331 +.short 331 +.short 331 +.short -1573 +.short -1573 +.short -1573 +.short -1573 +.short 2845 +.short 2845 +.short 2845 +.short 2845 +.short -748 +.short -748 +.short -748 +.short -748 +.short 3258 +.short 3258 +.short 3258 +.short 3258 +.short -15483 +.short -15483 +.short -15483 +.short -15483 +.short 17 +.short 17 +.short 583 +.short 583 +.short 1637 +.short 1637 +.short -1041 +.short -1041 +.short 167 +.short 167 +.short 5739 +.short 5739 +.short 16113 +.short 16113 +.short -10247 +.short -10247 +.short -568 +.short -568 +.short -680 +.short -680 +.short 723 +.short 723 +.short 1100 +.short 1100 +.short -5591 +.short -5591 +.short -6693 +.short -6693 +.short 7117 +.short 7117 +.short 10828 +.short 10828 +.short 1197 +.short 1197 +.short 1197 +.short 1197 +.short -1052 +.short -1052 +.short -1052 +.short -1052 +.short -1025 +.short -1025 +.short -1025 +.short -1025 +.short -1274 +.short -1274 +.short -1274 +.short -1274 +.short 11782 +.short 11782 +.short 11782 +.short 11782 +.short -10355 +.short -10355 +.short -10355 +.short -10355 +.short -10089 +.short -10089 +.short -10089 +.short -10089 +.short -12540 +.short -12540 +.short -12540 +.short -12540 +.short 1409 +.short 1409 +.short -48 +.short -48 +.short 756 +.short 756 +.short -314 +.short -314 +.short 13869 +.short 13869 +.short -472 +.short -472 +.short 7441 +.short 7441 +.short -3091 +.short -3091 +.short -667 +.short -667 +.short 233 +.short 233 +.short -1173 +.short -1173 +.short -279 +.short -279 +.short -6565 +.short -6565 +.short 2293 +.short 2293 +.short -11546 +.short -11546 +.short -2746 +.short -2746 +.short 650 +.short 650 +.short 650 +.short 650 +.short -816 +.short -816 +.short -816 +.short -816 +.short -1352 +.short -1352 +.short -1352 +.short -1352 +.short 632 +.short 632 +.short 632 +.short 632 +.short 6398 +.short 6398 +.short 6398 +.short 6398 +.short -8032 +.short -8032 +.short -8032 +.short -8032 +.short -13308 +.short -13308 +.short -13308 +.short -13308 +.short 6221 +.short 6221 +.short 6221 +.short 6221 +.short -1626 +.short -1626 +.short -540 +.short -540 +.short -1482 +.short -1482 +.short 1461 +.short 1461 +.short -16005 +.short -16005 +.short -5315 +.short -5315 +.short -14588 +.short -14588 +.short 14381 +.short 14381 +.short 1651 +.short 1651 +.short -1540 +.short -1540 +.short 952 +.short 952 +.short -642 +.short -642 +.short 16251 +.short 16251 +.short -15159 +.short -15159 +.short 9371 +.short 9371 +.short -6319 +.short -6319 +.short -464 +.short -464 +.short -464 +.short -464 +.short 1320 +.short 1320 +.short 1320 +.short 1320 +.short 33 +.short 33 +.short 33 +.short 33 +.short -1414 +.short -1414 +.short -1414 +.short -1414 +.short -4567 +.short -4567 +.short -4567 +.short -4567 +.short 12993 +.short 12993 +.short 12993 +.short 12993 +.short 325 +.short 325 +.short 325 +.short 325 +.short -13918 +.short -13918 +.short -13918 +.short -13918 +.short 939 +.short 939 +.short -892 +.short -892 +.short 733 +.short 733 +.short 268 +.short 268 +.short 9243 +.short 9243 +.short -8780 +.short -8780 +.short 7215 +.short 7215 +.short 2638 +.short 2638 +.short -1021 +.short -1021 +.short -941 +.short -941 +.short -992 +.short -992 +.short 641 +.short 641 +.short -10050 +.short -10050 +.short -9262 +.short -9262 +.short -9764 +.short -9764 +.short 6309 +.short 6309 +.short -1010 +.short -1010 +.short -1010 +.short -1010 +.short 807 +.short 807 +.short 807 +.short 807 +.short 1435 +.short 1435 +.short 1435 +.short 1435 +.short 452 +.short 452 +.short 452 +.short 452 +.short -9942 +.short -9942 +.short -9942 +.short -9942 +.short 7943 +.short 7943 +.short 7943 +.short 7943 +.short 14125 +.short 14125 +.short 14125 +.short 14125 +.short 4449 +.short 4449 +.short 4449 +.short 4449 +.short 1584 +.short 1584 +.short -1292 +.short -1292 +.short 375 +.short 375 +.short -1239 +.short -1239 +.short 15592 +.short 15592 +.short -12717 +.short -12717 +.short 3691 +.short 3691 +.short -12196 +.short -12196 +.short -1031 +.short -1031 +.short -109 +.short -109 +.short -780 +.short -780 +.short 1645 +.short 1645 +.short -10148 +.short -10148 +.short -1073 +.short -1073 +.short -7678 +.short -7678 +.short 16192 +.short 16192 +.short 1438 +.short 1438 +.short 1438 +.short 1438 +.short 1534 +.short 1534 +.short 1534 +.short 1534 +.short -461 +.short -461 +.short -461 +.short -461 +.short -927 +.short -927 +.short -927 +.short -927 +.short 14155 +.short 14155 +.short 14155 +.short 14155 +.short 15099 +.short 15099 +.short 15099 +.short 15099 +.short -4538 +.short -4538 +.short -4538 +.short -4538 +.short -9125 +.short -9125 +.short -9125 +.short -9125 +.short 1063 +.short 1063 +.short -556 +.short -556 +.short -1230 +.short -1230 +.short -863 +.short -863 +.short 10463 +.short 10463 +.short -5473 +.short -5473 +.short -12107 +.short -12107 +.short -8495 +.short -8495 +.short 319 +.short 319 +.short 757 +.short 757 +.short 561 +.short 561 +.short -735 +.short -735 +.short 3140 +.short 3140 +.short 7451 +.short 7451 +.short 5522 +.short 5522 +.short -7235 +.short -7235 +.short -682 +.short -682 +.short -682 +.short -682 +.short 1481 +.short 1481 +.short 1481 +.short 1481 +.short -712 +.short -712 +.short -712 +.short -712 +.short 648 +.short 648 +.short 648 +.short 648 +.short -6713 +.short -6713 +.short -6713 +.short -6713 +.short 14578 +.short 14578 +.short 14578 +.short 14578 +.short -7008 +.short -7008 +.short -7008 +.short -7008 +.short 6378 +.short 6378 +.short 6378 +.short 6378 +.short -525 +.short -525 +.short 403 +.short 403 +.short 1143 +.short 1143 +.short -554 +.short -554 +.short -5168 +.short -5168 +.short 3967 +.short 3967 +.short 11251 +.short 11251 +.short -5453 +.short -5453 +.short 1092 +.short 1092 +.short 1026 +.short 1026 +.short -1179 +.short -1179 +.short 886 +.short 886 +.short 10749 +.short 10749 +.short 10099 +.short 10099 +.short -11605 +.short -11605 +.short 8721 +.short 8721 +.short -855 +.short -855 +.short -855 +.short -855 +.short 1227 +.short 1227 +.short 1227 +.short 1227 +.short -219 +.short -219 +.short -219 +.short -219 +.short 910 +.short 910 +.short 910 +.short 910 +.short -8416 +.short -8416 +.short -8416 +.short -8416 +.short 12078 +.short 12078 +.short 12078 +.short 12078 +.short -2156 +.short -2156 +.short -2156 +.short -2156 +.short 8957 +.short 8957 +.short 8957 +.short 8957 +.short -1607 +.short -1607 +.short -1455 +.short -1455 +.short -1219 +.short -1219 +.short 885 +.short 885 +.short -15818 +.short -15818 +.short -14322 +.short -14322 +.short -11999 +.short -11999 +.short 8711 +.short 8711 +.short 1212 +.short 1212 +.short 1029 +.short 1029 +.short -394 +.short -394 +.short -1175 +.short -1175 +.short 11930 +.short 11930 +.short 10129 +.short 10129 +.short -3878 +.short -3878 +.short -11566 +.short -11566 diff --git a/slothy/targets/aarch64/aarch64_neon.py b/slothy/targets/aarch64/aarch64_neon.py index b8cf719f..2f0365c3 100644 --- a/slothy/targets/aarch64/aarch64_neon.py +++ b/slothy/targets/aarch64/aarch64_neon.py @@ -1934,6 +1934,11 @@ class mov(AArch64Instruction): # pylint: disable=missing-docstring,invalid-name inputs = ["Wa"] outputs = ["Wd"] +class mov_x(AArch64Instruction): # pylint: disable=missing-docstring,invalid-name + pattern = "mov , " + inputs = ["Xa"] + outputs = ["Xd"] + class AArch64Move(AArch64Instruction): # pylint: disable=missing-docstring,invalid-name pass diff --git a/slothy/targets/aarch64/apple_m1_firestorm_experimental.py b/slothy/targets/aarch64/apple_m1_firestorm_experimental.py index 63e7656e..347f2915 100644 --- a/slothy/targets/aarch64/apple_m1_firestorm_experimental.py +++ b/slothy/targets/aarch64/apple_m1_firestorm_experimental.py @@ -149,7 +149,7 @@ def get_min_max_objective(slothy): vuzp1, vuzp2): ExecutionUnit.V(), # Arithmetic - (add, add_imm): ExecutionUnit.I(), + (mov_x, mov_imm, sub_imm, add, add_imm): ExecutionUnit.I(), (add_lsl, add_lsr, add2): list(map(list, combinations(ExecutionUnit.I(), 2))), (umull_wform, mul_wform): ExecutionUnit.M(), (umaddl_wform): ExecutionUnit.SCALAR_I5, @@ -211,7 +211,7 @@ def get_min_max_objective(slothy): vuzp1, vuzp2): 1, # Arithmetic - (add, add_imm): 1, + (mov_x, mov_imm, sub_imm, add, add_imm): 1, (add_lsl, add_lsr, add2): 1, (umull_wform, mul_wform): 1, (umaddl_wform): 1, @@ -264,7 +264,7 @@ def get_min_max_objective(slothy): vuzp1, vuzp2): 2, # Arithmetic - (add, add_imm): 1, + (mov_x, mov_imm, sub_imm, add, add_imm): 1, (add_lsl, add_lsr, add2): 2, (umull_wform, mul_wform): 3, (umaddl_wform): 3, @@ -293,7 +293,7 @@ def get_latency(src, out_idx, dst): if instclass_src == umaddl_wform and instclass_dst == umaddl_wform and \ src.args_out[0] == dst.args_in[2]: return (3, lambda t_src, t_dst: t_dst.program_start_var == t_src.program_start_var + 1) - + return latency diff --git a/slothy/targets/aarch64/cortex_a72_frontend.py b/slothy/targets/aarch64/cortex_a72_frontend.py index 81e408ff..778093b3 100644 --- a/slothy/targets/aarch64/cortex_a72_frontend.py +++ b/slothy/targets/aarch64/cortex_a72_frontend.py @@ -125,10 +125,10 @@ def get_min_max_objective(slothy): ( Ldr_Q, Ldr_X ) : ExecutionUnit.LOAD(), - ( Str_Q, Str_X ) + ( d_stp_stack_with_inc, Str_Q, Str_X ) : ExecutionUnit.STORE(), - (add, add_imm, add_lsl, add_lsr) : ExecutionUnit.SCALAR(), + (mov_x, mov_imm, sub_imm, add, add_imm, add_lsl, add_lsr) : ExecutionUnit.SCALAR(), vsrshr : [ExecutionUnit.ASIMD1], @@ -153,10 +153,10 @@ def get_min_max_objective(slothy): Vins : 1, umov_d : 1, - (add, add_imm, add_lsl, add_lsr) : 1, + (mov_x, mov_imm, sub_imm, add, add_imm, add_lsl, add_lsr) : 1, ( Ldr_Q, - Str_Q, + d_stp_stack_with_inc, Str_Q, Ldr_X, Str_X ) : 1, @@ -179,14 +179,14 @@ def get_min_max_objective(slothy): trn1, trn2 ) : 3, # Approximation -- not necessary to get it exactly right, as mentioned above - ( Ldr_Q, Ldr_X, + ( d_stp_stack_with_inc, Ldr_Q, Ldr_X, Str_Q, Str_X ) : 4, # approx Vins : 6, # approx umov_d : 4, # approx - (add, add_imm, add_lsl, add_lsr) : 2, + (mov_x, mov_imm, sub_imm, add, add_imm, add_lsl, add_lsr) : 2, vsrshr : 3, # approx St4 : 8,