diff --git a/examples/naive/aarch64/intt_dilithium_1234_5678.s b/examples/naive/aarch64/intt_dilithium_1234_5678.s index 8228b2c1..cdb7d402 100644 --- a/examples/naive/aarch64/intt_dilithium_1234_5678.s +++ b/examples/naive/aarch64/intt_dilithium_1234_5678.s @@ -85,15 +85,15 @@ .endm .macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 vmulq \dst, \src, \const, \idx0 - vqrdmulhq \src, \src, \const, \idx1 - vmls \dst, \src, modulus + vmls \dst, t2, modulus .endm .macro mulmod dst, src, const, const_twisted - vmul \dst, \src, \const - vqrdmulh \src, \src, \const_twisted - vmls \dst, \src, modulus + vqrdmulh t2, \src, \const_twisted + mul \dst\().4s, \src\().4s, \const\().4s + vmls \dst, t2, modulus .endm .macro montg_reduce a @@ -114,12 +114,6 @@ mulmodq \b, tmp, \root, \idx0, \idx1 .endm -.macro mulmod_v dst, src, const, const_twisted - vmul \dst, \src, \const - vqrdmulh \src, \src, \const_twisted - vmls \dst, \src, modulus -.endm - .macro gs_butterfly_v a, b, root, root_twisted vsub tmp, \a, \b vadd \a, \a, \b @@ -486,25 +480,25 @@ layer1234_start: str_vo data14, in, (14*(512/8)) str_vo data15, in, (15*(512/8)) - mul_ninv data8, data9, data10, data11, data12, data13, data14, data15, data0, data1, data2, data3, data4, data5, data6, data7 - - canonical_reduce data8, modulus_half, neg_modulus_half, t2, t3 - canonical_reduce data9, modulus_half, neg_modulus_half, t2, t3 - canonical_reduce data10, modulus_half, neg_modulus_half, t2, t3 - canonical_reduce data11, modulus_half, neg_modulus_half, t2, t3 - canonical_reduce data12, modulus_half, neg_modulus_half, t2, t3 - canonical_reduce data13, modulus_half, neg_modulus_half, t2, t3 - canonical_reduce data14, modulus_half, neg_modulus_half, t2, t3 - canonical_reduce data15, modulus_half, neg_modulus_half, t2, t3 - - str_vi data8, in, (16) - str_vo data9, in, (-16 + 1*(512/8)) - str_vo data10, in, (-16 + 2*(512/8)) - str_vo data11, in, (-16 + 3*(512/8)) - str_vo data12, in, (-16 + 4*(512/8)) - str_vo data13, in, (-16 + 5*(512/8)) - str_vo data14, in, (-16 + 6*(512/8)) - str_vo data15, in, (-16 + 7*(512/8)) + mul_ninv data0, data1, data2, data3, data4, data5, data6, data7, data0, data1, data2, data3, data4, data5, data6, data7 + + canonical_reduce data0, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data1, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data2, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data3, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data4, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data5, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data6, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data7, modulus_half, neg_modulus_half, t2, t3 + + str_vi data0, in, (16) + str_vo data1, in, (-16 + 1*(512/8)) + str_vo data2, in, (-16 + 2*(512/8)) + str_vo data3, in, (-16 + 3*(512/8)) + str_vo data4, in, (-16 + 4*(512/8)) + str_vo data5, in, (-16 + 5*(512/8)) + str_vo data6, in, (-16 + 6*(512/8)) + str_vo data7, in, (-16 + 7*(512/8)) // layer1234_end: subs count, count, #1 diff --git a/examples/naive/aarch64/intt_dilithium_1234_5678_manual_ld4.s b/examples/naive/aarch64/intt_dilithium_1234_5678_manual_ld4.s index 153895ca..3e136547 100644 --- a/examples/naive/aarch64/intt_dilithium_1234_5678_manual_ld4.s +++ b/examples/naive/aarch64/intt_dilithium_1234_5678_manual_ld4.s @@ -85,15 +85,15 @@ .endm .macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 vmulq \dst, \src, \const, \idx0 - vqrdmulhq \src, \src, \const, \idx1 - vmls \dst, \src, modulus + vmls \dst, t2, modulus .endm .macro mulmod dst, src, const, const_twisted - vmul \dst, \src, \const - vqrdmulh \src, \src, \const_twisted - vmls \dst, \src, modulus + vqrdmulh t2, \src, \const_twisted + mul \dst\().4s, \src\().4s, \const\().4s + vmls \dst, t2, modulus .endm .macro montg_reduce a @@ -114,12 +114,6 @@ mulmodq \b, tmp, \root, \idx0, \idx1 .endm -.macro mulmod_v dst, src, const, const_twisted - vmul \dst, \src, \const - vqrdmulh \src, \src, \const_twisted - vmls \dst, \src, modulus -.endm - .macro gs_butterfly_v a, b, root, root_twisted vsub tmp, \a, \b vadd \a, \a, \b @@ -484,25 +478,25 @@ layer1234_start: str_vo data14, in, (14*(512/8)) str_vo data15, in, (15*(512/8)) - mul_ninv data8, data9, data10, data11, data12, data13, data14, data15, data0, data1, data2, data3, data4, data5, data6, data7 - - canonical_reduce data8, modulus_half, neg_modulus_half, t2, t3 - canonical_reduce data9, modulus_half, neg_modulus_half, t2, t3 - canonical_reduce data10, modulus_half, neg_modulus_half, t2, t3 - canonical_reduce data11, modulus_half, neg_modulus_half, t2, t3 - canonical_reduce data12, modulus_half, neg_modulus_half, t2, t3 - canonical_reduce data13, modulus_half, neg_modulus_half, t2, t3 - canonical_reduce data14, modulus_half, neg_modulus_half, t2, t3 - canonical_reduce data15, modulus_half, neg_modulus_half, t2, t3 - - str_vi data8, in, (16) - str_vo data9, in, (-16 + 1*(512/8)) - str_vo data10, in, (-16 + 2*(512/8)) - str_vo data11, in, (-16 + 3*(512/8)) - str_vo data12, in, (-16 + 4*(512/8)) - str_vo data13, in, (-16 + 5*(512/8)) - str_vo data14, in, (-16 + 6*(512/8)) - str_vo data15, in, (-16 + 7*(512/8)) + mul_ninv data0, data1, data2, data3, data4, data5, data6, data7, data0, data1, data2, data3, data4, data5, data6, data7 + + canonical_reduce data0, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data1, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data2, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data3, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data4, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data5, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data6, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data7, modulus_half, neg_modulus_half, t2, t3 + + str_vi data0, in, (16) + str_vo data1, in, (-16 + 1*(512/8)) + str_vo data2, in, (-16 + 2*(512/8)) + str_vo data3, in, (-16 + 3*(512/8)) + str_vo data4, in, (-16 + 4*(512/8)) + str_vo data5, in, (-16 + 5*(512/8)) + str_vo data6, in, (-16 + 6*(512/8)) + str_vo data7, in, (-16 + 7*(512/8)) // layer1234_end: subs count, count, #1 diff --git a/examples/naive/aarch64/intt_dilithium_123_45678.s b/examples/naive/aarch64/intt_dilithium_123_45678.s index efd53369..4527536e 100644 --- a/examples/naive/aarch64/intt_dilithium_123_45678.s +++ b/examples/naive/aarch64/intt_dilithium_123_45678.s @@ -47,15 +47,15 @@ xtmp1 .req x11 .endm .macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 vmulq \dst, \src, \const, \idx0 - vqrdmulhq \src, \src, \const, \idx1 - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro mulmod dst, src, const, const_twisted + vqrdmulh t2, \src, \const_twisted mul \dst\().4s, \src\().4s, \const\().4s - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro montg_reduce a @@ -514,17 +514,17 @@ layer123_start: str_vo data6, in, (6*(1024/8)) str_vo data7, in, (7*(1024/8)) - mul_ninv data4, data5, data6, data7, data0, data1, data2, data3 + mul_ninv data0, data1, data2, data3, data0, data1, data2, data3 - canonical_reduce data4, modulus_half, neg_modulus_half, t2, t3 - canonical_reduce data5, modulus_half, neg_modulus_half, t2, t3 - canonical_reduce data6, modulus_half, neg_modulus_half, t2, t3 - canonical_reduce data7, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data0, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data1, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data2, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data3, modulus_half, neg_modulus_half, t2, t3 - str_vi data4, in, (16) - str_vo data5, in, (-16 + 1*(1024/8)) - str_vo data6, in, (-16 + 2*(1024/8)) - str_vo data7, in, (-16 + 3*(1024/8)) + str_vi data0, in, (16) + str_vo data1, in, (-16 + 1*(1024/8)) + str_vo data2, in, (-16 + 2*(1024/8)) + str_vo data3, in, (-16 + 3*(1024/8)) subs count, count, #1 cbnz count, layer123_start diff --git a/examples/naive/aarch64/intt_dilithium_123_45678_manual_ld4.s b/examples/naive/aarch64/intt_dilithium_123_45678_manual_ld4.s index 464e0473..2a62f678 100644 --- a/examples/naive/aarch64/intt_dilithium_123_45678_manual_ld4.s +++ b/examples/naive/aarch64/intt_dilithium_123_45678_manual_ld4.s @@ -47,15 +47,15 @@ xtmp1 .req x11 .endm .macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 vmulq \dst, \src, \const, \idx0 - vqrdmulhq \src, \src, \const, \idx1 - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro mulmod dst, src, const, const_twisted + vqrdmulh t2, \src, \const_twisted mul \dst\().4s, \src\().4s, \const\().4s - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro montg_reduce a @@ -523,17 +523,17 @@ layer123_start: str_vo data6, in, (6*(1024/8)) str_vo data7, in, (7*(1024/8)) - mul_ninv data4, data5, data6, data7, data0, data1, data2, data3 + mul_ninv data0, data1, data2, data3, data0, data1, data2, data3 - canonical_reduce data4, modulus_half, neg_modulus_half, t2, t3 - canonical_reduce data5, modulus_half, neg_modulus_half, t2, t3 - canonical_reduce data6, modulus_half, neg_modulus_half, t2, t3 - canonical_reduce data7, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data0, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data1, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data2, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data3, modulus_half, neg_modulus_half, t2, t3 - str_vi data4, in, (16) - str_vo data5, in, (-16 + 1*(1024/8)) - str_vo data6, in, (-16 + 2*(1024/8)) - str_vo data7, in, (-16 + 3*(1024/8)) + str_vi data0, in, (16) + str_vo data1, in, (-16 + 1*(1024/8)) + str_vo data2, in, (-16 + 2*(1024/8)) + str_vo data3, in, (-16 + 3*(1024/8)) subs count, count, #1 cbnz count, layer123_start diff --git a/examples/naive/aarch64/intt_kyber_123_4567.s b/examples/naive/aarch64/intt_kyber_123_4567.s index 034fe144..ee06171f 100644 --- a/examples/naive/aarch64/intt_kyber_123_4567.s +++ b/examples/naive/aarch64/intt_kyber_123_4567.s @@ -67,15 +67,15 @@ .endm .macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 vmulq \dst, \src, \const, \idx0 - vqrdmulhq \src, \src, \const, \idx1 - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro mulmod dst, src, const, const_twisted + vqrdmulh t2, \src, \const_twisted mul \dst\().8h, \src\().8h, \const\().8h - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro gs_butterfly a, b, root, idx0, idx1 @@ -84,12 +84,6 @@ mulmodq \b, tmp, \root, \idx0, \idx1 .endm -.macro mulmod_v dst, src, const, const_twisted - mul \dst\().8h, \src\().8h, \const\().8h - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 -.endm - .macro gs_butterfly_v a, b, root, root_twisted sub tmp.8h, \a\().8h, \b\().8h add \a\().8h, \a\().8h, \b\().8h @@ -443,12 +437,12 @@ layer123_start: str_vo data6, in, (6*(512/8)) str_vo data7, in, (7*(512/8)) - mul_ninv data4, data5, data6, data7, data0, data1, data2, data3 + mul_ninv data0, data1, data2, data3, data0, data1, data2, data3 - str_vi data4, in, (16) - str_vo data5, in, (-16 + 1*(512/8)) - str_vo data6, in, (-16 + 2*(512/8)) - str_vo data7, in, (-16 + 3*(512/8)) + str_vi data0, in, (16) + str_vo data1, in, (-16 + 1*(512/8)) + str_vo data2, in, (-16 + 2*(512/8)) + str_vo data3, in, (-16 + 3*(512/8)) subs count, count, #1 diff --git a/examples/naive/aarch64/intt_kyber_123_4567_manual_ld4.s b/examples/naive/aarch64/intt_kyber_123_4567_manual_ld4.s index b69bbe11..cdd00ff1 100644 --- a/examples/naive/aarch64/intt_kyber_123_4567_manual_ld4.s +++ b/examples/naive/aarch64/intt_kyber_123_4567_manual_ld4.s @@ -67,15 +67,15 @@ .endm .macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 vmulq \dst, \src, \const, \idx0 - vqrdmulhq \src, \src, \const, \idx1 - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro mulmod dst, src, const, const_twisted + vqrdmulh t2, \src, \const_twisted mul \dst\().8h, \src\().8h, \const\().8h - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro gs_butterfly a, b, root, idx0, idx1 @@ -84,12 +84,6 @@ mulmodq \b, tmp, \root, \idx0, \idx1 .endm -.macro mulmod_v dst, src, const, const_twisted - mul \dst\().8h, \src\().8h, \const\().8h - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 -.endm - .macro gs_butterfly_v a, b, root, root_twisted sub tmp.8h, \a\().8h, \b\().8h add \a\().8h, \a\().8h, \b\().8h @@ -438,12 +432,12 @@ layer123_start: str_vo data6, in, (6*(512/8)) str_vo data7, in, (7*(512/8)) - mul_ninv data4, data5, data6, data7, data0, data1, data2, data3 + mul_ninv data0, data1, data2, data3, data0, data1, data2, data3 - str_vi data4, in, (16) - str_vo data5, in, (-16 + 1*(512/8)) - str_vo data6, in, (-16 + 2*(512/8)) - str_vo data7, in, (-16 + 3*(512/8)) + str_vi data0, in, (16) + str_vo data1, in, (-16 + 1*(512/8)) + str_vo data2, in, (-16 + 2*(512/8)) + str_vo data3, in, (-16 + 3*(512/8)) subs count, count, #1