diff --git a/examples/naive/aarch64/intt_dilithium_1234_5678.s b/examples/naive/aarch64/intt_dilithium_1234_5678.s
index 8228b2c1..cdb7d402 100644
--- a/examples/naive/aarch64/intt_dilithium_1234_5678.s
+++ b/examples/naive/aarch64/intt_dilithium_1234_5678.s
@@ -85,15 +85,15 @@
 .endm
 
 .macro mulmodq dst, src, const, idx0, idx1
+        vqrdmulhq   t2,  \src, \const, \idx1
         vmulq       \dst,  \src, \const, \idx0
-        vqrdmulhq   \src,  \src, \const, \idx1
-        vmls        \dst,  \src, modulus
+        vmls       \dst,  t2, modulus
 .endm
 
 .macro mulmod dst, src, const, const_twisted
-        vmul       \dst,  \src, \const
-        vqrdmulh   \src,  \src, \const_twisted
-        vmls       \dst,  \src, modulus
+        vqrdmulh   t2,  \src, \const_twisted
+        mul        \dst\().4s,  \src\().4s, \const\().4s
+        vmls       \dst,  t2, modulus
 .endm
 
 .macro montg_reduce a
@@ -114,12 +114,6 @@
         mulmodq  \b, tmp, \root, \idx0, \idx1
 .endm
 
-.macro mulmod_v dst, src, const, const_twisted
-        vmul        \dst,  \src, \const
-        vqrdmulh    \src,  \src, \const_twisted
-        vmls        \dst,  \src, modulus
-.endm
-
 .macro gs_butterfly_v a, b, root, root_twisted
         vsub    tmp,    \a, \b
         vadd    \a,    \a, \b
@@ -486,25 +480,25 @@ layer1234_start:
         str_vo data14, in, (14*(512/8))
         str_vo data15, in, (15*(512/8))
 
-        mul_ninv data8, data9, data10, data11, data12, data13, data14, data15, data0, data1, data2, data3, data4, data5, data6, data7
-
-        canonical_reduce data8,  modulus_half, neg_modulus_half, t2, t3
-        canonical_reduce data9,  modulus_half, neg_modulus_half, t2, t3
-        canonical_reduce data10, modulus_half, neg_modulus_half, t2, t3
-        canonical_reduce data11, modulus_half, neg_modulus_half, t2, t3
-        canonical_reduce data12, modulus_half, neg_modulus_half, t2, t3
-        canonical_reduce data13, modulus_half, neg_modulus_half, t2, t3
-        canonical_reduce data14, modulus_half, neg_modulus_half, t2, t3
-        canonical_reduce data15, modulus_half, neg_modulus_half, t2, t3
-
-        str_vi data8, in, (16)
-        str_vo data9, in, (-16 + 1*(512/8))
-        str_vo data10, in, (-16 + 2*(512/8))
-        str_vo data11, in, (-16 + 3*(512/8))
-        str_vo data12, in, (-16 + 4*(512/8))
-        str_vo data13, in, (-16 + 5*(512/8))
-        str_vo data14, in, (-16 + 6*(512/8))
-        str_vo data15, in, (-16 + 7*(512/8))
+        mul_ninv data0, data1, data2, data3, data4, data5, data6, data7, data0, data1, data2, data3, data4, data5, data6, data7
+
+        canonical_reduce data0, modulus_half, neg_modulus_half, t2, t3
+        canonical_reduce data1, modulus_half, neg_modulus_half, t2, t3
+        canonical_reduce data2, modulus_half, neg_modulus_half, t2, t3
+        canonical_reduce data3, modulus_half, neg_modulus_half, t2, t3
+        canonical_reduce data4, modulus_half, neg_modulus_half, t2, t3
+        canonical_reduce data5, modulus_half, neg_modulus_half, t2, t3
+        canonical_reduce data6, modulus_half, neg_modulus_half, t2, t3
+        canonical_reduce data7, modulus_half, neg_modulus_half, t2, t3
+
+        str_vi data0, in, (16)
+        str_vo data1, in, (-16 + 1*(512/8))
+        str_vo data2, in, (-16 + 2*(512/8))
+        str_vo data3, in, (-16 + 3*(512/8))
+        str_vo data4, in, (-16 + 4*(512/8))
+        str_vo data5, in, (-16 + 5*(512/8))
+        str_vo data6, in, (-16 + 6*(512/8))
+        str_vo data7, in, (-16 + 7*(512/8))
 
 // layer1234_end:
         subs count, count, #1
diff --git a/examples/naive/aarch64/intt_dilithium_1234_5678_manual_ld4.s b/examples/naive/aarch64/intt_dilithium_1234_5678_manual_ld4.s
index 153895ca..3e136547 100644
--- a/examples/naive/aarch64/intt_dilithium_1234_5678_manual_ld4.s
+++ b/examples/naive/aarch64/intt_dilithium_1234_5678_manual_ld4.s
@@ -85,15 +85,15 @@
 .endm
 
 .macro mulmodq dst, src, const, idx0, idx1
+        vqrdmulhq   t2,  \src, \const, \idx1
         vmulq       \dst,  \src, \const, \idx0
-        vqrdmulhq   \src,  \src, \const, \idx1
-        vmls        \dst,  \src, modulus
+        vmls       \dst,  t2, modulus
 .endm
 
 .macro mulmod dst, src, const, const_twisted
-        vmul       \dst,  \src, \const
-        vqrdmulh   \src,  \src, \const_twisted
-        vmls       \dst,  \src, modulus
+        vqrdmulh   t2,  \src, \const_twisted
+        mul        \dst\().4s,  \src\().4s, \const\().4s
+        vmls       \dst,  t2, modulus
 .endm
 
 .macro montg_reduce a
@@ -114,12 +114,6 @@
         mulmodq  \b, tmp, \root, \idx0, \idx1
 .endm
 
-.macro mulmod_v dst, src, const, const_twisted
-        vmul        \dst,  \src, \const
-        vqrdmulh    \src,  \src, \const_twisted
-        vmls        \dst,  \src, modulus
-.endm
-
 .macro gs_butterfly_v a, b, root, root_twisted
         vsub    tmp,    \a, \b
         vadd    \a,    \a, \b
@@ -484,25 +478,25 @@ layer1234_start:
         str_vo data14, in, (14*(512/8))
         str_vo data15, in, (15*(512/8))
 
-        mul_ninv data8, data9, data10, data11, data12, data13, data14, data15, data0, data1, data2, data3, data4, data5, data6, data7
-
-        canonical_reduce data8,  modulus_half, neg_modulus_half, t2, t3
-        canonical_reduce data9,  modulus_half, neg_modulus_half, t2, t3
-        canonical_reduce data10, modulus_half, neg_modulus_half, t2, t3
-        canonical_reduce data11, modulus_half, neg_modulus_half, t2, t3
-        canonical_reduce data12, modulus_half, neg_modulus_half, t2, t3
-        canonical_reduce data13, modulus_half, neg_modulus_half, t2, t3
-        canonical_reduce data14, modulus_half, neg_modulus_half, t2, t3
-        canonical_reduce data15, modulus_half, neg_modulus_half, t2, t3
-
-        str_vi data8, in, (16)
-        str_vo data9, in, (-16 + 1*(512/8))
-        str_vo data10, in, (-16 + 2*(512/8))
-        str_vo data11, in, (-16 + 3*(512/8))
-        str_vo data12, in, (-16 + 4*(512/8))
-        str_vo data13, in, (-16 + 5*(512/8))
-        str_vo data14, in, (-16 + 6*(512/8))
-        str_vo data15, in, (-16 + 7*(512/8))
+        mul_ninv data0, data1, data2, data3, data4, data5, data6, data7, data0, data1, data2, data3, data4, data5, data6, data7
+
+        canonical_reduce data0, modulus_half, neg_modulus_half, t2, t3
+        canonical_reduce data1, modulus_half, neg_modulus_half, t2, t3
+        canonical_reduce data2, modulus_half, neg_modulus_half, t2, t3
+        canonical_reduce data3, modulus_half, neg_modulus_half, t2, t3
+        canonical_reduce data4, modulus_half, neg_modulus_half, t2, t3
+        canonical_reduce data5, modulus_half, neg_modulus_half, t2, t3
+        canonical_reduce data6, modulus_half, neg_modulus_half, t2, t3
+        canonical_reduce data7, modulus_half, neg_modulus_half, t2, t3
+
+        str_vi data0, in, (16)
+        str_vo data1, in, (-16 + 1*(512/8))
+        str_vo data2, in, (-16 + 2*(512/8))
+        str_vo data3, in, (-16 + 3*(512/8))
+        str_vo data4, in, (-16 + 4*(512/8))
+        str_vo data5, in, (-16 + 5*(512/8))
+        str_vo data6, in, (-16 + 6*(512/8))
+        str_vo data7, in, (-16 + 7*(512/8))
 
 // layer1234_end:
         subs count, count, #1
diff --git a/examples/naive/aarch64/intt_dilithium_123_45678.s b/examples/naive/aarch64/intt_dilithium_123_45678.s
index efd53369..4527536e 100644
--- a/examples/naive/aarch64/intt_dilithium_123_45678.s
+++ b/examples/naive/aarch64/intt_dilithium_123_45678.s
@@ -47,15 +47,15 @@ xtmp1 .req x11
 .endm
 
 .macro mulmodq dst, src, const, idx0, idx1
+        vqrdmulhq   t2,  \src, \const, \idx1
         vmulq       \dst,  \src, \const, \idx0
-        vqrdmulhq   \src,  \src, \const, \idx1
-        vmlsq        \dst,  \src, consts, 0
+        vmlsq       \dst,  t2, consts, 0
 .endm
 
 .macro mulmod dst, src, const, const_twisted
+        vqrdmulh   t2,  \src, \const_twisted
         mul        \dst\().4s,  \src\().4s, \const\().4s
-        vqrdmulh   \src,  \src, \const_twisted
-        vmlsq       \dst,  \src, consts, 0
+        vmlsq      \dst,  t2, consts, 0
 .endm
 
 .macro montg_reduce a
@@ -514,17 +514,17 @@ layer123_start:
         str_vo data6, in, (6*(1024/8))
         str_vo data7, in, (7*(1024/8))        
 
-        mul_ninv data4, data5, data6, data7, data0, data1, data2, data3
+        mul_ninv data0, data1, data2, data3, data0, data1, data2, data3
 
-        canonical_reduce data4, modulus_half, neg_modulus_half, t2, t3
-        canonical_reduce data5, modulus_half, neg_modulus_half, t2, t3
-        canonical_reduce data6, modulus_half, neg_modulus_half, t2, t3
-        canonical_reduce data7, modulus_half, neg_modulus_half, t2, t3
+        canonical_reduce data0, modulus_half, neg_modulus_half, t2, t3
+        canonical_reduce data1, modulus_half, neg_modulus_half, t2, t3
+        canonical_reduce data2, modulus_half, neg_modulus_half, t2, t3
+        canonical_reduce data3, modulus_half, neg_modulus_half, t2, t3
 
-        str_vi data4, in, (16)
-        str_vo data5, in, (-16 + 1*(1024/8))
-        str_vo data6, in, (-16 + 2*(1024/8))
-        str_vo data7, in, (-16 + 3*(1024/8))
+        str_vi data0, in, (16)
+        str_vo data1, in, (-16 + 1*(1024/8))
+        str_vo data2, in, (-16 + 2*(1024/8))
+        str_vo data3, in, (-16 + 3*(1024/8))
 
         subs count, count, #1
         cbnz count, layer123_start
diff --git a/examples/naive/aarch64/intt_dilithium_123_45678_manual_ld4.s b/examples/naive/aarch64/intt_dilithium_123_45678_manual_ld4.s
index 464e0473..2a62f678 100644
--- a/examples/naive/aarch64/intt_dilithium_123_45678_manual_ld4.s
+++ b/examples/naive/aarch64/intt_dilithium_123_45678_manual_ld4.s
@@ -47,15 +47,15 @@ xtmp1 .req x11
 .endm
 
 .macro mulmodq dst, src, const, idx0, idx1
+        vqrdmulhq   t2,  \src, \const, \idx1
         vmulq       \dst,  \src, \const, \idx0
-        vqrdmulhq   \src,  \src, \const, \idx1
-        vmlsq        \dst,  \src, consts, 0
+        vmlsq       \dst,  t2, consts, 0
 .endm
 
 .macro mulmod dst, src, const, const_twisted
+        vqrdmulh   t2,  \src, \const_twisted
         mul        \dst\().4s,  \src\().4s, \const\().4s
-        vqrdmulh   \src,  \src, \const_twisted
-        vmlsq       \dst,  \src, consts, 0
+        vmlsq      \dst,  t2, consts, 0
 .endm
 
 .macro montg_reduce a
@@ -523,17 +523,17 @@ layer123_start:
         str_vo data6, in, (6*(1024/8))
         str_vo data7, in, (7*(1024/8))        
 
-        mul_ninv data4, data5, data6, data7, data0, data1, data2, data3
+        mul_ninv data0, data1, data2, data3, data0, data1, data2, data3
 
-        canonical_reduce data4, modulus_half, neg_modulus_half, t2, t3
-        canonical_reduce data5, modulus_half, neg_modulus_half, t2, t3
-        canonical_reduce data6, modulus_half, neg_modulus_half, t2, t3
-        canonical_reduce data7, modulus_half, neg_modulus_half, t2, t3
+        canonical_reduce data0, modulus_half, neg_modulus_half, t2, t3
+        canonical_reduce data1, modulus_half, neg_modulus_half, t2, t3
+        canonical_reduce data2, modulus_half, neg_modulus_half, t2, t3
+        canonical_reduce data3, modulus_half, neg_modulus_half, t2, t3
 
-        str_vi data4, in, (16)
-        str_vo data5, in, (-16 + 1*(1024/8))
-        str_vo data6, in, (-16 + 2*(1024/8))
-        str_vo data7, in, (-16 + 3*(1024/8))
+        str_vi data0, in, (16)
+        str_vo data1, in, (-16 + 1*(1024/8))
+        str_vo data2, in, (-16 + 2*(1024/8))
+        str_vo data3, in, (-16 + 3*(1024/8))
 
         subs count, count, #1
         cbnz count, layer123_start
diff --git a/examples/naive/aarch64/intt_kyber_123_4567.s b/examples/naive/aarch64/intt_kyber_123_4567.s
index 034fe144..ee06171f 100644
--- a/examples/naive/aarch64/intt_kyber_123_4567.s
+++ b/examples/naive/aarch64/intt_kyber_123_4567.s
@@ -67,15 +67,15 @@
 .endm
 
 .macro mulmodq dst, src, const, idx0, idx1
+        vqrdmulhq   t2,  \src, \const, \idx1
         vmulq       \dst,  \src, \const, \idx0
-        vqrdmulhq   \src,  \src, \const, \idx1
-        vmlsq       \dst,  \src, consts, 0
+        vmlsq       \dst,  t2, consts, 0
 .endm
 
 .macro mulmod dst, src, const, const_twisted
+        vqrdmulh   t2,  \src, \const_twisted
         mul        \dst\().8h,  \src\().8h, \const\().8h
-        vqrdmulh   \src,  \src, \const_twisted
-        vmlsq      \dst,  \src, consts, 0
+        vmlsq      \dst,  t2, consts, 0
 .endm
 
 .macro gs_butterfly a, b, root, idx0, idx1
@@ -84,12 +84,6 @@
         mulmodq  \b, tmp, \root, \idx0, \idx1
 .endm
 
-.macro mulmod_v dst, src, const, const_twisted
-        mul         \dst\().8h,  \src\().8h, \const\().8h
-        vqrdmulh    \src,  \src, \const_twisted
-        vmlsq       \dst,  \src, consts, 0
-.endm
-
 .macro gs_butterfly_v a, b, root, root_twisted
         sub    tmp.8h,    \a\().8h, \b\().8h
         add    \a\().8h,    \a\().8h, \b\().8h
@@ -443,12 +437,12 @@ layer123_start:
         str_vo data6, in, (6*(512/8))
         str_vo data7, in, (7*(512/8))
 
-        mul_ninv data4, data5, data6, data7, data0, data1, data2, data3
+        mul_ninv data0, data1, data2, data3, data0, data1, data2, data3
 
-        str_vi data4, in, (16)
-        str_vo data5, in, (-16 + 1*(512/8))
-        str_vo data6, in, (-16 + 2*(512/8))
-        str_vo data7, in, (-16 + 3*(512/8))
+        str_vi data0, in, (16)
+        str_vo data1, in, (-16 + 1*(512/8))
+        str_vo data2, in, (-16 + 2*(512/8))
+        str_vo data3, in, (-16 + 3*(512/8))
 
 
         subs count, count, #1
diff --git a/examples/naive/aarch64/intt_kyber_123_4567_manual_ld4.s b/examples/naive/aarch64/intt_kyber_123_4567_manual_ld4.s
index b69bbe11..cdd00ff1 100644
--- a/examples/naive/aarch64/intt_kyber_123_4567_manual_ld4.s
+++ b/examples/naive/aarch64/intt_kyber_123_4567_manual_ld4.s
@@ -67,15 +67,15 @@
 .endm
 
 .macro mulmodq dst, src, const, idx0, idx1
+        vqrdmulhq   t2,  \src, \const, \idx1
         vmulq       \dst,  \src, \const, \idx0
-        vqrdmulhq   \src,  \src, \const, \idx1
-        vmlsq       \dst,  \src, consts, 0
+        vmlsq       \dst,  t2, consts, 0
 .endm
 
 .macro mulmod dst, src, const, const_twisted
+        vqrdmulh   t2,  \src, \const_twisted
         mul        \dst\().8h,  \src\().8h, \const\().8h
-        vqrdmulh   \src,  \src, \const_twisted
-        vmlsq      \dst,  \src, consts, 0
+        vmlsq      \dst,  t2, consts, 0
 .endm
 
 .macro gs_butterfly a, b, root, idx0, idx1
@@ -84,12 +84,6 @@
         mulmodq  \b, tmp, \root, \idx0, \idx1
 .endm
 
-.macro mulmod_v dst, src, const, const_twisted
-        mul         \dst\().8h,  \src\().8h, \const\().8h
-        vqrdmulh    \src,  \src, \const_twisted
-        vmlsq       \dst,  \src, consts, 0
-.endm
-
 .macro gs_butterfly_v a, b, root, root_twisted
         sub    tmp.8h,    \a\().8h, \b\().8h
         add    \a\().8h,    \a\().8h, \b\().8h
@@ -438,12 +432,12 @@ layer123_start:
         str_vo data6, in, (6*(512/8))
         str_vo data7, in, (7*(512/8))
 
-        mul_ninv data4, data5, data6, data7, data0, data1, data2, data3
+        mul_ninv data0, data1, data2, data3, data0, data1, data2, data3
 
-        str_vi data4, in, (16)
-        str_vo data5, in, (-16 + 1*(512/8))
-        str_vo data6, in, (-16 + 2*(512/8))
-        str_vo data7, in, (-16 + 3*(512/8))
+        str_vi data0, in, (16)
+        str_vo data1, in, (-16 + 1*(512/8))
+        str_vo data2, in, (-16 + 2*(512/8))
+        str_vo data3, in, (-16 + 3*(512/8))
 
 
         subs count, count, #1