slothy-optimizer · SH1E0r1r2y · Jan 9, 2025 · Jan 10, 2025 · Jan 14, 2025 · Jan 10, 2025
diff --git a/.github/workflows/test_basic.yaml b/.github/workflows/test_basic.yaml
@@ -1,16 +1,12 @@
 name: Regression tests
 on:
+  push:
+    branches: [ "main" ]
   pull_request:
     branches: [ "main" ]
-    types: [ opened, synchronize, labeled ]
 jobs:
   examples_dry_run:
     name: Dry Run (${{ matrix.target }})
-    if: ${{ github.event.label.name == 'needs-ci' ||
-            github.event.pull_request.user.login == 'hanno-becker' ||
-            github.event.pull_request.user.login == 'dop-amin' ||
-            github.event.pull_request.user.login == 'mkannwischer'
-            }}
     runs-on: ubuntu-latest
     strategy:
       matrix:
@@ -22,11 +18,6 @@ jobs:
       run: |
         python3 example.py --dry-run --only-target=${{ matrix.target }}
   tutorial:
-    if: ${{ github.event.label.name == 'needs-ci' ||
-            github.event.pull_request.user.login == 'hanno-becker' ||
-            github.event.pull_request.user.login == 'dop-amin' ||
-            github.event.pull_request.user.login == 'mkannwischer'
-            }}
     runs-on: ubuntu-latest
     steps:
     - uses: actions/checkout@v3
@@ -35,11 +26,6 @@ jobs:
       run: |
         (cd tutorial && ./tutorial_all.sh)
   examples_basic:
-    if: ${{ github.event.label.name == 'needs-ci' ||
-            github.event.pull_request.user.login == 'hanno-becker' ||
-            github.event.pull_request.user.login == 'dop-amin' ||
-            github.event.pull_request.user.login == 'mkannwischer'
-            }}
     runs-on: ubuntu-latest
     steps:
     - uses: actions/checkout@v3
@@ -48,11 +34,6 @@ jobs:
       run: |
         python3 example.py --examples simple0,simple1,simple0_loop,simple1_loop
   examples_ntt_kyber_dilithium_helium_core:
-    if: ${{ github.event.label.name == 'needs-ci' ||
-            github.event.pull_request.user.login == 'hanno-becker' ||
-            github.event.pull_request.user.login == 'dop-amin' ||
-            github.event.pull_request.user.login == 'mkannwischer'
-            }}
     runs-on: ubuntu-latest
     steps:
     - uses: actions/checkout@v3
@@ -61,11 +42,6 @@ jobs:
       run: |
         python3 example.py --examples ntt_kyber_1_23_45_67_m55,ntt_dilithium_12_34_56_78_m55 --timeout=300
   examples_ntt_kyber_dilithium_neon_core:
-    if: ${{ github.event.label.name == 'needs-ci' ||
-            github.event.pull_request.user.login == 'hanno-becker' ||
-            github.event.pull_request.user.login == 'dop-amin' ||
-            github.event.pull_request.user.login == 'mkannwischer'
-            }}
     runs-on: ubuntu-latest
     steps:
     - uses: actions/checkout@v3
@@ -74,11 +50,6 @@ jobs:
       run: |
         python3 example.py --examples ntt_kyber_123_4567_a55,ntt_dilithium_123_45678_a55 --timeout=300
   sqmag:
-    if: ${{ github.event.label.name == 'needs-ci' ||
-            github.event.pull_request.user.login == 'hanno-becker' ||
-            github.event.pull_request.user.login == 'dop-amin' ||
-            github.event.pull_request.user.login == 'mkannwischer'
-            }}
     runs-on: ubuntu-latest
     steps:
     - uses: actions/checkout@v3
@@ -87,11 +58,6 @@ jobs:
       run: |
         (cd paper/scripts && NO_LOG=Y ./slothy_sqmag.sh)
   fft:
-    if: ${{ github.event.label.name == 'needs-ci' ||
-            github.event.pull_request.user.login == 'hanno-becker' ||
-            github.event.pull_request.user.login == 'dop-amin' ||
-            github.event.pull_request.user.login == 'mkannwischer'
-            }}
     runs-on: ubuntu-latest
     steps:
     - uses: actions/checkout@v3

diff --git a/example.py b/example.py
@@ -669,7 +669,9 @@ def __init__(self, var="", arch=Arch_Armv7M, target=Target_CortexM7):
     def core(self,slothy):
         slothy.config.variable_size=True
         slothy.config.inputs_are_outputs = True
+        slothy.fusion_region("start", "end", ssa=False)
         slothy.optimize(start="start", end="end")
+
 
 class Armv7mExample0Func(Example):
     def __init__(self, var="", arch=Arch_Armv7M, target=Target_CortexM7):
@@ -1569,7 +1571,7 @@ def __init__(self, var="", arch=Arch_Armv7M, target=Target_CortexM7, timeout=Non
     def core(self, slothy):
         slothy.config.constraints.stalls_first_attempt = 16
 
-        slothy.config.unsafe_address_offset_fixup = False
+        slothy.config.unsafe_address_offset_fixup = True
 
         slothy.config.variable_size = True
         slothy.config.inputs_are_outputs = True
@@ -1605,7 +1607,7 @@ def __init__(self, var="", arch=Arch_Armv7M, target=Target_CortexM7, timeout=Non
     def core(self, slothy):
         slothy.config.constraints.stalls_first_attempt = 16
 
-        slothy.config.unsafe_address_offset_fixup = False
+        slothy.config.unsafe_address_offset_fixup = True
 
 
         slothy.config.variable_size = True
@@ -1616,12 +1618,12 @@ def core(self, slothy):
         slothy.config.sw_pipelining.optimize_postamble = True
         slothy.config.sw_pipelining.allow_pre = True
 
-        slothy.optimize_loop("layer123_loop")
+        slothy.optimize_loop("layer123_loop", forced_loop_type=Arch_Armv7M.BranchLoop)
         slothy.optimize_loop("layer456_first_loop")
         slothy.optimize_loop("layer456_loop")
 
         slothy.config.inputs_are_outputs = True
-        slothy.optimize_loop("layer78_loop")
+        slothy.optimize_loop("layer78_loop", forced_loop_type=Arch_Armv7M.BranchLoop)
 
 class pointwise_montgomery_dilithium(Example):
     def __init__(self, var="", arch=Arch_Armv7M, target=Target_CortexM7, timeout=None):
@@ -1814,6 +1816,7 @@ def core(self, slothy):
         slothy.config.constraints.stalls_first_attempt = 32
 
         r = slothy.config.reserved_regs
+        r.add("r1")
         r = r.union(f"s{i}" for i in range(31)) # reserve FPR
         slothy.config.reserved_regs = r
 
@@ -1825,13 +1828,12 @@ def core(self, slothy):
         slothy.config.variable_size = True
         slothy.config.split_heuristic = True
         slothy.config.timeout = 360 # Not more than 2min per step
-        slothy.config.split_heuristic_factor = 1
         slothy.config.visualize_expected_performance = False
-        slothy.config.split_heuristic_factor = 4
+        slothy.config.split_heuristic_factor = 5
         slothy.config.split_heuristic_stepsize = 0.15
-        slothy.optimize_loop("layer1234_loop")
+        slothy.optimize_loop("layer1234_loop", forced_loop_type=Arch_Armv7M.BranchLoop)
         slothy.config.split_heuristic_optimize_seam = 6
-        slothy.optimize_loop("layer1234_loop")
+        slothy.optimize_loop("layer1234_loop", forced_loop_type=Arch_Armv7M.BranchLoop)
 
         slothy.config.outputs = ["r14"]
 
@@ -2179,12 +2181,11 @@ def core(self, slothy):
         slothy.config.variable_size = True
 
         r = slothy.config.reserved_regs
-        r.add("r14")
         slothy.config.reserved_regs = r
 
         slothy.config.sw_pipelining.enabled = True
         slothy.config.constraints.stalls_first_attempt = 16
-        slothy.optimize_loop("1")
+        slothy.optimize_loop("1", forced_loop_type=Arch_Armv7M.BranchLoop)
 
 class basemul_acc_32_16_kyber(Example):
     def __init__(self, var="", arch=Arch_Armv7M, target=Target_CortexM7, timeout=None):
@@ -2278,14 +2279,10 @@ def core(self, slothy):
         slothy.config.inputs_are_outputs = True
         slothy.config.variable_size = True
 
-        r = slothy.config.reserved_regs
-        r.add("r14")
-        slothy.config.reserved_regs = r
-
         slothy.config.unsafe_address_offset_fixup = False
         slothy.config.sw_pipelining.enabled = True
         slothy.config.constraints.stalls_first_attempt = 16
-        slothy.optimize_loop("1")
+        slothy.optimize_loop("1", forced_loop_type=Arch_Armv7M.BranchLoop)
 
 class add_kyber(Example):
     def __init__(self, var="", arch=Arch_Armv7M, target=Target_CortexM7, timeout=None):
@@ -2484,16 +2481,14 @@ def __init__(self, var="", arch=Arch_Armv7M, target=Target_CortexM7, timeout=Non
     def core(self, slothy):
         slothy.config.inputs_are_outputs = True
         slothy.config.variable_size = True
-        slothy.config.outputs = ["r14"]
         slothy.config.unsafe_address_offset_fixup = False
         r = slothy.config.reserved_regs
-        r.add("r14")
         r = r.union(f"s{i}" for i in range(32)) # reserve FPR
         slothy.config.reserved_regs = r
 
         slothy.config.sw_pipelining.enabled = True
         slothy.config.constraints.stalls_first_attempt = 16
-        slothy.optimize_loop("1")
+        slothy.optimize_loop("1", forced_loop_type=Arch_Armv7M.BranchLoop)
 
 class matacc_kyber(Example):
     def __init__(self, var="", arch=Arch_Armv7M, target=Target_CortexM7, timeout=None):

diff --git a/examples/naive/armv7m/armv7m_simple0.s b/examples/naive/armv7m/armv7m_simple0.s
@@ -29,4 +29,8 @@ smlabt r3,r2, r2, r1
 asrs r3,   r3,#1
 str r3, [r0,#4] // @slothy:writes=a
 
+ldrd r0, r3, [r0, #4]
+ldm r0 ,{r0-r2}
+add r2,r3,r2
+str r1, [sp, #0] 
 end:
diff --git a/examples/naive/armv7m/basemul_acc_32_32_kyber.s b/examples/naive/armv7m/basemul_acc_32_32_kyber.s
@@ -32,31 +32,31 @@ basemul_asm_acc_opt_32_32:
 
   movw loop, #64
   1:
-    ldr poly0, [aptr], #8
-    ldr poly1, [bptr], #8
+    ldr poly0, [aptr], #4
+    ldr poly1, [bptr], #4
     ldr.w res0, [rptr_tmp]
-    ldr tmp2, [aprimeptr], #8
+    ldr tmp2, [aprimeptr], #4
     ldr.w res1, [rptr_tmp, #4]
 
     // (poly0_t * zeta) * poly1_t + poly0_b * poly0_t + res
     smlad tmp2, tmp2, poly1, res0
-    str tmp2, [rptr_tmp], #16
+    str tmp2, [rptr_tmp], #4
 
     // poly1_t * poly0_b + poly1_b * poly0_t + res
     smladx tmp, poly0, poly1, res1
-    str tmp, [rptr_tmp, #-12]
+    str tmp, [rptr_tmp], #4
 
-    ldr poly0, [aptr, #-4]
-    ldr poly1, [bptr, #-4]
-    ldr res0, [rptr_tmp, #-8]
-    ldr tmp2, [aprimeptr, #-4]
-    ldr res1, [rptr_tmp, #-4]
+    ldr poly0, [aptr], #4
+    ldr poly1, [bptr], #4
+    ldr.w res0, [rptr_tmp]
+    ldr tmp2, [aprimeptr], #4
+    ldr.w res1, [rptr_tmp, #4]
 
     smlad tmp2, tmp2, poly1, res0
-    str tmp2, [rptr_tmp, #-8]
+    str tmp2, [rptr_tmp], #4
 
     smladx tmp, poly0, poly1, res1
-    str tmp, [rptr_tmp, #-4]
+    str tmp, [rptr_tmp], #4
 
     subs.w loop, loop, #1
   bne.w 1b

diff --git a/examples/naive/armv7m/frombytes_mul_acc_32_16_kyber.s b/examples/naive/armv7m/frombytes_mul_acc_32_16_kyber.s
@@ -12,7 +12,7 @@
 
 .macro doublebasemul_frombytes_asm_acc_32_16 rptr_tmp, rptr, bptr, zeta, poly0, poly1, poly3, res0, tmp, q, qa, qinv
   ldr \poly0, [\bptr], #8
-  ldr \res0, [\rptr_tmp], #16 // @slothy:core=True
+  ldr \res0, [\rptr_tmp], #16 // @slothy:core=True // @slothy:before=cmp
 
   smulwt \tmp, \zeta, \poly1
 	smlabt \tmp, \tmp, \q, \qa
@@ -72,7 +72,7 @@ frombytes_mul_asm_acc_32_16:
   push {r4-r11, r14}
 
   rptr     .req r0
-  bptr     .req r1
+  bptr     .req r3
   aptr     .req r2
   zetaptr  .req r3
   t0       .req r4
@@ -85,43 +85,28 @@ frombytes_mul_asm_acc_32_16:
 	qinv     .req r11
 	zeta     .req r12
 	ctr      .req r14
-  rptr_tmp .req r3
+  rptr_tmp .req r1
 
   movw qa, #26632
 	movt  q, #3329
 	### qinv=0x6ba8f301
 	movw qinv, #62209
 	movt qinv, #27560
 
-  vmov s2, zetaptr
+  vmov s1, r1
   ldr.w rptr_tmp, [sp, #9*4] // load rptr_tmp from stack
-  vmov s1, rptr_tmp
+
   add ctr, rptr_tmp, #64*4*4
   1:
+    ldr.w zeta, [zetaptr], #4
     deserialize aptr, tmp, tmp2, tmp3, t0, t1
-    vmov tmp, s2
-    ldr zeta, [tmp], #4
-    vmov s2, tmp
+    vmov s2, zetaptr
+    vmov bptr, s1
     doublebasemul_frombytes_asm_acc_32_16 rptr_tmp, rptr, bptr, zeta, tmp3, t0, t1, tmp, tmp2, q, qa, qinv
-    cmp.w rptr_tmp, ctr
+    vmov s1, bptr // @slothy:core=True
+    cmp.w rptr_tmp, ctr // @slothy:id=cmp
+    vmov zetaptr, s2
     bne.w 1b
 
-  // Original code
-  // ldr.w tmp, [sp, #9*4] // load rptr_tmp from stack
-  // vmov s1, tmp
-  // vmov s2, zetaptr
-  // add ctr, tmp, #64*4*4
-  // 1:
-  //   vmov zetaptr, s2
-  //   ldr.w zeta, [zetaptr], #4
-  //   deserialize aptr, tmp, tmp2, tmp3, t0, t1
-  //   vmov s2, zetaptr
-  //   vmov rptr_tmp, s1
-  //   doublebasemul_frombytes_asm_acc_32_16 rptr_tmp, rptr, bptr, zeta, tmp3, t0, t1, tmp, tmp2, q, qa, qinv
-  //   vmov s1, rptr_tmp
-  //   cmp.w rptr_tmp, ctr
-  //   bne.w 1b
-
 pop {r4-r11, pc}
-
 .size frombytes_mul_asm_acc_32_16, .-frombytes_mul_asm_acc_32_16
diff --git a/examples/naive/armv7m/frombytes_mul_acc_kyber.s b/examples/naive/armv7m/frombytes_mul_acc_kyber.s
@@ -32,7 +32,7 @@
 	// r[1] in upper half of tmp2
 	pkhtb \tmp, \tmp2, \tmp, asr #16
 	uadd16 \res0, \res0, \tmp
-	str \res0, [\rptr], #8 // @slothy:core=True
+	str \res0, [\rptr], #8 // @slothy:core=True // @slothy:before=cmp
 
 	neg \zeta, \zeta
 
@@ -101,13 +101,13 @@ frombytes_mul_asm_acc:
 	movt qinv, #27560
 
 	add ctr, rptr, #64*4*2
-	vmov s0, ctr
 	1:
 		ldr.w zeta, [zetaptr], #4
 		deserialize aptr, tmp, tmp2, tmp3, t0, t1
+		vmov s0, ctr
 		doublebasemul_frombytes_asm_acc rptr, bptr, zeta, tmp3, t0, t1, ctr, tmp, tmp2, q, qa, qinv
 		vmov ctr, s0
-	cmp.w rptr, ctr
+	cmp.w rptr, ctr // @slothy:id=cmp
 	bne.w 1b
 
 	pop {r4-r11, pc}