Skip to content

Commit

Permalink
Merge branch 'main' into armv7m
Browse files Browse the repository at this point in the history
  • Loading branch information
mkannwischer committed Jan 13, 2025
2 parents 9cf5191 + f78627a commit ac2f6c0
Show file tree
Hide file tree
Showing 16 changed files with 3,634 additions and 3,326 deletions.
46 changes: 15 additions & 31 deletions .github/workflows/test_basic.yaml
Original file line number Diff line number Diff line change
@@ -1,17 +1,22 @@
name: Regression tests
on:
push:
branches: [ "main" ]
pull_request:
branches: [ "main" ]
types: [ opened, synchronize, labeled ]
jobs:
examples_dry_run:
name: Dry Run (${{ matrix.target }})
<<<<<<< HEAD
if: ${{ github.event.label.name == 'needs-ci' ||
github.event.pull_request.user.login == 'hanno-becker' ||
github.event.pull_request.user.login == 'dop-amin' ||
github.event.pull_request.user.login == 'mkannwischer'
}}
runs-on: macos-latest
=======
runs-on: ubuntu-latest
>>>>>>> main
strategy:
matrix:
target: [slothy.targets.arm_v7m.cortex_m4,slothy.targets.arm_v7m.cortex_m7,slothy.targets.arm_v81m.cortex_m55r1, slothy.targets.arm_v81m.cortex_m85r1, slothy.targets.aarch64.cortex_a55, slothy.targets.aarch64.cortex_a72_frontend, slothy.targets.aarch64.apple_m1_firestorm_experimental, slothy.targets.aarch64.apple_m1_icestorm_experimental]
Expand All @@ -27,12 +32,16 @@ jobs:
run: |
python3 example.py --dry-run --only-target=${{ matrix.target }}
tutorial:
<<<<<<< HEAD
if: ${{ github.event.label.name == 'needs-ci' ||
github.event.pull_request.user.login == 'hanno-becker' ||
github.event.pull_request.user.login == 'dop-amin' ||
github.event.pull_request.user.login == 'mkannwischer'
}}
runs-on: macos-latest
=======
runs-on: ubuntu-latest
>>>>>>> main
steps:
- uses: actions/checkout@v3
- uses: ./.github/actions/setup-macos
Expand All @@ -45,12 +54,7 @@ jobs:
run: |
(cd tutorial && ./tutorial_all.sh)
examples_basic:
if: ${{ github.event.label.name == 'needs-ci' ||
github.event.pull_request.user.login == 'hanno-becker' ||
github.event.pull_request.user.login == 'dop-amin' ||
github.event.pull_request.user.login == 'mkannwischer'
}}
runs-on: macos-latest
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: ./.github/actions/setup-macos
Expand All @@ -63,12 +67,7 @@ jobs:
run: |
python3 example.py --examples simple0,simple1,simple0_loop,simple1_loop
examples_ntt_kyber_dilithium_helium_core:
if: ${{ github.event.label.name == 'needs-ci' ||
github.event.pull_request.user.login == 'hanno-becker' ||
github.event.pull_request.user.login == 'dop-amin' ||
github.event.pull_request.user.login == 'mkannwischer'
}}
runs-on: macos-latest
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: ./.github/actions/setup-macos
Expand All @@ -81,12 +80,7 @@ jobs:
run: |
python3 example.py --examples ntt_kyber_1_23_45_67_m55,ntt_dilithium_12_34_56_78_m55 --timeout=1200
examples_ntt_kyber_dilithium_neon_core:
if: ${{ github.event.label.name == 'needs-ci' ||
github.event.pull_request.user.login == 'hanno-becker' ||
github.event.pull_request.user.login == 'dop-amin' ||
github.event.pull_request.user.login == 'mkannwischer'
}}
runs-on: macos-latest
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: ./.github/actions/setup-macos
Expand All @@ -99,12 +93,7 @@ jobs:
run: |
python3 example.py --examples ntt_kyber_123_4567_a55,ntt_dilithium_123_45678_a55 --timeout=1200
sqmag:
if: ${{ github.event.label.name == 'needs-ci' ||
github.event.pull_request.user.login == 'hanno-becker' ||
github.event.pull_request.user.login == 'dop-amin' ||
github.event.pull_request.user.login == 'mkannwischer'
}}
runs-on: macos-latest
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: ./.github/actions/setup-macos
Expand All @@ -117,12 +106,7 @@ jobs:
run: |
(cd paper/scripts && NO_LOG=Y ./slothy_sqmag.sh)
fft:
if: ${{ github.event.label.name == 'needs-ci' ||
github.event.pull_request.user.login == 'hanno-becker' ||
github.event.pull_request.user.login == 'dop-amin' ||
github.event.pull_request.user.login == 'mkannwischer'
}}
runs-on: macos-latest
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: ./.github/actions/setup-macos
Expand Down
29 changes: 11 additions & 18 deletions example.py
Original file line number Diff line number Diff line change
Expand Up @@ -1623,7 +1623,7 @@ def core(self, slothy):

slothy.config.constraints.stalls_first_attempt = 16

slothy.config.unsafe_address_offset_fixup = False
slothy.config.unsafe_address_offset_fixup = True

slothy.config.variable_size = True
slothy.config.inputs_are_outputs = True
Expand Down Expand Up @@ -1659,7 +1659,7 @@ def __init__(self, var="", arch=Arch_Armv7M, target=Target_CortexM7, timeout=Non
def core(self, slothy):
slothy.config.constraints.stalls_first_attempt = 16

slothy.config.unsafe_address_offset_fixup = False
slothy.config.unsafe_address_offset_fixup = True


slothy.config.variable_size = True
Expand All @@ -1670,12 +1670,12 @@ def core(self, slothy):
slothy.config.sw_pipelining.optimize_postamble = True
slothy.config.sw_pipelining.allow_pre = True

slothy.optimize_loop("layer123_loop")
slothy.optimize_loop("layer123_loop", forced_loop_type=Arch_Armv7M.BranchLoop)
slothy.optimize_loop("layer456_first_loop")
slothy.optimize_loop("layer456_loop")

slothy.config.inputs_are_outputs = True
slothy.optimize_loop("layer78_loop")
slothy.optimize_loop("layer78_loop", forced_loop_type=Arch_Armv7M.BranchLoop)

class pointwise_montgomery_dilithium(Example):
def __init__(self, var="", arch=Arch_Armv7M, target=Target_CortexM7, timeout=None):
Expand Down Expand Up @@ -1868,6 +1868,7 @@ def core(self, slothy):
slothy.config.constraints.stalls_first_attempt = 32

r = slothy.config.reserved_regs
r.add("r1")
r = r.union(f"s{i}" for i in range(31)) # reserve FPR
slothy.config.reserved_regs = r

Expand All @@ -1879,13 +1880,12 @@ def core(self, slothy):
slothy.config.variable_size = True
slothy.config.split_heuristic = True
slothy.config.timeout = 360 # Not more than 2min per step
slothy.config.split_heuristic_factor = 1
slothy.config.visualize_expected_performance = False
slothy.config.split_heuristic_factor = 4
slothy.config.split_heuristic_factor = 5
slothy.config.split_heuristic_stepsize = 0.15
slothy.optimize_loop("layer1234_loop")
slothy.optimize_loop("layer1234_loop", forced_loop_type=Arch_Armv7M.BranchLoop)
slothy.config.split_heuristic_optimize_seam = 6
slothy.optimize_loop("layer1234_loop")
slothy.optimize_loop("layer1234_loop", forced_loop_type=Arch_Armv7M.BranchLoop)

slothy.config.outputs = ["r14"]

Expand Down Expand Up @@ -2233,12 +2233,11 @@ def core(self, slothy):
slothy.config.variable_size = True

r = slothy.config.reserved_regs
r.add("r14")
slothy.config.reserved_regs = r

slothy.config.sw_pipelining.enabled = True
slothy.config.constraints.stalls_first_attempt = 16
slothy.optimize_loop("1")
slothy.optimize_loop("1", forced_loop_type=Arch_Armv7M.BranchLoop)

class basemul_acc_32_16_kyber(Example):
def __init__(self, var="", arch=Arch_Armv7M, target=Target_CortexM7, timeout=None):
Expand Down Expand Up @@ -2332,14 +2331,10 @@ def core(self, slothy):
slothy.config.inputs_are_outputs = True
slothy.config.variable_size = True

r = slothy.config.reserved_regs
r.add("r14")
slothy.config.reserved_regs = r

slothy.config.unsafe_address_offset_fixup = False
slothy.config.sw_pipelining.enabled = True
slothy.config.constraints.stalls_first_attempt = 16
slothy.optimize_loop("1")
slothy.optimize_loop("1", forced_loop_type=Arch_Armv7M.BranchLoop)

class add_kyber(Example):
def __init__(self, var="", arch=Arch_Armv7M, target=Target_CortexM7, timeout=None):
Expand Down Expand Up @@ -2538,16 +2533,14 @@ def __init__(self, var="", arch=Arch_Armv7M, target=Target_CortexM7, timeout=Non
def core(self, slothy):
slothy.config.inputs_are_outputs = True
slothy.config.variable_size = True
slothy.config.outputs = ["r14"]
slothy.config.unsafe_address_offset_fixup = False
r = slothy.config.reserved_regs
r.add("r14")
r = r.union(f"s{i}" for i in range(32)) # reserve FPR
slothy.config.reserved_regs = r

slothy.config.sw_pipelining.enabled = True
slothy.config.constraints.stalls_first_attempt = 16
slothy.optimize_loop("1")
slothy.optimize_loop("1", forced_loop_type=Arch_Armv7M.BranchLoop)

class matacc_kyber(Example):
def __init__(self, var="", arch=Arch_Armv7M, target=Target_CortexM7, timeout=None):
Expand Down
24 changes: 12 additions & 12 deletions examples/naive/armv7m/basemul_acc_32_32_kyber.s
Original file line number Diff line number Diff line change
Expand Up @@ -32,31 +32,31 @@ basemul_asm_acc_opt_32_32:

movw loop, #64
1:
ldr poly0, [aptr], #8
ldr poly1, [bptr], #8
ldr poly0, [aptr], #4
ldr poly1, [bptr], #4
ldr.w res0, [rptr_tmp]
ldr tmp2, [aprimeptr], #8
ldr tmp2, [aprimeptr], #4
ldr.w res1, [rptr_tmp, #4]

// (poly0_t * zeta) * poly1_t + poly0_b * poly0_t + res
smlad tmp2, tmp2, poly1, res0
str tmp2, [rptr_tmp], #16
str tmp2, [rptr_tmp], #4

// poly1_t * poly0_b + poly1_b * poly0_t + res
smladx tmp, poly0, poly1, res1
str tmp, [rptr_tmp, #-12]
str tmp, [rptr_tmp], #4

ldr poly0, [aptr, #-4]
ldr poly1, [bptr, #-4]
ldr res0, [rptr_tmp, #-8]
ldr tmp2, [aprimeptr, #-4]
ldr res1, [rptr_tmp, #-4]
ldr poly0, [aptr], #4
ldr poly1, [bptr], #4
ldr.w res0, [rptr_tmp]
ldr tmp2, [aprimeptr], #4
ldr.w res1, [rptr_tmp, #4]

smlad tmp2, tmp2, poly1, res0
str tmp2, [rptr_tmp, #-8]
str tmp2, [rptr_tmp], #4

smladx tmp, poly0, poly1, res1
str tmp, [rptr_tmp, #-4]
str tmp, [rptr_tmp], #4

subs.w loop, loop, #1
bne.w 1b
Expand Down
37 changes: 11 additions & 26 deletions examples/naive/armv7m/frombytes_mul_acc_32_16_kyber.s
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

.macro doublebasemul_frombytes_asm_acc_32_16 rptr_tmp, rptr, bptr, zeta, poly0, poly1, poly3, res0, tmp, q, qa, qinv
ldr \poly0, [\bptr], #8
ldr \res0, [\rptr_tmp], #16 // @slothy:core=True
ldr \res0, [\rptr_tmp], #16 // @slothy:core=True // @slothy:before=cmp

smulwt \tmp, \zeta, \poly1
smlabt \tmp, \tmp, \q, \qa
Expand Down Expand Up @@ -72,7 +72,7 @@ frombytes_mul_asm_acc_32_16:
push {r4-r11, r14}

rptr .req r0
bptr .req r1
bptr .req r3
aptr .req r2
zetaptr .req r3
t0 .req r4
Expand All @@ -85,43 +85,28 @@ frombytes_mul_asm_acc_32_16:
qinv .req r11
zeta .req r12
ctr .req r14
rptr_tmp .req r3
rptr_tmp .req r1

movw qa, #26632
movt q, #3329
### qinv=0x6ba8f301
movw qinv, #62209
movt qinv, #27560

vmov s2, zetaptr
vmov s1, r1
ldr.w rptr_tmp, [sp, #9*4] // load rptr_tmp from stack
vmov s1, rptr_tmp

add ctr, rptr_tmp, #64*4*4
1:
ldr.w zeta, [zetaptr], #4
deserialize aptr, tmp, tmp2, tmp3, t0, t1
vmov tmp, s2
ldr zeta, [tmp], #4
vmov s2, tmp
vmov s2, zetaptr
vmov bptr, s1
doublebasemul_frombytes_asm_acc_32_16 rptr_tmp, rptr, bptr, zeta, tmp3, t0, t1, tmp, tmp2, q, qa, qinv
cmp.w rptr_tmp, ctr
vmov s1, bptr // @slothy:core=True
cmp.w rptr_tmp, ctr // @slothy:id=cmp
vmov zetaptr, s2
bne.w 1b

// Original code
// ldr.w tmp, [sp, #9*4] // load rptr_tmp from stack
// vmov s1, tmp
// vmov s2, zetaptr
// add ctr, tmp, #64*4*4
// 1:
// vmov zetaptr, s2
// ldr.w zeta, [zetaptr], #4
// deserialize aptr, tmp, tmp2, tmp3, t0, t1
// vmov s2, zetaptr
// vmov rptr_tmp, s1
// doublebasemul_frombytes_asm_acc_32_16 rptr_tmp, rptr, bptr, zeta, tmp3, t0, t1, tmp, tmp2, q, qa, qinv
// vmov s1, rptr_tmp
// cmp.w rptr_tmp, ctr
// bne.w 1b

pop {r4-r11, pc}

.size frombytes_mul_asm_acc_32_16, .-frombytes_mul_asm_acc_32_16
6 changes: 3 additions & 3 deletions examples/naive/armv7m/frombytes_mul_acc_kyber.s
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
// r[1] in upper half of tmp2
pkhtb \tmp, \tmp2, \tmp, asr #16
uadd16 \res0, \res0, \tmp
str \res0, [\rptr], #8 // @slothy:core=True
str \res0, [\rptr], #8 // @slothy:core=True // @slothy:before=cmp

neg \zeta, \zeta

Expand Down Expand Up @@ -101,13 +101,13 @@ frombytes_mul_asm_acc:
movt qinv, #27560

add ctr, rptr, #64*4*2
vmov s0, ctr
1:
ldr.w zeta, [zetaptr], #4
deserialize aptr, tmp, tmp2, tmp3, t0, t1
vmov s0, ctr
doublebasemul_frombytes_asm_acc rptr, bptr, zeta, tmp3, t0, t1, ctr, tmp, tmp2, q, qa, qinv
vmov ctr, s0
cmp.w rptr, ctr
cmp.w rptr, ctr // @slothy:id=cmp
bne.w 1b

pop {r4-r11, pc}
Expand Down
Loading

0 comments on commit ac2f6c0

Please sign in to comment.