Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Armv7-M: Allow register overlap in ldm + ldrd #153

Open
wants to merge 11 commits into
base: main
Choose a base branch
from
38 changes: 2 additions & 36 deletions .github/workflows/test_basic.yaml
Original file line number Diff line number Diff line change
@@ -1,16 +1,12 @@
name: Regression tests
on:
push:
branches: [ "main" ]
pull_request:
branches: [ "main" ]
types: [ opened, synchronize, labeled ]
jobs:
examples_dry_run:
name: Dry Run (${{ matrix.target }})
if: ${{ github.event.label.name == 'needs-ci' ||
github.event.pull_request.user.login == 'hanno-becker' ||
github.event.pull_request.user.login == 'dop-amin' ||
github.event.pull_request.user.login == 'mkannwischer'
}}
runs-on: ubuntu-latest
strategy:
matrix:
Expand All @@ -22,11 +18,6 @@ jobs:
run: |
python3 example.py --dry-run --only-target=${{ matrix.target }}
tutorial:
if: ${{ github.event.label.name == 'needs-ci' ||
github.event.pull_request.user.login == 'hanno-becker' ||
github.event.pull_request.user.login == 'dop-amin' ||
github.event.pull_request.user.login == 'mkannwischer'
}}
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
Expand All @@ -35,11 +26,6 @@ jobs:
run: |
(cd tutorial && ./tutorial_all.sh)
examples_basic:
if: ${{ github.event.label.name == 'needs-ci' ||
github.event.pull_request.user.login == 'hanno-becker' ||
github.event.pull_request.user.login == 'dop-amin' ||
github.event.pull_request.user.login == 'mkannwischer'
}}
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
Expand All @@ -48,11 +34,6 @@ jobs:
run: |
python3 example.py --examples simple0,simple1,simple0_loop,simple1_loop
examples_ntt_kyber_dilithium_helium_core:
if: ${{ github.event.label.name == 'needs-ci' ||
github.event.pull_request.user.login == 'hanno-becker' ||
github.event.pull_request.user.login == 'dop-amin' ||
github.event.pull_request.user.login == 'mkannwischer'
}}
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
Expand All @@ -61,11 +42,6 @@ jobs:
run: |
python3 example.py --examples ntt_kyber_1_23_45_67_m55,ntt_dilithium_12_34_56_78_m55 --timeout=300
examples_ntt_kyber_dilithium_neon_core:
if: ${{ github.event.label.name == 'needs-ci' ||
github.event.pull_request.user.login == 'hanno-becker' ||
github.event.pull_request.user.login == 'dop-amin' ||
github.event.pull_request.user.login == 'mkannwischer'
}}
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
Expand All @@ -74,11 +50,6 @@ jobs:
run: |
python3 example.py --examples ntt_kyber_123_4567_a55,ntt_dilithium_123_45678_a55 --timeout=300
sqmag:
if: ${{ github.event.label.name == 'needs-ci' ||
github.event.pull_request.user.login == 'hanno-becker' ||
github.event.pull_request.user.login == 'dop-amin' ||
github.event.pull_request.user.login == 'mkannwischer'
}}
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
Expand All @@ -87,11 +58,6 @@ jobs:
run: |
(cd paper/scripts && NO_LOG=Y ./slothy_sqmag.sh)
fft:
if: ${{ github.event.label.name == 'needs-ci' ||
github.event.pull_request.user.login == 'hanno-becker' ||
github.event.pull_request.user.login == 'dop-amin' ||
github.event.pull_request.user.login == 'mkannwischer'
}}
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
Expand Down
31 changes: 13 additions & 18 deletions example.py
Original file line number Diff line number Diff line change
Expand Up @@ -669,7 +669,9 @@ def __init__(self, var="", arch=Arch_Armv7M, target=Target_CortexM7):
def core(self,slothy):
slothy.config.variable_size=True
slothy.config.inputs_are_outputs = True
slothy.fusion_region("start", "end", ssa=False)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You need to put the fusion_region before the optimize, otherwise this does not help SLOTHY find a better solution.

slothy.optimize(start="start", end="end")


class Armv7mExample0Func(Example):
def __init__(self, var="", arch=Arch_Armv7M, target=Target_CortexM7):
Expand Down Expand Up @@ -1569,7 +1571,7 @@ def __init__(self, var="", arch=Arch_Armv7M, target=Target_CortexM7, timeout=Non
def core(self, slothy):
slothy.config.constraints.stalls_first_attempt = 16

slothy.config.unsafe_address_offset_fixup = False
slothy.config.unsafe_address_offset_fixup = True

slothy.config.variable_size = True
slothy.config.inputs_are_outputs = True
Expand Down Expand Up @@ -1605,7 +1607,7 @@ def __init__(self, var="", arch=Arch_Armv7M, target=Target_CortexM7, timeout=Non
def core(self, slothy):
slothy.config.constraints.stalls_first_attempt = 16

slothy.config.unsafe_address_offset_fixup = False
slothy.config.unsafe_address_offset_fixup = True


slothy.config.variable_size = True
Expand All @@ -1616,12 +1618,12 @@ def core(self, slothy):
slothy.config.sw_pipelining.optimize_postamble = True
slothy.config.sw_pipelining.allow_pre = True

slothy.optimize_loop("layer123_loop")
slothy.optimize_loop("layer123_loop", forced_loop_type=Arch_Armv7M.BranchLoop)
slothy.optimize_loop("layer456_first_loop")
slothy.optimize_loop("layer456_loop")

slothy.config.inputs_are_outputs = True
slothy.optimize_loop("layer78_loop")
slothy.optimize_loop("layer78_loop", forced_loop_type=Arch_Armv7M.BranchLoop)

class pointwise_montgomery_dilithium(Example):
def __init__(self, var="", arch=Arch_Armv7M, target=Target_CortexM7, timeout=None):
Expand Down Expand Up @@ -1814,6 +1816,7 @@ def core(self, slothy):
slothy.config.constraints.stalls_first_attempt = 32

r = slothy.config.reserved_regs
r.add("r1")
r = r.union(f"s{i}" for i in range(31)) # reserve FPR
slothy.config.reserved_regs = r

Expand All @@ -1825,13 +1828,12 @@ def core(self, slothy):
slothy.config.variable_size = True
slothy.config.split_heuristic = True
slothy.config.timeout = 360 # Not more than 2min per step
slothy.config.split_heuristic_factor = 1
slothy.config.visualize_expected_performance = False
slothy.config.split_heuristic_factor = 4
slothy.config.split_heuristic_factor = 5
slothy.config.split_heuristic_stepsize = 0.15
slothy.optimize_loop("layer1234_loop")
slothy.optimize_loop("layer1234_loop", forced_loop_type=Arch_Armv7M.BranchLoop)
slothy.config.split_heuristic_optimize_seam = 6
slothy.optimize_loop("layer1234_loop")
slothy.optimize_loop("layer1234_loop", forced_loop_type=Arch_Armv7M.BranchLoop)

slothy.config.outputs = ["r14"]

Expand Down Expand Up @@ -2179,12 +2181,11 @@ def core(self, slothy):
slothy.config.variable_size = True

r = slothy.config.reserved_regs
r.add("r14")
slothy.config.reserved_regs = r

slothy.config.sw_pipelining.enabled = True
slothy.config.constraints.stalls_first_attempt = 16
slothy.optimize_loop("1")
slothy.optimize_loop("1", forced_loop_type=Arch_Armv7M.BranchLoop)

class basemul_acc_32_16_kyber(Example):
def __init__(self, var="", arch=Arch_Armv7M, target=Target_CortexM7, timeout=None):
Expand Down Expand Up @@ -2278,14 +2279,10 @@ def core(self, slothy):
slothy.config.inputs_are_outputs = True
slothy.config.variable_size = True

r = slothy.config.reserved_regs
r.add("r14")
slothy.config.reserved_regs = r

slothy.config.unsafe_address_offset_fixup = False
slothy.config.sw_pipelining.enabled = True
slothy.config.constraints.stalls_first_attempt = 16
slothy.optimize_loop("1")
slothy.optimize_loop("1", forced_loop_type=Arch_Armv7M.BranchLoop)

class add_kyber(Example):
def __init__(self, var="", arch=Arch_Armv7M, target=Target_CortexM7, timeout=None):
Expand Down Expand Up @@ -2484,16 +2481,14 @@ def __init__(self, var="", arch=Arch_Armv7M, target=Target_CortexM7, timeout=Non
def core(self, slothy):
slothy.config.inputs_are_outputs = True
slothy.config.variable_size = True
slothy.config.outputs = ["r14"]
slothy.config.unsafe_address_offset_fixup = False
r = slothy.config.reserved_regs
r.add("r14")
r = r.union(f"s{i}" for i in range(32)) # reserve FPR
slothy.config.reserved_regs = r

slothy.config.sw_pipelining.enabled = True
slothy.config.constraints.stalls_first_attempt = 16
slothy.optimize_loop("1")
slothy.optimize_loop("1", forced_loop_type=Arch_Armv7M.BranchLoop)

class matacc_kyber(Example):
def __init__(self, var="", arch=Arch_Armv7M, target=Target_CortexM7, timeout=None):
Expand Down
4 changes: 4 additions & 0 deletions examples/naive/armv7m/armv7m_simple0.s
Original file line number Diff line number Diff line change
Expand Up @@ -29,4 +29,8 @@ smlabt r3,r2, r2, r1
asrs r3, r3,#1
str r3, [r0,#4] // @slothy:writes=a

ldrd r0, r3, [r0, #4]
ldm r0 ,{r0-r2}
add r2,r3,r2
str r1, [sp, #0]
end:
24 changes: 12 additions & 12 deletions examples/naive/armv7m/basemul_acc_32_32_kyber.s
Original file line number Diff line number Diff line change
Expand Up @@ -32,31 +32,31 @@ basemul_asm_acc_opt_32_32:

movw loop, #64
1:
ldr poly0, [aptr], #8
ldr poly1, [bptr], #8
ldr poly0, [aptr], #4
ldr poly1, [bptr], #4
ldr.w res0, [rptr_tmp]
ldr tmp2, [aprimeptr], #8
ldr tmp2, [aprimeptr], #4
ldr.w res1, [rptr_tmp, #4]

// (poly0_t * zeta) * poly1_t + poly0_b * poly0_t + res
smlad tmp2, tmp2, poly1, res0
str tmp2, [rptr_tmp], #16
str tmp2, [rptr_tmp], #4

// poly1_t * poly0_b + poly1_b * poly0_t + res
smladx tmp, poly0, poly1, res1
str tmp, [rptr_tmp, #-12]
str tmp, [rptr_tmp], #4

ldr poly0, [aptr, #-4]
ldr poly1, [bptr, #-4]
ldr res0, [rptr_tmp, #-8]
ldr tmp2, [aprimeptr, #-4]
ldr res1, [rptr_tmp, #-4]
ldr poly0, [aptr], #4
ldr poly1, [bptr], #4
ldr.w res0, [rptr_tmp]
ldr tmp2, [aprimeptr], #4
ldr.w res1, [rptr_tmp, #4]

smlad tmp2, tmp2, poly1, res0
str tmp2, [rptr_tmp, #-8]
str tmp2, [rptr_tmp], #4

smladx tmp, poly0, poly1, res1
str tmp, [rptr_tmp, #-4]
str tmp, [rptr_tmp], #4

subs.w loop, loop, #1
bne.w 1b
Expand Down
37 changes: 11 additions & 26 deletions examples/naive/armv7m/frombytes_mul_acc_32_16_kyber.s
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

.macro doublebasemul_frombytes_asm_acc_32_16 rptr_tmp, rptr, bptr, zeta, poly0, poly1, poly3, res0, tmp, q, qa, qinv
ldr \poly0, [\bptr], #8
ldr \res0, [\rptr_tmp], #16 // @slothy:core=True
ldr \res0, [\rptr_tmp], #16 // @slothy:core=True // @slothy:before=cmp

smulwt \tmp, \zeta, \poly1
smlabt \tmp, \tmp, \q, \qa
Expand Down Expand Up @@ -72,7 +72,7 @@ frombytes_mul_asm_acc_32_16:
push {r4-r11, r14}

rptr .req r0
bptr .req r1
bptr .req r3
aptr .req r2
zetaptr .req r3
t0 .req r4
Expand All @@ -85,43 +85,28 @@ frombytes_mul_asm_acc_32_16:
qinv .req r11
zeta .req r12
ctr .req r14
rptr_tmp .req r3
rptr_tmp .req r1

movw qa, #26632
movt q, #3329
### qinv=0x6ba8f301
movw qinv, #62209
movt qinv, #27560

vmov s2, zetaptr
vmov s1, r1
ldr.w rptr_tmp, [sp, #9*4] // load rptr_tmp from stack
vmov s1, rptr_tmp

add ctr, rptr_tmp, #64*4*4
1:
ldr.w zeta, [zetaptr], #4
deserialize aptr, tmp, tmp2, tmp3, t0, t1
vmov tmp, s2
ldr zeta, [tmp], #4
vmov s2, tmp
vmov s2, zetaptr
vmov bptr, s1
doublebasemul_frombytes_asm_acc_32_16 rptr_tmp, rptr, bptr, zeta, tmp3, t0, t1, tmp, tmp2, q, qa, qinv
cmp.w rptr_tmp, ctr
vmov s1, bptr // @slothy:core=True
cmp.w rptr_tmp, ctr // @slothy:id=cmp
vmov zetaptr, s2
bne.w 1b

// Original code
// ldr.w tmp, [sp, #9*4] // load rptr_tmp from stack
// vmov s1, tmp
// vmov s2, zetaptr
// add ctr, tmp, #64*4*4
// 1:
// vmov zetaptr, s2
// ldr.w zeta, [zetaptr], #4
// deserialize aptr, tmp, tmp2, tmp3, t0, t1
// vmov s2, zetaptr
// vmov rptr_tmp, s1
// doublebasemul_frombytes_asm_acc_32_16 rptr_tmp, rptr, bptr, zeta, tmp3, t0, t1, tmp, tmp2, q, qa, qinv
// vmov s1, rptr_tmp
// cmp.w rptr_tmp, ctr
// bne.w 1b

pop {r4-r11, pc}

.size frombytes_mul_asm_acc_32_16, .-frombytes_mul_asm_acc_32_16
6 changes: 3 additions & 3 deletions examples/naive/armv7m/frombytes_mul_acc_kyber.s
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
// r[1] in upper half of tmp2
pkhtb \tmp, \tmp2, \tmp, asr #16
uadd16 \res0, \res0, \tmp
str \res0, [\rptr], #8 // @slothy:core=True
str \res0, [\rptr], #8 // @slothy:core=True // @slothy:before=cmp

neg \zeta, \zeta

Expand Down Expand Up @@ -101,13 +101,13 @@ frombytes_mul_asm_acc:
movt qinv, #27560

add ctr, rptr, #64*4*2
vmov s0, ctr
1:
ldr.w zeta, [zetaptr], #4
deserialize aptr, tmp, tmp2, tmp3, t0, t1
vmov s0, ctr
doublebasemul_frombytes_asm_acc rptr, bptr, zeta, tmp3, t0, t1, ctr, tmp, tmp2, q, qa, qinv
vmov ctr, s0
cmp.w rptr, ctr
cmp.w rptr, ctr // @slothy:id=cmp
bne.w 1b

pop {r4-r11, pc}
Expand Down
Loading