From 0f18e1375818e1266a4eba7e06663b4e814505cd Mon Sep 17 00:00:00 2001 From: Amin Abdulrahman Date: Fri, 10 Jan 2025 16:31:31 +0100 Subject: [PATCH 1/3] CM7: Simplify Kyber basemuls naive * This commit simplifies the kyber basemul naive implementations to revert modifications to the code originally taken from pqm4 that were only introduced to accomodate for shortcomings of slothy's abilities. --- example.py | 13 +- .../naive/armv7m/basemul_acc_32_32_kyber.s | 24 +- .../armv7m/frombytes_mul_acc_32_16_kyber.s | 37 +- .../naive/armv7m/frombytes_mul_acc_kyber.s | 6 +- .../armv7m/basemul_acc_32_32_kyber_opt_m7.s | 225 ++++---- .../frombytes_mul_acc_32_16_kyber_opt_m7.s | 486 +++++++++--------- .../armv7m/frombytes_mul_acc_kyber_opt_m7.s | 448 ++++++++-------- 7 files changed, 613 insertions(+), 626 deletions(-) diff --git a/example.py b/example.py index c398799a..c0a7cf65 100644 --- a/example.py +++ b/example.py @@ -2179,12 +2179,11 @@ def core(self, slothy): slothy.config.variable_size = True r = slothy.config.reserved_regs - r.add("r14") slothy.config.reserved_regs = r slothy.config.sw_pipelining.enabled = True slothy.config.constraints.stalls_first_attempt = 16 - slothy.optimize_loop("1") + slothy.optimize_loop("1", forced_loop_type=Arch_Armv7M.BranchLoop) class basemul_acc_32_16_kyber(Example): def __init__(self, var="", arch=Arch_Armv7M, target=Target_CortexM7, timeout=None): @@ -2278,14 +2277,10 @@ def core(self, slothy): slothy.config.inputs_are_outputs = True slothy.config.variable_size = True - r = slothy.config.reserved_regs - r.add("r14") - slothy.config.reserved_regs = r - slothy.config.unsafe_address_offset_fixup = False slothy.config.sw_pipelining.enabled = True slothy.config.constraints.stalls_first_attempt = 16 - slothy.optimize_loop("1") + slothy.optimize_loop("1", forced_loop_type=Arch_Armv7M.BranchLoop) class add_kyber(Example): def __init__(self, var="", arch=Arch_Armv7M, target=Target_CortexM7, timeout=None): @@ -2484,16 +2479,14 @@ def __init__(self, var="", arch=Arch_Armv7M, target=Target_CortexM7, timeout=Non def core(self, slothy): slothy.config.inputs_are_outputs = True slothy.config.variable_size = True - slothy.config.outputs = ["r14"] slothy.config.unsafe_address_offset_fixup = False r = slothy.config.reserved_regs - r.add("r14") r = r.union(f"s{i}" for i in range(32)) # reserve FPR slothy.config.reserved_regs = r slothy.config.sw_pipelining.enabled = True slothy.config.constraints.stalls_first_attempt = 16 - slothy.optimize_loop("1") + slothy.optimize_loop("1", forced_loop_type=Arch_Armv7M.BranchLoop) class matacc_kyber(Example): def __init__(self, var="", arch=Arch_Armv7M, target=Target_CortexM7, timeout=None): diff --git a/examples/naive/armv7m/basemul_acc_32_32_kyber.s b/examples/naive/armv7m/basemul_acc_32_32_kyber.s index 6d6b3e5d..ca30e75a 100644 --- a/examples/naive/armv7m/basemul_acc_32_32_kyber.s +++ b/examples/naive/armv7m/basemul_acc_32_32_kyber.s @@ -32,31 +32,31 @@ basemul_asm_acc_opt_32_32: movw loop, #64 1: - ldr poly0, [aptr], #8 - ldr poly1, [bptr], #8 + ldr poly0, [aptr], #4 + ldr poly1, [bptr], #4 ldr.w res0, [rptr_tmp] - ldr tmp2, [aprimeptr], #8 + ldr tmp2, [aprimeptr], #4 ldr.w res1, [rptr_tmp, #4] // (poly0_t * zeta) * poly1_t + poly0_b * poly0_t + res smlad tmp2, tmp2, poly1, res0 - str tmp2, [rptr_tmp], #16 + str tmp2, [rptr_tmp], #4 // poly1_t * poly0_b + poly1_b * poly0_t + res smladx tmp, poly0, poly1, res1 - str tmp, [rptr_tmp, #-12] + str tmp, [rptr_tmp], #4 - ldr poly0, [aptr, #-4] - ldr poly1, [bptr, #-4] - ldr res0, [rptr_tmp, #-8] - ldr tmp2, [aprimeptr, #-4] - ldr res1, [rptr_tmp, #-4] + ldr poly0, [aptr], #4 + ldr poly1, [bptr], #4 + ldr.w res0, [rptr_tmp] + ldr tmp2, [aprimeptr], #4 + ldr.w res1, [rptr_tmp, #4] smlad tmp2, tmp2, poly1, res0 - str tmp2, [rptr_tmp, #-8] + str tmp2, [rptr_tmp], #4 smladx tmp, poly0, poly1, res1 - str tmp, [rptr_tmp, #-4] + str tmp, [rptr_tmp], #4 subs.w loop, loop, #1 bne.w 1b diff --git a/examples/naive/armv7m/frombytes_mul_acc_32_16_kyber.s b/examples/naive/armv7m/frombytes_mul_acc_32_16_kyber.s index bf02e568..3070a36b 100644 --- a/examples/naive/armv7m/frombytes_mul_acc_32_16_kyber.s +++ b/examples/naive/armv7m/frombytes_mul_acc_32_16_kyber.s @@ -12,7 +12,7 @@ .macro doublebasemul_frombytes_asm_acc_32_16 rptr_tmp, rptr, bptr, zeta, poly0, poly1, poly3, res0, tmp, q, qa, qinv ldr \poly0, [\bptr], #8 - ldr \res0, [\rptr_tmp], #16 // @slothy:core=True + ldr \res0, [\rptr_tmp], #16 // @slothy:core=True // @slothy:before=cmp smulwt \tmp, \zeta, \poly1 smlabt \tmp, \tmp, \q, \qa @@ -72,7 +72,7 @@ frombytes_mul_asm_acc_32_16: push {r4-r11, r14} rptr .req r0 - bptr .req r1 + bptr .req r3 aptr .req r2 zetaptr .req r3 t0 .req r4 @@ -85,7 +85,7 @@ frombytes_mul_asm_acc_32_16: qinv .req r11 zeta .req r12 ctr .req r14 - rptr_tmp .req r3 + rptr_tmp .req r1 movw qa, #26632 movt q, #3329 @@ -93,35 +93,20 @@ frombytes_mul_asm_acc_32_16: movw qinv, #62209 movt qinv, #27560 - vmov s2, zetaptr + vmov s1, r1 ldr.w rptr_tmp, [sp, #9*4] // load rptr_tmp from stack - vmov s1, rptr_tmp + add ctr, rptr_tmp, #64*4*4 1: + ldr.w zeta, [zetaptr], #4 deserialize aptr, tmp, tmp2, tmp3, t0, t1 - vmov tmp, s2 - ldr zeta, [tmp], #4 - vmov s2, tmp + vmov s2, zetaptr + vmov bptr, s1 doublebasemul_frombytes_asm_acc_32_16 rptr_tmp, rptr, bptr, zeta, tmp3, t0, t1, tmp, tmp2, q, qa, qinv - cmp.w rptr_tmp, ctr + vmov s1, bptr // @slothy:core=True + cmp.w rptr_tmp, ctr // @slothy:id=cmp + vmov zetaptr, s2 bne.w 1b - // Original code - // ldr.w tmp, [sp, #9*4] // load rptr_tmp from stack - // vmov s1, tmp - // vmov s2, zetaptr - // add ctr, tmp, #64*4*4 - // 1: - // vmov zetaptr, s2 - // ldr.w zeta, [zetaptr], #4 - // deserialize aptr, tmp, tmp2, tmp3, t0, t1 - // vmov s2, zetaptr - // vmov rptr_tmp, s1 - // doublebasemul_frombytes_asm_acc_32_16 rptr_tmp, rptr, bptr, zeta, tmp3, t0, t1, tmp, tmp2, q, qa, qinv - // vmov s1, rptr_tmp - // cmp.w rptr_tmp, ctr - // bne.w 1b - pop {r4-r11, pc} - .size frombytes_mul_asm_acc_32_16, .-frombytes_mul_asm_acc_32_16 \ No newline at end of file diff --git a/examples/naive/armv7m/frombytes_mul_acc_kyber.s b/examples/naive/armv7m/frombytes_mul_acc_kyber.s index 563e0151..084a65d9 100644 --- a/examples/naive/armv7m/frombytes_mul_acc_kyber.s +++ b/examples/naive/armv7m/frombytes_mul_acc_kyber.s @@ -32,7 +32,7 @@ // r[1] in upper half of tmp2 pkhtb \tmp, \tmp2, \tmp, asr #16 uadd16 \res0, \res0, \tmp - str \res0, [\rptr], #8 // @slothy:core=True + str \res0, [\rptr], #8 // @slothy:core=True // @slothy:before=cmp neg \zeta, \zeta @@ -101,13 +101,13 @@ frombytes_mul_asm_acc: movt qinv, #27560 add ctr, rptr, #64*4*2 - vmov s0, ctr 1: ldr.w zeta, [zetaptr], #4 deserialize aptr, tmp, tmp2, tmp3, t0, t1 + vmov s0, ctr doublebasemul_frombytes_asm_acc rptr, bptr, zeta, tmp3, t0, t1, ctr, tmp, tmp2, q, qa, qinv vmov ctr, s0 - cmp.w rptr, ctr + cmp.w rptr, ctr // @slothy:id=cmp bne.w 1b pop {r4-r11, pc} diff --git a/examples/opt/armv7m/basemul_acc_32_32_kyber_opt_m7.s b/examples/opt/armv7m/basemul_acc_32_32_kyber_opt_m7.s index 855e6e69..6c1ab50e 100644 --- a/examples/opt/armv7m/basemul_acc_32_32_kyber_opt_m7.s +++ b/examples/opt/armv7m/basemul_acc_32_32_kyber_opt_m7.s @@ -31,85 +31,89 @@ basemul_asm_acc_opt_32_32_opt_m7: // movt qinv, #27560 movw loop, #64 - // Instructions: 1 - // Expected cycles: 1 - // Expected IPC: 1.00 - // - // Cycle bound: 1.0 - // IPC bound: 1.00 - // - // Wall time: 0.00s - // User time: 0.00s - // - // ----- cycle (expected) ------> - // 0 25 - // |------------------------|---- - ldr r10, [r2], #8 // *............................. + // Instructions: 2 + // Expected cycles: 2 + // Expected IPC: 1.00 + // + // Cycle bound: 2.0 + // IPC bound: 1.00 + // + // Wall time: 0.00s + // User time: 0.00s + // + // ----- cycle (expected) ------> + // 0 25 + // |------------------------|---- + ldr r4, [r3], #4 // *............................. + ldr r8, [r3], #4 // .*............................ - // ------ cycle (expected) ------> - // 0 25 - // |------------------------|----- - // ldr r10, [r2], #8 // *.............................. + // ------ cycle (expected) ------> + // 0 25 + // |------------------------|----- + // ldr r4, [r3], #4 // *.............................. + // ldr r8, [r3], #4 // .*............................. sub r14, r14, #1 1: - // Instructions: 19 - // Expected cycles: 10 - // Expected IPC: 1.90 - // - // Cycle bound: 11.0 - // IPC bound: 1.73 - // - // Wall time: 0.32s - // User time: 0.32s - // - // ----- cycle (expected) ------> - // 0 25 - // |------------------------|---- - ldr r11, [r1], #8 // *............................. - ldr.w r4, [r0, #4] // *............................. - ldr r6, [r3, #4] // .*............................ - ldr r12, [r0, #8] // .*............................ - ldr r8, [r2, #-4] // ..*........................... - smladx r5, r11, r10, r4 // ..*........................... - ldr r9, [r1, #-4] // ...*.......................... - str r5, [r0, #4] // ...*.......................... - smlad r12, r6, r8, r12 // ....*......................... - ldr r7, [r0, #12] // ....*......................... - ldr r11, [r3], #8 // .....*........................ - str r12, [r0, #8] // .....*........................ - ldr.w r4, [r0] // ......*....................... - smladx r9, r9, r8, r7 // ......*....................... - str r9, [r0, #12] // .......*...................... - subs.w r14, r14, #1 // .......*...................... - smlad r4, r11, r10, r4 // ........*..................... - ldr r10, [r2], #8 // ........e..................... - str r4, [r0], #16 // .........*.................... + // Instructions: 20 + // Expected cycles: 10 + // Expected IPC: 2.00 + // + // Cycle bound: 12.0 + // IPC bound: 1.67 + // + // Wall time: 0.48s + // User time: 0.48s + // + // ----- cycle (expected) ------> + // 0 25 + // |------------------------|---- + ldr r10, [r2], #4 // *............................. + ldr.w r7, [r0] // *............................. + ldr r5, [r1], #4 // .*............................ + ldr.w r12, [r0, #8] // .*............................ + ldr.w r9, [r0, #4] // ..*........................... + smlad r7, r4, r10, r7 // ..*........................... + ldr r6, [r2], #4 // ...*.......................... + str r7, [r0], #4 // ...*.......................... + ldr.w r7, [r0, #8] // ....*......................... + smladx r5, r5, r10, r9 // ....*......................... + ldr r11, [r1], #4 // .....*........................ + smlad r10, r8, r6, r12 // .....*........................ + subs.w r14, r14, #1 // ......*....................... + str r5, [r0], #4 // ......*....................... + ldr r4, [r3], #4 // .......e...................... + smladx r7, r11, r6, r7 // .......*...................... + str r10, [r0], #4 // ........*..................... + ldr r8, [r3], #4 // ........e..................... + str r7, [r0], #4 // .........*.................... + bne.w 1b // .........*.................... // @slothy:branch // ------ cycle (expected) ------> // 0 25 // |------------------------|----- - // ldr r4, [r1], #8 // ..*.........~.........~........ - // ldr r6, [r2], #8 // e.'.......~.'.......~.'........ - // ldr.w r5, [r0] // ..'.....*...'.....~...'.....~.. - // ldr r12, [r3], #8 // ..'....*....'....~....'....~... - // ldr.w r7, [r0, #4] // ..*.........~.........~........ - // smlad r12, r12, r6, r5 // ~.'.......*.'.......~.'........ - // str r12, [r0], #16 // .~'........*'........~'........ - // smladx r11, r4, r6, r7 // ..'.*.......'.~.......'.~...... - // str r11, [r0, #-12] // ..'..*......'..~......'..~..... - // ldr r4, [r1, #-4] // ..'..*......'..~......'..~..... - // ldr r6, [r2, #-4] // ..'.*.......'.~.......'.~...... - // ldr r5, [r0, #-8] // ..'*........'~........'~....... - // ldr r12, [r3, #-4] // ..'*........'~........'~....... - // ldr r7, [r0, #-4] // ..'...*.....'...~.....'...~.... - // smlad r12, r12, r6, r5 // ..'...*.....'...~.....'...~.... - // str r12, [r0, #-8] // ..'....*....'....~....'....~... - // smladx r11, r4, r6, r7 // ..'.....*...'.....~...'.....~.. - // str r11, [r0, #-4] // ..'......*..'......~..'......~. - // subs.w r14, r14, #1 // ..'......*..'......~..'......~. + // ldr r4, [r1], #4 // ...'*........'~........'~...... + // ldr r6, [r2], #4 // ...*.........~.........~....... + // ldr.w r5, [r0] // ...*.........~.........~....... + // ldr r12, [r3], #4 // e..'......~..'......~..'....... + // ldr.w r7, [r0, #4] // ...'.*.......'.~.......'.~..... + // smlad r12, r12, r6, r5 // ...'.*.......'.~.......'.~..... + // str r12, [r0], #4 // ...'..*......'..~......'..~.... + // smladx r11, r4, r6, r7 // ...'...*.....'...~.....'...~... + // str r11, [r0], #4 // ...'.....*...'.....~...'.....~. + // ldr r4, [r1], #4 // ...'....*....'....~....'....~.. + // ldr r6, [r2], #4 // ...'..*......'..~......'..~.... + // ldr.w r5, [r0] // ...'*........'~........'~...... + // ldr r12, [r3], #4 // .e.'.......~.'.......~.'....... + // ldr.w r7, [r0, #4] // ...'...*.....'...~.....'...~... + // smlad r12, r12, r6, r5 // ...'....*....'....~....'....~.. + // str r12, [r0], #4 // .~.'.......*.'.......~.'....... + // smladx r11, r4, r6, r7 // ~..'......*..'......~..'....... + // str r11, [r0], #4 // ..~'........*'........~'....... + // subs.w r14, r14, #1 // ...'.....*...'.....~...'.....~. + // bne.w 1b // ..~'........*'........~'....... + - bne 1b // Instructions: 18 // Expected cycles: 10 // Expected IPC: 1.80 @@ -117,52 +121,51 @@ basemul_asm_acc_opt_32_32_opt_m7: // Cycle bound: 10.0 // IPC bound: 1.80 // - // Wall time: 0.02s - // User time: 0.02s + // Wall time: 0.05s + // User time: 0.05s // // ----- cycle (expected) ------> // 0 25 // |------------------------|---- - ldr r8, [r1], #8 // *............................. - ldr.w r11, [r0, #4] // *............................. - ldr r7, [r3, #4] // .*............................ - ldr r9, [r3], #8 // .*............................ - smladx r12, r8, r10, r11 // ..*........................... - ldr.w r4, [r0] // ..*........................... - ldr r8, [r0, #8] // ...*.......................... - str r12, [r0, #4] // ...*.......................... - smlad r10, r9, r10, r4 // ....*......................... - ldr r11, [r2, #-4] // ....*......................... - str r10, [r0], #16 // .....*........................ - ldr r10, [r1, #-4] // .....*........................ - smlad r4, r7, r11, r8 // ......*....................... - ldr r5, [r0, #-4] // ......*....................... - subs.w r14, r14, #1 // .......*...................... - str r4, [r0, #-8] // .......*...................... - smladx r6, r10, r11, r5 // ........*..................... - str r6, [r0, #-4] // .........*.................... + ldr.w r6, [r0] // *............................. + ldr r10, [r2], #4 // *............................. + ldr.w r9, [r0, #8] // .*............................ + ldr r11, [r1], #4 // .*............................ + ldr r12, [r2], #4 // ..*........................... + smlad r5, r4, r10, r6 // ..*........................... + str r5, [r0], #4 // ...*.......................... + ldr.w r7, [r0, #0] // ...*.......................... + subs.w r14, r14, #1 // ....*......................... + smlad r5, r8, r12, r9 // ....*......................... + ldr r6, [r1], #4 // .....*........................ + smladx r10, r11, r10, r7 // .....*........................ + str r10, [r0], #4 // ......*....................... + ldr.w r10, [r0, #4] // ......*....................... + str r5, [r0], #4 // .......*...................... + smladx r9, r6, r12, r10 // ........*..................... + str r9, [r0], #4 // .........*.................... - // ------ cycle (expected) ------> - // 0 25 - // |------------------------|----- - // ldr r11, [r1], #8 // *.............................. - // ldr.w r4, [r0, #4] // *.............................. - // ldr r6, [r3, #4] // .*............................. - // ldr r12, [r0, #8] // ...*........................... - // ldr r8, [r2, #-4] // ....*.......................... - // smladx r5, r11, r10, r4 // ..*............................ - // ldr r9, [r1, #-4] // .....*......................... - // str r5, [r0, #4] // ...*........................... - // smlad r12, r6, r8, r12 // ......*........................ - // ldr r7, [r0, #12] // ......*........................ - // ldr r11, [r3], #8 // .*............................. - // str r12, [r0, #8] // .......*....................... - // ldr.w r4, [r0] // ..*............................ - // smladx r9, r9, r8, r7 // ........*...................... - // str r9, [r0, #12] // .........*..................... - // subs.w r14, r14, #1 // .......*....................... - // smlad r4, r11, r10, r4 // ....*.......................... - // str r4, [r0], #16 // .....*......................... + // ------ cycle (expected) ------> + // 0 25 + // |------------------------|----- + // ldr r10, [r2], #4 // *.............................. + // ldr.w r7, [r0] // *.............................. + // ldr r5, [r1], #4 // .*............................. + // ldr.w r12, [r0, #8] // .*............................. + // ldr.w r9, [r0, #4] // ...*........................... + // smlad r7, r4, r10, r7 // ..*............................ + // ldr r6, [r2], #4 // ..*............................ + // str r7, [r0], #4 // ...*........................... + // ldr.w r7, [r0, #8] // ......*........................ + // smladx r5, r5, r10, r9 // .....*......................... + // ldr r11, [r1], #4 // .....*......................... + // smlad r10, r8, r6, r12 // ....*.......................... + // subs.w r14, r14, #1 // ....*.......................... + // str r5, [r0], #4 // ......*........................ + // smladx r7, r11, r6, r7 // ........*...................... + // str r10, [r0], #4 // .......*....................... + // str r7, [r0], #4 // .........*..................... + // bne.w 1b // .........*..................... pop {r4-r11, pc} diff --git a/examples/opt/armv7m/frombytes_mul_acc_32_16_kyber_opt_m7.s b/examples/opt/armv7m/frombytes_mul_acc_32_16_kyber_opt_m7.s index 63e37d6e..4cd64405 100644 --- a/examples/opt/armv7m/frombytes_mul_acc_32_16_kyber_opt_m7.s +++ b/examples/opt/armv7m/frombytes_mul_acc_32_16_kyber_opt_m7.s @@ -12,7 +12,7 @@ .macro doublebasemul_frombytes_asm_acc_32_16 rptr_tmp, rptr, bptr, zeta, poly0, poly1, poly3, res0, tmp, q, qa, qinv ldr \poly0, [\bptr], #8 - ldr \res0, [\rptr_tmp], #16 // @slothy:core + ldr \res0, [\rptr_tmp], #16 // @slothy:core // @slothy:before=cmp smulwt \tmp, \zeta, \poly1 smlabt \tmp, \tmp, \q, \qa @@ -72,7 +72,7 @@ frombytes_mul_asm_acc_32_16_opt_m7: push {r4-r11, r14} rptr .req r0 - bptr .req r1 + bptr .req r3 aptr .req r2 zetaptr .req r3 t0 .req r4 @@ -85,7 +85,7 @@ frombytes_mul_asm_acc_32_16_opt_m7: qinv .req r11 zeta .req r12 ctr .req r14 - rptr_tmp .req r3 + rptr_tmp .req r1 movw qa, #26632 movt q, #3329 @@ -93,264 +93,258 @@ frombytes_mul_asm_acc_32_16_opt_m7: movw qinv, #62209 movt qinv, #27560 - vmov s2, zetaptr + vmov s1, r1 ldr.w rptr_tmp, [sp, #9*4] // load rptr_tmp from stack - vmov s1, rptr_tmp + add ctr, rptr_tmp, #64*4*4 - // Instructions: 6 - // Expected cycles: 5 - // Expected IPC: 1.20 - // - // Cycle bound: 5.0 - // IPC bound: 1.20 - // - // Wall time: 0.01s - // User time: 0.01s - // - // ----- cycle (expected) ------> - // 0 25 - // |------------------------|---- - ldrb.w r8, [r2, #5] // *............................. - ldrh.w r12, [r2, #3] // *............................. - ldrb.w r4, [r2, #2] // .*............................ - ldrh.w r6, [r2], #6 // .*............................ - ubfx r7, r12, #12, #4 // ...*.......................... - orr r5, r7, r8, lsl #4 // ....*......................... + // Instructions: 5 + // Expected cycles: 5 + // Expected IPC: 1.00 + // + // Cycle bound: 5.0 + // IPC bound: 1.00 + // + // Wall time: 0.01s + // User time: 0.01s + // + // ----- cycle (expected) ------> + // 0 25 + // |------------------------|---- + ldrb.w r8, [r2, #5] // *............................. + ldrh.w r7, [r2, #3] // *............................. + ldrb.w r12, [r2, #2] // .*............................ + ldrh.w r5, [r2], #6 // .*............................ + ubfx r4, r5, #12, #4 // ....*......................... - // ------ cycle (expected) ------> - // 0 25 - // |------------------------|----- - // ldrh.w r12, [r2, #3] // *.............................. - // ldrb.w r7, [r2, #5] // *.............................. - // ldrb.w r4, [r2, #2] // .*............................. - // ubfx r5, r12, #12, #4 // ...*........................... - // orr r5, r5, r7, lsl #4 // ....*.......................... - // ldrh.w r6, [r2], #6 // .*............................. + // ------ cycle (expected) ------> + // 0 25 + // |------------------------|----- + // ldrb.w r8, [r2, #5] // *.............................. + // ldrh.w r7, [r2, #3] // *.............................. + // ldrb.w r12, [r2, #2] // .*............................. + // ldrh.w r5, [r2], #6 // .*............................. + // ubfx r4, r5, #12, #4 // ....*.......................... sub r14, r14, #16 1: - // Instructions: 45 - // Expected cycles: 26 - // Expected IPC: 1.73 - // - // Cycle bound: 29.0 - // IPC bound: 1.55 - // - // Wall time: 10.31s - // User time: 10.31s - // - // ----- cycle (expected) ------> - // 0 25 - // |------------------------|---- - ubfx r7, r6, #12, #4 // *............................. - vmov r8, s2 // *............................. - orr r7, r7, r4, lsl #4 // .*............................ - ldr r4, [r8], #4 // .*............................ - ubfx r6, r6, #0, #12 // ..*........................... - orr r6, r6, r7, lsl #16 // ...*.......................... - vmov s2, r8 // ...*.......................... - ubfx r8, r12, #0, #12 // ....*......................... - smulwt r7, r4, r6 // ....*......................... - orr r5, r8, r5, lsl #16 // .....*........................ - ldr r12, [r1], #8 // .....*........................ - neg r4, r4 // ......*....................... - smlabt r8, r7, r9, r10 // ......*....................... - ldr r7, [r3], #16 // .......*...................... // @slothy:core - smulwt r4, r4, r5 // .......*...................... - smlatt r7, r12, r8, r7 // ........*..................... - smlabt r4, r4, r9, r10 // .........*.................... - ldr r8, [r3, #-12] // .........*.................... - smlabb r7, r12, r6, r7 // ..........*................... - smladx r8, r12, r6, r8 // ...........*.................. - mul r7, r7, r11 // ............*................. - ldr r6, [r1, #-4] // .............*................ - mul r8, r8, r11 // .............*................ - smlatt r12, r7, r9, r10 // ..............*............... - ldr r7, [r3, #-8] // ..............*............... - smlatt r8, r8, r9, r10 // ...............*.............. - smlatt r7, r6, r4, r7 // ................*............. - ldr r4, [r3, #-4] // ................*............. - pkhtb r12, r8, r12, asr #16 // .................*............ - str r12, [r0], #8 // .................*............ - ldrh.w r12, [r2, #3] // ..................e........... - smlabb r8, r6, r5, r7 // ..................*........... - smladx r6, r6, r5, r4 // ...................*.......... - ldrb.w r7, [r2, #5] // ...................e.......... - ldrb.w r4, [r2, #2] // ....................e......... - mul r8, r8, r11 // ....................*......... - ubfx r5, r12, #12, #4 // .....................e........ - mul r6, r6, r11 // .....................*........ - orr r5, r5, r7, lsl #4 // ......................e....... - smlatt r7, r8, r9, r10 // ......................*....... - smlatt r8, r6, r9, r10 // .......................*...... - ldrh.w r6, [r2], #6 // .......................e...... - cmp.w r3, r14 // ........................*..... - pkhtb r8, r8, r7, asr #16 // .........................*.... - str r8, [r0, #-4] // .........................*.... - - // ------- cycle (expected) --------> - // 0 25 - // |------------------------|-------- - // ldrb.w r6, [r2, #2] // ..e.....'...................~..... - // ldrh.w r7, [r2, #3] // e.......'.................~....... - // ldrb.w r8, [r2, #5] // .e......'..................~...... - // ldrh.w r4, [r2], #6 // .....e..'......................~.. - // ubfx r5, r4, #12, #4 // ........*......................... - // ubfx r4, r4, #0, #12 // ........'.*....................... - // orr r5, r5, r6, lsl #4 // ........'*........................ - // orr r4, r4, r5, lsl #16 // ........'..*...................... - // ubfx r5, r7, #12, #4 // ...e....'....................~.... - // ubfx r6, r7, #0, #12 // ........'...*..................... - // orr r5, r5, r8, lsl #4 // ....e...'.....................~... - // orr r5, r6, r5, lsl #16 // ........'....*.................... - // vmov r6, s2 // ........*......................... - // ldr r12, [r6], #4 // ........'*........................ - // vmov s2, r6 // ........'..*...................... - // ldr r8, [r1], #8 // ........'....*.................... - // ldr r6, [r3], #16 // ........'......*.................. - // smulwt r7, r12, r4 // ........'...*..................... - // smlabt r7, r7, r9, r10 // ........'.....*................... - // smlatt r7, r8, r7, r6 // ........'.......*................. - // smlabb r7, r8, r4, r7 // ........'.........*............... - // mul r7, r7, r11 // ........'...........*............. - // smlatt r7, r7, r9, r10 // ........'.............*........... - // ldr r6, [r3, #-12] // ........'........*................ - // smladx r6, r8, r4, r6 // ........'..........*.............. - // mul r6, r6, r11 // ........'............*............ - // smlatt r6, r6, r9, r10 // ........'..............*.......... - // pkhtb r6, r6, r7, asr #16 // ........'................*........ - // str r6, [r0], #8 // ........'................*........ - // neg r12, r12 // ........'.....*................... - // ldr r8, [r1, #-4] // ........'............*............ - // ldr r6, [r3, #-8] // ........'.............*........... - // smulwt r7, r12, r5 // ........'......*.................. - // smlabt r7, r7, r9, r10 // ........'........*................ - // smlatt r7, r8, r7, r6 // ........'...............*......... - // smlabb r7, r8, r5, r7 // ~.......'.................*....... - // mul r7, r7, r11 // ..~.....'...................*..... - // smlatt r7, r7, r9, r10 // ....~...'.....................*... - // ldr r6, [r3, #-4] // ........'...............*......... - // smladx r6, r8, r5, r6 // .~......'..................*...... - // mul r6, r6, r11 // ...~....'....................*.... - // smlatt r6, r6, r9, r10 // .....~..'......................*.. - // pkhtb r6, r6, r7, asr #16 // .......~'........................* - // str r6, [r0, #-4] // .......~'........................* - // cmp.w r3, r14 // ......~.'.......................*. - - bne 1b - // Instructions: 39 - // Expected cycles: 25 - // Expected IPC: 1.56 + // Instructions: 48 + // Expected cycles: 26 + // Expected IPC: 1.85 // - // Cycle bound: 25.0 - // IPC bound: 1.56 + // Cycle bound: 29.0 + // IPC bound: 1.66 // - // Wall time: 0.95s - // User time: 0.95s + // Wall time: 11.37s + // User time: 11.37s // // ----- cycle (expected) ------> // 0 25 // |------------------------|---- - ubfx r7, r6, #12, #4 // *............................. - vmov r8, s2 // *............................. - orr r4, r7, r4, lsl #4 // .*............................ - ldr r7, [r8], #4 // .*............................ - ubfx r6, r6, #0, #12 // ..*........................... - vmov s2, r8 // ...*.......................... - orr r8, r6, r4, lsl #16 // ...*.......................... - ubfx r6, r12, #0, #12 // ....*......................... - smulwt r4, r7, r8 // ....*......................... - neg r7, r7 // .....*........................ - orr r5, r6, r5, lsl #16 // .....*........................ - smulwt r6, r7, r5 // ......*....................... - ldr r12, [r1], #8 // ......*....................... - smlabt r4, r4, r9, r10 // .......*...................... - ldr r7, [r3], #16 // .......*...................... // @slothy:core - smlabt r6, r6, r9, r10 // ........*..................... - smlatt r4, r12, r4, r7 // .........*.................... - ldr r7, [r3, #-12] // .........*.................... - cmp.w r3, r14 // ..........*................... - smlabb r4, r12, r8, r4 // ..........*................... - smladx r7, r12, r8, r7 // ...........*.................. - mul r4, r4, r11 // ............*................. - ldr r8, [r1, #-4] // ............*................. - mul r12, r7, r11 // .............*................ - smlatt r7, r4, r9, r10 // ..............*............... - smlatt r4, r12, r9, r10 // ...............*.............. - ldr r12, [r3, #-8] // ...............*.............. - smlatt r12, r8, r6, r12 // ................*............. - ldr r6, [r3, #-4] // ................*............. - pkhtb r4, r4, r7, asr #16 // .................*............ - smlabb r12, r8, r5, r12 // .................*............ - smladx r6, r8, r5, r6 // ..................*........... - mul r7, r12, r11 // ...................*.......... - mul r12, r6, r11 // ....................*......... - smlatt r5, r7, r9, r10 // .....................*........ - smlatt r7, r12, r9, r10 // ......................*....... - str r4, [r0], #8 // .......................*...... - pkhtb r8, r7, r5, asr #16 // ........................*..... - str r8, [r0, #-4] // ........................*..... + orr r6, r4, r12, lsl #4 // *............................. + ldr.w r4, [r3], #4 // *............................. + ubfx r5, r5, #0, #12 // .*............................ + vmov r12, s1 // .*............................ + vmov s27, r3 // ..*........................... + orr r5, r5, r6, lsl #16 // ..*........................... + ubfx r3, r7, #12, #4 // ...*.......................... + smulwt r6, r4, r5 // ...*.......................... + neg r4, r4 // ....*......................... + orr r8, r3, r8, lsl #4 // ....*......................... + ubfx r7, r7, #0, #12 // .....*........................ + smlabt r6, r6, r9, r10 // .....*........................ + orr r8, r7, r8, lsl #16 // ......*....................... + ldr r3, [r12], #8 // ......*....................... + ldr r7, [r1], #16 // .......*...................... // @slothy:core // @slothy:before=cmp + smulwt r4, r4, r8 // .......*...................... + smlatt r6, r3, r6, r7 // ........*..................... + ldr r7, [r1, #-12] // .........*.................... + smlabt r4, r4, r9, r10 // .........*.................... + vmov s1, r12 // ..........*................... // @slothy:core + smlabb r6, r3, r5, r6 // ..........*................... + cmp.w r1, r14 // ...........*.................. // @slothy:id=cmp + smladx r7, r3, r5, r7 // ...........*.................. + ldr r5, [r12, #-4] // ............*................. + mul r6, r6, r11 // ............*................. + ldr r3, [r1, #-8] // .............*................ + mul r7, r7, r11 // .............*................ + ldr r12, [r1, #-4] // ..............*............... + smlatt r4, r5, r4, r3 // ..............*............... + smlabb r4, r5, r8, r4 // ...............*.............. + smladx r12, r5, r8, r12 // ................*............. + ldrb.w r8, [r2, #5] // .................e............ + smlatt r6, r6, r9, r10 // .................*............ + smlatt r3, r7, r9, r10 // ..................*........... + ldrh.w r7, [r2, #3] // ...................e.......... + mul r4, r4, r11 // ...................*.......... + pkhtb r3, r3, r6, asr #16 // ....................*......... + mul r6, r12, r11 // ....................*......... + ldrb.w r12, [r2, #2] // .....................e........ + smlatt r4, r4, r9, r10 // .....................*........ + ldrh.w r5, [r2], #6 // ......................e....... + smlatt r6, r6, r9, r10 // ......................*....... + str r3, [r0], #8 // .......................*...... + vmov r3, s27 // .......................*...... + pkhtb r4, r6, r4, asr #16 // ........................*..... + str r4, [r0, #-4] // ........................*..... + ubfx r4, r5, #12, #4 // .........................e.... + bne.w 1b // .........................*.... // @slothy:branch + + // -------- cycle (expected) --------> + // 0 25 + // |------------------------|--------- + // ldr.w r12, [r3], #4 // .........*......................... + // ldrb.w r6, [r2, #2] // ....e....'....................~.... + // ldrh.w r7, [r2, #3] // ..e......'..................~...... + // ldrb.w r8, [r2, #5] // e........'................~........ + // ldrh.w r4, [r2], #6 // .....e...'.....................~... + // ubfx r5, r4, #12, #4 // ........e'......................... + // ubfx r4, r4, #0, #12 // .........'*........................ + // orr r5, r5, r6, lsl #4 // .........*......................... + // orr r4, r4, r5, lsl #16 // .........'.*....................... + // ubfx r5, r7, #12, #4 // .........'..*...................... + // ubfx r6, r7, #0, #12 // .........'....*.................... + // orr r5, r5, r8, lsl #4 // .........'...*..................... + // orr r5, r6, r5, lsl #16 // .........'.....*................... + // vmov s2, r3 // .........'.*....................... + // vmov r3, s1 // .........'*........................ + // ldr r8, [r3], #8 // .........'.....*................... + // ldr r6, [r1], #16 // .........'......*.................. + // smulwt r7, r12, r4 // .........'..*...................... + // smlabt r7, r7, r9, r10 // .........'....*.................... + // smlatt r7, r8, r7, r6 // .........'.......*................. + // smlabb r7, r8, r4, r7 // .........'.........*............... + // mul r7, r7, r11 // .........'...........*............. + // smlatt r7, r7, r9, r10 // ~........'................*........ + // ldr r6, [r1, #-12] // .........'........*................ + // smladx r6, r8, r4, r6 // .........'..........*.............. + // mul r6, r6, r11 // .........'............*............ + // smlatt r6, r6, r9, r10 // .~.......'.................*....... + // pkhtb r6, r6, r7, asr #16 // ...~.....'...................*..... + // str r6, [r0], #8 // ......~..'......................*.. + // neg r12, r12 // .........'...*..................... + // ldr r8, [r3, #-4] // .........'...........*............. + // ldr r6, [r1, #-8] // .........'............*............ + // smulwt r7, r12, r5 // .........'......*.................. + // smlabt r7, r7, r9, r10 // .........'........*................ + // smlatt r7, r8, r7, r6 // .........'.............*........... + // smlabb r7, r8, r5, r7 // .........'..............*.......... + // mul r7, r7, r11 // ..~......'..................*...... + // smlatt r7, r7, r9, r10 // ....~....'....................*.... + // ldr r6, [r1, #-4] // .........'.............*........... + // smladx r6, r8, r5, r6 // .........'...............*......... + // mul r6, r6, r11 // ...~.....'...................*..... + // smlatt r6, r6, r9, r10 // .....~...'.....................*... + // pkhtb r6, r6, r7, asr #16 // .......~.'.......................*. + // str r6, [r0, #-4] // .......~.'.......................*. + // vmov s1, r3 // .........'.........*............... + // cmp.w r1, r14 // .........'..........*.............. + // vmov r3, s2 // ......~..'......................*.. + // bne.w 1b // ........~'........................* - // ------ cycle (expected) ------> - // 0 25 - // |------------------------|----- - // ubfx r7, r6, #12, #4 // *.............................. - // vmov r8, s2 // *.............................. - // orr r7, r7, r4, lsl #4 // .*............................. - // ldr r4, [r8], #4 // .*............................. - // ubfx r6, r6, #0, #12 // ..*............................ - // orr r6, r6, r7, lsl #16 // ...*........................... - // vmov s2, r8 // ...*........................... - // ubfx r8, r12, #0, #12 // ....*.......................... - // smulwt r7, r4, r6 // ....*.......................... - // orr r5, r8, r5, lsl #16 // .....*......................... - // ldr r12, [r1], #8 // ......*........................ - // neg r4, r4 // .....*......................... - // smlabt r8, r7, r9, r10 // .......*....................... - // ldr r7, [r3], #16 // .......*....................... - // smulwt r4, r4, r5 // ......*........................ - // smlatt r7, r12, r8, r7 // .........*..................... - // smlabt r4, r4, r9, r10 // ........*...................... - // ldr r8, [r3, #-12] // .........*..................... - // smlabb r7, r12, r6, r7 // ..........*.................... - // smladx r8, r12, r6, r8 // ...........*................... - // mul r7, r7, r11 // ............*.................. - // ldr r6, [r1, #-4] // ............*.................. - // mul r8, r8, r11 // .............*................. - // smlatt r12, r7, r9, r10 // ..............*................ - // ldr r7, [r3, #-8] // ...............*............... - // smlatt r8, r8, r9, r10 // ...............*............... - // smlatt r7, r6, r4, r7 // ................*.............. - // ldr r4, [r3, #-4] // ................*.............. - // pkhtb r12, r8, r12, asr #16 // .................*............. - // str r12, [r0], #8 // .......................*....... - // smlabb r8, r6, r5, r7 // .................*............. - // smladx r6, r6, r5, r4 // ..................*............ - // mul r8, r8, r11 // ...................*........... - // mul r6, r6, r11 // ....................*.......... - // smlatt r7, r8, r9, r10 // .....................*......... - // smlatt r8, r6, r9, r10 // ......................*........ - // cmp.w r3, r14 // ..........*.................... - // pkhtb r8, r8, r7, asr #16 // ........................*...... - // str r8, [r0, #-4] // ........................*...... + // Instructions: 43 + // Expected cycles: 26 + // Expected IPC: 1.65 + // + // Cycle bound: 26.0 + // IPC bound: 1.65 + // + // Wall time: 1.77s + // User time: 1.77s + // + // ----- cycle (expected) ------> + // 0 25 + // |------------------------|---- + orr r12, r4, r12, lsl #4 // *............................. + ldr.w r4, [r3], #4 // *............................. + ubfx r5, r5, #0, #12 // .*............................ + vmov r6, s1 // .*............................ + orr r12, r5, r12, lsl #16 // ..*........................... + vmov s27, r3 // ..*........................... + ubfx r5, r7, #12, #4 // ...*.......................... + smulwt r3, r4, r12 // ...*.......................... + orr r8, r5, r8, lsl #4 // ....*......................... + ldr r5, [r6], #8 // ....*......................... + ubfx r7, r7, #0, #12 // .....*........................ + smlabt r3, r3, r9, r10 // .....*........................ + orr r7, r7, r8, lsl #16 // ......*....................... + ldr r8, [r1], #16 // ......*....................... // @slothy:core // @slothy:before=cmp + neg r4, r4 // .......*...................... + smlatt r8, r5, r3, r8 // .......*...................... + ldr r3, [r1, #-12] // ........*..................... + smlabb r8, r5, r12, r8 // ........*..................... + cmp.w r1, r14 // .........*.................... // @slothy:id=cmp + smulwt r4, r4, r7 // .........*.................... + vmov s1, r6 // ..........*................... // @slothy:core + smladx r5, r5, r12, r3 // ..........*................... + ldr r12, [r6, #-4] // ...........*.................. + smlabt r6, r4, r9, r10 // ...........*.................. + ldr r3, [r1, #-8] // ............*................. + mul r8, r8, r11 // ............*................. + ldr r4, [r1, #-4] // .............*................ + smlatt r6, r12, r6, r3 // .............*................ + vmov r3, s27 // ..............*............... + smlabb r6, r12, r7, r6 // ..............*............... + smladx r7, r12, r7, r4 // ...............*.............. + mul r12, r6, r11 // ................*............. + mul r6, r7, r11 // .................*............ + smlatt r12, r12, r9, r10 // ..................*........... + smlatt r7, r6, r9, r10 // ...................*.......... + mul r4, r5, r11 // ....................*......... + pkhtb r5, r7, r12, asr #16 // .....................*........ + smlatt r8, r8, r9, r10 // .....................*........ + smlatt r4, r4, r9, r10 // ......................*....... + pkhtb r6, r4, r8, asr #16 // ........................*..... + str r6, [r0], #8 // ........................*..... + str r5, [r0, #-4] // .........................*.... - // Original code - // ldr.w tmp, [sp, #9*4] // load rptr_tmp from stack - // vmov s1, tmp - // vmov s2, zetaptr - // add ctr, tmp, #64*4*4 - // 1: - // vmov zetaptr, s2 - // ldr.w zeta, [zetaptr], #4 - // deserialize aptr, tmp, tmp2, tmp3, t0, t1 - // vmov s2, zetaptr - // vmov rptr_tmp, s1 - // doublebasemul_frombytes_asm_acc_32_16 rptr_tmp, rptr, bptr, zeta, tmp3, t0, t1, tmp, tmp2, q, qa, qinv - // vmov s1, rptr_tmp - // cmp.w rptr_tmp, ctr - // bne.w 1b + // ------ cycle (expected) ------> + // 0 25 + // |------------------------|----- + // orr r6, r4, r12, lsl #4 // *.............................. + // ldr.w r4, [r3], #4 // *.............................. + // ubfx r5, r5, #0, #12 // .*............................. + // vmov r12, s1 // .*............................. + // vmov s27, r3 // ..*............................ + // orr r5, r5, r6, lsl #16 // ..*............................ + // ubfx r3, r7, #12, #4 // ...*........................... + // smulwt r6, r4, r5 // ...*........................... + // neg r4, r4 // .......*....................... + // orr r8, r3, r8, lsl #4 // ....*.......................... + // ubfx r7, r7, #0, #12 // .....*......................... + // smlabt r6, r6, r9, r10 // .....*......................... + // orr r8, r7, r8, lsl #16 // ......*........................ + // ldr r3, [r12], #8 // ....*.......................... + // ldr r7, [r1], #16 // ......*........................ + // smulwt r4, r4, r8 // .........*..................... + // smlatt r6, r3, r6, r7 // .......*....................... + // ldr r7, [r1, #-12] // ........*...................... + // smlabt r4, r4, r9, r10 // ...........*................... + // vmov s1, r12 // ..........*.................... + // smlabb r6, r3, r5, r6 // ........*...................... + // cmp.w r1, r14 // .........*..................... + // smladx r7, r3, r5, r7 // ..........*.................... + // ldr r5, [r12, #-4] // ...........*................... + // mul r6, r6, r11 // ............*.................. + // ldr r3, [r1, #-8] // ............*.................. + // mul r7, r7, r11 // ....................*.......... + // ldr r12, [r1, #-4] // .............*................. + // smlatt r4, r5, r4, r3 // .............*................. + // smlabb r4, r5, r8, r4 // ..............*................ + // smladx r12, r5, r8, r12 // ...............*............... + // smlatt r6, r6, r9, r10 // .....................*......... + // smlatt r3, r7, r9, r10 // ......................*........ + // mul r4, r4, r11 // ................*.............. + // pkhtb r3, r3, r6, asr #16 // ........................*...... + // mul r6, r12, r11 // .................*............. + // smlatt r4, r4, r9, r10 // ..................*............ + // smlatt r6, r6, r9, r10 // ...................*........... + // str r3, [r0], #8 // ........................*...... + // vmov r3, s27 // ..............*................ + // pkhtb r4, r6, r4, asr #16 // .....................*......... + // str r4, [r0, #-4] // .........................*..... + // bne.w 1b // .........................*..... -pop {r4-r11, pc} +pop {r4-r11, pc} .size frombytes_mul_asm_acc_32_16_opt_m7, .-frombytes_mul_asm_acc_32_16_opt_m7 \ No newline at end of file diff --git a/examples/opt/armv7m/frombytes_mul_acc_kyber_opt_m7.s b/examples/opt/armv7m/frombytes_mul_acc_kyber_opt_m7.s index 0f35011c..df797e2c 100644 --- a/examples/opt/armv7m/frombytes_mul_acc_kyber_opt_m7.s +++ b/examples/opt/armv7m/frombytes_mul_acc_kyber_opt_m7.s @@ -32,7 +32,7 @@ // r[1] in upper half of tmp2 pkhtb \tmp, \tmp2, \tmp, asr #16 uadd16 \res0, \res0, \tmp - str \res0, [\rptr], #8 // @slothy:core + str \res0, [\rptr], #8 // @slothy:core // @slothy:before=cmp neg \zeta, \zeta @@ -101,237 +101,249 @@ frombytes_mul_asm_acc_opt_m7: movt qinv, #27560 add ctr, rptr, #64*4*2 - vmov s0, ctr - // Instructions: 6 - // Expected cycles: 5 - // Expected IPC: 1.20 - // - // Cycle bound: 5.0 - // IPC bound: 1.20 - // - // Wall time: 0.01s - // User time: 0.01s - // - // ----- cycle (expected) ------> - // 0 25 - // |------------------------|---- - ldrh.w r8, [r2, #3] // *............................. - ldrb.w r5, [r2, #5] // .*............................ - ldrb.w r4, [r2, #2] // ..*........................... - ldrh.w r7, [r2], #6 // ..*........................... - ubfx r12, r8, #12, #4 // ...*.......................... - orr r12, r12, r5, lsl #4 // ....*......................... + // Instructions: 5 + // Expected cycles: 5 + // Expected IPC: 1.00 + // + // Cycle bound: 5.0 + // IPC bound: 1.00 + // + // Wall time: 0.01s + // User time: 0.01s + // + // ----- cycle (expected) ------> + // 0 25 + // |------------------------|---- + ldrb.w r8, [r2, #2] // *............................. + ldrh.w r5, [r2, #3] // *............................. + ldrb.w r7, [r2, #5] // .*............................ + ldrh.w r4, [r2], #6 // .*............................ + ubfx r6, r4, #12, #4 // ....*......................... - // ------ cycle (expected) ------> - // 0 25 - // |------------------------|----- - // ldrh.w r8, [r2, #3] // *.............................. - // ldrb.w r4, [r2, #5] // .*............................. - // ubfx r12, r8, #12, #4 // ...*........................... - // orr r12, r12, r4, lsl #4 // ....*.......................... - // ldrb.w r4, [r2, #2] // ..*............................ - // ldrh.w r7, [r2], #6 // ..*............................ + // ------ cycle (expected) ------> + // 0 25 + // |------------------------|----- + // ldrb.w r8, [r2, #2] // *.............................. + // ldrh.w r5, [r2, #3] // *.............................. + // ldrb.w r7, [r2, #5] // .*............................. + // ldrh.w r4, [r2], #6 // .*............................. + // ubfx r6, r4, #12, #4 // ....*.......................... - push {ctr} - vmov ctr, s0 - sub ctr, ctr, #8 - vmov s0, ctr - pop {ctr} + push {r14} + vmov r14, s0 + sub r14, r14, #8 + vmov s0, r14 + pop {r14} 1: - // Instructions: 42 - // Expected cycles: 25 - // Expected IPC: 1.68 - // - // Cycle bound: 27.0 - // IPC bound: 1.56 - // - // Wall time: 3.82s - // User time: 3.82s - // - // ----- cycle (expected) ------> - // 0 25 - // |------------------------|---- - ubfx r6, r7, #12, #4 // *............................. - ldr.w r5, [r3], #4 // *............................. - orr r6, r6, r4, lsl #4 // .*............................ - ldr r14, [r0] // .*............................ - ubfx r4, r7, #0, #12 // ..*........................... - orr r4, r4, r6, lsl #16 // ...*.......................... - ldr r7, [r1], #8 // ...*.......................... - ubfx r8, r8, #0, #12 // ....*......................... - smulwt r6, r5, r4 // ....*......................... - orr r12, r8, r12, lsl #16 // .....*........................ - smuadx r8, r7, r4 // .....*........................ - neg r5, r5 // ......*....................... - smlabt r6, r6, r9, r10 // ......*....................... - mul r8, r8, r11 // .......*...................... - smultt r6, r7, r6 // ........*..................... - smlabb r7, r7, r4, r6 // .........*.................... - smulwt r6, r5, r12 // ..........*................... - mul r4, r7, r11 // ...........*.................. - smlabt r5, r6, r9, r10 // ............*................. - ldr r6, [r1, #-4] // .............*................ - smlatt r7, r4, r9, r10 // .............*................ - smlatt r4, r8, r9, r10 // ..............*............... - ldrh.w r8, [r2, #3] // ...............e.............. - smultt r5, r6, r5 // ...............*.............. - pkhtb r7, r4, r7, asr #16 // ................*............. - smlabb r5, r6, r12, r5 // ................*............. - ldrb.w r4, [r2, #5] // .................e............ - smuadx r6, r6, r12 // .................*............ - uadd16 r14, r14, r7 // ..................*........... - mul r7, r5, r11 // ..................*........... - ubfx r12, r8, #12, #4 // ...................e.......... - mul r5, r6, r11 // ...................*.......... - orr r12, r12, r4, lsl #4 // ....................e......... - smlatt r6, r7, r9, r10 // ....................*......... - ldrb.w r4, [r2, #2] // .....................e........ - smlatt r5, r5, r9, r10 // .....................*........ - str r14, [r0], #8 // ......................*....... // @slothy:core - ldrh.w r7, [r2], #6 // ......................e....... - pkhtb r6, r5, r6, asr #16 // .......................*...... - ldr r14, [r0, #-4] // .......................*...... - uadd16 r14, r14, r6 // ........................*..... - str r14, [r0, #-4] // ........................*..... + // Instructions: 46 + // Expected cycles: 26 + // Expected IPC: 1.77 + // + // Cycle bound: 28.0 + // IPC bound: 1.64 + // + // Wall time: 8.16s + // User time: 8.16s + // + // ----- cycle (expected) ------> + // 0 25 + // |------------------------|---- + orr r12, r6, r8, lsl #4 // *............................. + ldr.w r6, [r3], #4 // *............................. + ubfx r8, r4, #0, #12 // .*............................ + ldr r4, [r1], #8 // .*............................ + orr r8, r8, r12, lsl #16 // ..*........................... + vmov s0, r14 // ..*........................... + ubfx r12, r5, #12, #4 // ...*.......................... + smuadx r14, r4, r8 // ...*.......................... + orr r12, r12, r7, lsl #4 // ....*......................... + smulwt r7, r6, r8 // ....*......................... + ubfx r5, r5, #0, #12 // .....*........................ + mul r14, r14, r11 // .....*........................ + orr r12, r5, r12, lsl #16 // ......*....................... + smlabt r5, r7, r9, r10 // ......*....................... + neg r6, r6 // .......*...................... + smlatt r7, r14, r9, r10 // .......*...................... + smultt r5, r4, r5 // ........*..................... + smlabb r4, r4, r8, r5 // .........*.................... + ldr r8, [r1, #-4] // ..........*................... + smulwt r5, r6, r12 // ..........*................... + mul r6, r4, r11 // ...........*.................. + ldr r4, [r0] // ............*................. + smlabt r5, r5, r9, r10 // ............*................. + smlatt r6, r6, r9, r10 // .............*................ + smultt r5, r8, r5 // ..............*............... + vmov r14, s0 // ...............*.............. + smlabb r5, r8, r12, r5 // ...............*.............. + pkhtb r6, r7, r6, asr #16 // ................*............. + smuadx r7, r8, r12 // ................*............. + uadd16 r4, r4, r6 // .................*............ + mul r6, r5, r11 // .................*............ + ldrb.w r8, [r2, #2] // ..................e........... + mul r12, r7, r11 // ..................*........... + ldrh.w r5, [r2, #3] // ...................e.......... + smlatt r6, r6, r9, r10 // ...................*.......... + ldrb.w r7, [r2, #5] // ....................e......... + smlatt r12, r12, r9, r10 // ....................*......... + str r4, [r0], #8 // .....................*........ // @slothy:core // @slothy:before=cmp + ldrh.w r4, [r2], #6 // .....................e........ + pkhtb r6, r12, r6, asr #16 // ......................*....... + ldr r12, [r0, #-4] // ......................*....... + uadd16 r12, r12, r6 // .......................*...... + cmp.w r0, r14 // .......................*...... // @slothy:id=cmp + ubfx r6, r4, #12, #4 // ........................e..... + str r12, [r0, #-4] // ........................*..... + bne.w 1b // .........................*.... // @slothy:branch - // -------- cycle (expected) --------> + // ------- cycle (expected) --------> // 0 25 - // |------------------------|--------- - // ldr.w r12, [r3], #4 // ..........*........................ - // ldrb.w r6, [r2, #2] // ......e...'....................~... - // ldrh.w r7, [r2, #3] // e.........'..............~......... - // ldrb.w r8, [r2, #5] // ..e.......'................~....... - // ldrh.w r4, [r2], #6 // .......e..'.....................~.. - // ubfx r5, r4, #12, #4 // ..........*........................ - // ubfx r4, r4, #0, #12 // ..........'.*...................... - // orr r5, r5, r6, lsl #4 // ..........'*....................... - // orr r4, r4, r5, lsl #16 // ..........'..*..................... - // ubfx r5, r7, #12, #4 // ....e.....'..................~..... - // ubfx r6, r7, #0, #12 // ..........'...*.................... - // orr r5, r5, r8, lsl #4 // .....e....'...................~.... - // orr r5, r6, r5, lsl #16 // ..........'....*................... - // ldr r8, [r1], #8 // ..........'..*..................... - // ldr r14, [r0] // ..........'*....................... - // smulwt r6, r12, r4 // ..........'...*.................... - // smlabt r6, r6, r9, r10 // ..........'.....*.................. - // smultt r6, r8, r6 // ..........'.......*................ - // smlabb r6, r8, r4, r6 // ..........'........*............... - // mul r6, r6, r11 // ..........'..........*............. - // smlatt r6, r6, r9, r10 // ..........'............*........... - // smuadx r7, r8, r4 // ..........'....*................... - // mul r7, r7, r11 // ..........'......*................. - // smlatt r7, r7, r9, r10 // ..........'.............*.......... - // pkhtb r6, r7, r6, asr #16 // .~........'...............*........ - // uadd16 r14, r14, r6 // ...~......'.................*...... - // str r14, [r0], #8 // .......~..'.....................*.. - // neg r12, r12 // ..........'.....*.................. - // ldr r8, [r1, #-4] // ..........'............*........... - // ldr r14, [r0, #-4] // ........~.'......................*. - // smulwt r6, r12, r5 // ..........'.........*.............. - // smlabt r6, r6, r9, r10 // ..........'...........*............ - // smultt r6, r8, r6 // ~.........'..............*......... - // smlabb r6, r8, r5, r6 // .~........'...............*........ - // mul r6, r6, r11 // ...~......'.................*...... - // smlatt r6, r6, r9, r10 // .....~....'...................*.... - // smuadx r7, r8, r5 // ..~.......'................*....... - // mul r7, r7, r11 // ....~.....'..................*..... - // smlatt r7, r7, r9, r10 // ......~...'....................*... - // pkhtb r6, r7, r6, asr #16 // ........~.'......................*. - // uadd16 r14, r14, r6 // .........~'.......................* - // str r14, [r0, #-4] // .........~'.......................* + // |------------------------|-------- + // ldr.w r12, [r3], #4 // ........*......................... + // ldrb.w r6, [r2, #2] // e.......'.................~....... + // ldrh.w r7, [r2, #3] // .e......'..................~...... + // ldrb.w r8, [r2, #5] // ..e.....'...................~..... + // ldrh.w r4, [r2], #6 // ...e....'....................~.... + // ubfx r5, r4, #12, #4 // ......e.'.......................~. + // ubfx r4, r4, #0, #12 // ........'*........................ + // orr r5, r5, r6, lsl #4 // ........*......................... + // orr r4, r4, r5, lsl #16 // ........'.*....................... + // ubfx r5, r7, #12, #4 // ........'..*...................... + // ubfx r6, r7, #0, #12 // ........'....*.................... + // orr r5, r5, r8, lsl #4 // ........'...*..................... + // orr r5, r6, r5, lsl #16 // ........'.....*................... + // vmov s0, r14 // ........'.*....................... + // ldr r8, [r1], #8 // ........'*........................ + // ldr r14, [r0] // ........'...........*............. + // smulwt r6, r12, r4 // ........'...*..................... + // smlabt r6, r6, r9, r10 // ........'.....*................... + // smultt r6, r8, r6 // ........'.......*................. + // smlabb r6, r8, r4, r6 // ........'........*................ + // mul r6, r6, r11 // ........'..........*.............. + // smlatt r6, r6, r9, r10 // ........'............*............ + // smuadx r7, r8, r4 // ........'..*...................... + // mul r7, r7, r11 // ........'....*.................... + // smlatt r7, r7, r9, r10 // ........'......*.................. + // pkhtb r6, r7, r6, asr #16 // ........'...............*......... + // uadd16 r14, r14, r6 // ........'................*........ + // str r14, [r0], #8 // ...~....'....................*.... + // neg r12, r12 // ........'......*.................. + // ldr r8, [r1, #-4] // ........'.........*............... + // ldr r14, [r0, #-4] // ....~...'.....................*... + // smulwt r6, r12, r5 // ........'.........*............... + // smlabt r6, r6, r9, r10 // ........'...........*............. + // smultt r6, r8, r6 // ........'.............*........... + // smlabb r6, r8, r5, r6 // ........'..............*.......... + // mul r6, r6, r11 // ........'................*........ + // smlatt r6, r6, r9, r10 // .~......'..................*...... + // smuadx r7, r8, r5 // ........'...............*......... + // mul r7, r7, r11 // ~.......'.................*....... + // smlatt r7, r7, r9, r10 // ..~.....'...................*..... + // pkhtb r6, r7, r6, asr #16 // ....~...'.....................*... + // uadd16 r14, r14, r6 // .....~..'......................*.. + // str r14, [r0, #-4] // ......~.'.......................*. + // vmov r14, s0 // ........'..............*.......... + // cmp.w r0, r14 // .....~..'......................*.. + // bne.w 1b // .......~'........................* - vmov ctr, s0 - cmp rptr, ctr - bne 1b - // Instructions: 36 - // Expected cycles: 25 - // Expected IPC: 1.44 + + // Instructions: 41 + // Expected cycles: 26 + // Expected IPC: 1.58 // - // Cycle bound: 25.0 - // IPC bound: 1.44 + // Cycle bound: 26.0 + // IPC bound: 1.58 // - // Wall time: 0.44s - // User time: 0.44s + // Wall time: 1.19s + // User time: 1.19s // // ----- cycle (expected) ------> // 0 25 // |------------------------|---- - ubfx r6, r7, #12, #4 // *............................. - ldr r5, [r1], #8 // *............................. - orr r4, r6, r4, lsl #4 // .*............................ + ldr r12, [r1], #8 // *............................. + orr r8, r6, r8, lsl #4 // *............................. + ubfx r4, r4, #0, #12 // .*............................ ldr.w r6, [r3], #4 // .*............................ - ubfx r7, r7, #0, #12 // ..*........................... - ldr r14, [r0] // ..*........................... - orr r7, r7, r4, lsl #16 // ...*.......................... - ubfx r8, r8, #0, #12 // ....*......................... - smulwt r4, r6, r7 // ....*......................... - orr r8, r8, r12, lsl #16 // .....*........................ - smuadx r12, r5, r7 // .....*........................ - neg r6, r6 // ......*....................... - smlabt r4, r4, r9, r10 // ......*....................... - mul r12, r12, r11 // .......*...................... - smultt r4, r5, r4 // ........*..................... - smlabb r5, r5, r7, r4 // .........*.................... - ldr r7, [r1, #-4] // ..........*................... - smulwt r6, r6, r8 // ..........*................... - mul r4, r5, r11 // ...........*.................. - smlabt r5, r6, r9, r10 // ............*................. - smlatt r4, r4, r9, r10 // .............*................ - smlatt r6, r12, r9, r10 // ..............*............... - smultt r5, r7, r5 // ...............*.............. - pkhtb r12, r6, r4, asr #16 // ................*............. - smlabb r4, r7, r8, r5 // ................*............. - uadd16 r14, r14, r12 // .................*............ - smuadx r8, r7, r8 // .................*............ - mul r4, r4, r11 // ..................*........... - mul r8, r8, r11 // ...................*.......... - smlatt r6, r4, r9, r10 // ....................*......... - smlatt r4, r8, r9, r10 // .....................*........ - str r14, [r0], #8 // ......................*....... // @slothy:core - pkhtb r4, r4, r6, asr #16 // .......................*...... - ldr r14, [r0, #-4] // .......................*...... - uadd16 r14, r14, r4 // ........................*..... - str r14, [r0, #-4] // ........................*..... + vmov s0, r14 // ..*........................... + orr r8, r4, r8, lsl #16 // ..*........................... + ubfx r14, r5, #12, #4 // ...*.......................... + smulwt r4, r6, r8 // ...*.......................... + orr r7, r14, r7, lsl #4 // ....*......................... + smuadx r14, r12, r8 // ....*......................... + ubfx r5, r5, #0, #12 // .....*........................ + smlabt r4, r4, r9, r10 // .....*........................ + orr r5, r5, r7, lsl #16 // ......*....................... + mul r14, r14, r11 // ......*....................... + neg r6, r6 // .......*...................... + smultt r4, r12, r4 // .......*...................... + smlabb r8, r12, r8, r4 // ........*..................... + ldr r12, [r1, #-4] // .........*.................... + smulwt r7, r6, r5 // .........*.................... + mul r8, r8, r11 // ..........*................... + smlabt r7, r7, r9, r10 // ...........*.................. + smlatt r8, r8, r9, r10 // ............*................. + ldr r6, [r0] // .............*................ + smlatt r4, r14, r9, r10 // .............*................ + vmov r14, s0 // ..............*............... + smultt r7, r12, r7 // ..............*............... + pkhtb r4, r4, r8, asr #16 // ...............*.............. + smlabb r8, r12, r5, r7 // ...............*.............. + uadd16 r4, r6, r4 // ................*............. + str r4, [r0], #8 // ................*............. // @slothy:core // @slothy:before=cmp + cmp.w r0, r14 // .................*............ // @slothy:id=cmp + smuadx r4, r12, r5 // .................*............ + ldr r7, [r0, #-4] // ..................*........... + mul r12, r8, r11 // ..................*........... + mul r5, r4, r11 // ...................*.......... + smlatt r12, r12, r9, r10 // ....................*......... + smlatt r5, r5, r9, r10 // .....................*........ + pkhtb r8, r5, r12, asr #16 // .......................*...... + uadd16 r8, r7, r8 // ........................*..... + str r8, [r0, #-4] // ........................*..... - // ------ cycle (expected) ------> - // 0 25 - // |------------------------|----- - // ubfx r6, r7, #12, #4 // *.............................. - // ldr.w r5, [r3], #4 // .*............................. - // orr r6, r6, r4, lsl #4 // .*............................. - // ldr r14, [r0] // ..*............................ - // ubfx r4, r7, #0, #12 // ..*............................ - // orr r4, r4, r6, lsl #16 // ...*........................... - // ldr r7, [r1], #8 // *.............................. - // ubfx r8, r8, #0, #12 // ....*.......................... - // smulwt r6, r5, r4 // ....*.......................... - // orr r12, r8, r12, lsl #16 // .....*......................... - // smuadx r8, r7, r4 // .....*......................... - // neg r5, r5 // ......*........................ - // smlabt r6, r6, r9, r10 // ......*........................ - // mul r8, r8, r11 // .......*....................... - // smultt r6, r7, r6 // ........*...................... - // smlabb r7, r7, r4, r6 // .........*..................... - // smulwt r6, r5, r12 // ..........*.................... - // mul r4, r7, r11 // ...........*................... - // smlabt r5, r6, r9, r10 // ............*.................. - // ldr r6, [r1, #-4] // ..........*.................... - // smlatt r7, r4, r9, r10 // .............*................. - // smlatt r4, r8, r9, r10 // ..............*................ - // smultt r5, r6, r5 // ...............*............... - // pkhtb r7, r4, r7, asr #16 // ................*.............. - // smlabb r5, r6, r12, r5 // ................*.............. - // smuadx r6, r6, r12 // .................*............. - // uadd16 r14, r14, r7 // .................*............. - // mul r7, r5, r11 // ..................*............ - // mul r5, r6, r11 // ...................*........... - // smlatt r6, r7, r9, r10 // ....................*.......... - // smlatt r5, r5, r9, r10 // .....................*......... - // str r14, [r0], #8 // ......................*........ - // pkhtb r6, r5, r6, asr #16 // .......................*....... - // ldr r14, [r0, #-4] // .......................*....... - // uadd16 r14, r14, r6 // ........................*...... - // str r14, [r0, #-4] // ........................*...... + // ------ cycle (expected) ------> + // 0 25 + // |------------------------|----- + // orr r12, r6, r8, lsl #4 // *.............................. + // ldr.w r6, [r3], #4 // .*............................. + // ubfx r8, r4, #0, #12 // .*............................. + // ldr r4, [r1], #8 // *.............................. + // orr r8, r8, r12, lsl #16 // ..*............................ + // vmov s0, r14 // ..*............................ + // ubfx r12, r5, #12, #4 // ...*........................... + // smuadx r14, r4, r8 // ....*.......................... + // orr r12, r12, r7, lsl #4 // ....*.......................... + // smulwt r7, r6, r8 // ...*........................... + // ubfx r5, r5, #0, #12 // .....*......................... + // mul r14, r14, r11 // ......*........................ + // orr r12, r5, r12, lsl #16 // ......*........................ + // smlabt r5, r7, r9, r10 // .....*......................... + // neg r6, r6 // .......*....................... + // smlatt r7, r14, r9, r10 // .............*................. + // smultt r5, r4, r5 // .......*....................... + // smlabb r4, r4, r8, r5 // ........*...................... + // ldr r8, [r1, #-4] // .........*..................... + // smulwt r5, r6, r12 // .........*..................... + // mul r6, r4, r11 // ..........*.................... + // ldr r4, [r0] // .............*................. + // smlabt r5, r5, r9, r10 // ...........*................... + // smlatt r6, r6, r9, r10 // ............*.................. + // smultt r5, r8, r5 // ..............*................ + // vmov r14, s0 // ..............*................ + // smlabb r5, r8, r12, r5 // ...............*............... + // pkhtb r6, r7, r6, asr #16 // ...............*............... + // smuadx r7, r8, r12 // .................*............. + // uadd16 r4, r4, r6 // ................*.............. + // mul r6, r5, r11 // ..................*............ + // mul r12, r7, r11 // ...................*........... + // smlatt r6, r6, r9, r10 // ....................*.......... + // smlatt r12, r12, r9, r10 // .....................*......... + // str r4, [r0], #8 // ................*.............. + // pkhtb r6, r12, r6, asr #16 // .......................*....... + // ldr r12, [r0, #-4] // ..................*............ + // uadd16 r12, r12, r6 // ........................*...... + // cmp.w r0, r14 // .................*............. + // str r12, [r0, #-4] // ........................*...... + // bne.w 1b // .........................*..... pop {r4-r11, pc} From 630bf950a76650a66640cac9d6c9ea43067c1c0d Mon Sep 17 00:00:00 2001 From: Amin Abdulrahman Date: Fri, 10 Jan 2025 16:34:13 +0100 Subject: [PATCH 2/3] CM7: Simplify Dilithium iNTT code * This commit simplifies the Dilithium iNTT naive implementations to revert modifications to the code originally taken from pqm4 that were only introduced to accomodate for shortcomings of slothy's abilities. * We can also enable the fixup in more cases due to switching of the loop-type + using `before` tag which is done here, too. This aids with performance. --- example.py | 6 +- .../naive/armv7m/intt_dilithium_123_456_78.s | 68 +- .../armv7m/intt_dilithium_123_456_78_opt_m7.s | 1642 +++++++++-------- 3 files changed, 869 insertions(+), 847 deletions(-) diff --git a/example.py b/example.py index c0a7cf65..ab328dc8 100644 --- a/example.py +++ b/example.py @@ -1605,7 +1605,7 @@ def __init__(self, var="", arch=Arch_Armv7M, target=Target_CortexM7, timeout=Non def core(self, slothy): slothy.config.constraints.stalls_first_attempt = 16 - slothy.config.unsafe_address_offset_fixup = False + slothy.config.unsafe_address_offset_fixup = True slothy.config.variable_size = True @@ -1616,12 +1616,12 @@ def core(self, slothy): slothy.config.sw_pipelining.optimize_postamble = True slothy.config.sw_pipelining.allow_pre = True - slothy.optimize_loop("layer123_loop") + slothy.optimize_loop("layer123_loop", forced_loop_type=Arch_Armv7M.BranchLoop) slothy.optimize_loop("layer456_first_loop") slothy.optimize_loop("layer456_loop") slothy.config.inputs_are_outputs = True - slothy.optimize_loop("layer78_loop") + slothy.optimize_loop("layer78_loop", forced_loop_type=Arch_Armv7M.BranchLoop) class pointwise_montgomery_dilithium(Example): def __init__(self, var="", arch=Arch_Armv7M, target=Target_CortexM7, timeout=None): diff --git a/examples/naive/armv7m/intt_dilithium_123_456_78.s b/examples/naive/armv7m/intt_dilithium_123_456_78.s index cd92e1d3..6cd3e27a 100644 --- a/examples/naive/armv7m/intt_dilithium_123_456_78.s +++ b/examples/naive/armv7m/intt_dilithium_123_456_78.s @@ -221,9 +221,9 @@ pqcrystals_dilithium_invntt_tomont: str.w pol5, [ptr_p, #5*distance/4] str.w pol6, [ptr_p, #6*distance/4] str.w pol7, [ptr_p, #7*distance/4] - str.w pol0, [ptr_p], #strincr + str.w pol0, [ptr_p], #strincr // @slothy:before=cmp vmov temp_l, s9 - cmp.w ptr_p, temp_l + cmp.w ptr_p, temp_l // @slothy:id=cmp bne.w layer123_loop sub ptr_p, #32*strincr @@ -248,21 +248,21 @@ pqcrystals_dilithium_invntt_tomont: ldr.w pol3, [ptr_p, #7*distance2/4] _3_layer_inv_butterfly_light_fast_first pol0, pol1, pol2, pol3, pol4, pol5, pol6, pol7, s2, s3, s4, s5, s6, s7, s8, zeta, qinv, q, temp_h, temp_l - ldr.w pol0, [ptr_p], #128 - ldr pol1, [ptr_p, #1*distance2/4-128] - ldr pol2, [ptr_p, #2*distance2/4-128] - ldr pol3, [ptr_p, #3*distance2/4-128] + ldr.w pol0, [ptr_p] + ldr pol1, [ptr_p, #1*distance2/4] + ldr pol2, [ptr_p, #2*distance2/4] + ldr pol3, [ptr_p, #3*distance2/4] _3_layer_inv_butterfly_light_fast_second pol0, pol1, pol2, pol3, pol4, pol5, pol6, pol7, s2, s3, s4, s5, s6, s7, s8, zeta, qinv, q, temp_h, temp_l - str pol1, [ptr_p, #1*distance2/4-128] - str pol2, [ptr_p, #2*distance2/4-128] - str pol3, [ptr_p, #3*distance2/4-128] - str.w pol5, [ptr_p, #5*distance2/4-128] - str.w pol6, [ptr_p, #6*distance2/4-128] - str.w pol7, [ptr_p, #7*distance2/4-128] - str pol0, [ptr_p, #-128] - str.w pol4, [ptr_p], #128 - //add.w ptr_p, #strincr2 + str pol1, [ptr_p, #1*distance2/4] + str pol2, [ptr_p, #2*distance2/4] + str pol3, [ptr_p, #3*distance2/4] + str.w pol4, [ptr_p, #4*distance2/4] + str.w pol5, [ptr_p, #5*distance2/4] + str.w pol6, [ptr_p, #6*distance2/4] + str.w pol7, [ptr_p, #7*distance2/4] + str pol0, [ptr_p] + add.w ptr_p, ptr_p, #strincr2 vmov temp_l, s10 cmp.w ptr_p, temp_l @@ -281,26 +281,26 @@ pqcrystals_dilithium_invntt_tomont: vldm ptr_zeta!, {s2-s8} vmov s0, ptr_zeta layer456_loop: - ldr.w pol0, [ptr_p], #128 - ldr pol1, [ptr_p, #1*distance2/4-128] - ldr pol2, [ptr_p, #2*distance2/4-128] - ldr pol3, [ptr_p, #3*distance2/4-128] - ldr.w pol4, [ptr_p, #4*distance2/4-128] - ldr.w pol5, [ptr_p, #5*distance2/4-128] - ldr.w pol6, [ptr_p, #6*distance2/4-128] - ldr.w pol7, [ptr_p, #7*distance2/4-128] + ldr.w pol0, [ptr_p] + ldr pol1, [ptr_p, #1*distance2/4] + ldr pol2, [ptr_p, #2*distance2/4] + ldr pol3, [ptr_p, #3*distance2/4] + ldr.w pol4, [ptr_p, #4*distance2/4] + ldr.w pol5, [ptr_p, #5*distance2/4] + ldr.w pol6, [ptr_p, #6*distance2/4] + ldr.w pol7, [ptr_p, #7*distance2/4] _3_layer_inv_CT_32 pol0, pol1, pol2, pol3, pol4, pol5, pol6, pol7, s2, s3, s4, s5, s6, s7, s8, zeta, qinv, q, temp_h, temp_l - str pol1, [ptr_p, #1*distance2/4-128] - str pol2, [ptr_p, #2*distance2/4-128] - str pol3, [ptr_p, #3*distance2/4-128] - str.w pol5, [ptr_p, #5*distance2/4-128] - str.w pol6, [ptr_p, #6*distance2/4-128] - str.w pol7, [ptr_p, #7*distance2/4-128] - str pol0, [ptr_p, #-128] - str.w pol4, [ptr_p], #128 - //add.w ptr_p, #strincr2 + str pol1, [ptr_p, #1*distance2/4] + str pol2, [ptr_p, #2*distance2/4] + str pol3, [ptr_p, #3*distance2/4] + str.w pol4, [ptr_p, #4*distance2/4] + str.w pol5, [ptr_p, #5*distance2/4] + str.w pol6, [ptr_p, #6*distance2/4] + str.w pol7, [ptr_p, #7*distance2/4] + str pol0, [ptr_p] + add.w ptr_p, ptr_p, #strincr2 vmov temp_l, s10 cmp.w ptr_p, temp_l @@ -342,10 +342,10 @@ pqcrystals_dilithium_invntt_tomont: str.w pol1, [ptr_p, #256] str.w pol2, [ptr_p, #512] str.w pol3, [ptr_p, #768] - str pol0, [ptr_p], #strincr3 // @slothy:core + str pol0, [ptr_p], #strincr3 // @slothy:core // @slothy:before=cmp vmov cntr, s9 - cmp.w ptr_p, cntr + cmp.w ptr_p, cntr // @slothy:id=cmp bne.w layer78_loop //restore registers diff --git a/examples/opt/armv7m/intt_dilithium_123_456_78_opt_m7.s b/examples/opt/armv7m/intt_dilithium_123_456_78_opt_m7.s index 3ae57ae4..ec1fa96c 100644 --- a/examples/opt/armv7m/intt_dilithium_123_456_78_opt_m7.s +++ b/examples/opt/armv7m/intt_dilithium_123_456_78_opt_m7.s @@ -200,288 +200,297 @@ pqcrystals_dilithium_invntt_tomont_opt_m7: add.w temp_l, ptr_p, #32*strincr // 32 iterations vmov s9, temp_l - // Instructions: 2 - // Expected cycles: 2 - // Expected IPC: 1.00 - // - // Cycle bound: 2.0 - // IPC bound: 1.00 - // - // Wall time: 0.01s - // User time: 0.01s - // - // ----- cycle (expected) ------> - // 0 25 - // |------------------------|---- - vmov r1, s6 // *............................. - vmov r12, s8 // .*............................ + // Instructions: 2 + // Expected cycles: 1 + // Expected IPC: 2.00 + // + // Cycle bound: 1.0 + // IPC bound: 2.00 + // + // Wall time: 0.01s + // User time: 0.01s + // + // ----- cycle (expected) ------> + // 0 25 + // |------------------------|---- + ldr.w r5, [r0, #24] // *............................. + ldr.w r4, [r0, #28] // *............................. - // ------ cycle (expected) ------> - // 0 25 - // |------------------------|----- - // vmov r1, s6 // *.............................. - // vmov r12, s8 // .*............................. + // ------ cycle (expected) ------> + // 0 25 + // |------------------------|----- + // ldr.w r5, [r0, #24] // *.............................. + // ldr.w r4, [r0, #28] // *.............................. - push {temp_l} - vmov temp_l, s9 - sub temp_l, temp_l, #32 - vmov s9, temp_l - pop {temp_l} + push {r14} + vmov r14, s9 + sub r14, r14, #32 + vmov s9, r14 + pop {r14} layer123_loop: - // Instructions: 55 - // Expected cycles: 28 - // Expected IPC: 1.96 + // Instructions: 58 + // Expected cycles: 29 + // Expected IPC: 2.00 // - // Cycle bound: 28.0 - // IPC bound: 1.96 + // Cycle bound: 29.0 + // IPC bound: 2.00 // - // Wall time: 68.83s - // User time: 68.83s + // Wall time: 50.16s + // User time: 50.16s // // ----- cycle (expected) ------> // 0 25 // |------------------------|---- - ldr.w r5, [r0, #16] // *............................. - ldr.w r6, [r0, #20] // *............................. - ldr.w r4, [r0, #12] // .*............................ - ldr.w r9, [r0, #24] // .*............................ - ldr.w r7, [r0, #28] // ..*........................... - add r5, r6 // ..*........................... - add r9, r7 // ...*.......................... - sub.w r8, r5, r6, lsl #1 // ...*.......................... - add.w r6, r5, r9 // ....*......................... - smull r5, r11, r8, r1 // ....*......................... - sub.w r7, r9, r7, lsl #1 // .....*........................ - smull r14, r10, r8, r12 // .....*........................ - sub.w r8, r6, r9, lsl #1 // ......*....................... - smlal r5, r11, r7, r12 // ......*....................... - ldr.w r12, [r0, #8] // .......*...................... - smlal r14, r10, r7, r1 // .......*...................... - ldr.w r9, [r0] // ........*..................... - mul r7, r5, r2 // ........*..................... - add r12, r4 // .........*.................... - mul r1, r14, r2 // .........*.................... - sub.w r4, r12, r4, lsl #1 // ..........*................... - smlal r5, r11, r7, r3 // ..........*................... - vmov r7, s4 // ...........*.................. - smlal r14, r10, r1, r3 // ...........*.................. - ldr.w r14, [r0, #4] // ............*................. - smull r1, r4, r4, r7 // ............*................. - add r9, r14 // .............*................ - smull r5, r8, r8, r7 // .............*................ - mul r7, r1, r2 // ..............*............... - sub.w r14, r9, r14, lsl #1 // ...............*.............. - add r9, r12 // ...............*.............. - sub.w r12, r9, r12, lsl #1 // ................*............. - smlal r1, r4, r7, r3 // ................*............. - add r9, r6 // .................*............ - mul r1, r5, r2 // .................*............ - add r14, r4 // ..................*........... - sub.w r7, r9, r6, lsl #1 // ..................*........... - sub.w r4, r14, r4, lsl #1 // ...................*.......... - smlal r5, r8, r1, r3 // ...................*.......... - add r4, r10 // ....................*......... - str.w r4, [r0, #12] // ....................*......... - add r14, r11 // .....................*........ - str.w r7, [r0, #16] // .....................*........ - add r12, r8 // ......................*....... - str.w r12, [r0, #8] // ......................*....... - sub.w r11, r14, r11, lsl #1 // .......................*...... - str.w r11, [r0, #20] // .......................*...... - sub.w r8, r12, r8, lsl #1 // ........................*..... - str.w r8, [r0, #24] // ........................*..... - sub.w r6, r4, r10, lsl #1 // .........................*.... - str.w r6, [r0, #28] // .........................*.... - vmov r1, s6 // ..........................e... - str.w r14, [r0, #4] // ..........................*... - vmov r12, s8 // ...........................e.. - str.w r9, [r0], #32 // ...........................*.. + vmov r6, s4 // *............................. + ldr.w r10, [r0, #16] // *............................. + add r5, r4 // .*............................ + ldr.w r14, [r0, #20] // .*............................ + add r10, r14 // ..*........................... + ldr.w r11, [r0, #12] // ..*........................... + sub.w r1, r5, r4, lsl #1 // ...*.......................... + add.w r9, r10, r5 // ...*.......................... + sub.w r12, r9, r5, lsl #1 // ....*......................... + ldr.w r7, [r0, #8] // ....*......................... + sub.w r10, r10, r14, lsl #1 // .....*........................ + ldr.w r14, [r0] // .....*........................ + ldr.w r4, [r0, #4] // ......*....................... + smull r8, r12, r12, r6 // ......*....................... + add r14, r4 // .......*...................... + add r7, r11 // .......*...................... + sub.w r5, r7, r11, lsl #1 // ........*..................... + mul r11, r8, r2 // ........*..................... + sub.w r4, r14, r4, lsl #1 // .........*.................... + smull r5, r6, r5, r6 // .........*.................... + add r14, r7 // ..........*................... + smlal r8, r12, r11, r3 // ..........*................... + sub.w r7, r14, r7, lsl #1 // ...........*.................. + mul r11, r5, r2 // ...........*.................. + add r14, r9 // ............*................. + str.w r14, [r0], #32 // ............*................. // @slothy:before=cmp + sub.w r14, r14, r9, lsl #1 // .............*................ + str r14, [r0, #-16] // .............*................ + vmov r8, s6 // ..............*............... + smlal r5, r6, r11, r3 // ..............*............... + vmov r5, s8 // ...............*.............. + smull r9, r11, r10, r8 // ...............*.............. + vmov r14, s9 // ................*............. + smlal r9, r11, r1, r5 // ................*............. + cmp.w r0, r14 // .................*............ // @slothy:id=cmp + smull r14, r10, r10, r5 // .................*............ + add r7, r12 // ..................*........... + mul r5, r9, r2 // ..................*........... + add r4, r6 // ...................*.......... + smlal r14, r10, r1, r8 // ...................*.......... + sub.w r1, r4, r6, lsl #1 // ....................*......... + smlal r9, r11, r5, r3 // ....................*......... + ldr.w r5, [r0, #24] // .....................e........ + mul r6, r14, r2 // .....................*........ + add r4, r11 // ......................*....... + str r4, [r0, #-28] // ......................*....... + sub.w r9, r4, r11, lsl #1 // .......................*...... + smlal r14, r10, r6, r3 // .......................*...... + sub.w r12, r7, r12, lsl #1 // ........................*..... + str r9, [r0, #-12] // ........................*..... + add r1, r10 // .........................*.... + str r1, [r0, #-20] // .........................*.... + ldr.w r4, [r0, #28] // ..........................e... + str r12, [r0, #-8] // ..........................*... + str r7, [r0, #-24] // ...........................*.. + sub.w r7, r1, r10, lsl #1 // ...........................*.. + str r7, [r0, #-4] // ............................*. + bne.w layer123_loop // ............................*. // @slothy:branch - // ------ cycle (expected) ------> + // --------- cycle (expected) ---------> // 0 25 - // |------------------------|----- - // ldr.w R4, [R0, #4*16/4] // ..*............................ - // ldr.w R6, [R0, #5*16/4] // ..*............................ - // ldr.w R12, [R0, #6*16/4] // ..'*........................... - // ldr.w R8, [R0, #7*16/4] // ..'.*.......................... - // add R4, R6 // ..'.*.......................... - // add R12, R8 // ..'..*......................... - // sub.w R6, R4, R6, lsl #1 // ..'..*......................... - // sub.w R8, R12, R8, lsl #1 // ..'....*....................... - // add.w R4, R4, R12 // ..'...*........................ - // sub.w R12, R4, R12, lsl #1 // ..'.....*...................... - // vmov R9, s6 // e.'.........................~.. - // vmov R10, s8 // .e'..........................~. - // smull R5, R11, R6, R9 // ..'...*........................ - // smlal R5, R11, R8, R10 // ..'.....*...................... - // mul R1, R5, R2 // ..'.......*.................... - // smlal R5, R11, R1, R3 // ..'.........*.................. - // smull R7, R14, R6, R10 // ..'....*....................... - // smlal R7, R14, R8, R9 // ..'......*..................... - // mul R1, R7, R2 // ..'........*................... - // smlal R7, R14, R1, R3 // ..'..........*................. - // ldr.w R5, [R0] // ..'.......*.................... - // ldr.w R6, [R0, #1*16/4] // ..'...........*................ - // ldr.w R7, [R0, #2*16/4] // ..'......*..................... - // ldr.w R8, [R0, #3*16/4] // ..'*........................... - // add R5, R6 // ..'............*............... - // add R7, R8 // ..'........*................... - // sub.w R6, R5, R6, lsl #1 // ..'..............*............. - // sub.w R8, R7, R8, lsl #1 // ..'.........*.................. - // vmov R1, s4 // ..'..........*................. - // smull R9, R8, R8, R1 // ..'...........*................ - // mul R10, R9, R2 // ..'.............*.............. - // smlal R9, R8, R10, R3 // ..'...............*............ - // add R5, R7 // ..'..............*............. - // add R6, R8 // ..'.................*.......... - // sub.w R7, R5, R7, lsl #1 // ..'...............*............ - // sub.w R8, R6, R8, lsl #1 // ..'..................*......... - // smull R9, R12, R12, R1 // ..'............*............... - // mul R10, R9, R2 // ..'................*........... - // smlal R9, R12, R10, R3 // ..'..................*......... - // add R5, R4 // ..'................*........... - // add R6, R11 // ..'....................*....... - // add R7, R12 // ..'.....................*...... - // add R8, R14 // ..'...................*........ - // sub.w R4, R5, R4, lsl #1 // ..'.................*.......... - // sub.w R11, R6, R11, lsl #1 // ..'......................*..... - // sub.w R12, R7, R12, lsl #1 // ..'.......................*.... - // sub.w R14, R8, R14, lsl #1 // ..'........................*... - // str.w R6, [R0, #1*16/4] // ~.'.........................*.. - // str.w R7, [R0, #2*16/4] // ..'.....................*...... - // str.w R8, [R0, #3*16/4] // ..'...................*........ - // str.w R4, [R0, #4*16/4] // ..'....................*....... - // str.w R11, [R0, #5*16/4] // ..'......................*..... - // str.w R12, [R0, #6*16/4] // ..'.......................*.... - // str.w R14, [R0, #7*16/4] // ..'........................*... - // str.w R5, [R0], #32 // .~'..........................*. + // |------------------------|----------- + // ldr.w R4, [R0, #4*16/4] // ........*............................ + // ldr.w R6, [R0, #5*16/4] // ........'*........................... + // ldr.w R12, [R0, #6*16/4] // e.......'....................~....... + // ldr.w R8, [R0, #7*16/4] // .....e..'.........................~.. + // add R4, R6 // ........'.*.......................... + // add R12, R8 // ........'*........................... + // sub.w R6, R4, R6, lsl #1 // ........'....*....................... + // sub.w R8, R12, R8, lsl #1 // ........'..*......................... + // add.w R4, R4, R12 // ........'..*......................... + // sub.w R12, R4, R12, lsl #1 // ........'...*........................ + // vmov R9, s6 // ........'.............*.............. + // vmov R10, s8 // ........'..............*............. + // smull R5, R11, R6, R9 // ........'..............*............. + // smlal R5, R11, R8, R10 // ........'...............*............ + // mul R1, R5, R2 // ........'.................*.......... + // smlal R5, R11, R1, R3 // ........'...................*........ + // smull R7, R14, R6, R10 // ........'................*........... + // smlal R7, R14, R8, R9 // ........'..................*......... + // mul R1, R7, R2 // ~.......'....................*....... + // smlal R7, R14, R1, R3 // ..~.....'......................*..... + // ldr.w R5, [R0] // ........'....*....................... + // ldr.w R6, [R0, #1*16/4] // ........'.....*...................... + // ldr.w R7, [R0, #2*16/4] // ........'...*........................ + // ldr.w R8, [R0, #3*16/4] // ........'.*.......................... + // add R5, R6 // ........'......*..................... + // add R7, R8 // ........'......*..................... + // sub.w R6, R5, R6, lsl #1 // ........'........*................... + // sub.w R8, R7, R8, lsl #1 // ........'.......*.................... + // vmov R1, s4 // ........*............................ + // smull R9, R8, R8, R1 // ........'........*................... + // mul R10, R9, R2 // ........'..........*................. + // smlal R9, R8, R10, R3 // ........'.............*.............. + // add R5, R7 // ........'.........*.................. + // add R6, R8 // ........'..................*......... + // sub.w R7, R5, R7, lsl #1 // ........'..........*................. + // sub.w R8, R6, R8, lsl #1 // ........'...................*........ + // smull R9, R12, R12, R1 // ........'.....*...................... + // mul R10, R9, R2 // ........'.......*.................... + // smlal R9, R12, R10, R3 // ........'.........*.................. + // add R5, R4 // ........'...........*................ + // add R6, R11 // .~......'.....................*...... + // add R7, R12 // ........'.................*.......... + // add R8, R14 // ....~...'........................*... + // sub.w R4, R5, R4, lsl #1 // ........'............*............... + // sub.w R11, R6, R11, lsl #1 // ..~.....'......................*..... + // sub.w R12, R7, R12, lsl #1 // ...~....'.......................*.... + // sub.w R14, R8, R14, lsl #1 // ......~.'..........................*. + // str.w R6, [R0, #1*16/4] // .~......'.....................*...... + // str.w R7, [R0, #2*16/4] // ......~.'..........................*. + // str.w R8, [R0, #3*16/4] // ....~...'........................*... + // str.w R4, [R0, #4*16/4] // ........'............*............... + // str.w R11, [R0, #5*16/4] // ...~....'.......................*.... + // str.w R12, [R0, #6*16/4] // .....~..'.........................*.. + // str.w R14, [R0, #7*16/4] // .......~'...........................* + // str.w R5, [R0], #32 // ........'...........*................ + // vmov R10, s9 // ........'...............*............ + // cmp.w R0, R10 // ........'................*........... + // bne.w layer123_loop // .......~'...........................* - vmov temp_l, s9 - cmp ptr_p, temp_l - bne layer123_loop - // Instructions: 53 - // Expected cycles: 28 - // Expected IPC: 1.89 - // - // Cycle bound: 27.0 - // IPC bound: 1.96 - // - // Wall time: 289.59s - // User time: 289.59s - // - // ----- cycle (expected) ------> - // 0 25 - // |------------------------|---- - ldr.w r5, [r0, #16] // *............................. - ldr.w r11, [r0, #20] // *............................. - ldr.w r7, [r0, #12] // .*............................ - ldr.w r10, [r0, #24] // .*............................ - ldr.w r14, [r0, #28] // ..*........................... - add r5, r11 // ..*........................... - add r10, r14 // ...*.......................... - sub.w r11, r5, r11, lsl #1 // ...*.......................... - add.w r8, r5, r10 // ....*......................... - smull r9, r5, r11, r1 // ....*......................... - sub.w r4, r10, r14, lsl #1 // .....*........................ - smull r14, r11, r11, r12 // .....*........................ - ldr.w r6, [r0, #8] // ......*....................... - smlal r9, r5, r4, r12 // ......*....................... - sub.w r10, r8, r10, lsl #1 // .......*...................... - smlal r14, r11, r4, r1 // .......*...................... - add r6, r7 // ........*..................... - mul r12, r9, r2 // ........*..................... - sub.w r1, r6, r7, lsl #1 // .........*.................... - mul r7, r14, r2 // .........*.................... - ldr.w r4, [r0, #4] // ..........*................... - smlal r9, r5, r12, r3 // ..........*................... - vmov r9, s4 // ...........*.................. - smlal r14, r11, r7, r3 // ...........*.................. - ldr.w r12, [r0] // ............*................. - smull r1, r14, r1, r9 // ............*................. - smull r10, r7, r10, r9 // .............*................ - add r12, r4 // ..............*............... - mul r9, r1, r2 // ..............*............... - sub.w r4, r12, r4, lsl #1 // ...............*.............. - add r12, r6 // ...............*.............. - sub.w r6, r12, r6, lsl #1 // ................*............. - add r12, r8 // ................*............. - sub.w r8, r12, r8, lsl #1 // .................*............ - smlal r1, r14, r9, r3 // .................*............ - mul r1, r10, r2 // ..................*........... - str.w r8, [r0, #16] // ...................*.......... - add r4, r14 // ...................*.......... - sub.w r8, r4, r14, lsl #1 // ....................*......... - smlal r10, r7, r1, r3 // ....................*......... - add r4, r5 // .....................*........ - str.w r4, [r0, #4] // .....................*........ - add r6, r7 // ......................*....... - str.w r6, [r0, #8] // ......................*....... - sub.w r9, r4, r5, lsl #1 // .......................*...... - str.w r9, [r0, #20] // .......................*...... - sub.w r9, r6, r7, lsl #1 // ........................*..... - str.w r9, [r0, #24] // ........................*..... - add r8, r11 // .........................*.... - str.w r8, [r0, #12] // .........................*.... - sub.w r9, r8, r11, lsl #1 // ..........................*... - str.w r9, [r0, #28] // ..........................*... - str.w r12, [r0], #32 // ...........................*.. + + // Instructions: 56 + // Expected cycles: 28 + // Expected IPC: 2.00 + // + // Cycle bound: 28.0 + // IPC bound: 2.00 + // + // Wall time: 9.01s + // User time: 9.01s + // + // ----- cycle (expected) ------> + // 0 25 + // |------------------------|---- + ldr.w r11, [r0, #20] // *............................. + ldr.w r14, [r0, #16] // *............................. + vmov r9, s6 // .*............................ + ldr.w r1, [r0, #12] // .*............................ + vmov r6, s8 // ..*........................... + add r14, r11 // ..*........................... + sub.w r12, r14, r11, lsl #1 // ...*.......................... + add r5, r4 // ...*.......................... + sub.w r4, r5, r4, lsl #1 // ....*......................... + smull r10, r7, r12, r6 // ....*......................... + ldr.w r11, [r0, #8] // .....*........................ + smlal r10, r7, r4, r9 // .....*........................ + add.w r8, r14, r5 // ......*....................... + smull r9, r14, r12, r9 // ......*....................... + add r11, r1 // .......*...................... + mul r12, r10, r2 // .......*...................... + sub.w r1, r11, r1, lsl #1 // ........*..................... + smlal r9, r14, r4, r6 // ........*..................... + ldr.w r4, [r0] // .........*.................... + smlal r10, r7, r12, r3 // .........*.................... + ldr.w r12, [r0, #4] // ..........*................... + mul r10, r9, r2 // ..........*................... + add r4, r12 // ...........*.................. + vmov r6, s4 // ...........*.................. + sub.w r5, r8, r5, lsl #1 // ............*................. + smlal r9, r14, r10, r3 // ............*................. + sub.w r9, r4, r12, lsl #1 // .............*................ + smull r5, r10, r5, r6 // .............*................ + add r4, r11 // ..............*............... + smull r1, r12, r1, r6 // ..............*............... + sub.w r6, r4, r11, lsl #1 // ...............*.............. + mul r11, r5, r2 // ...............*.............. + add r4, r8 // ................*............. + str.w r4, [r0], #32 // ................*............. // @slothy:before=cmp + sub.w r8, r4, r8, lsl #1 // .................*............ + smlal r5, r10, r11, r3 // .................*............ + vmov r4, s9 // ..................*........... + mul r11, r1, r2 // ..................*........... + add r6, r10 // ...................*.......... + str r6, [r0, #-24] // ...................*.......... + sub.w r6, r6, r10, lsl #1 // ....................*......... + smlal r1, r12, r11, r3 // ....................*......... + str r8, [r0, #-16] // .....................*........ + cmp.w r0, r4 // .....................*........ // @slothy:id=cmp + add r9, r12 // ......................*....... + str r6, [r0, #-8] // ......................*....... + sub.w r6, r9, r12, lsl #1 // .......................*...... + add r9, r14 // .......................*...... + str r9, [r0, #-28] // ........................*..... + add r6, r7 // ........................*..... + str r6, [r0, #-20] // .........................*.... + sub.w r12, r9, r14, lsl #1 // .........................*.... + str r12, [r0, #-12] // ..........................*... + sub.w r8, r6, r7, lsl #1 // ..........................*... + str r8, [r0, #-4] // ...........................*.. // ------ cycle (expected) ------> // 0 25 // |------------------------|----- - // ldr.w r5, [r0, #16] // *.............................. - // ldr.w r6, [r0, #20] // *.............................. - // ldr.w r4, [r0, #12] // .*............................. - // ldr.w r9, [r0, #24] // .*............................. - // ldr.w r7, [r0, #28] // ..*............................ - // add r5, r6 // ..*............................ - // add r9, r7 // ...*........................... - // sub.w r8, r5, r6, lsl #1 // ...*........................... - // add.w r6, r5, r9 // ....*.......................... - // smull r5, r11, r8, r1 // ....*.......................... - // sub.w r7, r9, r7, lsl #1 // .....*......................... - // smull r14, r10, r8, r12 // .....*......................... - // sub.w r8, r6, r9, lsl #1 // .......*....................... - // smlal r5, r11, r7, r12 // ......*........................ - // ldr.w r12, [r0, #8] // ......*........................ - // smlal r14, r10, r7, r1 // .......*....................... - // ldr.w r9, [r0] // ............*.................. - // mul r7, r5, r2 // ........*...................... - // add r12, r4 // ........*...................... - // mul r1, r14, r2 // .........*..................... - // sub.w r4, r12, r4, lsl #1 // .........*..................... - // smlal r5, r11, r7, r3 // ..........*.................... - // vmov r7, s4 // ...........*................... - // smlal r14, r10, r1, r3 // ...........*................... - // ldr.w r14, [r0, #4] // ..........*.................... - // smull r1, r4, r4, r7 // ............*.................. - // add r9, r14 // ..............*................ - // smull r5, r8, r8, r7 // .............*................. - // mul r7, r1, r2 // ..............*................ - // sub.w r14, r9, r14, lsl #1 // ...............*............... - // add r9, r12 // ...............*............... - // sub.w r12, r9, r12, lsl #1 // ................*.............. - // smlal r1, r4, r7, r3 // .................*............. - // add r9, r6 // ................*.............. - // mul r1, r5, r2 // ..................*............ - // add r14, r4 // ...................*........... - // sub.w r7, r9, r6, lsl #1 // .................*............. - // sub.w r4, r14, r4, lsl #1 // ....................*.......... - // smlal r5, r8, r1, r3 // ....................*.......... - // add r4, r10 // .........................*..... - // str.w r4, [r0, #12] // .........................*..... - // add r14, r11 // .....................*......... - // str.w r7, [r0, #16] // ...................*........... - // add r12, r8 // ......................*........ - // str.w r12, [r0, #8] // ......................*........ - // sub.w r11, r14, r11, lsl #1 // .......................*....... - // str.w r11, [r0, #20] // .......................*....... - // sub.w r8, r12, r8, lsl #1 // ........................*...... - // str.w r8, [r0, #24] // ........................*...... - // sub.w r6, r4, r10, lsl #1 // ..........................*.... - // str.w r6, [r0, #28] // ..........................*.... - // str.w r14, [r0, #4] // .....................*......... - // str.w r9, [r0], #32 // ...........................*... + // vmov r6, s4 // ...........*................... + // ldr.w r10, [r0, #16] // *.............................. + // add r5, r4 // ...*........................... + // ldr.w r14, [r0, #20] // *.............................. + // add r10, r14 // ..*............................ + // ldr.w r11, [r0, #12] // .*............................. + // sub.w r1, r5, r4, lsl #1 // ....*.......................... + // add.w r9, r10, r5 // ......*........................ + // sub.w r12, r9, r5, lsl #1 // ............*.................. + // ldr.w r7, [r0, #8] // .....*......................... + // sub.w r10, r10, r14, lsl #1 // ...*........................... + // ldr.w r14, [r0] // .........*..................... + // ldr.w r4, [r0, #4] // ..........*.................... + // smull r8, r12, r12, r6 // .............*................. + // add r14, r4 // ...........*................... + // add r7, r11 // .......*....................... + // sub.w r5, r7, r11, lsl #1 // ........*...................... + // mul r11, r8, r2 // ...............*............... + // sub.w r4, r14, r4, lsl #1 // .............*................. + // smull r5, r6, r5, r6 // ..............*................ + // add r14, r7 // ..............*................ + // smlal r8, r12, r11, r3 // .................*............. + // sub.w r7, r14, r7, lsl #1 // ...............*............... + // mul r11, r5, r2 // ..................*............ + // add r14, r9 // ................*.............. + // str.w r14, [r0], #32 // ................*.............. + // sub.w r14, r14, r9, lsl #1 // .................*............. + // str r14, [r0, #-16] // .....................*......... + // vmov r8, s6 // .*............................. + // smlal r5, r6, r11, r3 // ....................*.......... + // vmov r5, s8 // ..*............................ + // smull r9, r11, r10, r8 // ......*........................ + // vmov r14, s9 // ..................*............ + // smlal r9, r11, r1, r5 // ........*...................... + // cmp.w r0, r14 // .....................*......... + // smull r14, r10, r10, r5 // ....*.......................... + // add r7, r12 // ...................*........... + // mul r5, r9, r2 // ..........*.................... + // add r4, r6 // ......................*........ + // smlal r14, r10, r1, r8 // .....*......................... + // sub.w r1, r4, r6, lsl #1 // .......................*....... + // smlal r9, r11, r5, r3 // ............*.................. + // mul r6, r14, r2 // .......*....................... + // add r4, r11 // .......................*....... + // str r4, [r0, #-28] // ........................*...... + // sub.w r9, r4, r11, lsl #1 // .........................*..... + // smlal r14, r10, r6, r3 // .........*..................... + // sub.w r12, r7, r12, lsl #1 // ....................*.......... + // str r9, [r0, #-12] // ..........................*.... + // add r1, r10 // ........................*...... + // str r1, [r0, #-20] // .........................*..... + // str r12, [r0, #-8] // ......................*........ + // str r7, [r0, #-24] // ...................*........... + // sub.w r7, r1, r10, lsl #1 // ..........................*.... + // str r7, [r0, #-4] // ...........................*... + // bne.w layer123_loop // ...........................*... sub ptr_p, #32*strincr @@ -507,133 +516,135 @@ layer123_loop: // User time: 0.01s // layer456_first_loop: - // Instructions: 55 + // Instructions: 56 // Expected cycles: 29 - // Expected IPC: 1.90 + // Expected IPC: 1.93 // // Cycle bound: 28.0 - // IPC bound: 1.96 + // IPC bound: 2.00 // - // Wall time: 4.92s - // User time: 4.92s + // Wall time: 8.50s + // User time: 8.50s // // ----- cycle (expected) ------> // 0 25 // |------------------------|---- - vmov r12, s8 // *............................. - ldr.w r9, [r0, #128] // *............................. - vmov r4, s6 // .*............................ - ldr.w r14, [r0, #160] // .*............................ - add r9, r14 // ..*........................... - ldr.w r7, [r0, #192] // ..*........................... - ldr.w r6, [r0, #224] // ...*.......................... - add r7, r6 // ....*......................... - sub.w r10, r9, r14, lsl #1 // ....*......................... - add.w r1, r9, r7 // .....*........................ - smull r5, r11, r10, r4 // .....*........................ - sub.w r6, r7, r6, lsl #1 // ......*....................... - smull r14, r10, r10, r12 // ......*....................... - ldr.w r9, [r0], #128 // .......*...................... - smlal r14, r10, r6, r4 // .......*...................... - sub.w r7, r1, r7, lsl #1 // ........*..................... - smlal r5, r11, r6, r12 // ........*..................... - ldr r12, [r0, #-64] // .........*.................... - mul r8, r14, r2 // .........*.................... - ldr r4, [r0, #-32] // ..........*................... - mul r6, r5, r2 // ..........*................... - add r12, r4 // ...........*.................. - smlal r14, r10, r8, r3 // ...........*.................. - vmov r8, s4 // ............*................. - smlal r5, r11, r6, r3 // ............*................. - ldr r5, [r0, #-96] // .............*................ - smull r7, r6, r7, r8 // .............*................ - add r9, r5 // ..............*............... - sub.w r14, r12, r4, lsl #1 // ...............*.............. - mul r4, r7, r2 // ...............*.............. - sub.w r5, r9, r5, lsl #1 // ................*............. - smull r14, r8, r14, r8 // ................*............. - add r9, r12 // .................*............ - smlal r7, r6, r4, r3 // .................*............ - sub.w r4, r9, r12, lsl #1 // ..................*........... - mul r12, r14, r2 // ..................*........... - add r4, r6 // ...................*.......... - str r4, [r0, #-64] // ...................*.......... - sub.w r4, r4, r6, lsl #1 // ....................*......... - smlal r14, r8, r12, r3 // ....................*......... - str.w r4, [r0, #64] // .....................*........ - add r9, r1 // .....................*........ - add r5, r8 // ......................*....... - str r9, [r0, #-128] // ......................*....... - sub.w r14, r5, r8, lsl #1 // .......................*...... - add r5, r11 // .......................*...... - sub.w r7, r5, r11, lsl #1 // ........................*..... - str.w r7, [r0, #32] // ........................*..... - str r5, [r0, #-96] // .........................*.... - add r14, r10 // .........................*.... - str r14, [r0, #-32] // ..........................*... - sub.w r14, r14, r10, lsl #1 // ..........................*... - str.w r14, [r0, #96] // ...........................*.. - sub.w r5, r9, r1, lsl #1 // ...........................*.. - str.w r5, [r0], #128 // ............................*. + ldr.w r7, [r0, #128] // *............................. + vmov r5, s8 // *............................. + ldr.w r9, [r0, #160] // .*............................ + vmov r1, s4 // .*............................ + add r7, r9 // ..*........................... + ldr.w r6, [r0, #192] // ..*........................... + ldr.w r11, [r0, #224] // ...*.......................... + vmov r12, s6 // ...*.......................... + add r6, r11 // ....*......................... + sub.w r8, r7, r9, lsl #1 // ....*......................... + add.w r7, r7, r6 // .....*........................ + smull r14, r4, r8, r5 // .....*........................ + sub.w r10, r6, r11, lsl #1 // ......*....................... + smull r9, r11, r8, r12 // ......*....................... + ldr r8, [r0, #96] // .......*...................... + smlal r14, r4, r10, r12 // .......*...................... + ldr r12, [r0, #64] // ........*..................... + smlal r9, r11, r10, r5 // ........*..................... + sub.w r5, r7, r6, lsl #1 // .........*.................... + mul r10, r14, r2 // .........*.................... + add r12, r8 // ..........*................... + mul r6, r9, r2 // ..........*................... + sub.w r8, r12, r8, lsl #1 // ...........*.................. + smlal r14, r4, r10, r3 // ...........*.................. + ldr.w r14, [r0] // ............*................. + smlal r9, r11, r6, r3 // ............*................. + ldr r9, [r0, #32] // .............*................ + smull r6, r5, r5, r1 // .............*................ + smull r8, r10, r8, r1 // ..............*............... + add r14, r9 // ...............*.............. + mul r1, r6, r2 // ...............*.............. + sub.w r9, r14, r9, lsl #1 // ................*............. + add r14, r12 // .................*............ + smlal r6, r5, r1, r3 // .................*............ + sub.w r12, r14, r12, lsl #1 // ..................*........... + mul r6, r8, r2 // ..................*........... + add r12, r5 // ...................*.......... + str r12, [r0, #64] // ...................*.......... + sub.w r5, r12, r5, lsl #1 // ....................*......... + str.w r5, [r0, #192] // ....................*......... + add r14, r7 // .....................*........ + smlal r8, r10, r6, r3 // .....................*........ + str r14, [r0] // ......................*....... + sub.w r8, r14, r7, lsl #1 // ......................*....... + add r9, r10 // .......................*...... + str.w r8, [r0, #128] // .......................*...... + sub.w r10, r9, r10, lsl #1 // ........................*..... + add r9, r11 // ........................*..... + sub.w r6, r9, r11, lsl #1 // .........................*.... + str.w r6, [r0, #160] // .........................*.... + str r9, [r0, #32] // ..........................*... + add r10, r4 // ..........................*... + str r10, [r0, #96] // ...........................*.. + sub.w r6, r10, r4, lsl #1 // ...........................*.. + str.w r6, [r0, #224] // ............................*. + add.w r0, r0, #256 // ............................*. - // ------ cycle (expected) ------> - // 0 25 - // |------------------------|----- - // ldr.w R4, [R0, #4*128/4] // *............................~. - // ldr.w R6, [R0, #5*128/4] // .*...........................'. - // ldr.w R12, [R0, #6*128/4] // ..*..........................'. - // ldr.w R8, [R0, #7*128/4] // ...*.........................'. - // add R4, R6 // ..*..........................'. - // add R12, R8 // ....*........................'. - // sub.w R6, R4, R6, lsl #1 // ....*........................'. - // sub.w R8, R12, R8, lsl #1 // ......*......................'. - // add.w R4, R4, R12 // .....*.......................'. - // sub.w R12, R4, R12, lsl #1 // ........*....................'. - // vmov R9, s6 // .*...........................'. - // vmov R10, s8 // *............................~. - // smull R5, R11, R6, R9 // .....*.......................'. - // smlal R5, R11, R8, R10 // ........*....................'. - // mul R1, R5, R2 // ..........*..................'. - // smlal R5, R11, R1, R3 // ............*................'. - // smull R7, R14, R6, R10 // ......*......................'. - // smlal R7, R14, R8, R9 // .......*.....................'. - // mul R1, R7, R2 // .........*...................'. - // smlal R7, R14, R1, R3 // ...........*.................'. - // ldr.w R5, [R0], #128 // .......*.....................'. - // ldr R6, [R0, #1*128/4-128] // .............*...............'. - // ldr R7, [R0, #2*128/4-128] // .........*...................'. - // ldr R8, [R0, #3*128/4-128] // ..........*..................'. - // add R5, R6 // ..............*..............'. - // add R7, R8 // ...........*.................'. - // sub.w R6, R5, R6, lsl #1 // ................*............'. - // sub.w R8, R7, R8, lsl #1 // ...............*.............'. - // vmov R1, s4 // ............*................'. - // smull R9, R8, R8, R1 // ................*............'. - // mul R10, R9, R2 // ..................*..........'. - // smlal R9, R8, R10, R3 // ....................*........'. - // add R5, R7 // .................*...........'. - // add R6, R8 // ......................*......'. - // sub.w R7, R5, R7, lsl #1 // ..................*..........'. - // sub.w R8, R6, R8, lsl #1 // .......................*.....'. - // smull R9, R12, R12, R1 // .............*...............'. - // mul R10, R9, R2 // ...............*.............'. - // smlal R9, R12, R10, R3 // .................*...........'. - // add R5, R4 // .....................*.......'. - // add R6, R11 // .......................*.....'. - // add R7, R12 // ...................*.........'. - // add R8, R14 // .........................*...'. - // sub.w R4, R5, R4, lsl #1 // ...........................*.'. - // sub.w R11, R6, R11, lsl #1 // ........................*....'. - // sub.w R12, R7, R12, lsl #1 // ....................*........'. - // sub.w R14, R8, R14, lsl #1 // ..........................*..'. - // str R6, [R0, #1*128/4-128] // .........................*...'. - // str R7, [R0, #2*128/4-128] // ...................*.........'. - // str R8, [R0, #3*128/4-128] // ..........................*..'. - // str.w R11, [R0, #5*128/4-128] // ........................*....'. - // str.w R12, [R0, #6*128/4-128] // .....................*.......'. - // str.w R14, [R0, #7*128/4-128] // ...........................*.'. - // str R5, [R0, #-128] // ......................*......'. - // str.w R4, [R0], #128 // ............................*'. + // ------ cycle (expected) ------> + // 0 25 + // |------------------------|----- + // ldr.w R4, [R0, #4*128/4] // *............................~. + // ldr.w R6, [R0, #5*128/4] // .*...........................'. + // ldr.w R12, [R0, #6*128/4] // ..*..........................'. + // ldr.w R8, [R0, #7*128/4] // ...*.........................'. + // add R4, R6 // ..*..........................'. + // add R12, R8 // ....*........................'. + // sub.w R6, R4, R6, lsl #1 // ....*........................'. + // sub.w R8, R12, R8, lsl #1 // ......*......................'. + // add.w R4, R4, R12 // .....*.......................'. + // sub.w R12, R4, R12, lsl #1 // .........*...................'. + // vmov R9, s6 // ...*.........................'. + // vmov R10, s8 // *............................~. + // smull R5, R11, R6, R9 // ......*......................'. + // smlal R5, R11, R8, R10 // ........*....................'. + // mul R1, R5, R2 // ..........*..................'. + // smlal R5, R11, R1, R3 // ............*................'. + // smull R7, R14, R6, R10 // .....*.......................'. + // smlal R7, R14, R8, R9 // .......*.....................'. + // mul R1, R7, R2 // .........*...................'. + // smlal R7, R14, R1, R3 // ...........*.................'. + // ldr.w R5, [R0] // ............*................'. + // ldr R6, [R0, #1*128/4] // .............*...............'. + // ldr R7, [R0, #2*128/4] // ........*....................'. + // ldr R8, [R0, #3*128/4] // .......*.....................'. + // add R5, R6 // ...............*.............'. + // add R7, R8 // ..........*..................'. + // sub.w R6, R5, R6, lsl #1 // ................*............'. + // sub.w R8, R7, R8, lsl #1 // ...........*.................'. + // vmov R1, s4 // .*...........................'. + // smull R9, R8, R8, R1 // ..............*..............'. + // mul R10, R9, R2 // ..................*..........'. + // smlal R9, R8, R10, R3 // .....................*.......'. + // add R5, R7 // .................*...........'. + // add R6, R8 // .......................*.....'. + // sub.w R7, R5, R7, lsl #1 // ..................*..........'. + // sub.w R8, R6, R8, lsl #1 // ........................*....'. + // smull R9, R12, R12, R1 // .............*...............'. + // mul R10, R9, R2 // ...............*.............'. + // smlal R9, R12, R10, R3 // .................*...........'. + // add R5, R4 // .....................*.......'. + // add R6, R11 // ........................*....'. + // add R7, R12 // ...................*.........'. + // add R8, R14 // ..........................*..'. + // sub.w R4, R5, R4, lsl #1 // ......................*......'. + // sub.w R11, R6, R11, lsl #1 // .........................*...'. + // sub.w R12, R7, R12, lsl #1 // ....................*........'. + // sub.w R14, R8, R14, lsl #1 // ...........................*.'. + // str R6, [R0, #1*128/4] // ..........................*..'. + // str R7, [R0, #2*128/4] // ...................*.........'. + // str R8, [R0, #3*128/4] // ...........................*.'. + // str.w R4, [R0, #4*128/4] // .......................*.....'. + // str.w R11, [R0, #5*128/4] // .........................*...'. + // str.w R12, [R0, #6*128/4] // ....................*........'. + // str.w R14, [R0, #7*128/4] // ............................*'. + // str R5, [R0] // ......................*......'. + // add.w R0, R0, #256 // ............................*'. vmov temp_l, s10 cmp ptr_p, temp_l @@ -662,193 +673,195 @@ layer456_first_loop: // Expected cycles: 0 // Expected IPC: 0.00 // - // Wall time: 0.01s - // User time: 0.01s + // Wall time: 0.02s + // User time: 0.02s // layer456_loop: - // Instructions: 83 - // Expected cycles: 49 - // Expected IPC: 1.69 - // - // Cycle bound: 42.0 - // IPC bound: 1.98 - // - // Wall time: 6.79s - // User time: 6.79s - // - // --------------- cycle (expected) ---------------> - // 0 25 - // |------------------------|----------------------- - vmov r5, s2 // *................................................ - ldr.w r9, [r0], #128 // *................................................ - ldr r1, [r0, #-32] // ..*.............................................. - ldr.w r14, [r0, #96] // ...*............................................. - ldr.w r10, [r0, #64] // ....*............................................ - smull r7, r4, r1, r5 // ....*............................................ - ldr r11, [r0, #-96] // .....*........................................... - smull r6, r1, r14, r5 // .....*........................................... - mul r12, r7, r2 // ......*.......................................... - ldr r14, [r0, #-64] // .......*......................................... - mul r8, r6, r2 // .......*......................................... - smlal r7, r4, r12, r3 // ........*........................................ - ldr.w r7, [r0, #32] // .........*....................................... - smull r12, r11, r11, r5 // .........*....................................... - add r14, r4 // ..........*...................................... - smlal r6, r1, r8, r3 // ..........*...................................... - sub.w r6, r14, r4, lsl #1 // ...........*..................................... - mul r4, r12, r2 // ...........*..................................... - add r10, r1 // ............*.................................... - smull r5, r8, r7, r5 // ............*.................................... - sub.w r1, r10, r1, lsl #1 // .............*................................... - smlal r12, r11, r4, r3 // .............*................................... - vmov r4, s4 // ..............*.................................. - mul r12, r5, r2 // ..............*.................................. - add r9, r11 // ...............*................................. - smull r7, r1, r1, r4 // ...............*................................. - smlal r5, r8, r12, r3 // ................*................................ - ldr.w r5, [r0, #0] // .................*............................... - mul r12, r7, r2 // .................*............................... - smull r4, r6, r6, r4 // ..................*.............................. - add r5, r8 // ...................*............................. - smlal r7, r1, r12, r3 // ...................*............................. - vmov r12, s6 // ....................*............................ - sub.w r8, r5, r8, lsl #1 // ....................*............................ - add r8, r1 // .....................*........................... - mul r7, r4, r2 // .....................*........................... - sub.w r1, r8, r1, lsl #1 // ......................*.......................... - smull r12, r8, r8, r12 // ......................*.......................... - smlal r4, r6, r7, r3 // .......................*......................... - vmov r7, s8 // ........................*........................ - mul r4, r12, r2 // ........................*........................ - sub.w r11, r9, r11, lsl #1 // .........................*....................... - smull r7, r1, r1, r7 // .........................*....................... - add r11, r6 // ..........................*...................... - smlal r12, r8, r4, r3 // ..........................*...................... - sub.w r12, r11, r6, lsl #1 // ...........................*..................... - mul r6, r7, r2 // ...........................*..................... - add r11, r8 // ............................*.................... - str r11, [r0, #-96] // ............................*.................... - sub.w r11, r11, r8, lsl #1 // .............................*................... - smlal r7, r1, r6, r3 // .............................*................... - str.w r11, [r0, #32] // ..............................*.................. - vmov r7, s3 // ..............................*.................. - smull r6, r10, r10, r7 // ...............................*................. - smull r11, r14, r14, r7 // ................................*................ - mul r4, r6, r2 // .................................*............... - vmov r8, s7 // ..................................*.............. - mul r7, r11, r2 // ..................................*.............. - add r12, r1 // ...................................*............. - smlal r6, r10, r4, r3 // ...................................*............. - vmov r6, s5 // ....................................*............ - str r12, [r0, #-32] // ....................................*............ - add r5, r10 // .....................................*........... - smlal r11, r14, r7, r3 // .....................................*........... - sub.w r11, r5, r10, lsl #1 // ......................................*.......... - smull r10, r6, r5, r6 // ......................................*.......... - add r9, r14 // .......................................*......... - smull r7, r4, r11, r8 // .......................................*......... - mul r5, r10, r2 // ........................................*........ - sub.w r8, r9, r14, lsl #1 // .........................................*....... - mul r11, r7, r2 // .........................................*....... - smlal r10, r6, r5, r3 // ..........................................*...... - sub.w r14, r12, r1, lsl #1 // ...........................................*..... - smlal r7, r4, r11, r3 // ...........................................*..... - add r9, r6 // ............................................*.... - str r9, [r0, #-128] // ............................................*.... - add r8, r4 // .............................................*... - str r8, [r0, #-64] // .............................................*... - sub.w r11, r8, r4, lsl #1 // ..............................................*.. - str.w r11, [r0, #64] // ..............................................*.. - sub.w r11, r9, r6, lsl #1 // ...............................................*. - str.w r14, [r0, #96] // ...............................................*. - str.w r11, [r0], #128 // ................................................* + // Instructions: 84 + // Expected cycles: 46 + // Expected IPC: 1.83 + // + // Cycle bound: 42.0 + // IPC bound: 2.00 + // + // Wall time: 13.72s + // User time: 13.72s + // + // ------------- cycle (expected) --------------> + // 0 25 + // |------------------------|-------------------- + vmov r12, s2 // *............................................. + ldr r11, [r0, #96] // *............................................. + ldr r10, [r0, #32] // .*............................................ + smull r5, r7, r11, r12 // ..*........................................... + smull r9, r11, r10, r12 // ...*.......................................... + ldr.w r4, [r0, #224] // ....*......................................... + mul r6, r5, r2 // ....*......................................... + ldr.w r10, [r0, #192] // .....*........................................ + mul r14, r9, r2 // .....*........................................ + smull r4, r1, r4, r12 // ......*....................................... + smlal r9, r11, r14, r3 // .......*...................................... + ldr.w r14, [r0, #160] // ........*..................................... + mul r8, r4, r2 // ........*..................................... + ldr r9, [r0, #64] // .........*.................................... + smlal r5, r7, r6, r3 // .........*.................................... + vmov r5, s3 // ..........*................................... + smull r12, r6, r14, r12 // ..........*................................... + add r9, r7 // ...........*.................................. + smlal r4, r1, r8, r3 // ...........*.................................. + sub.w r14, r9, r7, lsl #1 // ............*................................. + mul r8, r12, r2 // ............*................................. + add r10, r1 // .............*................................ + smull r7, r4, r9, r5 // .............*................................ + sub.w r1, r10, r1, lsl #1 // ..............*............................... + smlal r12, r6, r8, r3 // ..............*............................... + vmov r8, s4 // ...............*.............................. + mul r9, r7, r2 // ...............*.............................. + smull r12, r1, r1, r8 // ................*............................. + smlal r7, r4, r9, r3 // .................*............................ + ldr.w r9, [r0, #128] // ..................*........................... + mul r7, r12, r2 // ..................*........................... + smull r5, r10, r10, r5 // ...................*.......................... + add r9, r6 // ....................*......................... + smlal r12, r1, r7, r3 // ....................*......................... + sub.w r12, r9, r6, lsl #1 // .....................*........................ + mul r7, r5, r2 // .....................*........................ + add r12, r1 // ......................*....................... + smull r6, r8, r14, r8 // ......................*....................... + vmov r14, s8 // .......................*...................... + smlal r5, r10, r7, r3 // .......................*...................... + sub.w r1, r12, r1, lsl #1 // ........................*..................... + mul r5, r6, r2 // ........................*..................... + add r9, r10 // .........................*.................... + smull r7, r1, r1, r14 // .........................*.................... + ldr.w r14, [r0] // ..........................*................... + smlal r6, r8, r5, r3 // ..........................*................... + vmov r5, s6 // ...........................*.................. + mul r6, r7, r2 // ...........................*.................. + add r14, r11 // ............................*................. + smull r12, r5, r12, r5 // ............................*................. + sub.w r10, r9, r10, lsl #1 // .............................*................ + smlal r7, r1, r6, r3 // .............................*................ + vmov r6, s7 // ..............................*............... + mul r7, r12, r2 // ..............................*............... + sub.w r11, r14, r11, lsl #1 // ...............................*.............. + smull r6, r10, r10, r6 // ...............................*.............. + add r11, r8 // ................................*............. + smlal r12, r5, r7, r3 // ................................*............. + sub.w r8, r11, r8, lsl #1 // .................................*............ + mul r7, r6, r2 // .................................*............ + add r8, r1 // ..................................*........... + str r8, [r0, #96] // ..................................*........... + sub.w r12, r8, r1, lsl #1 // ...................................*.......... + str.w r12, [r0, #224] // ...................................*.......... + vmov r12, s5 // ....................................*......... + smlal r6, r10, r7, r3 // ....................................*......... + add r11, r5 // .....................................*........ + smull r9, r8, r9, r12 // .....................................*........ + add r14, r4 // ......................................*....... + str r11, [r0, #32] // ......................................*....... + sub.w r11, r11, r5, lsl #1 // .......................................*...... + mul r5, r9, r2 // .......................................*...... + sub.w r1, r14, r4, lsl #1 // ........................................*..... + str.w r11, [r0, #160] // ........................................*..... + add r1, r10 // .........................................*.... + smlal r9, r8, r5, r3 // .........................................*.... + str r1, [r0, #64] // ..........................................*... + sub.w r9, r1, r10, lsl #1 // ..........................................*... + add r14, r8 // ...........................................*.. + str.w r9, [r0, #192] // ...........................................*.. + sub.w r1, r14, r8, lsl #1 // ............................................*. + str.w r1, [r0, #128] // ............................................*. + str r14, [r0] // .............................................* + add.w r0, r0, #256 // .............................................* - // --------------- cycle (expected) ---------------> - // 0 25 - // |------------------------|----------------------- - // ldr.w R5, [R0], #128 // *................................................ - // ldr R6, [R0, #1*128/4-128] // .....*........................................... - // ldr R7, [R0, #2*128/4-128] // .......*......................................... - // ldr R8, [R0, #3*128/4-128] // ..*.............................................. - // ldr.w R4, [R0, #4*128/4-128] // .................*............................... - // ldr.w R11, [R0, #5*128/4-128] // .........*....................................... - // ldr.w R12, [R0, #6*128/4-128] // ....*............................................ - // ldr.w R14, [R0, #7*128/4-128] // ...*............................................. - // vmov R1, s2 // *................................................ - // smull R9, R6, R6, R1 // .........*....................................... - // mul R10, R9, R2 // ...........*..................................... - // smlal R9, R6, R10, R3 // .............*................................... - // smull R9, R8, R8, R1 // ....*............................................ - // mul R10, R9, R2 // ......*.......................................... - // smlal R9, R8, R10, R3 // ........*........................................ - // smull R9, R11, R11, R1 // ............*.................................... - // mul R10, R9, R2 // ..............*.................................. - // smlal R9, R11, R10, R3 // ................*................................ - // smull R9, R14, R14, R1 // .....*........................................... - // mul R10, R9, R2 // .......*......................................... - // smlal R9, R14, R10, R3 // ..........*...................................... - // add R5, R6 // ...............*................................. - // add R7, R8 // ..........*...................................... - // add R4, R11 // ...................*............................. - // add R12, R14 // ............*.................................... - // sub.w R6, R5, R6, lsl #1 // .........................*....................... - // sub.w R8, R7, R8, lsl #1 // ...........*..................................... - // sub.w R11, R4, R11, lsl #1 // ....................*............................ - // sub.w R14, R12, R14, lsl #1 // .............*................................... - // vmov R1, s3 // ..............................*.................. - // smull R9, R7, R7, R1 // ................................*................ - // mul R10, R9, R2 // ..................................*.............. - // smlal R9, R7, R10, R3 // .....................................*........... - // smull R9, R12, R12, R1 // ...............................*................. - // mul R10, R9, R2 // .................................*............... - // smlal R9, R12, R10, R3 // ...................................*............. - // vmov R1, s4 // ..............*.................................. - // smull R9, R8, R8, R1 // ..................*.............................. - // mul R10, R9, R2 // .....................*........................... - // smlal R9, R8, R10, R3 // .......................*......................... - // smull R9, R14, R14, R1 // ...............*................................. - // mul R10, R9, R2 // .................*............................... - // smlal R9, R14, R10, R3 // ...................*............................. - // add R5, R7 // .......................................*......... - // add R6, R8 // ..........................*...................... - // add R4, R12 // .....................................*........... - // add R11, R14 // .....................*........................... - // sub.w R7, R5, R7, lsl #1 // .........................................*....... - // sub.w R8, R6, R8, lsl #1 // ...........................*..................... - // sub.w R12, R4, R12, lsl #1 // ......................................*.......... - // sub.w R14, R11, R14, lsl #1 // ......................*.......................... - // vmov R1, s5 // ....................................*............ - // smull R9, R4, R4, R1 // ......................................*.......... - // mul R10, R9, R2 // ........................................*........ - // smlal R9, R4, R10, R3 // ..........................................*...... - // vmov R1, s6 // ....................*............................ - // smull R9, R11, R11, R1 // ......................*.......................... - // mul R10, R9, R2 // ........................*........................ - // smlal R9, R11, R10, R3 // ..........................*...................... - // vmov R1, s7 // ..................................*.............. - // smull R9, R12, R12, R1 // .......................................*......... - // mul R10, R9, R2 // .........................................*....... - // smlal R9, R12, R10, R3 // ...........................................*..... - // vmov R1, s8 // ........................*........................ - // smull R9, R14, R14, R1 // .........................*....................... - // mul R10, R9, R2 // ...........................*..................... - // smlal R9, R14, R10, R3 // .............................*................... - // add R5, R4 // ............................................*.... - // add R6, R11 // ............................*.................... - // add R7, R12 // .............................................*... - // add R8, R14 // ...................................*............. - // sub.w R4, R5, R4, lsl #1 // ...............................................*. - // sub.w R11, R6, R11, lsl #1 // .............................*................... - // sub.w R12, R7, R12, lsl #1 // ..............................................*.. - // sub.w R14, R8, R14, lsl #1 // ...........................................*..... - // str R6, [R0, #1*128/4-128] // ............................*.................... - // str R7, [R0, #2*128/4-128] // .............................................*... - // str R8, [R0, #3*128/4-128] // ....................................*............ - // str.w R11, [R0, #5*128/4-128] // ..............................*.................. - // str.w R12, [R0, #6*128/4-128] // ..............................................*.. - // str.w R14, [R0, #7*128/4-128] // ...............................................*. - // str R5, [R0, #-128] // ............................................*.... - // str.w R4, [R0], #128 // ................................................* + // ------------- cycle (expected) --------------> + // 0 25 + // |------------------------|-------------------- + // ldr.w R5, [R0] // ..........................*................... + // ldr R6, [R0, #1*128/4] // .*............................................ + // ldr R7, [R0, #2*128/4] // .........*.................................... + // ldr R8, [R0, #3*128/4] // *............................................. + // ldr.w R4, [R0, #4*128/4] // ..................*........................... + // ldr.w R11, [R0, #5*128/4] // ........*..................................... + // ldr.w R12, [R0, #6*128/4] // .....*........................................ + // ldr.w R14, [R0, #7*128/4] // ....*......................................... + // vmov R1, s2 // *............................................. + // smull R9, R6, R6, R1 // ...*.......................................... + // mul R10, R9, R2 // .....*........................................ + // smlal R9, R6, R10, R3 // .......*...................................... + // smull R9, R8, R8, R1 // ..*........................................... + // mul R10, R9, R2 // ....*......................................... + // smlal R9, R8, R10, R3 // .........*.................................... + // smull R9, R11, R11, R1 // ..........*................................... + // mul R10, R9, R2 // ............*................................. + // smlal R9, R11, R10, R3 // ..............*............................... + // smull R9, R14, R14, R1 // ......*....................................... + // mul R10, R9, R2 // ........*..................................... + // smlal R9, R14, R10, R3 // ...........*.................................. + // add R5, R6 // ............................*................. + // add R7, R8 // ...........*.................................. + // add R4, R11 // ....................*......................... + // add R12, R14 // .............*................................ + // sub.w R6, R5, R6, lsl #1 // ...............................*.............. + // sub.w R8, R7, R8, lsl #1 // ............*................................. + // sub.w R11, R4, R11, lsl #1 // .....................*........................ + // sub.w R14, R12, R14, lsl #1 // ..............*............................... + // vmov R1, s3 // ..........*................................... + // smull R9, R7, R7, R1 // .............*................................ + // mul R10, R9, R2 // ...............*.............................. + // smlal R9, R7, R10, R3 // .................*............................ + // smull R9, R12, R12, R1 // ...................*.......................... + // mul R10, R9, R2 // .....................*........................ + // smlal R9, R12, R10, R3 // .......................*...................... + // vmov R1, s4 // ...............*.............................. + // smull R9, R8, R8, R1 // ......................*....................... + // mul R10, R9, R2 // ........................*..................... + // smlal R9, R8, R10, R3 // ..........................*................... + // smull R9, R14, R14, R1 // ................*............................. + // mul R10, R9, R2 // ..................*........................... + // smlal R9, R14, R10, R3 // ....................*......................... + // add R5, R7 // ......................................*....... + // add R6, R8 // ................................*............. + // add R4, R12 // .........................*.................... + // add R11, R14 // ......................*....................... + // sub.w R7, R5, R7, lsl #1 // ........................................*..... + // sub.w R8, R6, R8, lsl #1 // .................................*............ + // sub.w R12, R4, R12, lsl #1 // .............................*................ + // sub.w R14, R11, R14, lsl #1 // ........................*..................... + // vmov R1, s5 // ....................................*......... + // smull R9, R4, R4, R1 // .....................................*........ + // mul R10, R9, R2 // .......................................*...... + // smlal R9, R4, R10, R3 // .........................................*.... + // vmov R1, s6 // ...........................*.................. + // smull R9, R11, R11, R1 // ............................*................. + // mul R10, R9, R2 // ..............................*............... + // smlal R9, R11, R10, R3 // ................................*............. + // vmov R1, s7 // ..............................*............... + // smull R9, R12, R12, R1 // ...............................*.............. + // mul R10, R9, R2 // .................................*............ + // smlal R9, R12, R10, R3 // ....................................*......... + // vmov R1, s8 // .......................*...................... + // smull R9, R14, R14, R1 // .........................*.................... + // mul R10, R9, R2 // ...........................*.................. + // smlal R9, R14, R10, R3 // .............................*................ + // add R5, R4 // ...........................................*.. + // add R6, R11 // .....................................*........ + // add R7, R12 // .........................................*.... + // add R8, R14 // ..................................*........... + // sub.w R4, R5, R4, lsl #1 // ............................................*. + // sub.w R11, R6, R11, lsl #1 // .......................................*...... + // sub.w R12, R7, R12, lsl #1 // ..........................................*... + // sub.w R14, R8, R14, lsl #1 // ...................................*.......... + // str R6, [R0, #1*128/4] // ......................................*....... + // str R7, [R0, #2*128/4] // ..........................................*... + // str R8, [R0, #3*128/4] // ..................................*........... + // str.w R4, [R0, #4*128/4] // ............................................*. + // str.w R11, [R0, #5*128/4] // ........................................*..... + // str.w R12, [R0, #6*128/4] // ...........................................*.. + // str.w R14, [R0, #7*128/4] // ...................................*.......... + // str R5, [R0] // .............................................* + // add.w R0, R0, #256 // .............................................* vmov temp_l, s10 cmp ptr_p, temp_l @@ -874,256 +887,265 @@ layer456_loop: add.w cntr, ptr_p, #64*strincr3 // 64 iterations vmov s9, cntr - // Instructions: 2 - // Expected cycles: 1 - // Expected IPC: 2.00 - // - // Cycle bound: 1.0 - // IPC bound: 2.00 - // - // Wall time: 0.01s - // User time: 0.01s - // - // ----- cycle (expected) ------> - // 0 25 - // |------------------------|---- - ldr.w r8, [r1, #8] // *............................. - ldr.w r6, [r1, #4] // *............................. + // Instructions: 9 + // Expected cycles: 7 + // Expected IPC: 1.29 + // + // Cycle bound: 7.0 + // IPC bound: 1.29 + // + // Wall time: 0.02s + // User time: 0.02s + // + // ----- cycle (expected) ------> + // 0 25 + // |------------------------|---- + ldr.w r9, [r0, #768] // *............................. + ldr r11, [r1], #12 // *............................. + ldr.w r14, [r0, #512] // .*............................ + ldr.w r10, [r0, #256] // ..*........................... + smull r9, r6, r9, r11 // ..*........................... + ldr r4, [r1, #-8] // ...*.......................... + ldr.w r12, [r1], #16 // ....*......................... + mul r7, r9, r2 // ....*......................... + smlal r9, r6, r7, r3 // ......*....................... - // ------ cycle (expected) ------> - // 0 25 - // |------------------------|----- - // ldr.w r8, [r1, #8] // *.............................. - // ldr.w r6, [r1, #4] // *.............................. + // ------ cycle (expected) ------> + // 0 25 + // |------------------------|----- + // ldr r11, [r1], #12 // *.............................. + // ldr.w r6, [r0, #768] // *.............................. + // smull r9, r6, r6, r11 // ..*............................ + // mul r7, r9, r2 // ....*.......................... + // smlal r9, r6, r7, r3 // ......*........................ + // ldr.w r14, [r0, #512] // .*............................. + // ldr.w r10, [r0, #256] // ..*............................ + // ldr r4, [r1, #-8] // ...*........................... + // ldr.w r12, [r1], #16 // ....*.......................... - push {cntr} - vmov cntr, s9 - sub cntr, cntr, #4 - vmov s9, cntr - pop {cntr} + push {r8} + vmov r8, s9 + sub r8, r8, #4 + vmov s9, r8 + pop {r8} layer78_loop: - // Instructions: 47 - // Expected cycles: 32 - // Expected IPC: 1.47 - // - // Cycle bound: 26.0 - // IPC bound: 1.81 - // - // Wall time: 6.05s - // User time: 6.05s - // - // ------ cycle (expected) -------> - // 0 25 - // |------------------------|------ - ldr.w r7, [r0, #768] // *............................... - ldr r11, [r1], #12 // *............................... - ldr.w r5, [r0, #256] // .*.............................. - smull r9, r10, r7, r11 // ..*............................. - smull r11, r14, r5, r11 // ...*............................ - ldr.w r5, [r0, #512] // ....*........................... - mul r12, r9, r2 // ....*........................... - mul r4, r11, r2 // .....*.......................... - smlal r9, r10, r12, r3 // ......*......................... - ldr.w r9, [r0] // .......*........................ - smlal r11, r14, r4, r3 // .......*........................ - add r5, r10 // ........*....................... - sub.w r7, r5, r10, lsl #1 // .........*...................... - smull r12, r6, r5, r6 // .........*...................... - add r9, r14 // ..........*..................... - smull r5, r7, r7, r8 // ..........*..................... - ldr.w r8, [r1, #12] // ...........*.................... - mul r11, r12, r2 // ...........*.................... - sub.w r4, r9, r14, lsl #1 // ............*................... - mul r14, r5, r2 // ............*................... - ldr.w r10, [r1, #8] // .............*.................. - smlal r12, r6, r11, r3 // .............*.................. - ldr.w r12, [r1, #4] // ..............*................. - ldr.w r11, [r1], #16 // ..............*................. - add r9, r6 // ...............*................ - smlal r5, r7, r14, r3 // ...............*................ - sub.w r6, r9, r6, lsl #1 // ................*............... - smull r5, r9, r9, r11 // ................*............... - add r4, r7 // .................*.............. - smull r10, r6, r6, r10 // .................*.............. - sub.w r11, r4, r7, lsl #1 // ..................*............. - smull r14, r7, r4, r12 // ..................*............. - mul r4, r10, r2 // ...................*............ - smull r12, r11, r11, r8 // ....................*........... - ldr.w r8, [r1, #8] // .....................e.......... - smlal r10, r6, r4, r3 // .....................*.......... - mul r10, r12, r2 // ......................*......... - mul r4, r14, r2 // .......................*........ - smlal r12, r11, r10, r3 // ........................*....... - str.w r11, [r0, #768] // .........................*...... - smlal r14, r7, r4, r3 // ..........................*..... - str.w r7, [r0, #256] // ...........................*.... - mul r10, r5, r2 // ............................*... - str.w r6, [r0, #512] // .............................*.. - ldr.w r6, [r1, #4] // .............................e.. - smlal r5, r9, r10, r3 // ..............................*. - str r9, [r0], #4 // ...............................* // @slothy:core + // Instructions: 50 + // Expected cycles: 28 + // Expected IPC: 1.79 + // + // Cycle bound: 34.0 + // IPC bound: 1.47 + // + // Wall time: 15.28s + // User time: 15.28s + // + // ----- cycle (expected) ------> + // 0 25 + // |------------------------|---- + add r14, r6 // *............................. + smull r11, r9, r10, r11 // *............................. + smull r4, r5, r14, r4 // .*............................ + sub.w r7, r14, r6, lsl #1 // ..*........................... + mul r10, r11, r2 // ..*........................... + ldr r6, [r1, #-20] // ...*.......................... + mul r8, r4, r2 // ...*.......................... + ldr.w r14, [r0] // ....*......................... + smlal r11, r9, r10, r3 // ....*......................... + ldr r11, [r1], #12 // .....e........................ + smull r7, r10, r7, r6 // .....*........................ + ldr.w r6, [r0, #772] // ......e....................... + smlal r4, r5, r8, r3 // ......*....................... + add r14, r9 // .......*...................... + mul r8, r7, r2 // .......*...................... + sub.w r4, r14, r9, lsl #1 // ........*..................... + smull r9, r6, r6, r11 // ........e..................... + smlal r7, r10, r8, r3 // .........*.................... + add r14, r5 // ..........*................... + mul r7, r9, r2 // ..........e................... + sub.w r5, r14, r5, lsl #1 // ...........*.................. + smull r14, r12, r14, r12 // ...........*.................. + ldr r8, [r1, #-24] // ............*................. + smlal r9, r6, r7, r3 // ............e................. + add r4, r10 // .............*................ + mul r9, r14, r2 // .............*................ + sub.w r10, r4, r10, lsl #1 // ..............*............... + smull r8, r7, r4, r8 // ..............*............... + smlal r14, r12, r9, r3 // ...............*.............. + mul r14, r8, r2 // ................*............. + str r12, [r0], #4 // .................*............ // @slothy:core // @slothy:before=cmp + ldr r4, [r1, #-20] // ..................*........... + smlal r8, r7, r14, r3 // ..................*........... + str.w r7, [r0, #252] // ...................*.......... + ldr r14, [r1, #-16] // ...................*.......... + vmov r8, s9 // ....................*......... + smull r12, r7, r5, r4 // ....................*......... + cmp.w r0, r8 // .....................*........ // @slothy:id=cmp + smull r5, r9, r10, r14 // .....................*........ + ldr.w r14, [r0, #512] // ......................e....... + mul r4, r12, r2 // ......................*....... + mul r8, r5, r2 // .......................*...... + ldr.w r10, [r0, #256] // ........................e..... + smlal r12, r7, r4, r3 // ........................*..... + ldr r4, [r1, #-8] // .........................e.... + smlal r5, r9, r8, r3 // .........................*.... + ldr.w r12, [r1], #16 // ..........................e... + str.w r9, [r0, #764] // ..........................*... + str.w r7, [r0, #508] // ...........................*.. + bne.w layer78_loop // ...........................*.. // @slothy:branch - // ------------ cycle (expected) ------------> + // ---------------- cycle (expected) ----------------> // 0 25 - // |------------------------|----------------- - // ldr.w R12, [R1, #4] // ........e..'............................~.. - // ldr.w R14, [R1, #8] // e..........'....................~.......... - // ldr R11, [R1], #12 // ...........*............................... - // ldr.w R5, [R0] // ...........'......*........................ - // ldr.w R6, [R0, #256] // ...........'*.............................. - // ldr.w R7, [R0, #512] // ...........'...*........................... - // ldr.w R8, [R0, #768] // ...........*............................... - // smull R9, R6, R6, R11 // ...........'..*............................ - // mul R10, R9, R2 // ...........'....*.......................... - // smlal R9, R6, R10, R3 // ...........'......*........................ - // smull R9, R8, R8, R11 // ...........'.*............................. - // mul R10, R9, R2 // ...........'...*........................... - // smlal R9, R8, R10, R3 // ...........'.....*......................... - // add R5, R6 // ...........'.........*..................... - // add R7, R8 // ...........'.......*....................... - // sub.w R6, R5, R6, lsl #1 // ...........'...........*................... - // sub.w R8, R7, R8, lsl #1 // ...........'........*...................... - // smull R9, R7, R7, R12 // ...........'........*...................... - // mul R10, R9, R2 // ...........'..........*.................... - // smlal R9, R7, R10, R3 // ...........'............*.................. - // smull R9, R8, R8, R14 // ...........'.........*..................... - // mul R10, R9, R2 // ...........'...........*................... - // smlal R9, R8, R10, R3 // ...........'..............*................ - // add R5, R7 // ...........'..............*................ - // add R6, R8 // ...........'................*.............. - // sub.w R7, R5, R7, lsl #1 // ...........'...............*............... - // sub.w R8, R6, R8, lsl #1 // ...........'.................*............. - // ldr.w R12, [R1, #4] // ...........'.............*................. - // ldr.w R14, [R1, #8] // ...........'............*.................. - // ldr.w R11, [R1, #12] // ...........'..........*.................... - // ldr.w R4, [R1], #16 // ...........'.............*................. - // smull R9, R5, R5, R4 // ...........'...............*............... - // mul R10, R9, R2 // .......~...'...........................*... - // smlal R9, R5, R10, R3 // .........~.'.............................*. - // smull R9, R6, R6, R12 // ...........'.................*............. - // mul R10, R9, R2 // ..~........'......................*........ - // smlal R9, R6, R10, R3 // .....~.....'.........................*..... - // smull R9, R7, R7, R14 // ...........'................*.............. - // mul R10, R9, R2 // ...........'..................*............ - // smlal R9, R7, R10, R3 // ~..........'....................*.......... - // smull R9, R8, R8, R11 // ...........'...................*........... - // mul R10, R9, R2 // .~.........'.....................*......... - // smlal R9, R8, R10, R3 // ...~.......'.......................*....... - // str.w R6, [R0, #256] // ......~....'..........................*.... - // str.w R7, [R0, #512] // ........~..'............................*.. - // str.w R8, [R0, #768] // ....~......'........................*...... - // str R5, [R0], #4 // ..........~'..............................* + // |------------------------|------------------------- + // ldr.w R12, [R1, #4] // ....................e..'........................~.. + // ldr.w R14, [R1, #8] // .......................'..*........................ + // ldr R11, [R1], #12 // e......................'....~...................... + // ldr.w R5, [R0] // .......................'...*....................... + // ldr.w R6, [R0, #256] // ...................e...'.......................~... + // ldr.w R7, [R0, #512] // .................e.....'.....................~..... + // ldr.w R8, [R0, #768] // .e.....................'.....~..................... + // smull R9, R6, R6, R11 // .......................*........................... + // mul R10, R9, R2 // .......................'.*......................... + // smlal R9, R6, R10, R3 // .......................'...*....................... + // smull R9, R8, R8, R11 // ...e...................'.......~................... + // mul R10, R9, R2 // .....e.................'.........~................. + // smlal R9, R8, R10, R3 // .......e...............'...........~............... + // add R5, R6 // ..~....................'......*.................... + // add R7, R8 // .......................*........................... + // sub.w R6, R5, R6, lsl #1 // ...~...................'.......*................... + // sub.w R8, R7, R8, lsl #1 // .......................'.*......................... + // smull R9, R7, R7, R12 // .......................'*.......................... + // mul R10, R9, R2 // .......................'..*........................ + // smlal R9, R7, R10, R3 // .~.....................'.....*..................... + // smull R9, R8, R8, R14 // ~......................'....*...................... + // mul R10, R9, R2 // ..~....................'......*.................... + // smlal R9, R8, R10, R3 // ....~..................'........*.................. + // add R5, R7 // .....~.................'.........*................. + // add R6, R8 // ........~..............'............*.............. + // sub.w R7, R5, R7, lsl #1 // ......~................'..........*................ + // sub.w R8, R6, R8, lsl #1 // .........~.............'.............*............. + // ldr.w R12, [R1, #4] // .......~...............'...........*............... + // ldr.w R14, [R1, #8] // .............~.........'.................*......... + // ldr.w R11, [R1, #12] // ..............~........'..................*........ + // ldr.w R4, [R1], #16 // .....................e.'.........................~. + // smull R9, R5, R5, R4 // ......~................'..........*................ + // mul R10, R9, R2 // ........~..............'............*.............. + // smlal R9, R5, R10, R3 // ..........~............'..............*............ + // smull R9, R6, R6, R12 // .........~.............'.............*............. + // mul R10, R9, R2 // ...........~...........'...............*........... + // smlal R9, R6, R10, R3 // .............~.........'.................*......... + // smull R9, R7, R7, R14 // ...............~.......'...................*....... + // mul R10, R9, R2 // .................~.....'.....................*..... + // smlal R9, R7, R10, R3 // ...................~...'.......................*... + // smull R9, R8, R8, R11 // ................~......'....................*...... + // mul R10, R9, R2 // ..................~....'......................*.... + // smlal R9, R8, R10, R3 // ....................~..'........................*.. + // str.w R6, [R0, #256] // ..............~........'..................*........ + // str.w R7, [R0, #512] // ......................~'..........................* + // str.w R8, [R0, #768] // .....................~.'.........................*. + // str R5, [R0], #4 // ............~..........'................*.......... + // vmov R4, s9 // ...............~.......'...................*....... + // cmp.w R0, R4 // ................~......'....................*...... + // bne.w layer78_loop // ......................~'..........................* - vmov cntr, s9 - cmp ptr_p, cntr - bne layer78_loop - // Instructions: 45 - // Expected cycles: 32 - // Expected IPC: 1.41 - // - // Cycle bound: 32.0 - // IPC bound: 1.41 - // - // Wall time: 1.13s - // User time: 1.13s - // - // ------ cycle (expected) -------> - // 0 25 - // |------------------------|------ - ldr.w r9, [r0, #768] // *............................... - ldr r4, [r1], #12 // *............................... - ldr.w r10, [r0, #256] // .*.............................. - ldr.w r12, [r1, #8] // ..*............................. - smull r9, r11, r9, r4 // ..*............................. - smull r5, r14, r10, r4 // ...*............................ - mul r4, r9, r2 // ....*........................... - mul r7, r5, r2 // .....*.......................... - ldr.w r10, [r0, #512] // ......*......................... - smlal r9, r11, r4, r3 // ......*......................... - ldr.w r4, [r1, #12] // .......*........................ - smlal r5, r14, r7, r3 // .......*........................ - add r10, r11 // ........*....................... - sub.w r11, r10, r11, lsl #1 // .........*...................... - smull r10, r9, r10, r6 // .........*...................... - ldr.w r6, [r0] // ..........*..................... - smull r5, r8, r11, r8 // ..........*..................... - mul r7, r10, r2 // ...........*.................... - add r6, r14 // ............*................... - mul r11, r5, r2 // ............*................... - sub.w r14, r6, r14, lsl #1 // .............*.................. - smlal r10, r9, r7, r3 // .............*.................. - ldr.w r7, [r1, #4] // ..............*................. - ldr.w r10, [r1], #16 // ..............*................. - add r6, r9 // ...............*................ - smlal r5, r8, r11, r3 // ...............*................ - sub.w r9, r6, r9, lsl #1 // ................*............... - smull r6, r10, r6, r10 // ................*............... - add r14, r8 // .................*.............. - smull r12, r9, r9, r12 // .................*.............. - sub.w r11, r14, r8, lsl #1 // ..................*............. - mul r5, r6, r2 // ..................*............. - smull r8, r14, r14, r7 // ...................*............ - smlal r6, r10, r5, r3 // ....................*........... - smull r4, r11, r11, r4 // .....................*.......... - mul r6, r12, r2 // ......................*......... - mul r7, r4, r2 // .......................*........ - smlal r12, r9, r6, r3 // ........................*....... - smlal r4, r11, r7, r3 // .........................*...... - str.w r11, [r0, #768] // ..........................*..... - mul r12, r8, r2 // ...........................*.... - str.w r9, [r0, #512] // ............................*... - smlal r8, r14, r12, r3 // .............................*.. - str.w r14, [r0, #256] // ..............................*. - str r10, [r0], #4 // ...............................* // @slothy:core - // ------ cycle (expected) -------> + // Instructions: 41 + // Expected cycles: 26 + // Expected IPC: 1.58 + // + // Cycle bound: 26.0 + // IPC bound: 1.58 + // + // Wall time: 1.55s + // User time: 1.55s + // + // ----- cycle (expected) ------> // 0 25 - // |------------------------|------ - // ldr.w r7, [r0, #768] // *............................... - // ldr r11, [r1], #12 // *............................... - // ldr.w r5, [r0, #256] // .*.............................. - // smull r9, r10, r7, r11 // ..*............................. - // smull r11, r14, r5, r11 // ...*............................ - // ldr.w r5, [r0, #512] // ......*......................... - // mul r12, r9, r2 // ....*........................... - // mul r4, r11, r2 // .....*.......................... - // smlal r9, r10, r12, r3 // ......*......................... - // ldr.w r9, [r0] // ..........*..................... - // smlal r11, r14, r4, r3 // .......*........................ - // add r5, r10 // ........*....................... - // sub.w r7, r5, r10, lsl #1 // .........*...................... - // smull r12, r6, r5, r6 // .........*...................... - // add r9, r14 // ............*................... - // smull r5, r7, r7, r8 // ..........*..................... - // ldr.w r8, [r1, #12] // .......*........................ - // mul r11, r12, r2 // ...........*.................... - // sub.w r4, r9, r14, lsl #1 // .............*.................. - // mul r14, r5, r2 // ............*................... - // ldr.w r10, [r1, #8] // ..*............................. - // smlal r12, r6, r11, r3 // .............*.................. - // ldr.w r12, [r1, #4] // ..............*................. - // ldr.w r11, [r1], #16 // ..............*................. - // add r9, r6 // ...............*................ - // smlal r5, r7, r14, r3 // ...............*................ - // sub.w r6, r9, r6, lsl #1 // ................*............... - // smull r5, r9, r9, r11 // ................*............... - // add r4, r7 // .................*.............. - // smull r10, r6, r6, r10 // .................*.............. - // sub.w r11, r4, r7, lsl #1 // ..................*............. - // smull r14, r7, r4, r12 // ...................*............ - // mul r4, r10, r2 // ......................*......... - // smull r12, r11, r11, r8 // .....................*.......... - // smlal r10, r6, r4, r3 // ........................*....... - // mul r10, r12, r2 // .......................*........ - // mul r4, r14, r2 // ...........................*.... - // smlal r12, r11, r10, r3 // .........................*...... - // str.w r11, [r0, #768] // ..........................*..... - // smlal r14, r7, r4, r3 // .............................*.. - // str.w r7, [r0, #256] // ..............................*. - // mul r10, r5, r2 // ..................*............. - // str.w r6, [r0, #512] // ............................*... - // smlal r5, r9, r10, r3 // ....................*........... - // str r9, [r0], #4 // ...............................* + // |------------------------|---- + add r14, r6 // *............................. + smull r8, r9, r10, r11 // *............................. + sub.w r7, r14, r6, lsl #1 // .*............................ + smull r4, r14, r14, r4 // .*............................ + ldr.w r11, [r0] // ..*........................... + mul r5, r8, r2 // ..*........................... + ldr r10, [r1, #-20] // ...*.......................... + mul r6, r4, r2 // ...*.......................... + smlal r8, r9, r5, r3 // ....*......................... + smlal r4, r14, r6, r3 // .....*........................ + add r11, r9 // ......*....................... + smull r8, r5, r7, r10 // ......*....................... + sub.w r10, r11, r9, lsl #1 // .......*...................... + add r11, r14 // .......*...................... + sub.w r6, r11, r14, lsl #1 // ........*..................... + smull r11, r12, r11, r12 // ........*..................... + ldr r7, [r1, #-12] // .........*.................... + mul r9, r8, r2 // .........*.................... + ldr r14, [r1, #-4] // ..........*................... + mul r4, r11, r2 // ..........*................... + smlal r8, r5, r9, r3 // ...........*.................. + ldr r8, [r1, #-8] // ............*................. + smlal r11, r12, r4, r3 // ............*................. + add r10, r5 // .............*................ + str r12, [r0], #4 // .............*................ // @slothy:core // @slothy:before=cmp + sub.w r12, r10, r5, lsl #1 // ..............*............... + smull r5, r4, r6, r8 // ..............*............... + smull r6, r8, r12, r14 // ...............*.............. + vmov r14, s9 // ................*............. + smull r12, r11, r10, r7 // ................*............. + mul r10, r6, r2 // .................*............ + mul r9, r5, r2 // ..................*........... + smlal r6, r8, r10, r3 // ...................*.......... + smlal r5, r4, r9, r3 // ....................*......... + str.w r4, [r0, #508] // .....................*........ + mul r5, r12, r2 // ......................*....... + str.w r8, [r0, #764] // .......................*...... + cmp.w r0, r14 // .......................*...... // @slothy:id=cmp + smlal r12, r11, r5, r3 // ........................*..... + str.w r11, [r0, #252] // .........................*.... + + // ------ cycle (expected) ------> + // 0 25 + // |------------------------|----- + // add r14, r6 // *.............................. + // smull r11, r9, r10, r11 // *.............................. + // smull r4, r5, r14, r4 // .*............................. + // sub.w r7, r14, r6, lsl #1 // .*............................. + // mul r10, r11, r2 // ..*............................ + // ldr r6, [r1, #-20] // ...*........................... + // mul r8, r4, r2 // ...*........................... + // ldr.w r14, [r0] // ..*............................ + // smlal r11, r9, r10, r3 // ....*.......................... + // smull r7, r10, r7, r6 // ......*........................ + // smlal r4, r5, r8, r3 // .....*......................... + // add r14, r9 // ......*........................ + // mul r8, r7, r2 // .........*..................... + // sub.w r4, r14, r9, lsl #1 // .......*....................... + // smlal r7, r10, r8, r3 // ...........*................... + // add r14, r5 // .......*....................... + // sub.w r5, r14, r5, lsl #1 // ........*...................... + // smull r14, r12, r14, r12 // ........*...................... + // ldr r8, [r1, #-12] // .........*..................... + // add r4, r10 // .............*................. + // mul r9, r14, r2 // ..........*.................... + // sub.w r10, r4, r10, lsl #1 // ..............*................ + // smull r8, r7, r4, r8 // ................*.............. + // smlal r14, r12, r9, r3 // ............*.................. + // mul r14, r8, r2 // ......................*........ + // str r12, [r0], #4 // .............*................. + // ldr r4, [r1, #-8] // ............*.................. + // smlal r8, r7, r14, r3 // ........................*...... + // str.w r7, [r0, #252] // .........................*..... + // ldr r14, [r1, #-4] // ..........*.................... + // vmov r8, s9 // ................*.............. + // smull r12, r7, r5, r4 // ..............*................ + // cmp.w r0, r8 // .......................*....... + // smull r5, r9, r10, r14 // ...............*............... + // mul r4, r12, r2 // ..................*............ + // mul r8, r5, r2 // .................*............. + // smlal r12, r7, r4, r3 // ....................*.......... + // smlal r5, r9, r8, r3 // ...................*........... + // str.w r9, [r0, #764] // .......................*....... + // str.w r7, [r0, #508] // .....................*......... + // bne.w layer78_loop // .........................*..... // restore registers From 5b47a4b867382899b492272068d6207fda744409 Mon Sep 17 00:00:00 2001 From: Amin Abdulrahman Date: Mon, 13 Jan 2025 10:05:44 +0100 Subject: [PATCH 3/3] CM7: Simplify Dilithium 769 NTT code --- example.py | 8 +- examples/naive/armv7m/ntt_769_dilithium.s | 6 +- .../opt/armv7m/ntt_769_dilithium_opt_m7.s | 2009 ++++++++++------- 3 files changed, 1169 insertions(+), 854 deletions(-) diff --git a/example.py b/example.py index ab328dc8..51146036 100644 --- a/example.py +++ b/example.py @@ -1814,6 +1814,7 @@ def core(self, slothy): slothy.config.constraints.stalls_first_attempt = 32 r = slothy.config.reserved_regs + r.add("r1") r = r.union(f"s{i}" for i in range(31)) # reserve FPR slothy.config.reserved_regs = r @@ -1825,13 +1826,12 @@ def core(self, slothy): slothy.config.variable_size = True slothy.config.split_heuristic = True slothy.config.timeout = 360 # Not more than 2min per step - slothy.config.split_heuristic_factor = 1 slothy.config.visualize_expected_performance = False - slothy.config.split_heuristic_factor = 4 + slothy.config.split_heuristic_factor = 5 slothy.config.split_heuristic_stepsize = 0.15 - slothy.optimize_loop("layer1234_loop") + slothy.optimize_loop("layer1234_loop", forced_loop_type=Arch_Armv7M.BranchLoop) slothy.config.split_heuristic_optimize_seam = 6 - slothy.optimize_loop("layer1234_loop") + slothy.optimize_loop("layer1234_loop", forced_loop_type=Arch_Armv7M.BranchLoop) slothy.config.outputs = ["r14"] diff --git a/examples/naive/armv7m/ntt_769_dilithium.s b/examples/naive/armv7m/ntt_769_dilithium.s index 6f7c51a7..ee67b44a 100644 --- a/examples/naive/armv7m/ntt_769_dilithium.s +++ b/examples/naive/armv7m/ntt_769_dilithium.s @@ -146,7 +146,6 @@ small_ntt_asm_769: // s24: tmp // s25: twiddle_ptr vmov s24, tmp - vmov s25, twiddle_ptr layer1234_loop: // load a1, a3, ..., a15 vmov s23, poly @@ -251,10 +250,10 @@ small_ntt_asm_769: uadd16 tmp, poly0, poly1 usub16 twiddle1, poly0, poly1 str.w twiddle1, [poly, #offset] - str.w tmp, [poly], #4 // @slothy:core + str.w tmp, [poly], #4 // @slothy:core // @slothy:before=cmp vmov tmp, s24 - cmp.w poly, tmp + cmp.w poly, tmp // @slothy:id=cmp bne.w layer1234_loop sub.w poly, #8*strincr @@ -266,7 +265,6 @@ small_ntt_asm_769: add.w tmp, poly, #strincr2*16 vmov s13, tmp - vmov twiddle_ptr, s25 layer567_loop: vmov s23, poly load poly, poly0, poly1, poly2, poly3, #0, #distance2/4, #2*distance2/4, #3*distance2/4 diff --git a/examples/opt/armv7m/ntt_769_dilithium_opt_m7.s b/examples/opt/armv7m/ntt_769_dilithium_opt_m7.s index 3af47ba0..cfc7bfc2 100644 --- a/examples/opt/armv7m/ntt_769_dilithium_opt_m7.s +++ b/examples/opt/armv7m/ntt_769_dilithium_opt_m7.s @@ -146,628 +146,948 @@ small_ntt_asm_769_opt_m7: // s24: tmp // s25: twiddle_ptr vmov s24, tmp - vmov s25, twiddle_ptr layer1234_loop: - // Instructions: 299 - // Expected cycles: 153 - // Expected IPC: 1.95 + // Instructions: 302 + // Expected cycles: 159 + // Expected IPC: 1.90 // - // ------------------------------------------------------------------------------------------------------------------------------------------- original position --------------------------------------------------------------------------------------------------------------------------------------------> - // 0 25 50 75 100 125 150 175 200 225 250 275 - // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|----------------------- - vmov r6, s8 // *.......................................................................................................................................................................................................................................................................................................... - ldr.w r3, [r0, #352] // .*......................................................................................................................................................................................................................................................................................................... - ldr.w r11, [r0, #96] // ....*...................................................................................................................................................................................................................................................................................................... - vmov s23, r0 // ..*........................................................................................................................................................................................................................................................................................................ - ldr.w r1, [r0, #288] // ........*.................................................................................................................................................................................................................................................................................................. - smulwb r10, r6, r3 // .....*..................................................................................................................................................................................................................................................................................................... - ldr.w r2, [r0, #480] // ..............*............................................................................................................................................................................................................................................................................................ - smulwt r7, r6, r3 // ...........*............................................................................................................................................................................................................................................................................................... - ldr.w r4, [r0, #32] // ............*.............................................................................................................................................................................................................................................................................................. - smulwb r8, r6, r1 // ...............*........................................................................................................................................................................................................................................................................................... - ldr.w r5, [r0, #416] // ...*....................................................................................................................................................................................................................................................................................................... - smulwb r14, r6, r2 // ...........................*............................................................................................................................................................................................................................................................................... - ldr.w r9, [r0, #160] // ......*.................................................................................................................................................................................................................................................................................................... - smulwt r2, r6, r2 // .............................*............................................................................................................................................................................................................................................................................. - ldr.w r3, [r0, #224] // ..........*................................................................................................................................................................................................................................................................................................ - smulwt r1, r6, r1 // .............*............................................................................................................................................................................................................................................................................................. - movw r0, #24608 // ................*.......................................................................................................................................................................................................................................................................................... - smlabt r2, r2, r12, r0 // .................................*......................................................................................................................................................................................................................................................................... - // gap // ........................................................................................................................................................................................................................................................................................................... - smlabt r14, r14, r12, r0 // ...............................*........................................................................................................................................................................................................................................................................... - // gap // ........................................................................................................................................................................................................................................................................................................... - smlabt r8, r8, r12, r0 // .....................*..................................................................................................................................................................................................................................................................................... - // gap // ........................................................................................................................................................................................................................................................................................................... - smlabt r1, r1, r12, r0 // .......................*................................................................................................................................................................................................................................................................................... - pkhtb r14, r2, r14, asr #16 // ....................................*...................................................................................................................................................................................................................................................................... - smlabt r2, r10, r12, r0 // .........................*................................................................................................................................................................................................................................................................................. - pkhtb r8, r1, r8, asr #16 // ..........................*................................................................................................................................................................................................................................................................................ - smulwb r10, r6, r5 // .......*................................................................................................................................................................................................................................................................................................... - usub16 r1, r3, r14 // ......................................*.................................................................................................................................................................................................................................................................... - smulwt r6, r6, r5 // .........*................................................................................................................................................................................................................................................................................................. - uadd16 r14, r3, r14 // ........................................*.................................................................................................................................................................................................................................................................. - smlabt r3, r7, r12, r0 // ...................*....................................................................................................................................................................................................................................................................................... - vmov r5, s9 // ................................*.......................................................................................................................................................................................................................................................................... - smlabt r7, r6, r12, r0 // ..................*........................................................................................................................................................................................................................................................................................ - pkhtb r3, r3, r2, asr #16 // ..................................*........................................................................................................................................................................................................................................................................ - smlabt r6, r10, r12, r0 // .................*......................................................................................................................................................................................................................................................................................... - usub16 r10, r11, r3 // ..................................................*........................................................................................................................................................................................................................................................ - smulwb r2, r5, r14 // ..........................................*................................................................................................................................................................................................................................................................ - uadd16 r11, r11, r3 // ................................................*.......................................................................................................................................................................................................................................................... - smulwt r3, r5, r14 // ............................................*.............................................................................................................................................................................................................................................................. - pkhtb r7, r7, r6, asr #16 // ....................*...................................................................................................................................................................................................................................................................................... - smlabt r6, r2, r12, r0 // ..............................................*............................................................................................................................................................................................................................................................ - uadd16 r2, r9, r7 // ........................*.................................................................................................................................................................................................................................................................................. - smlabt r14, r3, r12, r0 // ...................................................*....................................................................................................................................................................................................................................................... - usub16 r7, r9, r7 // ......................*.................................................................................................................................................................................................................................................................................... - smulwt r3, r5, r2 // .....................................*..................................................................................................................................................................................................................................................................... - uadd16 r9, r4, r8 // ..............................*............................................................................................................................................................................................................................................................................ - smulwb r5, r5, r2 // ...................................*....................................................................................................................................................................................................................................................................... - usub16 r2, r4, r8 // ............................*.............................................................................................................................................................................................................................................................................. - smlabt r4, r3, r12, r0 // .........................................*................................................................................................................................................................................................................................................................. - vmov r3, s10 // .............................................*............................................................................................................................................................................................................................................................. - smlabt r8, r5, r12, r0 // .......................................*................................................................................................................................................................................................................................................................... - pkhtb r5, r14, r6, asr #16 // .....................................................*..................................................................................................................................................................................................................................................... - smulwb r6, r3, r7 // ........................................................*.................................................................................................................................................................................................................................................. - // gap // ........................................................................................................................................................................................................................................................................................................... - smulwt r14, r3, r7 // ..........................................................*................................................................................................................................................................................................................................................ - pkhtb r8, r4, r8, asr #16 // ...........................................*............................................................................................................................................................................................................................................................... - smulwb r7, r3, r1 // ...............................................*........................................................................................................................................................................................................................................................... - // gap // ........................................................................................................................................................................................................................................................................................................... - smulwt r4, r3, r1 // .................................................*......................................................................................................................................................................................................................................................... - usub16 r1, r11, r5 // .......................................................*................................................................................................................................................................................................................................................... - smlabt r3, r7, r12, r0 // ....................................................*...................................................................................................................................................................................................................................................... - vmov r7, s12 // ...........................................................*............................................................................................................................................................................................................................................... - smlabt r4, r4, r12, r0 // ......................................................*.................................................................................................................................................................................................................................................... - uadd16 r11, r11, r5 // ...............................................................*........................................................................................................................................................................................................................................... - smulwb r5, r7, r1 // ..................................................................*........................................................................................................................................................................................................................................ - pkhtb r4, r4, r3, asr #16 // .........................................................*................................................................................................................................................................................................................................................. - smulwt r3, r7, r1 // ................................................................*.......................................................................................................................................................................................................................................... - usub16 r7, r10, r4 // .....................................................................*..................................................................................................................................................................................................................................... - smlabt r1, r5, r12, r0 // ......................................................................*.................................................................................................................................................................................................................................... - uadd16 r4, r10, r4 // .............................................................*............................................................................................................................................................................................................................................. - smlabt r3, r3, r12, r0 // ....................................................................*...................................................................................................................................................................................................................................... - vmov r5, s11 // .............................................................................*............................................................................................................................................................................................................................. - smlabt r6, r6, r12, r0 // ............................................................*.............................................................................................................................................................................................................................................. - pkhtb r10, r3, r1, asr #16 // ...............................................................................*........................................................................................................................................................................................................................... - smulwt r3, r5, r11 // ..................................................................................*........................................................................................................................................................................................................................ - uadd16 r1, r9, r8 // ...........................................................................*............................................................................................................................................................................................................................... - smulwb r11, r5, r11 // ................................................................................*.......................................................................................................................................................................................................................... - usub16 r5, r9, r8 // ...................................................................................*....................................................................................................................................................................................................................... - smlabt r8, r14, r12, r0 // ..............................................................*............................................................................................................................................................................................................................................ - vmov r9, s13 // ...................................................................*....................................................................................................................................................................................................................................... - smlabt r14, r11, r12, r0 // ....................................................................................*...................................................................................................................................................................................................................... - pkhtb r6, r8, r6, asr #16 // .................................................................*......................................................................................................................................................................................................................................... - smlabt r8, r3, r12, r0 // ......................................................................................*.................................................................................................................................................................................................................... - uadd16 r11, r2, r6 // .........................................................................*................................................................................................................................................................................................................................. - smulwb r3, r9, r4 // ........................................................................*.................................................................................................................................................................................................................................. - pkhtb r8, r8, r14, asr #16 // .........................................................................................*................................................................................................................................................................................................................. - smulwt r14, r9, r4 // ..........................................................................*................................................................................................................................................................................................................................ - usub16 r2, r2, r6 // .......................................................................*................................................................................................................................................................................................................................... - smlabt r6, r3, r12, r0 // ............................................................................*.............................................................................................................................................................................................................................. - vmov r3, s14 // .....................................................................................*..................................................................................................................................................................................................................... - smlabt r14, r14, r12, r0 // ..............................................................................*............................................................................................................................................................................................................................ - vmov r4, s20 // ..............................................................................................*............................................................................................................................................................................................................ - smulwb r9, r3, r7 // ........................................................................................*.................................................................................................................................................................................................................. - pkhtb r6, r14, r6, asr #16 // .................................................................................*......................................................................................................................................................................................................................... - smulwt r3, r3, r7 // ...........................................................................................*............................................................................................................................................................................................................... - uadd16 r7, r5, r10 // ............................................................................................................................*.............................................................................................................................................................................. - smlabt r9, r9, r12, r0 // .............................................................................................*............................................................................................................................................................................................................. - vmov r14, s17 // ..............................................................................................................................*............................................................................................................................................................................ - smlabt r3, r3, r12, r0 // ...............................................................................................*........................................................................................................................................................................................................... - usub16 r10, r5, r10 // ...........................................................................................................................................*............................................................................................................................................................... - smulwt r5, r14, r7 // ................................................................................................................................*.......................................................................................................................................................................... - pkhtb r3, r3, r9, asr #16 // .................................................................................................*......................................................................................................................................................................................................... - smulwb r14, r14, r7 // ....................................................................................................................................*...................................................................................................................................................................... - vmov r7, s19 // ..................................................................................................................*........................................................................................................................................................................................ - smlabt r5, r5, r12, r0 // ...................................................................................................................................*....................................................................................................................................................................... - usub16 r9, r11, r6 // .......................................................................................*................................................................................................................................................................................................................... - smlabt r14, r14, r12, r0 // ........................................................................................................................................*.................................................................................................................................................................. - uadd16 r6, r11, r6 // ............................................................................................*.............................................................................................................................................................................................................. - smulwt r11, r4, r9 // ................................................................................................*.......................................................................................................................................................................................................... - // gap // ........................................................................................................................................................................................................................................................................................................... - smulwb r4, r4, r9 // ..................................................................................................*........................................................................................................................................................................................................ - pkhtb r5, r5, r14, asr #16 // ...............................................................................................................................................*........................................................................................................................................................... - smulwt r9, r7, r6 // .........................................................................................................................*................................................................................................................................................................................. - uadd16 r14, r2, r3 // .....................................................................................................*..................................................................................................................................................................................................... - smlabt r4, r4, r12, r0 // .....................................................................................................................*..................................................................................................................................................................................... - usub16 r2, r2, r3 // ...................................................................................................*....................................................................................................................................................................................................... - smlabt r11, r11, r12, r0 // ....................................................................................................*...................................................................................................................................................................................................... - vmov r3, s18 // .............................................................................................................................................*............................................................................................................................................................. - smulwb r6, r7, r6 // .............................................................................................................................*............................................................................................................................................................................. - pkhtb r7, r11, r4, asr #16 // ..........................................................................................................................*................................................................................................................................................................................ - smulwt r4, r3, r10 // ................................................................................................................................................*.......................................................................................................................................................... - vmov r11, s21 // ..........................................................................................................*................................................................................................................................................................................................ - smulwb r3, r3, r10 // ......................................................................................................................................................*.................................................................................................................................................... - vmov s5, r7 // ..................................................................................................................................*........................................................................................................................................................................ - smlabt r7, r9, r12, r0 // ...............................................................................................................................*........................................................................................................................................................................... - vmov s2, r5 // .................................................................................................................................................*......................................................................................................................................................... - smlabt r5, r6, r12, r0 // .................................................................................................................................*......................................................................................................................................................................... - uadd16 r9, r1, r8 // ........................................................................................................*.................................................................................................................................................................................................. - smlabt r6, r3, r12, r0 // ..........................................................................................................................................................*................................................................................................................................................ - usub16 r3, r1, r8 // ............................................................................................................*.............................................................................................................................................................................................. - smulwb r1, r11, r14 // .............................................................................................................*............................................................................................................................................................................................. - pkhtb r8, r7, r5, asr #16 // .....................................................................................................................................*..................................................................................................................................................................... - smulwt r5, r11, r14 // ...............................................................................................................*........................................................................................................................................................................................... - // gap // ........................................................................................................................................................................................................................................................................................................... - smlabt r10, r1, r12, r0 // .................................................................................................................*......................................................................................................................................................................................... - vmov r1, s15 // .......................................................................................................................................*................................................................................................................................................................... - smlabt r14, r5, r12, r0 // ...................................................................................................................*....................................................................................................................................................................................... - vmov s4, r8 // .................................................................................................................................................................*......................................................................................................................................... - smulwb r11, r1, r9 // ..........................................................................................................................................*................................................................................................................................................................ - vmov r5, s16 // ....................................................................................................................*...................................................................................................................................................................................... - smulwt r8, r1, r9 // ..................................................................................................................................................*........................................................................................................................................................ - vmov r1, s22 // ..........................................................................................*................................................................................................................................................................................................................ - smlabt r4, r4, r12, r0 // ....................................................................................................................................................*...................................................................................................................................................... - pkhtb r7, r14, r10, asr #16 // ......................................................................................................................*.................................................................................................................................................................................... - smlabt r8, r8, r12, r0 // ........................................................................................................................................................*.................................................................................................................................................. - pkhtb r4, r4, r6, asr #16 // ...................................................................................................................................................................*....................................................................................................................................... - smlabt r9, r11, r12, r0 // ..............................................................................................................................................*............................................................................................................................................................ - vmov r11, s23 // .........................................................................................................................................*................................................................................................................................................................. - smulwt r14, r1, r2 // .......................................................................................................*................................................................................................................................................................................................... - vmov s6, r7 // ........................................................................................................................*.................................................................................................................................................................................. - smulwb r1, r1, r2 // ......................................................................................................*.................................................................................................................................................................................................... - pkhtb r10, r8, r9, asr #16 // ...........................................................................................................................................................*............................................................................................................................................... - smlabt r9, r14, r12, r0 // ...........................................................................................................*............................................................................................................................................................................................... - vmov r8, s8 // .....................................................................................................................................................*..................................................................................................................................................... - smlabt r14, r1, r12, r0 // .........................................................................................................*................................................................................................................................................................................................. - ldr.w r7, [r11, #320] // .......................................................................................................................................................*................................................................................................................................................... - smulwt r6, r5, r3 // .......................................................................................................................*................................................................................................................................................................................... - ldr.w r2, [r11, #256] // .............................................................................................................................................................*............................................................................................................................................. - smulwb r1, r5, r3 // ......................................................................................................................................*.................................................................................................................................................................... - pkhtb r14, r9, r14, asr #16 // ..............................................................................................................*............................................................................................................................................................................................ - smlabt r5, r6, r12, r0 // ...........................................................................................................................*............................................................................................................................................................................... - vmov s7, r14 // ................................................................................................................*.......................................................................................................................................................................................... - smulwb r14, r8, r2 // ..................................................................................................................................................................*........................................................................................................................................ - ldr.w r3, [r11, #0] // .........................................................................................................................................................*................................................................................................................................................. - smlabt r1, r1, r12, r0 // ............................................................................................................................................*.............................................................................................................................................................. - vmov s3, r4 // .........................................................................................................................................................................*................................................................................................................................. - smulwb r4, r8, r7 // ............................................................................................................................................................*.............................................................................................................................................. - vmov s0, r10 // ...................................................................................................................................................................................*....................................................................................................................... - smulwt r6, r8, r2 // ....................................................................................................................................................................*...................................................................................................................................... - pkhtb r5, r5, r1, asr #16 // ...................................................................................................................................................*....................................................................................................................................................... - smulwt r1, r8, r7 // ..............................................................................................................................................................*............................................................................................................................................ - movw r0, #24608 // ...............................................................................................................................................................*........................................................................................................................................... - smlabt r7, r14, r12, r0 // ......................................................................................................................................................................*.................................................................................................................................... - ldr.w r9, [r11, #384] // .....................................................................................................................................................................*..................................................................................................................................... - smlabt r6, r6, r12, r0 // ........................................................................................................................................................................*.................................................................................................................................. - ldr.w r10, [r11, #448] // ....................................................................................................................................................................................*...................................................................................................................... - smlabt r14, r4, r12, r0 // ................................................................................................................................................................*.......................................................................................................................................... - pkhtb r7, r6, r7, asr #16 // ...........................................................................................................................................................................*............................................................................................................................... - smulwb r4, r8, r9 // ..........................................................................................................................................................................*................................................................................................................................ - usub16 r6, r3, r7 // .............................................................................................................................................................................*............................................................................................................................. - smulwb r2, r8, r10 // ........................................................................................................................................................................................*.................................................................................................................. - vmov s1, r5 // ......................................................................................................................................................................................*.................................................................................................................... - smlabt r1, r1, r12, r0 // ..............................................................................................................................................................................*............................................................................................................................ - uadd16 r3, r3, r7 // ...............................................................................................................................................................................*........................................................................................................................... - smulwt r7, r8, r9 // ............................................................................................................................................................................*.............................................................................................................................. - pkhtb r9, r1, r14, asr #16 // .................................................................................................................................................................................*......................................................................................................................... - smulwt r1, r8, r10 // ..........................................................................................................................................................................................*................................................................................................................ - ldr.w r8, [r11, #64] // .......................................................................................................................................................................*................................................................................................................................... - smlabt r5, r4, r12, r0 // ..................................................................................................................................................................................*........................................................................................................................ - ldr.w r4, [r11, #128] // .....................................................................................................................................................................................*..................................................................................................................... - smlabt r7, r7, r12, r0 // ................................................................................................................................................................................*.......................................................................................................................... - ldr.w r10, [r11, #192] // ...........................................................................................................................................................................................*............................................................................................................... - smlabt r14, r2, r12, r0 // ............................................................................................................................................................................................*.............................................................................................................. - pkhtb r11, r7, r5, asr #16 // .......................................................................................................................................................................................*................................................................................................................... - vmov r2, s9 // .............................................................................................................................................................................................*............................................................................................................. - uadd16 r7, r4, r11 // .........................................................................................................................................................................................*................................................................................................................. - smlabt r1, r1, r12, r0 // ..............................................................................................................................................................................................*............................................................................................................ - usub16 r5, r4, r11 // ...............................................................................................................................................................................................*........................................................................................................... - smulwb r11, r2, r7 // ................................................................................................................................................................................................*.......................................................................................................... - pkhtb r14, r1, r14, asr #16 // .................................................................................................................................................................................................*......................................................................................................... - smulwt r4, r2, r7 // ..................................................................................................................................................................................................*........................................................................................................ - usub16 r7, r10, r14 // ...................................................................................................................................................................................................*....................................................................................................... - smlabt r1, r11, r12, r0 // ....................................................................................................................................................................................................*...................................................................................................... - uadd16 r10, r10, r14 // .....................................................................................................................................................................................................*..................................................................................................... - smlabt r4, r4, r12, r0 // ......................................................................................................................................................................................................*.................................................................................................... - vmov r11, s10 // .......................................................................................................................................................................................................*................................................................................................... - smulwb r14, r2, r10 // ........................................................................................................................................................................................................*.................................................................................................. - pkhtb r1, r4, r1, asr #16 // .........................................................................................................................................................................................................*................................................................................................. - smulwt r10, r2, r10 // ..........................................................................................................................................................................................................*................................................................................................ - usub16 r4, r3, r1 // ...........................................................................................................................................................................................................*............................................................................................... - smlabt r14, r14, r12, r0 // ............................................................................................................................................................................................................*.............................................................................................. - uadd16 r1, r3, r1 // .............................................................................................................................................................................................................*............................................................................................. - smlabt r3, r10, r12, r0 // ..............................................................................................................................................................................................................*............................................................................................ - usub16 r10, r8, r9 // .................................................................................................................................................................................................................*......................................................................................... - smulwt r2, r11, r7 // ..................................................................................................................................................................................................................*........................................................................................ - uadd16 r8, r8, r9 // ...............................................................................................................................................................................................................*........................................................................................... - smulwb r9, r11, r5 // ................................................................................................................................................................................................................*.......................................................................................... - pkhtb r3, r3, r14, asr #16 // ...................................................................................................................................................................................................................*....................................................................................... - smulwt r14, r11, r5 // ....................................................................................................................................................................................................................*...................................................................................... - usub16 r5, r8, r3 // .....................................................................................................................................................................................................................*..................................................................................... - smlabt r9, r9, r12, r0 // ......................................................................................................................................................................................................................*.................................................................................... - uadd16 r3, r8, r3 // .......................................................................................................................................................................................................................*................................................................................... - smlabt r8, r14, r12, r0 // ........................................................................................................................................................................................................................*.................................................................................. - vmov r14, s12 // .........................................................................................................................................................................................................................*................................................................................. - smulwb r7, r11, r7 // ..........................................................................................................................................................................................................................*................................................................................ - pkhtb r9, r8, r9, asr #16 // ...........................................................................................................................................................................................................................*............................................................................... - smulwb r11, r14, r5 // ............................................................................................................................................................................................................................*.............................................................................. - usub16 r8, r6, r9 // .............................................................................................................................................................................................................................*............................................................................. - smlabt r7, r7, r12, r0 // ..............................................................................................................................................................................................................................*............................................................................ - uadd16 r9, r6, r9 // ...............................................................................................................................................................................................................................*........................................................................... - smlabt r6, r2, r12, r0 // .................................................................................................................................................................................................................................*......................................................................... - vmov r2, s13 // ................................................................................................................................................................................................................................*.......................................................................... - smulwt r14, r14, r5 // ..................................................................................................................................................................................................................................*........................................................................ - pkhtb r7, r6, r7, asr #16 // ...................................................................................................................................................................................................................................*....................................................................... - smlabt r5, r11, r12, r0 // ....................................................................................................................................................................................................................................*...................................................................... - uadd16 r6, r10, r7 // .....................................................................................................................................................................................................................................*..................................................................... - smlabt r14, r14, r12, r0 // ......................................................................................................................................................................................................................................*.................................................................... - usub16 r11, r10, r7 // .......................................................................................................................................................................................................................................*................................................................... - smulwb r7, r2, r6 // ........................................................................................................................................................................................................................................*.................................................................. - pkhtb r5, r14, r5, asr #16 // ...........................................................................................................................................................................................................................................*............................................................... - smulwt r6, r2, r6 // ..........................................................................................................................................................................................................................................*................................................................ - uadd16 r2, r4, r5 // .............................................................................................................................................................................................................................................*............................................................. - smlabt r10, r7, r12, r0 // ............................................................................................................................................................................................................................................*.............................................................. - vmov r7, s11 // .........................................................................................................................................................................................................................................*................................................................. - smlabt r14, r6, r12, r0 // ..............................................................................................................................................................................................................................................*............................................................ - usub16 r5, r4, r5 // ...............................................................................................................................................................................................................................................*........................................................... - smulwb r4, r7, r3 // ................................................................................................................................................................................................................................................*.......................................................... - pkhtb r10, r14, r10, asr #16 // .................................................................................................................................................................................................................................................*......................................................... - smulwt r14, r7, r3 // ..................................................................................................................................................................................................................................................*........................................................ - usub16 r6, r9, r10 // ...................................................................................................................................................................................................................................................*....................................................... - smlabt r4, r4, r12, r0 // ....................................................................................................................................................................................................................................................*...................................................... - vmov r7, s14 // .....................................................................................................................................................................................................................................................*..................................................... - smlabt r14, r14, r12, r0 // ......................................................................................................................................................................................................................................................*.................................................... - uadd16 r9, r9, r10 // .......................................................................................................................................................................................................................................................*................................................... - smulwt r10, r7, r11 // ..........................................................................................................................................................................................................................................................*................................................ - pkhtb r3, r14, r4, asr #16 // .........................................................................................................................................................................................................................................................*................................................. - smulwb r7, r7, r11 // ........................................................................................................................................................................................................................................................*.................................................. - usub16 r14, r1, r3 // .............................................................................................................................................................................................................................................................*............................................. - smlabt r4, r10, r12, r0 // ..............................................................................................................................................................................................................................................................*............................................ - vmov r11, s1 // ...........................................................................................................................................................................................................................................................*............................................... - smlabt r7, r7, r12, r0 // ............................................................................................................................................................................................................................................................*.............................................. - usub16 r10, r14, r11 // ...............................................................................................................................................................................................................................................................*........................................... - vmov r0, s23 // ................................................................................................................................................................................................................................................................*.......................................... - str.w r10, [r0, #96] // ..................................................................................................................................................................................................................................................................*........................................ - vmov r10, s2 // ....................................................................................................................................................................................................................................................................*...................................... - pkhtb r4, r4, r7, asr #16 // .......................................................................................................................................................................................................................................................................*................................... - vmov r7, s5 // ........................................................................................................................................................................................................................................................................*.................................. - uadd16 r14, r14, r11 // .................................................................................................................................................................................................................................................................*......................................... - str.w r14, [r0, #64] // ...................................................................................................................................................................................................................................................................*....................................... - usub16 r14, r2, r10 // .....................................................................................................................................................................................................................................................................*..................................... - str.w r14, [r0, #160] // ......................................................................................................................................................................................................................................................................*.................................... - uadd16 r14, r6, r7 // .........................................................................................................................................................................................................................................................................*................................. - str.w r14, [r0, #320] // ..........................................................................................................................................................................................................................................................................*................................ - usub16 r6, r6, r7 // ...........................................................................................................................................................................................................................................................................*............................... - vmov r14, s7 // ..............................................................................................................................................................................................................................................................................*............................ - usub16 r11, r8, r4 // .............................................................................................................................................................................................................................................................................*............................. - str.w r6, [r0, #352] // ............................................................................................................................................................................................................................................................................*.............................. - usub16 r6, r11, r14 // ...............................................................................................................................................................................................................................................................................*........................... - str.w r6, [r0, #480] // ..................................................................................................................................................................................................................................................................................*........................ - uadd16 r11, r11, r14 // .................................................................................................................................................................................................................................................................................*......................... - vmov r6, s3 // ....................................................................................................................................................................................................................................................................................*...................... - uadd16 r14, r1, r3 // ...................................................................................................................................................................................................................................................................................*....................... - vmov r3, s0 // ................................................................................................................................................................................................................................................................................*.......................... - usub16 r1, r5, r6 // ...........................................................................................................................................................................................................................................................................................*............... - str.w r1, [r0, #224] // ............................................................................................................................................................................................................................................................................................*.............. - uadd16 r7, r2, r10 // .....................................................................................................................................................................................................................................................................................*..................... - str.w r7, [r0, #128] // ......................................................................................................................................................................................................................................................................................*.................... - uadd16 r1, r5, r6 // .......................................................................................................................................................................................................................................................................................*................... - str.w r1, [r0, #192] // ........................................................................................................................................................................................................................................................................................*.................. - usub16 r1, r14, r3 // .........................................................................................................................................................................................................................................................................................*................. - str.w r1, [r0, #32] // ..........................................................................................................................................................................................................................................................................................*................ - uadd16 r14, r14, r3 // .............................................................................................................................................................................................................................................................................................*............. - str.w r14, [r0], #4 // ................................................................................................................................................................................................................................................................................................*.......... // @slothy:core - uadd16 r6, r8, r4 // ...............................................................................................................................................................................................................................................................................................*........... - vmov r5, s6 // ..............................................................................................................................................................................................................................................................................................*............ - usub16 r1, r6, r5 // .................................................................................................................................................................................................................................................................................................*......... - str.w r1, [r0, #412] // ..................................................................................................................................................................................................................................................................................................*........ - uadd16 r10, r6, r5 // ...................................................................................................................................................................................................................................................................................................*....... - str.w r11, [r0, #444] // .....................................................................................................................................................................................................................................................................................................*..... - str.w r10, [r0, #380] // ....................................................................................................................................................................................................................................................................................................*...... - vmov r7, s4 // ......................................................................................................................................................................................................................................................................................................*.... - usub16 r5, r9, r7 // .......................................................................................................................................................................................................................................................................................................*... - str.w r5, [r0, #284] // ........................................................................................................................................................................................................................................................................................................*.. - uadd16 r6, r9, r7 // .........................................................................................................................................................................................................................................................................................................*. - str.w r6, [r0, #252] // ..........................................................................................................................................................................................................................................................................................................* + // --------------------------------------------------------------------------------------------------------------------------------------------- original position ---------------------------------------------------------------------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 200 225 250 275 300 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|- + vmov r14, s8 // ...*.......................................................................................................................................................................................................................................................................................................... + ldr.w r11, [r0, #416] // ......*....................................................................................................................................................................................................................................................................................................... + vmov s23, r0 // ..*........................................................................................................................................................................................................................................................................................................... + ldr.w r5, [r0, #352] // ....*......................................................................................................................................................................................................................................................................................................... + ldr.w r9, [r0, #96] // .*............................................................................................................................................................................................................................................................................................................ + // gap // .............................................................................................................................................................................................................................................................................................................. + ldr.w r2, [r0, #288] // ........*..................................................................................................................................................................................................................................................................................................... + smulwb r7, r14, r5 // .............*................................................................................................................................................................................................................................................................................................ + ldr.w r10, [r0, #32] // ..........*................................................................................................................................................................................................................................................................................................... + smulwt r6, r14, r5 // .......*...................................................................................................................................................................................................................................................................................................... + ldr.w r5, [r0, #480] // .....*........................................................................................................................................................................................................................................................................................................ + smulwb r4, r14, r2 // ....................*......................................................................................................................................................................................................................................................................................... + ldr.w r8, [r0, #160] // ............*................................................................................................................................................................................................................................................................................................. + smulwt r3, r14, r2 // ......................*....................................................................................................................................................................................................................................................................................... + ldr.w r2, [r0, #224] // *............................................................................................................................................................................................................................................................................................................. + movw r0, #24608 // ..............*............................................................................................................................................................................................................................................................................................... + // gap // .............................................................................................................................................................................................................................................................................................................. + smlabt r4, r4, r12, r0 // ........................*..................................................................................................................................................................................................................................................................................... + // gap // .............................................................................................................................................................................................................................................................................................................. + smlabt r3, r3, r12, r0 // ..........................*................................................................................................................................................................................................................................................................................... + // gap // .............................................................................................................................................................................................................................................................................................................. + smlabt r6, r6, r12, r0 // ...................*.......................................................................................................................................................................................................................................................................................... + pkhtb r4, r3, r4, asr #16 // ............................*................................................................................................................................................................................................................................................................................. + smlabt r7, r7, r12, r0 // .................*............................................................................................................................................................................................................................................................................................ + // gap // .............................................................................................................................................................................................................................................................................................................. + smulwb r3, r14, r5 // .........*.................................................................................................................................................................................................................................................................................................... + // gap // .............................................................................................................................................................................................................................................................................................................. + smulwt r5, r14, r5 // ...........*.................................................................................................................................................................................................................................................................................................. + pkhtb r7, r6, r7, asr #16 // .....................*........................................................................................................................................................................................................................................................................................ + smlabt r3, r3, r12, r0 // ...............*.............................................................................................................................................................................................................................................................................................. + usub16 r6, r10, r4 // ..............................*............................................................................................................................................................................................................................................................................... + smlabt r5, r5, r12, r0 // ................*............................................................................................................................................................................................................................................................................................. + uadd16 r4, r10, r4 // ................................*............................................................................................................................................................................................................................................................................. + smulwb r10, r14, r11 // ...........................*.................................................................................................................................................................................................................................................................................. + pkhtb r3, r5, r3, asr #16 // ..................*........................................................................................................................................................................................................................................................................................... + smulwt r14, r14, r11 // .............................*................................................................................................................................................................................................................................................................................ + usub16 r5, r2, r3 // ......................................*....................................................................................................................................................................................................................................................................... + vmov r11, s9 // ...................................*.......................................................................................................................................................................................................................................................................... + uadd16 r2, r2, r3 // ..................................*........................................................................................................................................................................................................................................................................... + smlabt r3, r10, r12, r0 // ...............................*.............................................................................................................................................................................................................................................................................. + uadd16 r10, r9, r7 // .........................*.................................................................................................................................................................................................................................................................................... + smlabt r14, r14, r12, r0 // .................................*............................................................................................................................................................................................................................................................................ + usub16 r7, r9, r7 // .......................*...................................................................................................................................................................................................................................................................................... + smulwb r9, r11, r2 // .....................................*........................................................................................................................................................................................................................................................................ + pkhtb r3, r14, r3, asr #16 // ....................................*......................................................................................................................................................................................................................................................................... + smulwt r2, r11, r2 // .......................................*...................................................................................................................................................................................................................................................................... + // gap // .............................................................................................................................................................................................................................................................................................................. + smlabt r14, r9, r12, r0 // .........................................*.................................................................................................................................................................................................................................................................... + uadd16 r9, r8, r3 // ..........................................*................................................................................................................................................................................................................................................................... + smlabt r2, r2, r12, r0 // ...........................................*.................................................................................................................................................................................................................................................................. + usub16 r8, r8, r3 // ........................................*..................................................................................................................................................................................................................................................................... + smulwb r3, r11, r9 // ..............................................*............................................................................................................................................................................................................................................................... + pkhtb r2, r2, r14, asr #16 // .............................................*................................................................................................................................................................................................................................................................ + smulwt r9, r11, r9 // ............................................*................................................................................................................................................................................................................................................................. + uadd16 r11, r10, r2 // ...............................................*.............................................................................................................................................................................................................................................................. + smlabt r3, r3, r12, r0 // ..................................................*........................................................................................................................................................................................................................................................... + vmov r14, s11 // ................................................................*............................................................................................................................................................................................................................................. + smlabt r9, r9, r12, r0 // ................................................*............................................................................................................................................................................................................................................................. + usub16 r10, r10, r2 // ...................................................*.......................................................................................................................................................................................................................................................... + smulwb r2, r14, r11 // ...................................................................*.......................................................................................................................................................................................................................................... + pkhtb r9, r9, r3, asr #16 // ......................................................*....................................................................................................................................................................................................................................................... + smulwt r3, r14, r11 // .....................................................................*........................................................................................................................................................................................................................................ + uadd16 r14, r4, r9 // ........................................................*..................................................................................................................................................................................................................................................... + smlabt r11, r2, r12, r0 // .......................................................................*...................................................................................................................................................................................................................................... + vmov r2, s12 // ......................................................................*....................................................................................................................................................................................................................................... + smlabt r3, r3, r12, r0 // .........................................................................*.................................................................................................................................................................................................................................... + usub16 r4, r4, r9 // ..........................................................*................................................................................................................................................................................................................................................... + smulwt r9, r2, r10 // .............................................................................*................................................................................................................................................................................................................................ + pkhtb r11, r3, r11, asr #16 // ............................................................................*................................................................................................................................................................................................................................. + smulwb r2, r2, r10 // ...........................................................................*.................................................................................................................................................................................................................................. + usub16 r3, r14, r11 // ..................................................................................*........................................................................................................................................................................................................................... + smlabt r10, r9, r12, r0 // .................................................................................*............................................................................................................................................................................................................................ + vmov r9, s16 // .............................................................................................*................................................................................................................................................................................................................ + smlabt r2, r2, r12, r0 // ...............................................................................*.............................................................................................................................................................................................................................. + uadd16 r11, r14, r11 // ................................................................................*............................................................................................................................................................................................................................. + smulwb r14, r9, r3 // ................................................................................................*............................................................................................................................................................................................................. + // gap // .............................................................................................................................................................................................................................................................................................................. + smulwt r9, r9, r3 // ...............................................................................................*.............................................................................................................................................................................................................. + pkhtb r3, r10, r2, asr #16 // ....................................................................................*......................................................................................................................................................................................................................... + smlabt r14, r14, r12, r0 // ....................................................................................................*......................................................................................................................................................................................................... + vmov r2, s10 // .................................................*............................................................................................................................................................................................................................................................ + smlabt r9, r9, r12, r0 // ..................................................................................................*........................................................................................................................................................................................................... + // gap // .............................................................................................................................................................................................................................................................................................................. + smulwb r10, r2, r5 // ...........................................................*.................................................................................................................................................................................................................................................. + pkhtb r14, r9, r14, asr #16 // ........................................................................................................*..................................................................................................................................................................................................... + smulwt r9, r2, r5 // .............................................................*................................................................................................................................................................................................................................................ + vmov s1, r14 // ..........................................................................................................*................................................................................................................................................................................................... + smlabt r10, r10, r12, r0 // ...............................................................*.............................................................................................................................................................................................................................................. + usub16 r5, r4, r3 // ........................................................................................*..................................................................................................................................................................................................................... + smlabt r14, r9, r12, r0 // .................................................................*............................................................................................................................................................................................................................................ + uadd16 r4, r4, r3 // ..........................................................................................*................................................................................................................................................................................................................... + smulwb r3, r2, r8 // ....................................................*......................................................................................................................................................................................................................................................... + pkhtb r10, r14, r10, asr #16 // ....................................................................*......................................................................................................................................................................................................................................... + smulwt r8, r2, r8 // .....................................................*........................................................................................................................................................................................................................................................ + usub16 r9, r7, r10 // ..........................................................................*................................................................................................................................................................................................................................... + smlabt r2, r3, r12, r0 // .......................................................*...................................................................................................................................................................................................................................................... + vmov r14, s14 // ..............................................................................*............................................................................................................................................................................................................................... + smlabt r3, r8, r12, r0 // .........................................................*.................................................................................................................................................................................................................................................... + uadd16 r10, r7, r10 // ........................................................................*..................................................................................................................................................................................................................................... + smulwb r8, r14, r9 // ...................................................................................*.......................................................................................................................................................................................................................... + vmov r7, s18 // ............................................................................................................*................................................................................................................................................................................................. + smulwt r14, r14, r9 // .....................................................................................*........................................................................................................................................................................................................................ + pkhtb r9, r3, r2, asr #16 // ............................................................*................................................................................................................................................................................................................................................. + smlabt r8, r8, r12, r0 // .......................................................................................*...................................................................................................................................................................................................................... + usub16 r3, r6, r9 // ..............................................................*............................................................................................................................................................................................................................................... + smlabt r2, r14, r12, r0 // .........................................................................................*.................................................................................................................................................................................................................... + uadd16 r6, r6, r9 // ..................................................................*........................................................................................................................................................................................................................................... + smulwb r14, r7, r5 // ...............................................................................................................*.............................................................................................................................................................................................. + pkhtb r9, r2, r8, asr #16 // ...........................................................................................*.................................................................................................................................................................................................................. + smulwt r8, r7, r5 // .................................................................................................................*............................................................................................................................................................................................ + uadd16 r7, r3, r9 // .................................................................................................*............................................................................................................................................................................................................ + smlabt r5, r14, r12, r0 // .....................................................................................................................*........................................................................................................................................................................................ + vmov r14, s21 // ....................................................................................................................*......................................................................................................................................................................................... + smlabt r8, r8, r12, r0 // ......................................................................................................................................*....................................................................................................................................................................... + // gap // .............................................................................................................................................................................................................................................................................................................. + smulwb r2, r14, r7 // ......................................................................................................................*....................................................................................................................................................................................... + pkhtb r8, r8, r5, asr #16 // ........................................................................................................................................*..................................................................................................................................................................... + smulwt r7, r14, r7 // .......................................................................................................................*...................................................................................................................................................................................... + vmov r14, s17 // .......................................................................................................................................*...................................................................................................................................................................... + smlabt r5, r2, r12, r0 // .........................................................................................................................*.................................................................................................................................................................................... + vmov s3, r8 // ..........................................................................................................................................*................................................................................................................................................................... + smulwb r8, r14, r4 // ...........................................................................................................................................*.................................................................................................................................................................. + // gap // .............................................................................................................................................................................................................................................................................................................. + smulwt r2, r14, r4 // ...............................................................................................................................................*.............................................................................................................................................................. + usub16 r4, r3, r9 // ...................................................................................................*.......................................................................................................................................................................................................... + smlabt r14, r7, r12, r0 // ...........................................................................................................................*.................................................................................................................................................................................. + vmov r7, s13 // ......................................................................................*....................................................................................................................................................................................................................... + smlabt r9, r8, r12, r0 // .................................................................................................................................................*............................................................................................................................................................ + pkhtb r5, r14, r5, asr #16 // ..............................................................................................................................*............................................................................................................................................................................... + smulwb r3, r7, r10 // ............................................................................................*................................................................................................................................................................................................................. + vmov s6, r5 // ..................................................................................................................................................*........................................................................................................................................................... + smulwt r7, r7, r10 // ..............................................................................................*............................................................................................................................................................................................................... + vmov r10, s23 // .........................................................................................................................................*.................................................................................................................................................................... + smlabt r3, r3, r12, r0 // .......................................................................................................*...................................................................................................................................................................................................... + // gap // .............................................................................................................................................................................................................................................................................................................. + smlabt r8, r7, r12, r0 // .....................................................................................................*........................................................................................................................................................................................................ + vmov r7, s22 // ......................................................................................................*....................................................................................................................................................................................................... + smlabt r5, r2, r12, r0 // ...................................................................................................................................................*.......................................................................................................................................................... + pkhtb r2, r8, r3, asr #16 // ..............................................................................................................*............................................................................................................................................................................................... + smulwb r8, r7, r4 // .........................................................................................................*.................................................................................................................................................................................................... + pkhtb r9, r5, r9, asr #16 // ......................................................................................................................................................*....................................................................................................................................................... + smulwt r7, r7, r4 // ...........................................................................................................*.................................................................................................................................................................................................. + usub16 r3, r6, r2 // ................................................................................................................*............................................................................................................................................................................................. + smlabt r8, r8, r12, r0 // .............................................................................................................*................................................................................................................................................................................................ + vmov r4, s15 // ..........................................................................................................................*................................................................................................................................................................................... + smlabt r7, r7, r12, r0 // ...................................................................................................................*.......................................................................................................................................................................................... + uadd16 r2, r6, r2 // ..................................................................................................................*........................................................................................................................................................................................... + smulwt r14, r4, r11 // .............................................................................................................................*................................................................................................................................................................................ + vmov r5, s19 // ................................................................................................................................*............................................................................................................................................................................. + smulwb r6, r4, r11 // ............................................................................................................................*................................................................................................................................................................................. + pkhtb r7, r7, r8, asr #16 // ........................................................................................................................*..................................................................................................................................................................................... + smulwb r8, r5, r2 // ..................................................................................................................................*........................................................................................................................................................................... + vmov r4, s20 // ........................................................................................................................................................*..................................................................................................................................................... + smulwt r11, r5, r2 // ....................................................................................................................................*......................................................................................................................................................................... + vmov s2, r9 // ............................................................................................................................................................*................................................................................................................................................. + smlabt r9, r8, r12, r0 // .............................................................................................................................................*................................................................................................................................................................ + vmov s7, r7 // ............................................................................................................................................*................................................................................................................................................................. + smlabt r11, r11, r12, r0 // .....................................................................................................................................................*........................................................................................................................................................ + vmov r5, s8 // ....................................................................................................................................................*......................................................................................................................................................... + smulwt r8, r4, r3 // .............................................................................................................................................................*................................................................................................................................................ + pkhtb r9, r11, r9, asr #16 // ..........................................................................................................................................................*................................................................................................................................................... + smlabt r11, r6, r12, r0 // ...............................................................................................................................*.............................................................................................................................................................................. + ldr.w r6, [r10, #256] // ..............................................................................................................................................................*............................................................................................................................................... + smlabt r14, r14, r12, r0 // .................................................................................................................................*............................................................................................................................................................................ + vmov s4, r9 // .......................................................................................................................................................................................*...................................................................................................................... + smulwb r7, r4, r3 // ...........................................................................................................................................................*.................................................................................................................................................. + pkhtb r11, r14, r11, asr #16 // ...................................................................................................................................*.......................................................................................................................................................................... + smulwt r2, r5, r6 // ...........................................................................................................................................................................*.................................................................................................................................. + ldr.w r14, [r10, #448] // ................................................................................................................................................................................*............................................................................................................................. + smlabt r7, r7, r12, r0 // ...............................................................................................................................................................*.............................................................................................................................................. + vmov s0, r11 // .....................................................................................................................................*........................................................................................................................................................................ + smlabt r11, r8, r12, r0 // .................................................................................................................................................................*............................................................................................................................................ + ldr.w r8, [r10, #320] // ................................................................................................................................................*............................................................................................................................................................. + smulwt r3, r5, r14 // ......................................................................................................................................................................................*....................................................................................................................... + pkhtb r7, r11, r7, asr #16 // ......................................................................................................................................................................*....................................................................................................................................... + smulwb r6, r5, r6 // .........................................................................................................................................................................*.................................................................................................................................... + vmov s5, r7 // ........................................................................................................................................................................*..................................................................................................................................... + smulwb r7, r5, r8 // .......................................................................................................................................................*...................................................................................................................................................... + movw r0, #24608 // ..................................................................................................................................................................*........................................................................................................................................... + smlabt r6, r6, r12, r0 // .............................................................................................................................................................................*................................................................................................................................ + ldr.w r9, [r10, #0] // ....................................................................................................................................................................*......................................................................................................................................... + smlabt r2, r2, r12, r0 // ...............................................................................................................................................................................*.............................................................................................................................. + ldr.w r4, [r10, #384] // ................................................................................................................................................................*............................................................................................................................................. + smulwt r8, r5, r8 // .........................................................................................................................................................*.................................................................................................................................................... + pkhtb r11, r2, r6, asr #16 // ..................................................................................................................................................................................*........................................................................................................................... + smlabt r7, r7, r12, r0 // .....................................................................................................................................................................*........................................................................................................................................ + uadd16 r2, r9, r11 // ....................................................................................................................................................................................*......................................................................................................................... + smlabt r8, r8, r12, r0 // ...................................................................................................................................................................*.......................................................................................................................................... + usub16 r6, r9, r11 // .....................................................................................................................................................................................*........................................................................................................................ + smulwb r9, r5, r4 // .......................................................................................................................................................................*...................................................................................................................................... + // gap // .............................................................................................................................................................................................................................................................................................................. + smulwb r11, r5, r14 // ........................................................................................................................................................................................*..................................................................................................................... + // gap // .............................................................................................................................................................................................................................................................................................................. + smulwt r4, r5, r4 // .................................................................................................................................................................................*............................................................................................................................ + pkhtb r7, r8, r7, asr #16 // ..........................................................................................................................................................................*................................................................................................................................... + smlabt r9, r9, r12, r0 // ...................................................................................................................................................................................*.......................................................................................................................... + ldr.w r5, [r10, #128] // .........................................................................................................................................................................................*.................................................................................................................... + smlabt r8, r4, r12, r0 // ..........................................................................................................................................................................................*................................................................................................................... + ldr.w r14, [r10, #192] // ...........................................................................................................................................................................................*.................................................................................................................. + smlabt r4, r11, r12, r0 // ............................................................................................................................................................................................*................................................................................................................. + pkhtb r11, r8, r9, asr #16 // .............................................................................................................................................................................................*................................................................................................................ + smlabt r8, r3, r12, r0 // ..............................................................................................................................................................................................*............................................................................................................... + usub16 r9, r5, r11 // ...................................................................................................................................................................................................*.......................................................................................................... + vmov r3, s10 // ..............................................................................................................................................................................................................*............................................................................................... + pkhtb r8, r8, r4, asr #16 // .................................................................................................................................................................................................*............................................................................................................ + smulwb r4, r3, r9 // .................................................................................................................................................................................................................*............................................................................................ + uadd16 r11, r5, r11 // ...............................................................................................................................................................................................*.............................................................................................................. + smulwt r5, r3, r9 // ...................................................................................................................................................................................................................*.......................................................................................... + ldr.w r9, [r10, #64] // ..............................................................................................................................................*............................................................................................................................................................... + smlabt r4, r4, r12, r0 // .....................................................................................................................................................................................................................*........................................................................................ + usub16 r10, r14, r8 // .....................................................................................................................................................................................................*........................................................................................................ + smlabt r5, r5, r12, r0 // .......................................................................................................................................................................................................................*...................................................................................... + uadd16 r8, r14, r8 // .......................................................................................................................................................................................................*...................................................................................................... + smulwb r14, r3, r10 // ........................................................................................................................................................................................................................*..................................................................................... + pkhtb r4, r5, r4, asr #16 // .........................................................................................................................................................................................................................*.................................................................................... + smulwt r3, r3, r10 // ..........................................................................................................................................................................................................................*................................................................................... + uadd16 r10, r9, r7 // ............................................................................................................................................................................*................................................................................................................................. + smlabt r14, r14, r12, r0 // ............................................................................................................................................................................................................................*................................................................................. + vmov r5, s9 // ................................................................................................................................................................................................*............................................................................................................. + smlabt r3, r3, r12, r0 // ..............................................................................................................................................................................................................................*............................................................................... + usub16 r9, r9, r7 // ..............................................................................................................................................................................*............................................................................................................................... + smulwb r7, r5, r8 // .........................................................................................................................................................................................................*.................................................................................................... + pkhtb r3, r3, r14, asr #16 // .................................................................................................................................................................................................................................*............................................................................ + smulwt r14, r5, r8 // ...........................................................................................................................................................................................................*.................................................................................................. + usub16 r8, r6, r4 // ...........................................................................................................................................................................................................................*.................................................................................. + smlabt r7, r7, r12, r0 // .............................................................................................................................................................................................................*................................................................................................ + uadd16 r6, r6, r4 // ...............................................................................................................................................................................................................................*.............................................................................. + smlabt r14, r14, r12, r0 // ...............................................................................................................................................................................................................*.............................................................................................. + // gap // .............................................................................................................................................................................................................................................................................................................. + smulwb r4, r5, r11 // ..................................................................................................................................................................................................*........................................................................................................... + pkhtb r14, r14, r7, asr #16 // ..................................................................................................................................................................................................................*........................................................................................... + smulwt r7, r5, r11 // ....................................................................................................................................................................................................*......................................................................................................... + usub16 r11, r10, r14 // ....................................................................................................................................................................................................................*......................................................................................... + smlabt r4, r4, r12, r0 // ......................................................................................................................................................................................................*....................................................................................................... + vmov r5, s12 // .............................................................................................................................................................................................................................*................................................................................ + smlabt r7, r7, r12, r0 // ........................................................................................................................................................................................................*..................................................................................................... + uadd16 r10, r10, r14 // ......................................................................................................................................................................................................................*....................................................................................... + smulwt r14, r5, r11 // ...................................................................................................................................................................................................................................*.......................................................................... + pkhtb r7, r7, r4, asr #16 // ..........................................................................................................................................................................................................*................................................................................................... + smulwb r4, r5, r11 // ................................................................................................................................................................................................................................*............................................................................. + usub16 r11, r2, r7 // ............................................................................................................................................................................................................*................................................................................................. + smlabt r5, r14, r12, r0 // .......................................................................................................................................................................................................................................*...................................................................... + uadd16 r2, r2, r7 // ................................................................................................................................................................................................................*............................................................................................. + smlabt r4, r4, r12, r0 // .....................................................................................................................................................................................................................................*........................................................................ + usub16 r7, r9, r3 // ..................................................................................................................................................................................................................................*........................................................................... + vmov r14, s14 // ......................................................................................................................................................................................................................................*....................................................................... + uadd16 r9, r9, r3 // ....................................................................................................................................................................................................................................*......................................................................... + smulwb r3, r14, r7 // ........................................................................................................................................................................................................................................*..................................................................... + pkhtb r5, r5, r4, asr #16 // .........................................................................................................................................................................................................................................*.................................................................... + smulwt r4, r14, r7 // ..........................................................................................................................................................................................................................................*................................................................... + uadd16 r14, r11, r5 // ...............................................................................................................................................................................................................................................*.............................................................. + smlabt r7, r3, r12, r0 // ............................................................................................................................................................................................................................................*................................................................. + vmov r3, s11 // .............................................................................................................................................................................................................................................*................................................................ + smlabt r4, r4, r12, r0 // ..............................................................................................................................................................................................................................................*............................................................... + usub16 r5, r11, r5 // ...........................................................................................................................................................................................................................................*.................................................................. + smulwb r11, r3, r10 // ................................................................................................................................................................................................................................................*............................................................. + pkhtb r4, r4, r7, asr #16 // .................................................................................................................................................................................................................................................*............................................................ + smulwt r10, r3, r10 // ..................................................................................................................................................................................................................................................*........................................................... + usub16 r3, r8, r4 // ...................................................................................................................................................................................................................................................*.......................................................... + smlabt r11, r11, r12, r0 // ....................................................................................................................................................................................................................................................*......................................................... + vmov r7, s13 // .....................................................................................................................................................................................................................................................*........................................................ + smlabt r10, r10, r12, r0 // ......................................................................................................................................................................................................................................................*....................................................... + uadd16 r8, r8, r4 // .......................................................................................................................................................................................................................................................*...................................................... + smulwb r4, r7, r9 // ........................................................................................................................................................................................................................................................*..................................................... + pkhtb r11, r10, r11, asr #16 // .........................................................................................................................................................................................................................................................*.................................................... + smulwt r10, r7, r9 // ..........................................................................................................................................................................................................................................................*................................................... + vmov r7, s2 // ...........................................................................................................................................................................................................................................................*.................................................. + smlabt r4, r4, r12, r0 // ............................................................................................................................................................................................................................................................*................................................. + usub16 r9, r14, r7 // .............................................................................................................................................................................................................................................................*................................................ + smlabt r0, r10, r12, r0 // ..............................................................................................................................................................................................................................................................*............................................... + uadd16 r14, r14, r7 // ...............................................................................................................................................................................................................................................................*.............................................. + vmov r10, s5 // ................................................................................................................................................................................................................................................................*............................................. + pkhtb r7, r0, r4, asr #16 // .................................................................................................................................................................................................................................................................*............................................ + vmov r0, s23 // ..................................................................................................................................................................................................................................................................*........................................... + usub16 r4, r2, r11 // ...................................................................................................................................................................................................................................................................*.......................................... + str.w r9, [r0, #160] // ....................................................................................................................................................................................................................................................................*......................................... + usub16 r9, r6, r7 // .......................................................................................................................................................................................................................................................................*...................................... + str.w r14, [r0, #128] // ........................................................................................................................................................................................................................................................................*..................................... + uadd16 r7, r6, r7 // ...........................................................................................................................................................................................................................................................................*.................................. + vmov r6, s7 // ............................................................................................................................................................................................................................................................................*................................. + usub16 r14, r9, r10 // .........................................................................................................................................................................................................................................................................*.................................... + // gap // .............................................................................................................................................................................................................................................................................................................. + uadd16 r11, r2, r11 // .....................................................................................................................................................................................................................................................................*........................................ + str.w r14, [r0, #352] // ..........................................................................................................................................................................................................................................................................*................................... + uadd16 r14, r9, r10 // .............................................................................................................................................................................................................................................................................*................................ + str.w r14, [r0, #320] // ..............................................................................................................................................................................................................................................................................*............................... + uadd16 r10, r3, r6 // ...............................................................................................................................................................................................................................................................................*.............................. + vmov r14, s24 // ................................................................................................................................................................................................................................................................................*............................. + usub16 r3, r3, r6 // ..................................................................................................................................................................................................................................................................................*........................... + str.w r3, [r0, #480] // ...........................................................................................................................................................................................................................................................................................*.................. + str.w r10, [r0, #448] // .................................................................................................................................................................................................................................................................................*............................ + vmov r3, s3 // ...................................................................................................................................................................................................................................................................................*.......................... + uadd16 r9, r5, r3 // ....................................................................................................................................................................................................................................................................................*......................... + str.w r9, [r0, #192] // .....................................................................................................................................................................................................................................................................................*........................ + usub16 r5, r5, r3 // ......................................................................................................................................................................................................................................................................................*....................... + vmov r3, s4 // ......................................................................................................................................................................................................................................................................*....................................... + uadd16 r9, r7, r3 // ..............................................................................................................................................................................................................................................................................................*............... + vmov r10, s0 // ...................................................................................................................................................................................................................................................................................................*.......... + usub16 r7, r7, r3 // ........................................................................................................................................................................................................................................................................................*..................... + str.w r7, [r0, #288] // .........................................................................................................................................................................................................................................................................................*.................... + uadd16 r7, r11, r10 // ....................................................................................................................................................................................................................................................................................................*......... + vmov r6, s1 // ...............................................................................................................................................................................................................................................................................................*.............. + uadd16 r2, r4, r6 // ................................................................................................................................................................................................................................................................................................*............. + str.w r2, [r0, #64] // .................................................................................................................................................................................................................................................................................................*............ + usub16 r4, r4, r6 // ..................................................................................................................................................................................................................................................................................................*........... + str.w r9, [r0, #256] // .........................................................................................................................................................................................................................................................................................................*.... + str.w r7, [r0], #4 // .......................................................................................................................................................................................................................................................................................................*...... // @slothy:core // @slothy:before=cmp + cmp.w r0, r14 // ........................................................................................................................................................................................................................................................................................................*..... // @slothy:id=cmp + usub16 r9, r11, r10 // ......................................................................................................................................................................................................................................................................................................*....... + str.w r4, [r0, #92] // .....................................................................................................................................................................................................................................................................................................*........ + str.w r9, [r0, #28] // ............................................................................................................................................................................................................................................................................................................*. + vmov r11, s6 // ..........................................................................................................................................................................................................................................................................................*................... + uadd16 r9, r8, r11 // ..........................................................................................................................................................................................................................................................................................................*... + str.w r9, [r0, #380] // ...........................................................................................................................................................................................................................................................................................................*.. + usub16 r9, r8, r11 // ............................................................................................................................................................................................................................................................................................*................. + str.w r5, [r0, #220] // .......................................................................................................................................................................................................................................................................................*...................... + str.w r9, [r0, #412] // .............................................................................................................................................................................................................................................................................................*................ + bne.w layer1234_loop // .............................................................................................................................................................................................................................................................................................................* // @slothy:branch + + // ----------------------------------------------------------------------------------------------------------------------------------------------- new position ------------------------------------------------------------------------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 200 225 250 275 300 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|- + // ldr.w r3, [r0, #224] // .............*................................................................................................................................................................................................................................................................................................ + // ldr.w r10, [r0, #96] // ....*......................................................................................................................................................................................................................................................................................................... + // vmov s23, r0 // ..*........................................................................................................................................................................................................................................................................................................... + // vmov r5, s8 // *............................................................................................................................................................................................................................................................................................................. + // ldr.w r6, [r0, #352] // ...*.......................................................................................................................................................................................................................................................................................................... + // ldr.w r8, [r0, #480] // .........*.................................................................................................................................................................................................................................................................................................... + // ldr.w r11, [r0, #416] // .*............................................................................................................................................................................................................................................................................................................ + // smulwt r4, r5, r6 // ........*..................................................................................................................................................................................................................................................................................................... + // ldr.w r2, [r0, #288] // .....*........................................................................................................................................................................................................................................................................................................ + // smulwb r7, r5, r8 // ....................*......................................................................................................................................................................................................................................................................................... + // ldr.w r14, [r0, #32] // .......*...................................................................................................................................................................................................................................................................................................... + // smulwt r9, r5, r8 // .....................*........................................................................................................................................................................................................................................................................................ + // ldr.w r8, [r0, #160] // ...........*.................................................................................................................................................................................................................................................................................................. + // smulwb r6, r5, r6 // ......*....................................................................................................................................................................................................................................................................................................... + // movw r0, #24608 // ..............*............................................................................................................................................................................................................................................................................................... + // smlabt r7, r7, r12, r0 // .......................*...................................................................................................................................................................................................................................................................................... + // smlabt r9, r9, r12, r0 // .........................*.................................................................................................................................................................................................................................................................................... + // smlabt r6, r6, r12, r0 // ...................*.......................................................................................................................................................................................................................................................................................... + // pkhtb r9, r9, r7, asr #16 // ............................*................................................................................................................................................................................................................................................................................. + // smlabt r7, r4, r12, r0 // .................*............................................................................................................................................................................................................................................................................................ + // smulwb r4, r5, r2 // ..........*................................................................................................................................................................................................................................................................................................... + // pkhtb r6, r7, r6, asr #16 // ......................*....................................................................................................................................................................................................................................................................................... + // smulwt r2, r5, r2 // ............*................................................................................................................................................................................................................................................................................................. + // usub16 r7, r10, r6 // ....................................*......................................................................................................................................................................................................................................................................... + // smlabt r4, r4, r12, r0 // ...............*.............................................................................................................................................................................................................................................................................................. + // uadd16 r10, r10, r6 // ..................................*........................................................................................................................................................................................................................................................................... + // smlabt r6, r2, r12, r0 // ................*............................................................................................................................................................................................................................................................................................. + // smulwb r2, r5, r11 // ...........................*.................................................................................................................................................................................................................................................................................. + // pkhtb r4, r6, r4, asr #16 // ..................*........................................................................................................................................................................................................................................................................................... + // smulwt r5, r5, r11 // .............................*................................................................................................................................................................................................................................................................................ + // usub16 r6, r14, r4 // ........................*..................................................................................................................................................................................................................................................................................... + // smlabt r2, r2, r12, r0 // .................................*............................................................................................................................................................................................................................................................................ + // uadd16 r4, r14, r4 // ..........................*................................................................................................................................................................................................................................................................................... + // smlabt r11, r5, r12, r0 // ...................................*.......................................................................................................................................................................................................................................................................... + // uadd16 r14, r3, r9 // ................................*............................................................................................................................................................................................................................................................................. + // vmov r5, s9 // ...............................*.............................................................................................................................................................................................................................................................................. + // pkhtb r11, r11, r2, asr #16 // ......................................*....................................................................................................................................................................................................................................................................... + // smulwb r2, r5, r14 // .....................................*........................................................................................................................................................................................................................................................................ + // usub16 r9, r3, r9 // ..............................*............................................................................................................................................................................................................................................................................... + // smulwt r3, r5, r14 // .......................................*...................................................................................................................................................................................................................................................................... + // usub16 r14, r8, r11 // ...........................................*.................................................................................................................................................................................................................................................................. + // smlabt r2, r2, r12, r0 // ........................................*..................................................................................................................................................................................................................................................................... + // uadd16 r8, r8, r11 // .........................................*.................................................................................................................................................................................................................................................................... + // smlabt r3, r3, r12, r0 // ..........................................*................................................................................................................................................................................................................................................................... + // smulwt r11, r5, r8 // ..............................................*............................................................................................................................................................................................................................................................... + // pkhtb r2, r3, r2, asr #16 // .............................................*................................................................................................................................................................................................................................................................ + // smulwb r8, r5, r8 // ............................................*................................................................................................................................................................................................................................................................. + // uadd16 r3, r10, r2 // ...............................................*.............................................................................................................................................................................................................................................................. + // smlabt r11, r11, r12, r0 // ..................................................*........................................................................................................................................................................................................................................................... + // vmov r5, s10 // ........................................................................*..................................................................................................................................................................................................................................... + // smlabt r8, r8, r12, r0 // ................................................*............................................................................................................................................................................................................................................................. + // usub16 r10, r10, r2 // ...................................................*.......................................................................................................................................................................................................................................................... + // smulwb r2, r5, r14 // ..................................................................................*........................................................................................................................................................................................................................... + // smulwt r14, r5, r14 // ....................................................................................*......................................................................................................................................................................................................................... + // pkhtb r11, r11, r8, asr #16 // .....................................................*........................................................................................................................................................................................................................................................ + // smlabt r2, r2, r12, r0 // ......................................................................................*....................................................................................................................................................................................................................... + // uadd16 r8, r4, r11 // .......................................................*...................................................................................................................................................................................................................................................... + // smlabt r14, r14, r12, r0 // ........................................................................................*..................................................................................................................................................................................................................... + // usub16 r4, r4, r11 // ...........................................................*.................................................................................................................................................................................................................................................. + // smulwb r11, r5, r9 // ..........................................................................*................................................................................................................................................................................................................................... + // pkhtb r2, r14, r2, asr #16 // .............................................................................................*................................................................................................................................................................................................................ + // smulwt r5, r5, r9 // ............................................................................*................................................................................................................................................................................................................................. + // usub16 r9, r6, r2 // ...............................................................................................*.............................................................................................................................................................................................................. + // smlabt r11, r11, r12, r0 // ..............................................................................*............................................................................................................................................................................................................................... + // vmov r14, s11 // .................................................*............................................................................................................................................................................................................................................................ + // smlabt r5, r5, r12, r0 // ................................................................................*............................................................................................................................................................................................................................. + // uadd16 r6, r6, r2 // .................................................................................................*............................................................................................................................................................................................................ + // smulwb r2, r14, r3 // ....................................................*......................................................................................................................................................................................................................................................... + // pkhtb r11, r5, r11, asr #16 // ...................................................................................*.......................................................................................................................................................................................................................... + // smulwt r5, r14, r3 // ......................................................*....................................................................................................................................................................................................................................................... + // vmov r14, s12 // .........................................................*.................................................................................................................................................................................................................................................... + // smlabt r3, r2, r12, r0 // ........................................................*..................................................................................................................................................................................................................................................... + // uadd16 r2, r7, r11 // .........................................................................................*.................................................................................................................................................................................................................... + // smlabt r5, r5, r12, r0 // ..........................................................*................................................................................................................................................................................................................................................... + // usub16 r7, r7, r11 // .....................................................................................*........................................................................................................................................................................................................................ + // smulwb r11, r14, r10 // ..............................................................*............................................................................................................................................................................................................................................... + // pkhtb r3, r5, r3, asr #16 // .............................................................*................................................................................................................................................................................................................................................ + // smulwt r10, r14, r10 // ............................................................*................................................................................................................................................................................................................................................. + // vmov r5, s14 // .......................................................................................*...................................................................................................................................................................................................................... + // smlabt r14, r11, r12, r0 // ..................................................................*........................................................................................................................................................................................................................................... + // uadd16 r11, r8, r3 // ...................................................................*.......................................................................................................................................................................................................................................... + // smlabt r10, r10, r12, r0 // ................................................................*............................................................................................................................................................................................................................................. + // usub16 r3, r8, r3 // ...............................................................*.............................................................................................................................................................................................................................................. + // smulwb r8, r5, r7 // ..........................................................................................*................................................................................................................................................................................................................... + // pkhtb r10, r10, r14, asr #16 // ......................................................................*....................................................................................................................................................................................................................................... + // smulwt r14, r5, r7 // ............................................................................................*................................................................................................................................................................................................................. + // vmov r7, s13 // ...................................................................................................................*.......................................................................................................................................................................................... + // smlabt r8, r8, r12, r0 // ..............................................................................................*............................................................................................................................................................................................................... + // usub16 r5, r4, r10 // ...............................................................................*.............................................................................................................................................................................................................................. + // smlabt r14, r14, r12, r0 // ................................................................................................*............................................................................................................................................................................................................. + // uadd16 r4, r4, r10 // .................................................................................*............................................................................................................................................................................................................................ + // pkhtb r14, r14, r8, asr #16 // ...................................................................................................*.......................................................................................................................................................................................................... + // smulwb r10, r7, r2 // ......................................................................................................................*....................................................................................................................................................................................... + // vmov r8, s16 // .................................................................*............................................................................................................................................................................................................................................ + // smulwt r2, r7, r2 // ........................................................................................................................*..................................................................................................................................................................................... + // smulwt r7, r8, r3 // .....................................................................*........................................................................................................................................................................................................................................ + // smulwb r3, r8, r3 // ....................................................................*......................................................................................................................................................................................................................................... + // uadd16 r8, r9, r14 // .....................................................................................................*........................................................................................................................................................................................................ + // smlabt r7, r7, r12, r0 // .........................................................................*.................................................................................................................................................................................................................................... + // usub16 r14, r9, r14 // .................................................................................................................*............................................................................................................................................................................................ + // smlabt r9, r3, r12, r0 // .......................................................................*...................................................................................................................................................................................................................................... + // smlabt r3, r2, r12, r0 // ...........................................................................................................................*.................................................................................................................................................................................. + // vmov r2, s22 // ............................................................................................................................*................................................................................................................................................................................. + // smlabt r10, r10, r12, r0 // ..........................................................................................................................*................................................................................................................................................................................... + // pkhtb r9, r7, r9, asr #16 // ...........................................................................*.................................................................................................................................................................................................................................. + // smulwb r7, r2, r14 // ...............................................................................................................................*.............................................................................................................................................................................. + // vmov s1, r9 // .............................................................................*................................................................................................................................................................................................................................ + // smulwt r14, r2, r14 // .................................................................................................................................*............................................................................................................................................................................ + // vmov r2, s18 // ...........................................................................................*.................................................................................................................................................................................................................. + // smlabt r7, r7, r12, r0 // ...................................................................................................................................*.......................................................................................................................................................................... + // pkhtb r9, r3, r10, asr #16 // ..............................................................................................................................*............................................................................................................................................................................... + // smulwb r10, r2, r5 // ..................................................................................................*........................................................................................................................................................................................................... + // usub16 r3, r6, r9 // ..................................................................................................................................*........................................................................................................................................................................... + // smulwt r2, r2, r5 // ....................................................................................................*......................................................................................................................................................................................................... + // uadd16 r5, r6, r9 // ......................................................................................................................................*....................................................................................................................................................................... + // smlabt r9, r14, r12, r0 // .....................................................................................................................................*........................................................................................................................................................................ + // vmov r14, s21 // .......................................................................................................*...................................................................................................................................................................................................... + // smlabt r10, r10, r12, r0 // ......................................................................................................*....................................................................................................................................................................................................... + // smulwb r6, r14, r8 // .........................................................................................................*.................................................................................................................................................................................................... + // smulwt r8, r14, r8 // ...........................................................................................................*.................................................................................................................................................................................................. + // pkhtb r9, r9, r7, asr #16 // ..........................................................................................................................................*................................................................................................................................................................... + // smlabt r7, r6, r12, r0 // .............................................................................................................*................................................................................................................................................................................................ + // vmov r6, s15 // ....................................................................................................................................*......................................................................................................................................................................... + // smlabt r8, r8, r12, r0 // ..................................................................................................................*........................................................................................................................................................................................... + // smulwb r14, r6, r11 // .........................................................................................................................................*.................................................................................................................................................................... + // smulwt r11, r6, r11 // .......................................................................................................................................*...................................................................................................................................................................... + // pkhtb r8, r8, r7, asr #16 // .....................................................................................................................*........................................................................................................................................................................................ + // smlabt r7, r14, r12, r0 // .....................................................................................................................................................*........................................................................................................................................................ + // vmov r14, s19 // ........................................................................................................................................*..................................................................................................................................................................... + // smlabt r11, r11, r12, r0 // .......................................................................................................................................................*...................................................................................................................................................... + // smulwb r6, r14, r5 // ...........................................................................................................................................*.................................................................................................................................................................. + // pkhtb r11, r11, r7, asr #16 // ..........................................................................................................................................................*................................................................................................................................................... + // smulwt r14, r14, r5 // .............................................................................................................................................*................................................................................................................................................................ + // vmov s0, r11 // ..............................................................................................................................................................*............................................................................................................................................... + // smlabt r11, r2, r12, r0 // ........................................................................................................*..................................................................................................................................................................................................... + // vmov r2, s17 // ............................................................................................................*................................................................................................................................................................................................. + // pkhtb r5, r11, r10, asr #16 // ..........................................................................................................*................................................................................................................................................................................................... + // vmov r10, s23 // .........................................................................................................................*.................................................................................................................................................................................... + // vmov s3, r5 // ..............................................................................................................*............................................................................................................................................................................................... + // smulwb r5, r2, r4 // ...............................................................................................................*.............................................................................................................................................................................................. + // vmov s7, r9 // ................................................................................................................................................*............................................................................................................................................................. + // smlabt r6, r6, r12, r0 // ...............................................................................................................................................*.............................................................................................................................................................. + // ldr.w r9, [r10, #64] // ..................................................................................................................................................................................................*........................................................................................................... + // smulwt r2, r2, r4 // ................................................................................................................*............................................................................................................................................................................................. + // ldr.w r4, [r10, #320] // ................................................................................................................................................................*............................................................................................................................................. + // smlabt r7, r5, r12, r0 // ....................................................................................................................*......................................................................................................................................................................................... + // vmov s6, r8 // .......................................................................................................................*...................................................................................................................................................................................... + // smlabt r2, r2, r12, r0 // .............................................................................................................................*................................................................................................................................................................................ + // vmov r11, s8 // ..................................................................................................................................................*........................................................................................................................................................... + // smlabt r5, r14, r12, r0 // .................................................................................................................................................*............................................................................................................................................................ + // pkhtb r8, r2, r7, asr #16 // ................................................................................................................................*............................................................................................................................................................................. + // smulwb r7, r11, r4 // .....................................................................................................................................................................*........................................................................................................................................ + // vmov r2, s20 // ............................................................................................................................................*................................................................................................................................................................. + // smulwt r4, r11, r4 // ...........................................................................................................................................................................*.................................................................................................................................. + // pkhtb r14, r5, r6, asr #16 // ....................................................................................................................................................*......................................................................................................................................................... + // smulwb r5, r2, r3 // .........................................................................................................................................................*.................................................................................................................................................... + // vmov s2, r8 // ..............................................................................................................................................*............................................................................................................................................................... + // smulwt r3, r2, r3 // ...................................................................................................................................................*.......................................................................................................................................................... + // ldr.w r2, [r10, #256] // ......................................................................................................................................................*....................................................................................................................................................... + // smlabt r8, r5, r12, r0 // .............................................................................................................................................................*................................................................................................................................................ + // ldr.w r5, [r10, #384] // ..........................................................................................................................................................................*................................................................................................................................... + // smlabt r6, r3, r12, r0 // ...............................................................................................................................................................*.............................................................................................................................................. + // movw r0, #24608 // ......................................................................................................................................................................*....................................................................................................................................... + // smlabt r3, r4, r12, r0 // ...............................................................................................................................................................................*.............................................................................................................................. + // ldr.w r4, [r10, #0] // ........................................................................................................................................................................*..................................................................................................................................... + // smlabt r7, r7, r12, r0 // .............................................................................................................................................................................*................................................................................................................................ + // pkhtb r6, r6, r8, asr #16 // ..................................................................................................................................................................*........................................................................................................................................... + // smulwb r8, r11, r5 // .................................................................................................................................................................................*............................................................................................................................ + // vmov s5, r6 // ....................................................................................................................................................................*......................................................................................................................................... + // smulwb r6, r11, r2 // ...................................................................................................................................................................*.......................................................................................................................................... + // pkhtb r7, r3, r7, asr #16 // ....................................................................................................................................................................................*......................................................................................................................... + // smulwt r2, r11, r2 // ...........................................................................................................................................................*.................................................................................................................................................. + // uadd16 r3, r9, r7 // ..........................................................................................................................................................................................................*................................................................................................... + // smlabt r6, r6, r12, r0 // .......................................................................................................................................................................*...................................................................................................................................... + // usub16 r7, r9, r7 // ..............................................................................................................................................................................................................*............................................................................................... + // smlabt r2, r2, r12, r0 // .........................................................................................................................................................................*.................................................................................................................................... + // ldr.w r9, [r10, #448] // ............................................................................................................................................................*................................................................................................................................................. + // smulwt r5, r11, r5 // ...................................................................................................................................................................................*.......................................................................................................................... + // pkhtb r6, r2, r6, asr #16 // ............................................................................................................................................................................*................................................................................................................................. + // smlabt r8, r8, r12, r0 // .....................................................................................................................................................................................*........................................................................................................................ + // uadd16 r2, r4, r6 // ..............................................................................................................................................................................*............................................................................................................................... + // usub16 r6, r4, r6 // ................................................................................................................................................................................*............................................................................................................................. + // smulwt r4, r11, r9 // .................................................................................................................................................................*............................................................................................................................................ + // vmov s4, r14 // ........................................................................................................................................................*..................................................................................................................................................... + // smulwb r11, r11, r9 // ..................................................................................................................................................................................*........................................................................................................................... + // ldr.w r14, [r10, #128] // ......................................................................................................................................................................................*....................................................................................................................... + // smlabt r9, r5, r12, r0 // .......................................................................................................................................................................................*...................................................................................................................... + // ldr.w r5, [r10, #192] // ........................................................................................................................................................................................*..................................................................................................................... + // smlabt r10, r11, r12, r0 // .........................................................................................................................................................................................*.................................................................................................................... + // pkhtb r8, r9, r8, asr #16 // ..........................................................................................................................................................................................*................................................................................................................... + // smlabt r9, r4, r12, r0 // ...........................................................................................................................................................................................*.................................................................................................................. + // uadd16 r4, r14, r8 // ................................................................................................................................................................................................*............................................................................................................. + // vmov r11, s9 // ............................................................................................................................................................................................................*................................................................................................. + // pkhtb r9, r9, r10, asr #16 // ..............................................................................................................................................................................................*............................................................................................................... + // smulwb r10, r11, r4 // ......................................................................................................................................................................................................................*....................................................................................... + // usub16 r14, r14, r8 // ............................................................................................................................................................................................*................................................................................................................. + // smulwt r4, r11, r4 // ........................................................................................................................................................................................................................*..................................................................................... + // usub16 r8, r5, r9 // ....................................................................................................................................................................................................*......................................................................................................... + // smlabt r10, r10, r12, r0 // ..........................................................................................................................................................................................................................*................................................................................... + // uadd16 r5, r5, r9 // ......................................................................................................................................................................................................*....................................................................................................... + // smlabt r4, r4, r12, r0 // ............................................................................................................................................................................................................................*................................................................................. + // smulwb r9, r11, r5 // ...............................................................................................................................................................................................................*.............................................................................................. + // pkhtb r10, r4, r10, asr #16 // ...............................................................................................................................................................................................................................*.............................................................................. + // smulwt r5, r11, r5 // .................................................................................................................................................................................................................*............................................................................................ + // usub16 r11, r2, r10 // .................................................................................................................................................................................................................................*............................................................................ + // smlabt r9, r9, r12, r0 // ...................................................................................................................................................................................................................*.......................................................................................... + // vmov r4, s10 // .............................................................................................................................................................................................*................................................................................................................ + // smlabt r5, r5, r12, r0 // .....................................................................................................................................................................................................................*........................................................................................ + // uadd16 r2, r2, r10 // ...................................................................................................................................................................................................................................*.......................................................................... + // smulwb r10, r4, r14 // ...............................................................................................................................................................................................*.............................................................................................................. + // pkhtb r5, r5, r9, asr #16 // .......................................................................................................................................................................................................................*...................................................................................... + // smulwt r14, r4, r14 // .................................................................................................................................................................................................*............................................................................................................ + // usub16 r9, r3, r5 // .........................................................................................................................................................................................................................*.................................................................................... + // smlabt r10, r10, r12, r0 // ...................................................................................................................................................................................................*.......................................................................................................... + // uadd16 r3, r3, r5 // .............................................................................................................................................................................................................................*................................................................................ + // smlabt r5, r14, r12, r0 // .....................................................................................................................................................................................................*........................................................................................................ + // smulwb r14, r4, r8 // .......................................................................................................................................................................................................*...................................................................................................... + // pkhtb r5, r5, r10, asr #16 // ........................................................................................................................................................................................................*..................................................................................................... + // smulwt r4, r4, r8 // .........................................................................................................................................................................................................*.................................................................................................... + // usub16 r8, r6, r5 // ..................................................................................................................................................................................................................*........................................................................................... + // smlabt r14, r14, r12, r0 // ...........................................................................................................................................................................................................*.................................................................................................. + // vmov r10, s12 // ...........................................................................................................................................................................................................................*.................................................................................. + // smlabt r4, r4, r12, r0 // .............................................................................................................................................................................................................*................................................................................................ + // uadd16 r6, r6, r5 // ....................................................................................................................................................................................................................*......................................................................................... + // smulwb r5, r10, r9 // ................................................................................................................................................................................................................................*............................................................................. + // pkhtb r14, r4, r14, asr #16 // ................................................................................................................................................................................................................*............................................................................................. + // usub16 r4, r7, r14 // .....................................................................................................................................................................................................................................*........................................................................ + // smulwt r10, r10, r9 // ..............................................................................................................................................................................................................................*............................................................................... + // uadd16 r9, r7, r14 // .......................................................................................................................................................................................................................................*...................................................................... + // smlabt r5, r5, r12, r0 // ....................................................................................................................................................................................................................................*......................................................................... + // vmov r7, s14 // ......................................................................................................................................................................................................................................*....................................................................... + // smlabt r14, r10, r12, r0 // ..................................................................................................................................................................................................................................*........................................................................... + // smulwb r10, r7, r4 // ........................................................................................................................................................................................................................................*..................................................................... + // pkhtb r14, r14, r5, asr #16 // .........................................................................................................................................................................................................................................*.................................................................... + // smulwt r4, r7, r4 // ..........................................................................................................................................................................................................................................*................................................................... + // usub16 r5, r11, r14 // ...............................................................................................................................................................................................................................................*.............................................................. + // smlabt r7, r10, r12, r0 // ............................................................................................................................................................................................................................................*................................................................. + // vmov r10, s11 // .............................................................................................................................................................................................................................................*................................................................ + // smlabt r4, r4, r12, r0 // ..............................................................................................................................................................................................................................................*............................................................... + // uadd16 r14, r11, r14 // ...........................................................................................................................................................................................................................................*.................................................................. + // smulwb r11, r10, r3 // ................................................................................................................................................................................................................................................*............................................................. + // pkhtb r7, r4, r7, asr #16 // .................................................................................................................................................................................................................................................*............................................................ + // smulwt r3, r10, r3 // ..................................................................................................................................................................................................................................................*........................................................... + // usub16 r10, r8, r7 // ...................................................................................................................................................................................................................................................*.......................................................... + // smlabt r4, r11, r12, r0 // ....................................................................................................................................................................................................................................................*......................................................... + // vmov r11, s13 // .....................................................................................................................................................................................................................................................*........................................................ + // smlabt r3, r3, r12, r0 // ......................................................................................................................................................................................................................................................*....................................................... + // uadd16 r8, r8, r7 // .......................................................................................................................................................................................................................................................*...................................................... + // smulwb r7, r11, r9 // ........................................................................................................................................................................................................................................................*..................................................... + // pkhtb r4, r3, r4, asr #16 // .........................................................................................................................................................................................................................................................*.................................................... + // smulwt r3, r11, r9 // ..........................................................................................................................................................................................................................................................*................................................... + // vmov r9, s2 // ...........................................................................................................................................................................................................................................................*.................................................. + // smlabt r11, r7, r12, r0 // ............................................................................................................................................................................................................................................................*................................................. + // usub16 r7, r14, r9 // .............................................................................................................................................................................................................................................................*................................................ + // smlabt r0, r3, r12, r0 // ..............................................................................................................................................................................................................................................................*............................................... + // uadd16 r3, r14, r9 // ...............................................................................................................................................................................................................................................................*.............................................. + // vmov r14, s5 // ................................................................................................................................................................................................................................................................*............................................. + // pkhtb r11, r0, r11, asr #16 // .................................................................................................................................................................................................................................................................*............................................ + // vmov r0, s23 // ..................................................................................................................................................................................................................................................................*........................................... + // usub16 r9, r2, r4 // ...................................................................................................................................................................................................................................................................*.......................................... + // str.w r7, [r0, #160] // ....................................................................................................................................................................................................................................................................*......................................... + // uadd16 r2, r2, r4 // ..........................................................................................................................................................................................................................................................................*................................... + // vmov r7, s4 // .......................................................................................................................................................................................................................................................................................*...................... + // usub16 r4, r6, r11 // .....................................................................................................................................................................................................................................................................*........................................ + // str.w r3, [r0, #128] // ......................................................................................................................................................................................................................................................................*....................................... + // usub16 r3, r4, r14 // .........................................................................................................................................................................................................................................................................*.................................... + // str.w r3, [r0, #352] // ...........................................................................................................................................................................................................................................................................*.................................. + // uadd16 r3, r6, r11 // .......................................................................................................................................................................................................................................................................*...................................... + // vmov r11, s7 // ........................................................................................................................................................................................................................................................................*..................................... + // uadd16 r6, r4, r14 // ............................................................................................................................................................................................................................................................................*................................. + // str.w r6, [r0, #320] // .............................................................................................................................................................................................................................................................................*................................ + // uadd16 r4, r10, r11 // ..............................................................................................................................................................................................................................................................................*............................... + // vmov r14, s24 // ...............................................................................................................................................................................................................................................................................*.............................. + // str.w r4, [r0, #448] // ..................................................................................................................................................................................................................................................................................*........................... + // usub16 r4, r10, r11 // ................................................................................................................................................................................................................................................................................*............................. + // vmov r6, s3 // ...................................................................................................................................................................................................................................................................................*.......................... + // uadd16 r10, r5, r6 // ....................................................................................................................................................................................................................................................................................*......................... + // str.w r10, [r0, #192] // .....................................................................................................................................................................................................................................................................................*........................ + // usub16 r11, r5, r6 // ......................................................................................................................................................................................................................................................................................*....................... + // str.w r11, [r0, #224] // ...........................................................................................................................................................................................................................................................................................................*.. + // usub16 r6, r3, r7 // ..........................................................................................................................................................................................................................................................................................*................... + // str.w r6, [r0, #288] // ...........................................................................................................................................................................................................................................................................................*.................. + // vmov r10, s6 // .......................................................................................................................................................................................................................................................................................................*...... + // str.w r4, [r0, #480] // .................................................................................................................................................................................................................................................................................*............................ + // usub16 r4, r8, r10 // ..........................................................................................................................................................................................................................................................................................................*... + // str.w r4, [r0, #416] // ............................................................................................................................................................................................................................................................................................................*. + // uadd16 r5, r3, r7 // ........................................................................................................................................................................................................................................................................................*..................... + // vmov r3, s1 // .............................................................................................................................................................................................................................................................................................*................ + // uadd16 r7, r9, r3 // ..............................................................................................................................................................................................................................................................................................*............... + // str.w r7, [r0, #64] // ...............................................................................................................................................................................................................................................................................................*.............. + // usub16 r9, r9, r3 // ................................................................................................................................................................................................................................................................................................*............. + // vmov r3, s0 // .........................................................................................................................................................................................................................................................................................*.................... + // uadd16 r4, r2, r3 // ............................................................................................................................................................................................................................................................................................*................. + // str.w r9, [r0, #96] // .....................................................................................................................................................................................................................................................................................................*........ + // usub16 r7, r2, r3 // ....................................................................................................................................................................................................................................................................................................*......... + // str.w r4, [r0], #4 // ..................................................................................................................................................................................................................................................................................................*........... + // cmp.w r0, r14 // ...................................................................................................................................................................................................................................................................................................*.......... + // str.w r5, [r0, #252] // .................................................................................................................................................................................................................................................................................................*............ + // uadd16 r2, r8, r10 // ........................................................................................................................................................................................................................................................................................................*..... + // str.w r2, [r0, #380] // .........................................................................................................................................................................................................................................................................................................*.... + // str.w r7, [r0, #28] // ......................................................................................................................................................................................................................................................................................................*....... + // bne.w layer1234_loop // .............................................................................................................................................................................................................................................................................................................* + + + + // ----------------------------------------------------------------------------------------------------------------------------------------------- new position ------------------------------------------------------------------------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 200 225 250 275 300 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|- + // vmov s23, r0 // ..*........................................................................................................................................................................................................................................................................................................... + // ldr.w r2, [r0, #32] // ..........*................................................................................................................................................................................................................................................................................................... + // ldr.w r3, [r0, #256/4+32] // .*............................................................................................................................................................................................................................................................................................................ + // ldr.w r4, [r0, #2*256/4+32] // ............*................................................................................................................................................................................................................................................................................................. + // ldr.w r5, [r0, #3*256/4+32] // *............................................................................................................................................................................................................................................................................................................. + // ldr.w r6, [r0, #256+32] // ........*..................................................................................................................................................................................................................................................................................................... + // ldr.w r7, [r0, #5*256/4+32] // ....*......................................................................................................................................................................................................................................................................................................... + // ldr.w r8, [r0, #6*256/4+32] // ......*....................................................................................................................................................................................................................................................................................................... + // ldr.w r9, [r0, #7*256/4+32] // .....*........................................................................................................................................................................................................................................................................................................ + // movw r0, #24608 // ..............*............................................................................................................................................................................................................................................................................................... + // vmov r10, s8 // ...*.......................................................................................................................................................................................................................................................................................................... + // smulwb r14, r10, r6 // ....................*......................................................................................................................................................................................................................................................................................... + // smulwt r6, r10, r6 // ......................*....................................................................................................................................................................................................................................................................................... + // smlabt r14, r14, r12, r0 // ........................*..................................................................................................................................................................................................................................................................................... + // smlabt r6, r6, r12, r0 // ..........................*................................................................................................................................................................................................................................................................................... + // pkhtb r14, r6, r14, asr #16 // ............................*................................................................................................................................................................................................................................................................................. + // usub16 r6, r2, r14 // ..............................*............................................................................................................................................................................................................................................................................... + // uadd16 r2, r2, r14 // ................................*............................................................................................................................................................................................................................................................................. + // smulwb r14, r10, r7 // .............*................................................................................................................................................................................................................................................................................................ + // smulwt r7, r10, r7 // .......*...................................................................................................................................................................................................................................................................................................... + // smlabt r14, r14, r12, r0 // .................*............................................................................................................................................................................................................................................................................................ + // smlabt r7, r7, r12, r0 // ...................*.......................................................................................................................................................................................................................................................................................... + // pkhtb r14, r7, r14, asr #16 // .....................*........................................................................................................................................................................................................................................................................................ + // usub16 r7, r3, r14 // .......................*...................................................................................................................................................................................................................................................................................... + // uadd16 r3, r3, r14 // .........................*.................................................................................................................................................................................................................................................................................... + // smulwb r14, r10, r8 // ...........................*.................................................................................................................................................................................................................................................................................. + // smulwt r8, r10, r8 // .............................*................................................................................................................................................................................................................................................................................ + // smlabt r14, r14, r12, r0 // ...............................*.............................................................................................................................................................................................................................................................................. + // smlabt r8, r8, r12, r0 // .................................*............................................................................................................................................................................................................................................................................ + // pkhtb r14, r8, r14, asr #16 // ....................................*......................................................................................................................................................................................................................................................................... + // usub16 r8, r4, r14 // ........................................*..................................................................................................................................................................................................................................................................... + // uadd16 r4, r4, r14 // ..........................................*................................................................................................................................................................................................................................................................... + // smulwb r14, r10, r9 // .........*.................................................................................................................................................................................................................................................................................................... + // smulwt r9, r10, r9 // ...........*.................................................................................................................................................................................................................................................................................................. + // smlabt r14, r14, r12, r0 // ...............*.............................................................................................................................................................................................................................................................................................. + // smlabt r9, r9, r12, r0 // ................*............................................................................................................................................................................................................................................................................................. + // pkhtb r14, r9, r14, asr #16 // ..................*........................................................................................................................................................................................................................................................................................... + // usub16 r9, r5, r14 // ......................................*....................................................................................................................................................................................................................................................................... + // uadd16 r5, r5, r14 // ..................................*........................................................................................................................................................................................................................................................................... + // vmov r10, s9 // ...................................*.......................................................................................................................................................................................................................................................................... + // vmov r11, s10 // .................................................*............................................................................................................................................................................................................................................................ + // smulwb r14, r10, r4 // ..............................................*............................................................................................................................................................................................................................................................... + // smulwt r4, r10, r4 // ............................................*................................................................................................................................................................................................................................................................. + // smlabt r14, r14, r12, r0 // ..................................................*........................................................................................................................................................................................................................................................... + // smlabt r4, r4, r12, r0 // ................................................*............................................................................................................................................................................................................................................................. + // pkhtb r14, r4, r14, asr #16 // ......................................................*....................................................................................................................................................................................................................................................... + // usub16 r4, r2, r14 // ..........................................................*................................................................................................................................................................................................................................................... + // uadd16 r2, r2, r14 // ........................................................*..................................................................................................................................................................................................................................................... + // smulwb r14, r10, r5 // .....................................*........................................................................................................................................................................................................................................................................ + // smulwt r5, r10, r5 // .......................................*...................................................................................................................................................................................................................................................................... + // smlabt r14, r14, r12, r0 // .........................................*.................................................................................................................................................................................................................................................................... + // smlabt r5, r5, r12, r0 // ...........................................*.................................................................................................................................................................................................................................................................. + // pkhtb r14, r5, r14, asr #16 // .............................................*................................................................................................................................................................................................................................................................ + // usub16 r5, r3, r14 // ...................................................*.......................................................................................................................................................................................................................................................... + // uadd16 r3, r3, r14 // ...............................................*.............................................................................................................................................................................................................................................................. + // smulwb r14, r11, r8 // ....................................................*......................................................................................................................................................................................................................................................... + // smulwt r8, r11, r8 // .....................................................*........................................................................................................................................................................................................................................................ + // smlabt r14, r14, r12, r0 // .......................................................*...................................................................................................................................................................................................................................................... + // smlabt r8, r8, r12, r0 // .........................................................*.................................................................................................................................................................................................................................................... + // pkhtb r14, r8, r14, asr #16 // ............................................................*................................................................................................................................................................................................................................................. + // usub16 r8, r6, r14 // ..............................................................*............................................................................................................................................................................................................................................... + // uadd16 r6, r6, r14 // ..................................................................*........................................................................................................................................................................................................................................... + // smulwb r14, r11, r9 // ...........................................................*.................................................................................................................................................................................................................................................. + // smulwt r9, r11, r9 // .............................................................*................................................................................................................................................................................................................................................ + // smlabt r14, r14, r12, r0 // ...............................................................*.............................................................................................................................................................................................................................................. + // smlabt r9, r9, r12, r0 // .................................................................*............................................................................................................................................................................................................................................ + // pkhtb r14, r9, r14, asr #16 // ....................................................................*......................................................................................................................................................................................................................................... + // usub16 r9, r7, r14 // ..........................................................................*................................................................................................................................................................................................................................... + // uadd16 r7, r7, r14 // ........................................................................*..................................................................................................................................................................................................................................... + // vmov r10, s11 // ................................................................*............................................................................................................................................................................................................................................. + // vmov r11, s12 // ......................................................................*....................................................................................................................................................................................................................................... + // smulwb r14, r10, r3 // ...................................................................*.......................................................................................................................................................................................................................................... + // smulwt r3, r10, r3 // .....................................................................*........................................................................................................................................................................................................................................ + // smlabt r14, r14, r12, r0 // .......................................................................*...................................................................................................................................................................................................................................... + // smlabt r3, r3, r12, r0 // .........................................................................*.................................................................................................................................................................................................................................... + // pkhtb r14, r3, r14, asr #16 // ............................................................................*................................................................................................................................................................................................................................. + // usub16 r3, r2, r14 // ..................................................................................*........................................................................................................................................................................................................................... + // uadd16 r2, r2, r14 // ................................................................................*............................................................................................................................................................................................................................. + // smulwb r14, r11, r5 // ...........................................................................*.................................................................................................................................................................................................................................. + // smulwt r5, r11, r5 // .............................................................................*................................................................................................................................................................................................................................ + // smlabt r14, r14, r12, r0 // ...............................................................................*.............................................................................................................................................................................................................................. + // smlabt r5, r5, r12, r0 // .................................................................................*............................................................................................................................................................................................................................ + // pkhtb r14, r5, r14, asr #16 // ....................................................................................*......................................................................................................................................................................................................................... + // usub16 r5, r4, r14 // ........................................................................................*..................................................................................................................................................................................................................... + // uadd16 r4, r4, r14 // ..........................................................................................*................................................................................................................................................................................................................... + // vmov r10, s13 // ......................................................................................*....................................................................................................................................................................................................................... + // vmov r11, s14 // ..............................................................................*............................................................................................................................................................................................................................... + // smulwb r14, r10, r7 // ............................................................................................*................................................................................................................................................................................................................. + // smulwt r7, r10, r7 // ..............................................................................................*............................................................................................................................................................................................................... + // smlabt r14, r14, r12, r0 // .......................................................................................................*...................................................................................................................................................................................................... + // smlabt r7, r7, r12, r0 // .....................................................................................................*........................................................................................................................................................................................................ + // pkhtb r14, r7, r14, asr #16 // ..............................................................................................................*............................................................................................................................................................................................... + // usub16 r7, r6, r14 // ................................................................................................................*............................................................................................................................................................................................. + // uadd16 r6, r6, r14 // ..................................................................................................................*........................................................................................................................................................................................... + // smulwb r14, r11, r9 // ...................................................................................*.......................................................................................................................................................................................................................... + // smulwt r9, r11, r9 // .....................................................................................*........................................................................................................................................................................................................................ + // smlabt r14, r14, r12, r0 // .......................................................................................*...................................................................................................................................................................................................................... + // smlabt r9, r9, r12, r0 // .........................................................................................*.................................................................................................................................................................................................................... + // pkhtb r14, r9, r14, asr #16 // ...........................................................................................*.................................................................................................................................................................................................................. + // usub16 r9, r8, r14 // ...................................................................................................*.......................................................................................................................................................................................................... + // uadd16 r8, r8, r14 // .................................................................................................*............................................................................................................................................................................................................ + // vmov r10, s15 // ..........................................................................................................................*................................................................................................................................................................................... + // vmov r11, s16 // .............................................................................................*................................................................................................................................................................................................................ + // smulwb r14, r10, r2 // ............................................................................................................................*................................................................................................................................................................................. + // smulwt r2, r10, r2 // .............................................................................................................................*................................................................................................................................................................................ + // smlabt r14, r14, r12, r0 // ...............................................................................................................................*.............................................................................................................................................................................. + // smlabt r2, r2, r12, r0 // .................................................................................................................................*............................................................................................................................................................................ + // pkhtb r2, r2, r14, asr #16 // ...................................................................................................................................*.......................................................................................................................................................................... + // smulwb r14, r11, r3 // ................................................................................................*............................................................................................................................................................................................................. + // smulwt r3, r11, r3 // ...............................................................................................*.............................................................................................................................................................................................................. + // smlabt r14, r14, r12, r0 // ....................................................................................................*......................................................................................................................................................................................................... + // smlabt r3, r3, r12, r0 // ..................................................................................................*........................................................................................................................................................................................................... + // pkhtb r3, r3, r14, asr #16 // ........................................................................................................*..................................................................................................................................................................................................... + // vmov r10, s17 // .......................................................................................................................................*...................................................................................................................................................................... + // vmov r11, s18 // ............................................................................................................*................................................................................................................................................................................................. + // smulwb r14, r10, r4 // ...........................................................................................................................................*.................................................................................................................................................................. + // smulwt r4, r10, r4 // ...............................................................................................................................................*.............................................................................................................................................................. + // smlabt r14, r14, r12, r0 // .................................................................................................................................................*............................................................................................................................................................ + // smlabt r4, r4, r12, r0 // ...................................................................................................................................................*.......................................................................................................................................................... + // pkhtb r4, r4, r14, asr #16 // ......................................................................................................................................................*....................................................................................................................................................... + // smulwb r14, r11, r5 // ...............................................................................................................*.............................................................................................................................................................................................. + // smulwt r5, r11, r5 // .................................................................................................................*............................................................................................................................................................................................ + // smlabt r14, r14, r12, r0 // .....................................................................................................................*........................................................................................................................................................................................ + // smlabt r5, r5, r12, r0 // ......................................................................................................................................*....................................................................................................................................................................... + // pkhtb r5, r5, r14, asr #16 // ........................................................................................................................................*..................................................................................................................................................................... + // vmov r10, s19 // ................................................................................................................................*............................................................................................................................................................................. + // vmov r11, s20 // ........................................................................................................................................................*..................................................................................................................................................... + // smulwb r14, r10, r6 // ..................................................................................................................................*........................................................................................................................................................................... + // smulwt r6, r10, r6 // ....................................................................................................................................*......................................................................................................................................................................... + // smlabt r14, r14, r12, r0 // .............................................................................................................................................*................................................................................................................................................................ + // smlabt r6, r6, r12, r0 // .....................................................................................................................................................*........................................................................................................................................................ + // pkhtb r6, r6, r14, asr #16 // ..........................................................................................................................................................*................................................................................................................................................... + // smulwb r14, r11, r7 // ...........................................................................................................................................................*.................................................................................................................................................. + // smulwt r7, r11, r7 // .............................................................................................................................................................*................................................................................................................................................ + // smlabt r14, r14, r12, r0 // ...............................................................................................................................................................*.............................................................................................................................................. + // smlabt r7, r7, r12, r0 // .................................................................................................................................................................*............................................................................................................................................ + // pkhtb r7, r7, r14, asr #16 // ......................................................................................................................................................................*....................................................................................................................................... + // vmov r10, s21 // ....................................................................................................................*......................................................................................................................................................................................... + // vmov r11, s22 // ......................................................................................................*....................................................................................................................................................................................................... + // smulwb r14, r10, r8 // ......................................................................................................................*....................................................................................................................................................................................... + // smulwt r8, r10, r8 // .......................................................................................................................*...................................................................................................................................................................................... + // smlabt r14, r14, r12, r0 // .........................................................................................................................*.................................................................................................................................................................................... + // smlabt r8, r8, r12, r0 // ...........................................................................................................................*.................................................................................................................................................................................. + // pkhtb r8, r8, r14, asr #16 // ..............................................................................................................................*............................................................................................................................................................................... + // smulwb r14, r11, r9 // .........................................................................................................*.................................................................................................................................................................................................... + // smulwt r9, r11, r9 // ...........................................................................................................*.................................................................................................................................................................................................. + // smlabt r14, r14, r12, r0 // .............................................................................................................*................................................................................................................................................................................................ + // smlabt r9, r9, r12, r0 // ...................................................................................................................*.......................................................................................................................................................................................... + // pkhtb r9, r9, r14, asr #16 // ........................................................................................................................*..................................................................................................................................................................................... + // vmov s0, r2 // .....................................................................................................................................*........................................................................................................................................................................ + // vmov s1, r3 // ..........................................................................................................*................................................................................................................................................................................................... + // vmov s2, r4 // ............................................................................................................................................................*................................................................................................................................................. + // vmov s3, r5 // ..........................................................................................................................................*................................................................................................................................................................... + // vmov s4, r6 // .......................................................................................................................................................................................*...................................................................................................................... + // vmov s5, r7 // ........................................................................................................................................................................*..................................................................................................................................... + // vmov s6, r8 // ..................................................................................................................................................*........................................................................................................................................................... + // vmov s7, r9 // ............................................................................................................................................*................................................................................................................................................................. + // vmov r0, s23 // .........................................................................................................................................*.................................................................................................................................................................... + // ldr.w r2, [r0, #0] // ....................................................................................................................................................................*......................................................................................................................................... + // ldr.w r3, [r0, #256/4] // ..............................................................................................................................................*............................................................................................................................................................... + // ldr.w r4, [r0, #2*256/4] // .........................................................................................................................................................................................*.................................................................................................................... + // ldr.w r5, [r0, #3*256/4] // ...........................................................................................................................................................................................*.................................................................................................................. + // ldr.w r6, [r0, #256] // ..............................................................................................................................................................*............................................................................................................................................... + // ldr.w r7, [r0, #5*256/4] // ................................................................................................................................................*............................................................................................................................................................. + // ldr.w r8, [r0, #6*256/4] // ................................................................................................................................................................*............................................................................................................................................. + // ldr.w r9, [r0, #7*256/4] // ................................................................................................................................................................................*............................................................................................................................. + // movw r0, #24608 // ..................................................................................................................................................................*........................................................................................................................................... + // vmov r10, s8 // ....................................................................................................................................................*......................................................................................................................................................... + // smulwb r14, r10, r6 // .........................................................................................................................................................................*.................................................................................................................................... + // smulwt r6, r10, r6 // ...........................................................................................................................................................................*.................................................................................................................................. + // smlabt r14, r14, r12, r0 // .............................................................................................................................................................................*................................................................................................................................ + // smlabt r6, r6, r12, r0 // ...............................................................................................................................................................................*.............................................................................................................................. + // pkhtb r14, r6, r14, asr #16 // ..................................................................................................................................................................................*........................................................................................................................... + // usub16 r6, r2, r14 // .....................................................................................................................................................................................*........................................................................................................................ + // uadd16 r2, r2, r14 // ....................................................................................................................................................................................*......................................................................................................................... + // smulwb r14, r10, r7 // .......................................................................................................................................................*...................................................................................................................................................... + // smulwt r7, r10, r7 // .........................................................................................................................................................*.................................................................................................................................................... + // smlabt r14, r14, r12, r0 // .....................................................................................................................................................................*........................................................................................................................................ + // smlabt r7, r7, r12, r0 // ...................................................................................................................................................................*.......................................................................................................................................... + // pkhtb r14, r7, r14, asr #16 // ..........................................................................................................................................................................*................................................................................................................................... + // usub16 r7, r3, r14 // ..............................................................................................................................................................................*............................................................................................................................... + // uadd16 r3, r3, r14 // ............................................................................................................................................................................*................................................................................................................................. + // smulwb r14, r10, r8 // .......................................................................................................................................................................*...................................................................................................................................... + // smulwt r8, r10, r8 // .................................................................................................................................................................................*............................................................................................................................ + // smlabt r14, r14, r12, r0 // ...................................................................................................................................................................................*.......................................................................................................................... + // smlabt r8, r8, r12, r0 // ..........................................................................................................................................................................................*................................................................................................................... + // pkhtb r14, r8, r14, asr #16 // .............................................................................................................................................................................................*................................................................................................................ + // usub16 r8, r4, r14 // ...................................................................................................................................................................................................*.......................................................................................................... + // uadd16 r4, r4, r14 // ...............................................................................................................................................................................................*.............................................................................................................. + // smulwb r14, r10, r9 // ........................................................................................................................................................................................*..................................................................................................................... + // smulwt r9, r10, r9 // ......................................................................................................................................................................................*....................................................................................................................... + // smlabt r14, r14, r12, r0 // ............................................................................................................................................................................................*................................................................................................................. + // smlabt r9, r9, r12, r0 // ..............................................................................................................................................................................................*............................................................................................................... + // pkhtb r14, r9, r14, asr #16 // .................................................................................................................................................................................................*............................................................................................................ + // usub16 r9, r5, r14 // .....................................................................................................................................................................................................*........................................................................................................ + // uadd16 r5, r5, r14 // .......................................................................................................................................................................................................*...................................................................................................... + // vmov r10, s9 // ................................................................................................................................................................................................*............................................................................................................. + // vmov r11, s10 // ..............................................................................................................................................................................................................*............................................................................................... + // smulwb r14, r10, r4 // ..................................................................................................................................................................................................*........................................................................................................... + // smulwt r4, r10, r4 // ....................................................................................................................................................................................................*......................................................................................................... + // smlabt r14, r14, r12, r0 // ......................................................................................................................................................................................................*....................................................................................................... + // smlabt r4, r4, r12, r0 // ........................................................................................................................................................................................................*..................................................................................................... + // pkhtb r14, r4, r14, asr #16 // ..........................................................................................................................................................................................................*................................................................................................... + // usub16 r4, r2, r14 // ............................................................................................................................................................................................................*................................................................................................. + // uadd16 r2, r2, r14 // ................................................................................................................................................................................................................*............................................................................................. + // smulwb r14, r10, r5 // .........................................................................................................................................................................................................*.................................................................................................... + // smulwt r5, r10, r5 // ...........................................................................................................................................................................................................*.................................................................................................. + // smlabt r14, r14, r12, r0 // .............................................................................................................................................................................................................*................................................................................................ + // smlabt r5, r5, r12, r0 // ...............................................................................................................................................................................................................*.............................................................................................. + // pkhtb r14, r5, r14, asr #16 // ..................................................................................................................................................................................................................*........................................................................................... + // usub16 r5, r3, r14 // ....................................................................................................................................................................................................................*......................................................................................... + // uadd16 r3, r3, r14 // ......................................................................................................................................................................................................................*....................................................................................... + // smulwb r14, r11, r8 // .................................................................................................................................................................................................................*............................................................................................ + // smulwt r8, r11, r8 // ...................................................................................................................................................................................................................*.......................................................................................... + // smlabt r14, r14, r12, r0 // .....................................................................................................................................................................................................................*........................................................................................ + // smlabt r8, r8, r12, r0 // .......................................................................................................................................................................................................................*...................................................................................... + // pkhtb r14, r8, r14, asr #16 // .........................................................................................................................................................................................................................*.................................................................................... + // usub16 r8, r6, r14 // ...........................................................................................................................................................................................................................*.................................................................................. + // uadd16 r6, r6, r14 // ...............................................................................................................................................................................................................................*.............................................................................. + // smulwb r14, r11, r9 // ........................................................................................................................................................................................................................*..................................................................................... + // smulwt r9, r11, r9 // ..........................................................................................................................................................................................................................*................................................................................... + // smlabt r14, r14, r12, r0 // ............................................................................................................................................................................................................................*................................................................................. + // smlabt r9, r9, r12, r0 // ..............................................................................................................................................................................................................................*............................................................................... + // pkhtb r14, r9, r14, asr #16 // .................................................................................................................................................................................................................................*............................................................................ + // usub16 r9, r7, r14 // ..................................................................................................................................................................................................................................*........................................................................... + // uadd16 r7, r7, r14 // ....................................................................................................................................................................................................................................*......................................................................... + // vmov r10, s11 // .............................................................................................................................................................................................................................................*................................................................ + // vmov r11, s12 // .............................................................................................................................................................................................................................*................................................................................ + // smulwb r14, r10, r3 // ................................................................................................................................................................................................................................................*............................................................. + // smulwt r3, r10, r3 // ..................................................................................................................................................................................................................................................*........................................................... + // smlabt r14, r14, r12, r0 // ....................................................................................................................................................................................................................................................*......................................................... + // smlabt r3, r3, r12, r0 // ......................................................................................................................................................................................................................................................*....................................................... + // pkhtb r14, r3, r14, asr #16 // .........................................................................................................................................................................................................................................................*.................................................... + // usub16 r3, r2, r14 // ...................................................................................................................................................................................................................................................................*.......................................... + // uadd16 r2, r2, r14 // .....................................................................................................................................................................................................................................................................*........................................ + // smulwb r14, r11, r5 // ................................................................................................................................................................................................................................*............................................................................. + // smulwt r5, r11, r5 // ...................................................................................................................................................................................................................................*.......................................................................... + // smlabt r14, r14, r12, r0 // .....................................................................................................................................................................................................................................*........................................................................ + // smlabt r5, r5, r12, r0 // .......................................................................................................................................................................................................................................*...................................................................... + // pkhtb r14, r5, r14, asr #16 // .........................................................................................................................................................................................................................................*.................................................................... + // usub16 r5, r4, r14 // ...........................................................................................................................................................................................................................................*.................................................................. + // uadd16 r4, r4, r14 // ...............................................................................................................................................................................................................................................*.............................................................. + // vmov r10, s13 // .....................................................................................................................................................................................................................................................*........................................................ + // vmov r11, s14 // ......................................................................................................................................................................................................................................*....................................................................... + // smulwb r14, r10, r7 // ........................................................................................................................................................................................................................................................*..................................................... + // smulwt r7, r10, r7 // ..........................................................................................................................................................................................................................................................*................................................... + // smlabt r14, r14, r12, r0 // ............................................................................................................................................................................................................................................................*................................................. + // smlabt r7, r7, r12, r0 // ..............................................................................................................................................................................................................................................................*............................................... + // pkhtb r14, r7, r14, asr #16 // .................................................................................................................................................................................................................................................................*............................................ + // usub16 r7, r6, r14 // .......................................................................................................................................................................................................................................................................*...................................... + // uadd16 r6, r6, r14 // ...........................................................................................................................................................................................................................................................................*.................................. + // smulwb r14, r11, r9 // ........................................................................................................................................................................................................................................*..................................................................... + // smulwt r9, r11, r9 // ..........................................................................................................................................................................................................................................*................................................................... + // smlabt r14, r14, r12, r0 // ............................................................................................................................................................................................................................................*................................................................. + // smlabt r9, r9, r12, r0 // ..............................................................................................................................................................................................................................................*............................................................... + // pkhtb r14, r9, r14, asr #16 // .................................................................................................................................................................................................................................................*............................................................ + // usub16 r9, r8, r14 // ...................................................................................................................................................................................................................................................*.......................................................... + // uadd16 r8, r8, r14 // .......................................................................................................................................................................................................................................................*...................................................... + // vmov r0, s23 // ..................................................................................................................................................................................................................................................................*........................................... + // vmov r10, s1 // ...............................................................................................................................................................................................................................................................................................*.............. + // uadd16 r14, r3, r10 // ................................................................................................................................................................................................................................................................................................*............. + // usub16 r3, r3, r10 // ..................................................................................................................................................................................................................................................................................................*........... + // str.w r14, [r0, #1*256/4] // .................................................................................................................................................................................................................................................................................................*............ + // str.w r3, [r0, #1*256/4+32] // .....................................................................................................................................................................................................................................................................................................*........ + // vmov r10, s3 // ...................................................................................................................................................................................................................................................................................*.......................... + // uadd16 r14, r5, r10 // ....................................................................................................................................................................................................................................................................................*......................... + // usub16 r5, r5, r10 // ......................................................................................................................................................................................................................................................................................*....................... + // str.w r14, [r0, #3*256/4] // .....................................................................................................................................................................................................................................................................................*........................ + // str.w r5, [r0, #3*256/4+32] // .......................................................................................................................................................................................................................................................................................*...................... + // vmov r10, s5 // ................................................................................................................................................................................................................................................................*............................................. + // uadd16 r14, r7, r10 // .............................................................................................................................................................................................................................................................................*................................ + // usub16 r7, r7, r10 // .........................................................................................................................................................................................................................................................................*.................................... + // str.w r14, [r0, #5*256/4] // ..............................................................................................................................................................................................................................................................................*............................... + // str.w r7, [r0, #5*256/4+32] // ..........................................................................................................................................................................................................................................................................*................................... + // vmov r10, s7 // ............................................................................................................................................................................................................................................................................*................................. + // uadd16 r14, r9, r10 // ...............................................................................................................................................................................................................................................................................*.............................. + // usub16 r9, r9, r10 // ..................................................................................................................................................................................................................................................................................*........................... + // str.w r14, [r0, #7*256/4] // .................................................................................................................................................................................................................................................................................*............................ + // str.w r9, [r0, #7*256/4+32] // ...........................................................................................................................................................................................................................................................................................*.................. + // vmov r5, s2 // ...........................................................................................................................................................................................................................................................*.................................................. + // uadd16 r14, r4, r5 // ...............................................................................................................................................................................................................................................................*.............................................. + // usub16 r10, r4, r5 // .............................................................................................................................................................................................................................................................*................................................ + // str.w r14, [r0, #2*256/4] // ........................................................................................................................................................................................................................................................................*..................................... + // str.w r10, [r0, #2*256/4+32] // ....................................................................................................................................................................................................................................................................*......................................... + // vmov r7, s4 // ......................................................................................................................................................................................................................................................................*....................................... + // uadd16 r14, r6, r7 // ..............................................................................................................................................................................................................................................................................................*............... + // usub16 r10, r6, r7 // ........................................................................................................................................................................................................................................................................................*..................... + // str.w r14, [r0, #4*256/4] // .........................................................................................................................................................................................................................................................................................................*.... + // str.w r10, [r0, #4*256/4+32] // .........................................................................................................................................................................................................................................................................................*.................... + // vmov r9, s6 // ..........................................................................................................................................................................................................................................................................................*................... + // uadd16 r14, r8, r9 // ..........................................................................................................................................................................................................................................................................................................*... + // usub16 r10, r8, r9 // ............................................................................................................................................................................................................................................................................................*................. + // str.w r14, [r0, #6*256/4] // ...........................................................................................................................................................................................................................................................................................................*.. + // str.w r10, [r0, #6*256/4+32] // .............................................................................................................................................................................................................................................................................................*................ + // vmov r3, s0 // ...................................................................................................................................................................................................................................................................................................*.......... + // uadd16 r14, r2, r3 // ....................................................................................................................................................................................................................................................................................................*......... + // usub16 r10, r2, r3 // ......................................................................................................................................................................................................................................................................................................*....... + // str.w r10, [r0, #32] // ............................................................................................................................................................................................................................................................................................................*. + // str.w r14, [r0], #4 // .......................................................................................................................................................................................................................................................................................................*...... + // vmov r14, s24 // ................................................................................................................................................................................................................................................................................*............................. + // cmp.w r0, r14 // ........................................................................................................................................................................................................................................................................................................*..... + // bne.w layer1234_loop // .............................................................................................................................................................................................................................................................................................................* - // ---------------------------------------------------------------------------------------------------------------------------------------------- new position ----------------------------------------------------------------------------------------------------------------------------------------------> - // 0 25 50 75 100 125 150 175 200 225 250 275 - // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|----------------------- - // vmov r14, s8 // *.......................................................................................................................................................................................................................................................................................................... - // ldr.w r6, [r0, #352] // .*......................................................................................................................................................................................................................................................................................................... - // vmov s23, r0 // ...*....................................................................................................................................................................................................................................................................................................... - // ldr.w r7, [r0, #416] // ..........*................................................................................................................................................................................................................................................................................................ - // ldr.w r5, [r0, #96] // ..*........................................................................................................................................................................................................................................................................................................ - // smulwb r2, r14, r6 // .....*..................................................................................................................................................................................................................................................................................................... - // ldr.w r1, [r0, #160] // ............*.............................................................................................................................................................................................................................................................................................. - // smulwb r11, r14, r7 // ........................*.................................................................................................................................................................................................................................................................................. - // ldr.w r10, [r0, #288] // ....*...................................................................................................................................................................................................................................................................................................... - // smulwt r7, r14, r7 // ..........................*................................................................................................................................................................................................................................................................................ - // ldr.w r3, [r0, #224] // ..............*............................................................................................................................................................................................................................................................................................ - // smulwt r4, r14, r6 // .......*................................................................................................................................................................................................................................................................................................... - // ldr.w r8, [r0, #32] // ........*.................................................................................................................................................................................................................................................................................................. - // smulwt r6, r14, r10 // ...............*........................................................................................................................................................................................................................................................................................... - // ldr.w r9, [r0, #480] // ......*.................................................................................................................................................................................................................................................................................................... - // smulwb r10, r14, r10 // .........*................................................................................................................................................................................................................................................................................................. - // movw r0, #24608 // ................*.......................................................................................................................................................................................................................................................................................... - // smlabt r11, r11, r12, r0 // ................................*.......................................................................................................................................................................................................................................................................... - // smlabt r7, r7, r12, r0 // ..............................*............................................................................................................................................................................................................................................................................ - // smlabt r4, r4, r12, r0 // ............................*.............................................................................................................................................................................................................................................................................. - // pkhtb r11, r7, r11, asr #16 // .....................................*..................................................................................................................................................................................................................................................................... - // smlabt r7, r10, r12, r0 // ...................*....................................................................................................................................................................................................................................................................................... - // usub16 r10, r1, r11 // .........................................*................................................................................................................................................................................................................................................................. - // smlabt r6, r6, r12, r0 // ....................*...................................................................................................................................................................................................................................................................................... - // uadd16 r11, r1, r11 // .......................................*................................................................................................................................................................................................................................................................... - // smlabt r2, r2, r12, r0 // ......................*.................................................................................................................................................................................................................................................................................... - // pkhtb r7, r6, r7, asr #16 // .......................*................................................................................................................................................................................................................................................................................... - // smulwb r1, r14, r9 // ...........*............................................................................................................................................................................................................................................................................................... - // usub16 r6, r8, r7 // .............................................*............................................................................................................................................................................................................................................................. - // smulwt r14, r14, r9 // .............*............................................................................................................................................................................................................................................................................................. - // uadd16 r9, r8, r7 // ...........................................*............................................................................................................................................................................................................................................................... - // smlabt r8, r1, r12, r0 // ..................*........................................................................................................................................................................................................................................................................................ - // vmov r1, s9 // .............................*............................................................................................................................................................................................................................................................................. - // smlabt r14, r14, r12, r0 // .................*......................................................................................................................................................................................................................................................................................... - // pkhtb r4, r4, r2, asr #16 // ...............................*........................................................................................................................................................................................................................................................................... - // smulwb r7, r1, r11 // ............................................*.............................................................................................................................................................................................................................................................. - // pkhtb r14, r14, r8, asr #16 // .....................*..................................................................................................................................................................................................................................................................................... - // smulwt r2, r1, r11 // ..........................................*................................................................................................................................................................................................................................................................ - // usub16 r8, r3, r14 // .........................*................................................................................................................................................................................................................................................................................. - // smlabt r7, r7, r12, r0 // ................................................*.......................................................................................................................................................................................................................................................... - // uadd16 r14, r3, r14 // ...........................*............................................................................................................................................................................................................................................................................... - // smlabt r2, r2, r12, r0 // ..............................................*............................................................................................................................................................................................................................................................ - // smulwb r3, r1, r14 // ..................................*........................................................................................................................................................................................................................................................................ - // pkhtb r2, r2, r7, asr #16 // ....................................................*...................................................................................................................................................................................................................................................... - // smulwt r11, r1, r14 // ....................................*...................................................................................................................................................................................................................................................................... - // vmov r7, s10 // ...............................................*........................................................................................................................................................................................................................................................... - // smlabt r1, r3, r12, r0 // ......................................*.................................................................................................................................................................................................................................................................... - // smulwb r3, r7, r8 // .....................................................*..................................................................................................................................................................................................................................................... - // uadd16 r14, r5, r4 // ...................................*....................................................................................................................................................................................................................................................................... - // smulwt r8, r7, r8 // ......................................................*.................................................................................................................................................................................................................................................... - // usub16 r4, r5, r4 // .................................*......................................................................................................................................................................................................................................................................... - // smlabt r11, r11, r12, r0 // ........................................*.................................................................................................................................................................................................................................................................. - // smlabt r5, r3, r12, r0 // ........................................................*.................................................................................................................................................................................................................................................. - // pkhtb r11, r11, r1, asr #16 // .................................................*......................................................................................................................................................................................................................................................... - // smlabt r8, r8, r12, r0 // ..........................................................*................................................................................................................................................................................................................................................ - // usub16 r3, r14, r11 // .......................................................*................................................................................................................................................................................................................................................... - // smulwb r1, r7, r10 // ..................................................*........................................................................................................................................................................................................................................................ - // pkhtb r8, r8, r5, asr #16 // .............................................................*............................................................................................................................................................................................................................................. - // smulwt r7, r7, r10 // ...................................................*....................................................................................................................................................................................................................................................... - // vmov r10, s12 // .........................................................*................................................................................................................................................................................................................................................. - // smlabt r1, r1, r12, r0 // ....................................................................*...................................................................................................................................................................................................................................... - // uadd16 r5, r4, r8 // .................................................................*......................................................................................................................................................................................................................................... - // smlabt r7, r7, r12, r0 // ..........................................................................*................................................................................................................................................................................................................................ - // uadd16 r14, r14, r11 // ...........................................................*............................................................................................................................................................................................................................................... - // smulwt r11, r10, r3 // ..............................................................*............................................................................................................................................................................................................................................ - // pkhtb r7, r7, r1, asr #16 // .............................................................................*............................................................................................................................................................................................................................. - // smulwb r1, r10, r3 // ............................................................*.............................................................................................................................................................................................................................................. - // vmov r3, s13 // ...........................................................................*............................................................................................................................................................................................................................... - // smlabt r10, r11, r12, r0 // ..................................................................*........................................................................................................................................................................................................................................ - // usub16 r11, r4, r8 // ...............................................................*........................................................................................................................................................................................................................................... - // smlabt r4, r1, r12, r0 // ................................................................*.......................................................................................................................................................................................................................................... - // usub16 r1, r6, r7 // ...................................................................................*....................................................................................................................................................................................................................... - // smulwb r8, r3, r5 // ................................................................................*.......................................................................................................................................................................................................................... - // uadd16 r6, r6, r7 // ...............................................................................*........................................................................................................................................................................................................................... - // smulwt r5, r3, r5 // ..................................................................................*........................................................................................................................................................................................................................ - // uadd16 r3, r9, r2 // .......................................................................*................................................................................................................................................................................................................................... - // smlabt r8, r8, r12, r0 // ....................................................................................*...................................................................................................................................................................................................................... - // vmov r7, s11 // ...................................................................*....................................................................................................................................................................................................................................... - // smlabt r5, r5, r12, r0 // ......................................................................................*.................................................................................................................................................................................................................... - // pkhtb r10, r10, r4, asr #16 // .....................................................................*..................................................................................................................................................................................................................................... - // smulwb r4, r7, r14 // ........................................................................*.................................................................................................................................................................................................................................. - // pkhtb r8, r5, r8, asr #16 // .........................................................................................*................................................................................................................................................................................................................. - // smulwt r14, r7, r14 // ......................................................................*.................................................................................................................................................................................................................................... - // usub16 r5, r9, r2 // .........................................................................*................................................................................................................................................................................................................................. - // smlabt r4, r4, r12, r0 // ............................................................................*.............................................................................................................................................................................................................................. - // vmov r9, s14 // .....................................................................................*..................................................................................................................................................................................................................... - // smlabt r2, r14, r12, r0 // ..............................................................................*............................................................................................................................................................................................................................ - // usub16 r7, r6, r8 // .....................................................................................................*..................................................................................................................................................................................................... - // smulwb r14, r9, r11 // ........................................................................................*.................................................................................................................................................................................................................. - // pkhtb r4, r2, r4, asr #16 // .................................................................................*......................................................................................................................................................................................................................... - // vmov r2, s22 // .......................................................................................................................................*................................................................................................................................................................... - // smulwt r11, r9, r11 // ..........................................................................................*................................................................................................................................................................................................................ - // uadd16 r6, r6, r8 // .......................................................................................................*................................................................................................................................................................................................... - // smlabt r9, r14, r12, r0 // ............................................................................................*.............................................................................................................................................................................................................. - // vmov r14, s20 // .......................................................................................*................................................................................................................................................................................................................... - // smlabt r8, r11, r12, r0 // ..............................................................................................*............................................................................................................................................................................................................ - // smulwt r11, r14, r7 // ........................................................................................................*.................................................................................................................................................................................................. - // pkhtb r9, r8, r9, asr #16 // .................................................................................................*......................................................................................................................................................................................................... - // smulwb r7, r14, r7 // .........................................................................................................*................................................................................................................................................................................................. - // usub16 r8, r1, r9 // ..............................................................................................................*............................................................................................................................................................................................ - // smlabt r14, r11, r12, r0 // ...............................................................................................................*........................................................................................................................................................................................... - // uadd16 r1, r1, r9 // ............................................................................................................*.............................................................................................................................................................................................. - // smulwb r9, r2, r8 // ................................................................................................................................................*.......................................................................................................................................................... - // smulwt r11, r2, r8 // ..............................................................................................................................................*............................................................................................................................................................ - // uadd16 r8, r3, r4 // ..........................................................................................................................*................................................................................................................................................................................ - // smlabt r2, r9, r12, r0 // ....................................................................................................................................................*...................................................................................................................................................... - // vmov r9, s21 // ....................................................................................................................*...................................................................................................................................................................................... - // smlabt r11, r11, r12, r0 // ..................................................................................................................................................*........................................................................................................................................................ - // usub16 r4, r3, r4 // ............................................................................................................................*.............................................................................................................................................................................. - // smulwb r3, r9, r1 // .............................................................................................................................*............................................................................................................................................................................. - // pkhtb r2, r11, r2, asr #16 // .........................................................................................................................................................*................................................................................................................................................. - // smulwt r1, r9, r1 // ...............................................................................................................................*........................................................................................................................................................................... - // vmov s7, r2 // ...........................................................................................................................................................*............................................................................................................................................... - // smlabt r11, r3, r12, r0 // ................................................................................................................................*.......................................................................................................................................................................... - // vmov r2, s19 // ...................................................................................................*....................................................................................................................................................................................................... - // smlabt r1, r1, r12, r0 // ..................................................................................................................................*........................................................................................................................................................................ - // vmov r9, s16 // .....................................................................................................................................*..................................................................................................................................................................... - // smlabt r7, r7, r12, r0 // .............................................................................................................*............................................................................................................................................................................................. - // pkhtb r1, r1, r11, asr #16 // .........................................................................................................................................*................................................................................................................................................................. - // smulwt r11, r9, r4 // ......................................................................................................................................................*.................................................................................................................................................... - // vmov s6, r1 // ...............................................................................................................................................*........................................................................................................................................................... - // smulwt r1, r2, r6 // ...........................................................................................................*............................................................................................................................................................................................... - // pkhtb r3, r14, r7, asr #16 // ..................................................................................................................*........................................................................................................................................................................................ - // smlabt r7, r11, r12, r0 // ..........................................................................................................................................................*................................................................................................................................................ - // uadd16 r11, r5, r10 // ...........................................................................................*............................................................................................................................................................................................................... - // smulwb r14, r2, r6 // .................................................................................................................*......................................................................................................................................................................................... - // vmov r2, s17 // .............................................................................................*............................................................................................................................................................................................................. - // smlabt r1, r1, r12, r0 // .......................................................................................................................*................................................................................................................................................................................... - // smulwt r6, r2, r11 // ................................................................................................*.......................................................................................................................................................................................................... - // smlabt r14, r14, r12, r0 // .........................................................................................................................*................................................................................................................................................................................. - // vmov s5, r3 // ......................................................................................................................*.................................................................................................................................................................................... - // smlabt r6, r6, r12, r0 // ....................................................................................................*...................................................................................................................................................................................................... - // smulwb r11, r2, r11 // ..................................................................................................*........................................................................................................................................................................................................ - // pkhtb r14, r1, r14, asr #16 // ..............................................................................................................................*............................................................................................................................................................................ - // smulwb r1, r9, r4 // ........................................................................................................................................................*.................................................................................................................................................. - // vmov r9, s15 // .................................................................................................................................*......................................................................................................................................................................... - // smlabt r11, r11, r12, r0 // ......................................................................................................*.................................................................................................................................................................................................... - // vmov r4, s23 // .............................................................................................................................................*............................................................................................................................................................. - // smulwb r2, r9, r8 // ....................................................................................................................................*...................................................................................................................................................................... - // usub16 r5, r5, r10 // ...............................................................................................*........................................................................................................................................................................................................... - // smlabt r1, r1, r12, r0 // ..............................................................................................................................................................*............................................................................................................................................ - // vmov r3, s18 // ................................................................................................................*.......................................................................................................................................................................................... - // smlabt r10, r2, r12, r0 // ............................................................................................................................................*.............................................................................................................................................................. - // pkhtb r11, r6, r11, asr #16 // ..........................................................................................................*................................................................................................................................................................................................ - // smulwt r2, r3, r5 // ...................................................................................................................*....................................................................................................................................................................................... - // vmov s2, r11 // ........................................................................................................................*.................................................................................................................................................................................. - // smulwt r9, r9, r8 // ......................................................................................................................................*.................................................................................................................................................................... - // pkhtb r11, r7, r1, asr #16 // ...................................................................................................................................................................*....................................................................................................................................... - // smlabt r1, r2, r12, r0 // ........................................................................................................................................*.................................................................................................................................................................. - // vmov r2, s8 // ...................................................................................................................................................*....................................................................................................................................................... - // smulwb r3, r3, r5 // .....................................................................................................................*..................................................................................................................................................................................... - // ldr.w r7, [r4, #320] // .....................................................................................................................................................*..................................................................................................................................................... - // smlabt r5, r9, r12, r0 // ..........................................................................................................................................*................................................................................................................................................................ - // ldr.w r8, [r4, #0] // .............................................................................................................................................................*............................................................................................................................................. - // smlabt r6, r3, r12, r0 // ...........................................................................................................................*............................................................................................................................................................................... - // pkhtb r10, r5, r10, asr #16 // .................................................................................................................................................*......................................................................................................................................................... - // smulwb r5, r2, r7 // ................................................................................................................................................................*.......................................................................................................................................... - // ldr.w r3, [r4, #256] // .......................................................................................................................................................*................................................................................................................................................... - // smulwt r9, r2, r7 // ....................................................................................................................................................................*...................................................................................................................................... - // movw r0, #24608 // .....................................................................................................................................................................*..................................................................................................................................... - // smlabt r7, r5, r12, r0 // ..........................................................................................................................................................................*................................................................................................................................ - // vmov s4, r14 // ...................................................................................................................................*....................................................................................................................................................................... - // smulwb r5, r2, r3 // ............................................................................................................................................................*.............................................................................................................................................. - // pkhtb r14, r1, r6, asr #16 // ...........................................................................................................................................*............................................................................................................................................................... - // smulwt r3, r2, r3 // ..................................................................................................................................................................*........................................................................................................................................ - // ldr.w r6, [r4, #384] // .......................................................................................................................................................................*................................................................................................................................... - // smlabt r1, r5, r12, r0 // ......................................................................................................................................................................*.................................................................................................................................... - // ldr.w r5, [r4, #64] // .....................................................................................................................................................................................*..................................................................................................................... - // smlabt r3, r3, r12, r0 // ........................................................................................................................................................................*.................................................................................................................................. - // vmov s3, r14 // ...............................................................................................................................................................*........................................................................................................................................... - // smulwb r14, r2, r6 // ............................................................................................................................................................................*.............................................................................................................................. - // pkhtb r1, r3, r1, asr #16 // ...........................................................................................................................................................................*............................................................................................................................... - // smulwt r3, r2, r6 // ..................................................................................................................................................................................*........................................................................................................................ - // usub16 r6, r8, r1 // .............................................................................................................................................................................*............................................................................................................................. - // smlabt r9, r9, r12, r0 // ................................................................................................................................................................................*.......................................................................................................................... - // uadd16 r1, r8, r1 // .................................................................................................................................................................................*......................................................................................................................... - // smlabt r8, r3, r12, r0 // ........................................................................................................................................................................................*.................................................................................................................. - // pkhtb r3, r9, r7, asr #16 // ...................................................................................................................................................................................*....................................................................................................................... - // smlabt r9, r14, r12, r0 // ......................................................................................................................................................................................*.................................................................................................................... - // vmov s0, r10 // .................................................................................................................................................................*......................................................................................................................................... - // ldr.w r10, [r4, #448] // .........................................................................................................................................................................*................................................................................................................................. - // ldr.w r7, [r4, #128] // .......................................................................................................................................................................................*................................................................................................................... - // vmov s1, r11 // ...............................................................................................................................................................................*........................................................................................................................... - // pkhtb r8, r8, r9, asr #16 // ...........................................................................................................................................................................................*............................................................................................................... - // smulwb r14, r2, r10 // ..............................................................................................................................................................................*............................................................................................................................ - // uadd16 r11, r7, r8 // .............................................................................................................................................................................................*............................................................................................................. - // smulwt r10, r2, r10 // ....................................................................................................................................................................................*...................................................................................................................... - // ldr.w r9, [r4, #192] // .........................................................................................................................................................................................*................................................................................................................. - // smlabt r4, r14, r12, r0 // ..........................................................................................................................................................................................*................................................................................................................ - // vmov r2, s9 // ............................................................................................................................................................................................*.............................................................................................................. - // smlabt r14, r10, r12, r0 // ..............................................................................................................................................................................................*............................................................................................................ - // usub16 r10, r7, r8 // ...............................................................................................................................................................................................*........................................................................................................... - // smulwb r7, r2, r11 // ................................................................................................................................................................................................*.......................................................................................................... - // pkhtb r8, r14, r4, asr #16 // .................................................................................................................................................................................................*......................................................................................................... - // smulwt r14, r2, r11 // ..................................................................................................................................................................................................*........................................................................................................ - // usub16 r4, r9, r8 // ...................................................................................................................................................................................................*....................................................................................................... - // smlabt r11, r7, r12, r0 // ....................................................................................................................................................................................................*...................................................................................................... - // uadd16 r7, r9, r8 // .....................................................................................................................................................................................................*..................................................................................................... - // smlabt r14, r14, r12, r0 // ......................................................................................................................................................................................................*.................................................................................................... - // vmov r8, s10 // .......................................................................................................................................................................................................*................................................................................................... - // smulwb r9, r2, r7 // ........................................................................................................................................................................................................*.................................................................................................. - // pkhtb r11, r14, r11, asr #16 // .........................................................................................................................................................................................................*................................................................................................. - // smulwt r2, r2, r7 // ..........................................................................................................................................................................................................*................................................................................................ - // usub16 r14, r1, r11 // ...........................................................................................................................................................................................................*............................................................................................... - // smlabt r7, r9, r12, r0 // ............................................................................................................................................................................................................*.............................................................................................. - // uadd16 r1, r1, r11 // .............................................................................................................................................................................................................*............................................................................................. - // smlabt r11, r2, r12, r0 // ..............................................................................................................................................................................................................*............................................................................................ - // uadd16 r9, r5, r3 // .................................................................................................................................................................................................................*......................................................................................... - // smulwb r2, r8, r10 // ..................................................................................................................................................................................................................*........................................................................................ - // usub16 r3, r5, r3 // ...............................................................................................................................................................................................................*........................................................................................... - // smulwt r5, r8, r4 // ................................................................................................................................................................................................................*.......................................................................................... - // pkhtb r7, r11, r7, asr #16 // ...................................................................................................................................................................................................................*....................................................................................... - // smulwt r11, r8, r10 // ....................................................................................................................................................................................................................*...................................................................................... - // usub16 r10, r9, r7 // .....................................................................................................................................................................................................................*..................................................................................... - // smlabt r2, r2, r12, r0 // ......................................................................................................................................................................................................................*.................................................................................... - // uadd16 r7, r9, r7 // .......................................................................................................................................................................................................................*................................................................................... - // smlabt r9, r11, r12, r0 // ........................................................................................................................................................................................................................*.................................................................................. - // vmov r11, s12 // .........................................................................................................................................................................................................................*................................................................................. - // smulwb r4, r8, r4 // ..........................................................................................................................................................................................................................*................................................................................ - // pkhtb r9, r9, r2, asr #16 // ...........................................................................................................................................................................................................................*............................................................................... - // smulwb r2, r11, r10 // ............................................................................................................................................................................................................................*.............................................................................. - // usub16 r8, r6, r9 // .............................................................................................................................................................................................................................*............................................................................. - // smlabt r4, r4, r12, r0 // ..............................................................................................................................................................................................................................*............................................................................ - // uadd16 r9, r6, r9 // ...............................................................................................................................................................................................................................*........................................................................... - // vmov r6, s13 // .................................................................................................................................................................................................................................*......................................................................... - // smlabt r5, r5, r12, r0 // ................................................................................................................................................................................................................................*.......................................................................... - // smulwt r11, r11, r10 // ..................................................................................................................................................................................................................................*........................................................................ - // pkhtb r4, r5, r4, asr #16 // ...................................................................................................................................................................................................................................*....................................................................... - // smlabt r5, r2, r12, r0 // ....................................................................................................................................................................................................................................*...................................................................... - // uadd16 r2, r3, r4 // .....................................................................................................................................................................................................................................*..................................................................... - // smlabt r11, r11, r12, r0 // ......................................................................................................................................................................................................................................*.................................................................... - // usub16 r4, r3, r4 // .......................................................................................................................................................................................................................................*................................................................... - // smulwb r10, r6, r2 // ........................................................................................................................................................................................................................................*.................................................................. - // vmov r3, s11 // .............................................................................................................................................................................................................................................*............................................................. - // smulwt r2, r6, r2 // ..........................................................................................................................................................................................................................................*................................................................ - // pkhtb r5, r11, r5, asr #16 // .........................................................................................................................................................................................................................................*................................................................. - // smlabt r11, r10, r12, r0 // ............................................................................................................................................................................................................................................*.............................................................. - // uadd16 r10, r14, r5 // ...........................................................................................................................................................................................................................................*............................................................... - // smlabt r2, r2, r12, r0 // ..............................................................................................................................................................................................................................................*............................................................ - // usub16 r14, r14, r5 // ...............................................................................................................................................................................................................................................*........................................................... - // smulwb r5, r3, r7 // ................................................................................................................................................................................................................................................*.......................................................... - // pkhtb r11, r2, r11, asr #16 // .................................................................................................................................................................................................................................................*......................................................... - // smulwt r7, r3, r7 // ..................................................................................................................................................................................................................................................*........................................................ - // usub16 r2, r9, r11 // ...................................................................................................................................................................................................................................................*....................................................... - // smlabt r3, r5, r12, r0 // ....................................................................................................................................................................................................................................................*...................................................... - // vmov r5, s14 // .....................................................................................................................................................................................................................................................*..................................................... - // smlabt r7, r7, r12, r0 // ......................................................................................................................................................................................................................................................*.................................................... - // uadd16 r9, r9, r11 // .......................................................................................................................................................................................................................................................*................................................... - // smulwb r11, r5, r4 // ..........................................................................................................................................................................................................................................................*................................................ - // pkhtb r3, r7, r3, asr #16 // .........................................................................................................................................................................................................................................................*................................................. - // smulwt r7, r5, r4 // ........................................................................................................................................................................................................................................................*.................................................. - // vmov r6, s1 // .............................................................................................................................................................................................................................................................*............................................. - // smlabt r5, r11, r12, r0 // ..............................................................................................................................................................................................................................................................*............................................ - // usub16 r4, r1, r3 // ...........................................................................................................................................................................................................................................................*............................................... - // smlabt r7, r7, r12, r0 // ............................................................................................................................................................................................................................................................*.............................................. - // usub16 r11, r4, r6 // ...............................................................................................................................................................................................................................................................*........................................... - // vmov r0, s23 // ................................................................................................................................................................................................................................................................*.......................................... - // uadd16 r6, r4, r6 // .....................................................................................................................................................................................................................................................................*..................................... - // str.w r11, [r0, #96] // .................................................................................................................................................................................................................................................................*......................................... - // str.w r6, [r0, #64] // ......................................................................................................................................................................................................................................................................*.................................... - // vmov r4, s2 // ..................................................................................................................................................................................................................................................................*........................................ - // usub16 r11, r10, r4 // .......................................................................................................................................................................................................................................................................*................................... - // str.w r11, [r0, #160] // ........................................................................................................................................................................................................................................................................*.................................. - // pkhtb r6, r7, r5, asr #16 // ...................................................................................................................................................................................................................................................................*....................................... - // vmov r11, s5 // ....................................................................................................................................................................................................................................................................*...................................... - // uadd16 r5, r2, r11 // .........................................................................................................................................................................................................................................................................*................................. - // str.w r5, [r0, #320] // ..........................................................................................................................................................................................................................................................................*................................ - // usub16 r7, r2, r11 // ...........................................................................................................................................................................................................................................................................*............................... - // str.w r7, [r0, #352] // ..............................................................................................................................................................................................................................................................................*............................ - // usub16 r5, r8, r6 // .............................................................................................................................................................................................................................................................................*............................. - // vmov r7, s7 // ............................................................................................................................................................................................................................................................................*.............................. - // usub16 r11, r5, r7 // ...............................................................................................................................................................................................................................................................................*........................... - // vmov r2, s0 // ....................................................................................................................................................................................................................................................................................*...................... - // uadd16 r7, r5, r7 // .................................................................................................................................................................................................................................................................................*......................... - // str.w r11, [r0, #480] // ................................................................................................................................................................................................................................................................................*.......................... - // uadd16 r1, r1, r3 // ...................................................................................................................................................................................................................................................................................*....................... - // vmov r3, s3 // ..................................................................................................................................................................................................................................................................................*........................ - // uadd16 r10, r10, r4 // .......................................................................................................................................................................................................................................................................................*................... - // str.w r10, [r0, #128] // ........................................................................................................................................................................................................................................................................................*.................. - // uadd16 r5, r14, r3 // .........................................................................................................................................................................................................................................................................................*................. - // str.w r5, [r0, #192] // ..........................................................................................................................................................................................................................................................................................*................ - // usub16 r10, r1, r2 // ...........................................................................................................................................................................................................................................................................................*............... - // str.w r10, [r0, #32] // ............................................................................................................................................................................................................................................................................................*.............. - // usub16 r3, r14, r3 // .....................................................................................................................................................................................................................................................................................*..................... - // str.w r3, [r0, #224] // ......................................................................................................................................................................................................................................................................................*.................... - // uadd16 r14, r1, r2 // .............................................................................................................................................................................................................................................................................................*............. - // vmov r1, s6 // ................................................................................................................................................................................................................................................................................................*.......... - // uadd16 r3, r8, r6 // ...............................................................................................................................................................................................................................................................................................*........... - // str.w r14, [r0], #4 // ..............................................................................................................................................................................................................................................................................................*............ - // usub16 r2, r3, r1 // .................................................................................................................................................................................................................................................................................................*......... - // str.w r2, [r0, #412] // ..................................................................................................................................................................................................................................................................................................*........ - // uadd16 r1, r3, r1 // ...................................................................................................................................................................................................................................................................................................*....... - // str.w r1, [r0, #380] // .....................................................................................................................................................................................................................................................................................................*..... - // str.w r7, [r0, #444] // ....................................................................................................................................................................................................................................................................................................*...... - // vmov r7, s4 // ......................................................................................................................................................................................................................................................................................................*.... - // usub16 r11, r9, r7 // .......................................................................................................................................................................................................................................................................................................*... - // str.w r11, [r0, #284] // ........................................................................................................................................................................................................................................................................................................*.. - // uadd16 r7, r9, r7 // .........................................................................................................................................................................................................................................................................................................*. - // str.w r7, [r0, #252] // ..........................................................................................................................................................................................................................................................................................................* - vmov tmp, s24 - cmp poly, tmp - bne layer1234_loop sub.w poly, #8*strincr @@ -778,249 +1098,246 @@ layer1234_loop: add.w tmp, poly, #strincr2*16 vmov s13, tmp - vmov twiddle_ptr, s25 layer567_loop: // Instructions: 110 - // Expected cycles: 60 - // Expected IPC: 1.83 + // Expected cycles: 59 + // Expected IPC: 1.86 // // --------------------------------------------- original position ---------------------------------------------> // 0 25 50 75 100 // |------------------------|------------------------|------------------------|------------------------|--------- - ldr.w r6, [r0, #24] // *............................................................................................................. - ldr.w r2, [r1], #28 // .....*........................................................................................................ - ldr.w r3, [r0, #16] // ..........*................................................................................................... - ldr.w r8, [r0, #20] // ......*....................................................................................................... - ldr.w r11, [r0, #28] // ........*..................................................................................................... - smulwt r10, r2, r6 // ...........*.................................................................................................. - ldr.w r5, [r0, #4] // ...*.......................................................................................................... - smulwb r14, r2, r8 // ......................*....................................................................................... - ldr.w r4, [r0, #0] // ....*......................................................................................................... - smulwt r7, r2, r8 // ........................*..................................................................................... - ldr.w r8, [r0, #12] // .*............................................................................................................ - smulwb r6, r2, r6 // .........*.................................................................................................... - ldr.w r9, [r0, #8] // ..*........................................................................................................... - vmov s23, r0 // .......*...................................................................................................... - movw r0, #24608 // ............*................................................................................................. - smlabt r14, r14, r12, r0 // ..........................*................................................................................... + vmov s23, r0 // .*............................................................................................................ + ldr.w r9, [r1], #28 // *............................................................................................................. + ldr.w r14, [r0, #24] // ..*........................................................................................................... + ldr.w r2, [r0, #20] // ...*.......................................................................................................... + ldr.w r10, [r0, #12] // ....*......................................................................................................... + ldr.w r8, [r0, #16] // .....*........................................................................................................ + ldr.w r6, [r0, #28] // ......*....................................................................................................... + smulwb r11, r9, r2 // .........*.................................................................................................... + ldr.w r4, [r0, #4] // ........*..................................................................................................... + smulwt r3, r9, r2 // .......*...................................................................................................... + ldr.w r2, [r0, #8] // ..........*................................................................................................... + smulwb r7, r9, r6 // ...........*.................................................................................................. + ldr.w r5, [r0, #0] // ............*................................................................................................. + smulwt r6, r9, r6 // .............*................................................................................................ + movw r0, #24608 // ..............*............................................................................................... + smlabt r7, r7, r12, r0 // ...............*.............................................................................................. // gap // .............................................................................................................. - smlabt r6, r6, r12, r0 // .............*................................................................................................ + smlabt r11, r11, r12, r0 // ................*............................................................................................. // gap // .............................................................................................................. - smlabt r10, r10, r12, r0 // ..............*............................................................................................... + smlabt r6, r6, r12, r0 // .................*............................................................................................ // gap // .............................................................................................................. - smlabt r7, r7, r12, r0 // ............................*................................................................................. - pkhtb r10, r10, r6, asr #16 // ................*............................................................................................. - smulwb r6, r2, r3 // .............................*................................................................................ - pkhtb r14, r7, r14, asr #16 // ..............................*............................................................................... - smulwt r3, r2, r3 // ...............................*.............................................................................. - usub16 r7, r5, r14 // ..................................*........................................................................... - smlabt r6, r6, r12, r0 // .................................*............................................................................ - uadd16 r14, r5, r14 // ....................................*......................................................................... - smlabt r3, r3, r12, r0 // ...................................*.......................................................................... + smlabt r3, r3, r12, r0 // ..................*........................................................................................... + pkhtb r6, r6, r7, asr #16 // ...................*.......................................................................................... + smulwb r7, r9, r14 // ....................*......................................................................................... // gap // .............................................................................................................. - smulwb r5, r2, r11 // ...............*.............................................................................................. - pkhtb r3, r3, r6, asr #16 // ......................................*....................................................................... - smulwt r11, r2, r11 // .................*............................................................................................ - ldr r2, [r1, #-20] // ................................*............................................................................. - smlabt r5, r5, r12, r0 // ...................*.......................................................................................... - usub16 r6, r9, r10 // ..................*........................................................................................... - smlabt r11, r11, r12, r0 // .....................*........................................................................................ - uadd16 r10, r9, r10 // ....................*......................................................................................... - smulwb r9, r2, r6 // .....................................*........................................................................ - pkhtb r11, r11, r5, asr #16 // .......................*...................................................................................... - smulwt r6, r2, r6 // .......................................*...................................................................... - uadd16 r5, r8, r11 // ...........................*.................................................................................. - smlabt r9, r9, r12, r0 // .........................................*.................................................................... - usub16 r11, r8, r11 // .........................*.................................................................................... - smlabt r6, r6, r12, r0 // ...........................................*.................................................................. - usub16 r8, r4, r3 // ..........................................*................................................................... + smulwt r14, r9, r14 // ......................*....................................................................................... + pkhtb r3, r3, r11, asr #16 // .....................*........................................................................................ + smlabt r11, r7, r12, r0 // .......................*...................................................................................... + usub16 r7, r4, r3 // ..........................*................................................................................... + smlabt r14, r14, r12, r0 // .........................*.................................................................................... + uadd16 r3, r4, r3 // ........................*..................................................................................... + smulwb r4, r9, r8 // ...........................*.................................................................................. + pkhtb r11, r14, r11, asr #16 // ............................*................................................................................. + smulwt r8, r9, r8 // .............................*................................................................................ + ldr r9, [r1, #-20] // ...............................................*.............................................................. + smlabt r4, r4, r12, r0 // ...............................*.............................................................................. + usub16 r14, r2, r11 // .....................................*........................................................................ + smlabt r8, r8, r12, r0 // .................................*............................................................................ + uadd16 r2, r2, r11 // .......................................*...................................................................... + smulwb r11, r9, r14 // ....................................................*......................................................... + pkhtb r8, r8, r4, asr #16 // ...................................*.......................................................................... + smulwt r14, r9, r14 // .....................................................*........................................................ + uadd16 r4, r10, r6 // ..............................*............................................................................... + smlabt r11, r11, r12, r0 // ........................................................*..................................................... + usub16 r6, r10, r6 // ..................................*........................................................................... + smlabt r10, r14, r12, r0 // .........................................................*.................................................... // gap // .............................................................................................................. - pkhtb r9, r6, r9, asr #16 // .............................................*................................................................ - smulwb r6, r2, r11 // ..............................................*............................................................... - uadd16 r3, r4, r3 // ........................................*..................................................................... - smulwt r2, r2, r11 // ............................................*................................................................. - ldr r11, [r1, #-24] // ...............................................*.............................................................. - smlabt r4, r6, r12, r0 // ..................................................*........................................................... - uadd16 r6, r8, r9 // ...................................................*.......................................................... - smlabt r2, r2, r12, r0 // ................................................*............................................................. - usub16 r8, r8, r9 // .................................................*............................................................ - smulwb r9, r11, r5 // ....................................................*......................................................... - pkhtb r2, r2, r4, asr #16 // ......................................................*....................................................... - smulwt r4, r11, r5 // .....................................................*........................................................ - usub16 r5, r7, r2 // .......................................................*...................................................... - smlabt r9, r9, r12, r0 // ........................................................*..................................................... - uadd16 r7, r7, r2 // .........................................................*.................................................... - smlabt r2, r4, r12, r0 // ..........................................................*................................................... + smulwb r14, r9, r6 // ...........................................................*.................................................. + pkhtb r10, r10, r11, asr #16 // ............................................................*................................................. + smulwt r9, r9, r6 // .............................................................*................................................ + ldr r6, [r1, #-24] // ................................*............................................................................. + smlabt r14, r14, r12, r0 // ...............................................................*.............................................. + uadd16 r11, r5, r8 // .........................................*.................................................................... + smlabt r9, r9, r12, r0 // .................................................................*............................................ + usub16 r5, r5, r8 // ...........................................*.................................................................. + smulwb r8, r6, r4 // ....................................*......................................................................... + pkhtb r14, r9, r14, asr #16 // ....................................................................*......................................... + smulwt r4, r6, r4 // ......................................*....................................................................... + usub16 r9, r7, r14 // ........................................................................*..................................... + smlabt r8, r8, r12, r0 // ........................................*..................................................................... + uadd16 r7, r7, r14 // ..........................................................................*................................... + smlabt r14, r4, r12, r0 // ..........................................*................................................................... // gap // .............................................................................................................. - smulwb r4, r11, r10 // ...........................................................*.................................................. - pkhtb r9, r2, r9, asr #16 // ............................................................*................................................. - smulwt r11, r11, r10 // .............................................................*................................................ - ldr r2, [r1, #-16] // ......................................................................................*....................... - smlabt r10, r4, r12, r0 // ...............................................................*.............................................. - uadd16 r4, r14, r9 // ..................................................................*........................................... - smlabt r11, r11, r12, r0 // .................................................................*............................................ - usub16 r9, r14, r9 // ................................................................*............................................. - smulwb r14, r2, r4 // ...........................................................................................*.................. - pkhtb r11, r11, r10, asr #16 // ....................................................................*......................................... - smulwt r2, r2, r4 // ............................................................................................*................. - ldr r4, [r1, #-12] // ..............................................................*............................................... - smlabt r14, r14, r12, r0 // ..............................................................................................*............... - uadd16 r10, r3, r11 // ........................................................................*..................................... - smlabt r2, r2, r12, r0 // ................................................................................................*............. - usub16 r3, r3, r11 // ..........................................................................*................................... - smulwb r11, r4, r9 // ...................................................................*.......................................... - pkhtb r2, r2, r14, asr #16 // .....................................................................................................*........ - smulwt r4, r4, r9 // .....................................................................*........................................ - ldr r14, [r1, #-4] // ..............................................................................*............................... - smlabt r9, r11, r12, r0 // .......................................................................*...................................... + smulwb r4, r6, r2 // ..............................................*............................................................... + pkhtb r8, r14, r8, asr #16 // .............................................*................................................................ + smulwt r14, r6, r2 // ............................................*................................................................. + ldr r6, [r1, #-12] // ......................................................................*....................................... + smlabt r4, r4, r12, r0 // ..................................................*........................................................... + usub16 r2, r3, r8 // .................................................*............................................................ + smlabt r14, r14, r12, r0 // ................................................*............................................................. + uadd16 r8, r3, r8 // ...................................................*.......................................................... + smulwb r3, r6, r2 // ...........................................................................*.................................. + pkhtb r4, r14, r4, asr #16 // ......................................................*....................................................... + smulwt r2, r6, r2 // .............................................................................*................................ + ldr r14, [r1, #-16] // ..............................................................*............................................... + smlabt r6, r3, r12, r0 // ...............................................................................*.............................. + usub16 r3, r5, r10 // ..................................................................*........................................... + smlabt r2, r2, r12, r0 // .................................................................................*............................ + uadd16 r5, r5, r10 // ................................................................*............................................. + smulwb r10, r14, r8 // ...................................................................*.......................................... + pkhtb r2, r2, r6, asr #16 // ....................................................................................*......................... + smulwt r8, r14, r8 // .....................................................................*........................................ + ldr r6, [r1, #-4] // ......................................................................................*....................... + smlabt r10, r10, r12, r0 // .......................................................................*...................................... + uadd16 r14, r11, r4 // ..........................................................*................................................... + smlabt r8, r8, r12, r0 // .........................................................................*.................................... + usub16 r11, r11, r4 // .......................................................*...................................................... + smulwb r4, r6, r9 // ...........................................................................................*.................. + pkhtb r10, r8, r10, asr #16 // ............................................................................*................................. + smulwt r9, r6, r9 // .............................................................................................*................ + ldr r8, [r1, #-8] // ..............................................................................*............................... + smlabt r4, r4, r12, r0 // ...............................................................................................*.............. + uadd16 r6, r14, r10 // ..................................................................................*........................... + smlabt r9, r9, r12, r0 // .................................................................................................*............ + usub16 r14, r14, r10 // ................................................................................*............................. + smulwb r10, r8, r7 // ...................................................................................*.......................... // gap // .............................................................................................................. - smlabt r11, r4, r12, r0 // .........................................................................*.................................... + smulwt r7, r8, r7 // .....................................................................................*........................ + uadd16 r8, r11, r2 // ..........................................................................................*................... + smlabt r10, r10, r12, r0 // .......................................................................................*...................... + usub16 r2, r11, r2 // ........................................................................................*..................... + smlabt r7, r7, r12, r0 // .........................................................................................*.................... + vmov r0, s23 // ..................................................................................................*........... + str.w r6, [r0], #32 // .....................................................................................................*........ // @slothy:core // @slothy:before=cmp + str r14, [r0, #-28] // ...................................................................................................*.......... // gap // .............................................................................................................. - smulwb r4, r14, r5 // ...................................................................................*.......................... - pkhtb r11, r11, r9, asr #16 // ............................................................................*................................. - smulwt r14, r14, r5 // .....................................................................................*........................ - ldr r9, [r1, #-8] // ......................................................................*....................................... - smlabt r4, r4, r12, r0 // .......................................................................................*...................... - usub16 r5, r3, r11 // ..................................................................................*........................... - smlabt r14, r14, r12, r0 // .........................................................................................*.................... - uadd16 r11, r3, r11 // ................................................................................*............................. - smulwt r3, r9, r7 // .............................................................................*................................ - pkhtb r14, r14, r4, asr #16 // .............................................................................................*................ - smulwb r4, r9, r7 // ...........................................................................*.................................. - uadd16 r9, r8, r14 // ...................................................................................................*.......... - smlabt r7, r3, r12, r0 // .................................................................................*............................ - usub16 r3, r10, r2 // .........................................................................................................*.... - smlabt r4, r4, r12, r0 // ...............................................................................*.............................. - vmov r0, s23 // .................................................................................................*............ - str.w r3, [r0, #4] // ............................................................................................................*. - str.w r11, [r0, #8] // ..................................................................................................*........... - // gap // .............................................................................................................. - str.w r5, [r0, #12] // ....................................................................................................*......... - // gap // .............................................................................................................. - pkhtb r4, r7, r4, asr #16 // ....................................................................................*......................... - str.w r9, [r0, #24] // ..........................................................................................................*... - usub16 r7, r6, r4 // ........................................................................................*..................... - str.w r7, [r0, #20] // ......................................................................................................*....... - uadd16 r9, r10, r2 // .......................................................................................................*...... - str.w r9, [r0], #32 // .............................................................................................................* // @slothy:core // @slothy:before=cmp - uadd16 r6, r6, r4 // ..........................................................................................*................... - str r6, [r0, #-16] // ........................................................................................................*..... - usub16 r6, r8, r14 // ...............................................................................................*.............. - str r6, [r0, #-4] // ...........................................................................................................*.. + pkhtb r7, r7, r10, asr #16 // ............................................................................................*................. + str r2, [r0, #-20] // .......................................................................................................*...... + usub16 r6, r5, r7 // ..............................................................................................*............... + str r8, [r0, #-24] // .........................................................................................................*.... + uadd16 r14, r5, r7 // ................................................................................................*............. + str r14, [r0, #-16] // ...........................................................................................................*.. + pkhtb r14, r9, r4, asr #16 // ....................................................................................................*......... + str r6, [r0, #-12] // ..........................................................................................................*... + uadd16 r8, r3, r14 // ........................................................................................................*..... + str r8, [r0, #-8] // ............................................................................................................*. + usub16 r8, r3, r14 // ......................................................................................................*....... + str r8, [r0, #-4] // .............................................................................................................* - // ----------------------------------------------- new position ------------------------------------------------> - // 0 25 50 75 100 - // |------------------------|------------------------|------------------------|------------------------|--------- - // ldr.w r3, [r0, #24] // *............................................................................................................. - // ldr.w r5, [r0, #12] // ..........*................................................................................................... - // ldr.w r6, [r0, #8] // ............*................................................................................................. - // ldr.w r14, [r0, #4] // ......*....................................................................................................... - // ldr.w r11, [r0, #0] // ........*..................................................................................................... - // ldr.w r10, [r1], #28 // .*............................................................................................................ - // ldr.w r9, [r0, #20] // ...*.......................................................................................................... - // vmov s23, r0 // .............*................................................................................................ - // ldr.w r8, [r0, #28] // ....*......................................................................................................... - // smulwb r2, r10, r3 // ...........*.................................................................................................. - // ldr.w r7, [r0, #16] // ..*........................................................................................................... - // smulwt r3, r10, r3 // .....*........................................................................................................ - // movw r0, #24608 // ..............*............................................................................................... - // smlabt r2, r2, r12, r0 // ................*............................................................................................. - // smlabt r3, r3, r12, r0 // .................*............................................................................................ - // smulwb r4, r10, r8 // ...........................*.................................................................................. - // pkhtb r2, r3, r2, asr #16 // ...................*.......................................................................................... - // smulwt r3, r10, r8 // .............................*................................................................................ - // usub16 r8, r6, r2 // ................................*............................................................................. - // smlabt r4, r4, r12, r0 // ...............................*.............................................................................. - // uadd16 r2, r6, r2 // ..................................*........................................................................... - // smlabt r3, r3, r12, r0 // .................................*............................................................................ - // smulwb r6, r10, r9 // .......*...................................................................................................... - // pkhtb r4, r3, r4, asr #16 // ....................................*......................................................................... - // smulwt r3, r10, r9 // .........*.................................................................................................... - // usub16 r9, r5, r4 // ........................................*..................................................................... - // smlabt r6, r6, r12, r0 // ...............*.............................................................................................. - // uadd16 r5, r5, r4 // ......................................*....................................................................... - // smlabt r4, r3, r12, r0 // ..................*........................................................................................... - // smulwb r3, r10, r7 // ....................*......................................................................................... - // pkhtb r6, r4, r6, asr #16 // .....................*........................................................................................ - // smulwt r10, r10, r7 // ......................*....................................................................................... - // ldr r4, [r1, #-20] // ..............................*............................................................................... - // smlabt r3, r3, r12, r0 // ........................*..................................................................................... - // usub16 r7, r14, r6 // .......................*...................................................................................... - // smlabt r10, r10, r12, r0 // ..........................*................................................................................... - // uadd16 r14, r14, r6 // .........................*.................................................................................... - // smulwb r6, r4, r8 // ...................................*.......................................................................... - // pkhtb r10, r10, r3, asr #16 // ............................*................................................................................. - // smulwt r8, r4, r8 // .....................................*........................................................................ - // uadd16 r3, r11, r10 // .............................................*................................................................ - // smlabt r6, r6, r12, r0 // .......................................*...................................................................... - // usub16 r10, r11, r10 // ..........................................*................................................................... - // smlabt r11, r8, r12, r0 // .........................................*.................................................................... - // smulwt r8, r4, r9 // ..............................................*............................................................... - // pkhtb r6, r11, r6, asr #16 // ...........................................*.................................................................. - // smulwb r9, r4, r9 // ............................................*................................................................. - // ldr r11, [r1, #-24] // ...............................................*.............................................................. - // smlabt r4, r8, r12, r0 // ..................................................*........................................................... - // usub16 r8, r10, r6 // ...................................................*.......................................................... - // smlabt r9, r9, r12, r0 // ................................................*............................................................. - // uadd16 r6, r10, r6 // .................................................*............................................................ - // smulwb r10, r11, r5 // ....................................................*......................................................... - // smulwt r5, r11, r5 // ......................................................*....................................................... - // pkhtb r4, r4, r9, asr #16 // .....................................................*........................................................ - // usub16 r9, r7, r4 // .......................................................*...................................................... - // smlabt r10, r10, r12, r0 // ........................................................*..................................................... - // uadd16 r7, r7, r4 // .........................................................*.................................................... - // smlabt r4, r5, r12, r0 // ..........................................................*................................................... - // smulwb r5, r11, r2 // ...........................................................*.................................................. - // pkhtb r4, r4, r10, asr #16 // ............................................................*................................................. - // smulwt r10, r11, r2 // .............................................................*................................................ - // ldr r2, [r1, #-12] // ......................................................................*....................................... - // smlabt r5, r5, r12, r0 // ...............................................................*.............................................. - // usub16 r11, r14, r4 // ..................................................................*........................................... - // smlabt r10, r10, r12, r0 // .................................................................*............................................ - // uadd16 r14, r14, r4 // ................................................................*............................................. - // smulwb r4, r2, r11 // ...........................................................................*.................................. - // pkhtb r5, r10, r5, asr #16 // ....................................................................*......................................... - // smulwt r10, r2, r11 // .............................................................................*................................ - // ldr r2, [r1, #-8] // ....................................................................................*......................... - // smlabt r4, r4, r12, r0 // ...............................................................................*.............................. - // uadd16 r11, r3, r5 // ........................................................................*..................................... - // smlabt r10, r10, r12, r0 // ................................................................................*............................. - // usub16 r5, r3, r5 // ..........................................................................*................................... - // smulwb r3, r2, r7 // ...........................................................................................*.................. - // pkhtb r10, r10, r4, asr #16 // ..................................................................................*........................... - // smulwt r7, r2, r7 // .........................................................................................*.................... - // ldr r2, [r1, #-4] // ..............................................................................*............................... - // smlabt r3, r3, r12, r0 // ...............................................................................................*.............. - // uadd16 r4, r5, r10 // ........................................................................................*..................... - // smlabt r7, r7, r12, r0 // .............................................................................................*................ - // usub16 r10, r5, r10 // ......................................................................................*....................... - // smulwb r5, r2, r9 // .................................................................................*............................ - // pkhtb r7, r7, r3, asr #16 // ....................................................................................................*......... - // smulwt r9, r2, r9 // ...................................................................................*.......................... - // ldr r2, [r1, #-16] // ..............................................................*............................................... - // smlabt r5, r5, r12, r0 // .....................................................................................*........................ - // usub16 r3, r6, r7 // ......................................................................................................*....... - // smlabt r9, r9, r12, r0 // .......................................................................................*...................... - // uadd16 r6, r6, r7 // ..........................................................................................................*... - // smulwb r7, r2, r14 // ...................................................................*.......................................... - // smulwt r2, r2, r14 // .....................................................................*........................................ - // pkhtb r14, r9, r5, asr #16 // ..........................................................................................*................... - // smlabt r7, r7, r12, r0 // .......................................................................*...................................... - // usub16 r5, r8, r14 // ............................................................................................................*. - // smlabt r9, r2, r12, r0 // .........................................................................*.................................... - // vmov r0, s23 // ................................................................................................*............. - // str.w r4, [r0, #8] // ..................................................................................................*........... - // uadd16 r4, r8, r14 // ............................................................................................*................. - // str.w r10, [r0, #12] // ...................................................................................................*.......... - // pkhtb r7, r9, r7, asr #16 // ............................................................................*................................. - // str.w r3, [r0, #20] // .......................................................................................................*...... - // uadd16 r10, r11, r7 // ........................................................................................................*..... - // str.w r6, [r0, #16] // ...........................................................................................................*.. - // usub16 r6, r11, r7 // ..............................................................................................*............... - // str.w r4, [r0, #24] // .....................................................................................................*........ - // str.w r5, [r0, #28] // .............................................................................................................* - // str.w r6, [r0, #4] // .................................................................................................*............ - // str.w r10, [r0], #32 // .........................................................................................................*.... + // ----------------------------------------------- new position ------------------------------------------------> + // 0 25 50 75 100 + // |------------------------|------------------------|------------------------|------------------------|--------- + // ldr.w r9, [r1], #28 // .*............................................................................................................ + // vmov s23, r0 // *............................................................................................................. + // ldr.w r11, [r0, #24] // ..*........................................................................................................... + // ldr.w r2, [r0, #20] // ...*.......................................................................................................... + // ldr.w r5, [r0, #12] // ....*......................................................................................................... + // ldr.w r8, [r0, #16] // .....*........................................................................................................ + // ldr.w r3, [r0, #28] // ......*....................................................................................................... + // smulwt r14, r9, r2 // .........*.................................................................................................... + // ldr.w r10, [r0, #4] // ........*..................................................................................................... + // smulwb r7, r9, r2 // .......*...................................................................................................... + // ldr.w r2, [r0, #8] // ..........*................................................................................................... + // smulwb r6, r9, r3 // ...........*.................................................................................................. + // ldr.w r4, [r0, #0] // ............*................................................................................................. + // smulwt r3, r9, r3 // .............*................................................................................................ + // movw r0, #24608 // ..............*............................................................................................... + // smlabt r6, r6, r12, r0 // ...............*.............................................................................................. + // smlabt r7, r7, r12, r0 // ................*............................................................................................. + // smlabt r3, r3, r12, r0 // .................*............................................................................................ + // smlabt r14, r14, r12, r0 // ..................*........................................................................................... + // pkhtb r6, r3, r6, asr #16 // ...................*.......................................................................................... + // smulwb r3, r9, r11 // ....................*......................................................................................... + // pkhtb r7, r14, r7, asr #16 // ......................*....................................................................................... + // smulwt r11, r9, r11 // .....................*........................................................................................ + // smlabt r14, r3, r12, r0 // .......................*...................................................................................... + // uadd16 r3, r10, r7 // ..........................*................................................................................... + // smlabt r11, r11, r12, r0 // .........................*.................................................................................... + // usub16 r7, r10, r7 // ........................*..................................................................................... + // smulwb r10, r9, r8 // ...........................*.................................................................................. + // pkhtb r11, r11, r14, asr #16 // ............................*................................................................................. + // smulwt r8, r9, r8 // .............................*................................................................................ + // uadd16 r14, r5, r6 // ......................................*....................................................................... + // smlabt r10, r10, r12, r0 // ...............................*.............................................................................. + // ldr r9, [r1, #-24] // .............................................*................................................................ + // smlabt r8, r8, r12, r0 // .................................*............................................................................ + // usub16 r6, r5, r6 // ........................................*..................................................................... + // pkhtb r8, r8, r10, asr #16 // ....................................*......................................................................... + // smulwb r5, r9, r14 // ..................................................*........................................................... + // usub16 r10, r2, r11 // ................................*............................................................................. + // smulwt r14, r9, r14 // ....................................................*......................................................... + // uadd16 r11, r2, r11 // ..................................*........................................................................... + // smlabt r5, r5, r12, r0 // ......................................................*....................................................... + // uadd16 r2, r4, r8 // ...............................................*.............................................................. + // smlabt r14, r14, r12, r0 // ........................................................*..................................................... + // usub16 r8, r4, r8 // .................................................*............................................................ + // smulwt r4, r9, r11 // ...........................................................*.................................................. + // pkhtb r14, r14, r5, asr #16 // ..........................................................*................................................... + // smulwb r9, r9, r11 // .........................................................*.................................................... + // ldr r11, [r1, #-20] // ..............................*............................................................................... + // smlabt r4, r4, r12, r0 // ...............................................................*.............................................. + // usub16 r5, r3, r14 // ..............................................................*............................................... + // smlabt r9, r9, r12, r0 // .............................................................*................................................ + // uadd16 r14, r3, r14 // ................................................................*............................................. + // smulwb r3, r11, r10 // ...................................*.......................................................................... + // smulwt r10, r11, r10 // .....................................*........................................................................ + // pkhtb r9, r4, r9, asr #16 // ..................................................................*........................................... + // usub16 r4, r2, r9 // ................................................................................*............................. + // smlabt r3, r3, r12, r0 // .......................................*...................................................................... + // smlabt r10, r10, r12, r0 // .........................................*.................................................................... + // uadd16 r2, r2, r9 // ..............................................................................*............................... + // smulwb r9, r11, r6 // ..........................................*................................................................... + // pkhtb r10, r10, r3, asr #16 // ...........................................*.................................................................. + // smulwt r3, r11, r6 // ............................................*................................................................. + // ldr r6, [r1, #-16] // ....................................................................*......................................... + // smlabt r9, r9, r12, r0 // ..............................................*............................................................... + // uadd16 r11, r8, r10 // ........................................................................*..................................... + // smlabt r3, r3, r12, r0 // ................................................*............................................................. + // usub16 r10, r8, r10 // ......................................................................*....................................... + // smulwb r8, r6, r14 // .........................................................................*.................................... + // pkhtb r3, r3, r9, asr #16 // ...................................................*.......................................................... + // smulwt r6, r6, r14 // ...........................................................................*.................................. + // ldr r14, [r1, #-12] // ............................................................*................................................. + // smlabt r8, r8, r12, r0 // .............................................................................*................................ + // usub16 r9, r7, r3 // .....................................................*........................................................ + // smlabt r6, r6, r12, r0 // ...............................................................................*.............................. + // uadd16 r7, r7, r3 // .......................................................*...................................................... + // smulwb r3, r14, r5 // .................................................................*............................................ + // pkhtb r6, r6, r8, asr #16 // ..................................................................................*........................... + // smulwt r8, r14, r5 // ...................................................................*.......................................... + // ldr r14, [r1, #-8] // ....................................................................................*......................... + // smlabt r5, r3, r12, r0 // .....................................................................*........................................ + // usub16 r3, r2, r6 // ........................................................................................*..................... + // smlabt r8, r8, r12, r0 // .......................................................................*...................................... + // uadd16 r6, r2, r6 // ......................................................................................*....................... + // smulwb r2, r14, r7 // .........................................................................................*.................... + // pkhtb r8, r8, r5, asr #16 // ..........................................................................*................................... + // smulwt r14, r14, r7 // ..........................................................................................*................... + // ldr r7, [r1, #-4] // ............................................................................*................................. + // smlabt r2, r2, r12, r0 // ............................................................................................*................. + // usub16 r5, r4, r8 // .............................................................................................*................ + // smlabt r14, r14, r12, r0 // ..............................................................................................*............... + // uadd16 r8, r4, r8 // ...........................................................................................*.................. + // smulwb r4, r7, r9 // .................................................................................*............................ + // pkhtb r14, r14, r2, asr #16 // ..................................................................................................*........... + // smulwt r9, r7, r9 // ...................................................................................*.......................... + // usub16 r7, r11, r14 // ....................................................................................................*......... + // smlabt r2, r4, r12, r0 // .....................................................................................*........................ + // uadd16 r11, r11, r14 // ......................................................................................................*....... + // smlabt r14, r9, r12, r0 // .......................................................................................*...................... + // vmov r0, s23 // ...............................................................................................*.............. + // str.w r3, [r0, #4] // .................................................................................................*............ + // pkhtb r14, r14, r2, asr #16 // ........................................................................................................*..... + // str.w r6, [r0], #32 // ................................................................................................*............. + // usub16 r9, r10, r14 // ............................................................................................................*. + // str r5, [r0, #-20] // ...................................................................................................*.......... + // uadd16 r6, r10, r14 // ..........................................................................................................*... + // str r8, [r0, #-24] // .....................................................................................................*........ + // str r7, [r0, #-12] // .........................................................................................................*.... + // str r11, [r0, #-16] // .......................................................................................................*...... + // str r6, [r0, #-8] // ...........................................................................................................*.. + // str r9, [r0, #-4] // .............................................................................................................* vmov tmp, s13 cmp poly, tmp