From 14dc6b771660f19fa0c60caf7a1c49901a5d15b9 Mon Sep 17 00:00:00 2001 From: "Matthias J. Kannwischer" Date: Wed, 18 Dec 2024 17:07:02 +0800 Subject: [PATCH] M7: reduce32_dilithium --- example.py | 22 +++ examples/naive/armv7m/reduce32_dilithium.s | 52 +++++++ .../opt/armv7m/reduce32_dilithium_opt_m7.s | 136 ++++++++++++++++++ 3 files changed, 210 insertions(+) create mode 100644 examples/naive/armv7m/reduce32_dilithium.s create mode 100644 examples/opt/armv7m/reduce32_dilithium_opt_m7.s diff --git a/example.py b/example.py index f42eedc2..c83f73ba 100644 --- a/example.py +++ b/example.py @@ -1680,6 +1680,27 @@ def core(self, slothy): slothy.config.sw_pipelining.enabled = True slothy.optimize_loop("_asymmetric_mul_16_loop") + +class reduce32_dilithium(Example): + def __init__(self, var="", arch=Arch_Armv7M, target=Target_CortexM7, timeout=None): + name = "reduce32_dilithium" + infile = name + funcname = "pqcrystals_dilithium_asm_reduce32" + + if var != "": + name += f"_{var}" + infile += f"_{var}" + name += f"_{target_label_dict[target]}" + + super().__init__(infile, name, rename=True, arch=arch, target=target, timeout=timeout, funcname=funcname) + + def core(self, slothy): + slothy.config.outputs = ["r10"] + slothy.config.inputs_are_outputs = True + slothy.config.constraints.stalls_first_attempt = 4 + slothy.config.sw_pipelining.enabled = True + slothy.optimize_loop("1") + def main(): examples = [ Example0(), Example1(), @@ -1842,6 +1863,7 @@ def main(): basemul_257_asymmetric_dilithium(), pointwise_769_dilithium(), pointwise_769_asymmetric_dilithium(), + reduce32_dilithium(), ] all_example_names = [e.name for e in examples] diff --git a/examples/naive/armv7m/reduce32_dilithium.s b/examples/naive/armv7m/reduce32_dilithium.s new file mode 100644 index 00000000..8fef6f48 --- /dev/null +++ b/examples/naive/armv7m/reduce32_dilithium.s @@ -0,0 +1,52 @@ +.syntax unified +.thumb +.macro redq a, tmp, q + add \tmp, \a, #4194304 + asrs \tmp, \tmp, #23 + mls \a, \tmp, \q, \a +.endm + +// void asm_reduce32(int32_t a[N]); +.global pqcrystals_dilithium_asm_reduce32 +.type pqcrystals_dilithium_asm_reduce32, %function +.align 2 +pqcrystals_dilithium_asm_reduce32: + push {r4-r11, r14} + + movw r12,#:lower16:8380417 + movt r12,#:upper16:8380417 + movw r10, #32 + 1: + ldr.w r1, [r0] + ldr.w r2, [r0, #1*4] + ldr.w r3, [r0, #2*4] + ldr.w r4, [r0, #3*4] + ldr.w r5, [r0, #4*4] + ldr.w r6, [r0, #5*4] + ldr.w r7, [r0, #6*4] + ldr.w r8, [r0, #7*4] + + redq r1, r9, r12 + redq r2, r9, r12 + redq r3, r9, r12 + redq r4, r9, r12 + redq r5, r9, r12 + redq r6, r9, r12 + redq r7, r9, r12 + redq r8, r9, r12 + + str.w r2, [r0, #1*4] + str.w r3, [r0, #2*4] + str.w r4, [r0, #3*4] + str.w r5, [r0, #4*4] + str.w r6, [r0, #5*4] + str.w r7, [r0, #6*4] + str.w r8, [r0, #7*4] + str r1, [r0], #8*4 + subs r10, #1 + bne.w 1b + + pop {r4-r11, r14} + bx lr + +.size pqcrystals_dilithium_asm_reduce32, .-pqcrystals_dilithium_asm_reduce32 \ No newline at end of file diff --git a/examples/opt/armv7m/reduce32_dilithium_opt_m7.s b/examples/opt/armv7m/reduce32_dilithium_opt_m7.s new file mode 100644 index 00000000..256c65ea --- /dev/null +++ b/examples/opt/armv7m/reduce32_dilithium_opt_m7.s @@ -0,0 +1,136 @@ +.syntax unified +.thumb +.macro redq a, tmp, q + add \tmp, \a, #4194304 + asrs \tmp, \tmp, #23 + mls \a, \tmp, \q, \a +.endm + +// void asm_reduce32(int32_t a[N]); +.global pqcrystals_dilithium_asm_reduce32_opt_m7 +.type pqcrystals_dilithium_asm_reduce32_opt_m7, %function +.align 2 +pqcrystals_dilithium_asm_reduce32_opt_m7: + push {r4-r11, r14} + + movw r12,#:lower16:8380417 + movt r12,#:upper16:8380417 + movw r10, #32 + // Instructions: 0 + // Expected cycles: 0 + // Expected IPC: 0.00 + // + // Wall time: 0.00s + // User time: 0.00s + // +1: + // Instructions: 41 + // Expected cycles: 21 + // Expected IPC: 1.95 + // + // Wall time: 7.03s + // User time: 7.03s + // + // ----- cycle (expected) ------> + // 0 25 + // |------------------------|---- + ldr.w r9, [r0, #16] // *............................. + ldr.w r7, [r0, #4] // *............................. + add r5, r7, #4194304 // .*............................ + ldr.w r11, [r0, #20] // .*............................ + add r3, r11, #4194304 // ..*........................... + ldr.w r8, [r0, #28] // ..*........................... + add r14, r8, #4194304 // ...*.......................... + asrs r3, r3, #23 // ...*.......................... + ldr.w r2, [r0, #12] // ....*......................... + mls r4, r3, r12, r11 // ....*......................... + str.w r4, [r0, #20] // .....*........................ + asrs r3, r5, #23 // .....*........................ + mls r3, r3, r12, r7 // ......*....................... + ldr.w r5, [r0, #8] // ......*....................... + add r6, r9, #4194304 // .......*...................... + add r4, r2, #4194304 // .......*...................... + str.w r3, [r0, #4] // ........*..................... + asrs r11, r4, #23 // ........*..................... + mls r4, r11, r12, r2 // .........*.................... + asrs r6, r6, #23 // .........*.................... + str.w r4, [r0, #12] // ..........*................... + asrs r11, r14, #23 // ..........*................... + mls r4, r11, r12, r8 // ...........*.................. + add r14, r5, #4194304 // ...........*.................. + mls r6, r6, r12, r9 // ............*................. + asrs r14, r14, #23 // ............*................. + str.w r6, [r0, #16] // .............*................ + ldr.w r1, [r0, #24] // .............*................ + str.w r4, [r0, #28] // ..............*............... + add r4, r1, #4194304 // ..............*............... + ldr.w r11, [r0] // ...............*.............. + mls r14, r14, r12, r5 // ...............*.............. + asrs r2, r4, #23 // ................*............. + str.w r14, [r0, #8] // ................*............. + mls r4, r2, r12, r1 // .................*............ + add r2, r11, #4194304 // .................*............ + str.w r4, [r0, #24] // ..................*........... + asrs r7, r2, #23 // ..................*........... + mls r8, r7, r12, r11 // ...................*.......... + subs r10, #1 // ...................*.......... + str r8, [r0], #8*4 // ....................*......... + + // ------ cycle (expected) ------> + // 0 25 + // |------------------------|----- + // ldr.w r1, [r0] // ...............*.....'......... + // ldr.w r2, [r0, #1*4] // *....................~......... + // ldr.w r3, [r0, #2*4] // ......*..............'.....~... + // ldr.w r4, [r0, #3*4] // ....*................'...~..... + // ldr.w r5, [r0, #4*4] // *....................~......... + // ldr.w r6, [r0, #5*4] // .*...................'~........ + // ldr.w r7, [r0, #6*4] // .............*.......'......... + // ldr.w r8, [r0, #7*4] // ..*..................'.~....... + // add r9, r1, #4194304 // .................*...'......... + // asrs r9, r9, #23 // ..................*..'......... + // mls r1, r9, r12, r1 // ...................*.'......... + // add r9, r2, #4194304 // .*...................'~........ + // asrs r9, r9, #23 // .....*...............'....~.... + // mls r2, r9, r12, r2 // ......*..............'.....~... + // add r9, r3, #4194304 // ...........*.........'......... + // asrs r9, r9, #23 // ............*........'......... + // mls r3, r9, r12, r3 // ...............*.....'......... + // add r9, r4, #4194304 // .......*.............'......~.. + // asrs r9, r9, #23 // ........*............'.......~. + // mls r4, r9, r12, r4 // .........*...........'......... + // add r9, r5, #4194304 // .......*.............'......~.. + // asrs r9, r9, #23 // .........*...........'......... + // mls r5, r9, r12, r5 // ............*........'......... + // add r9, r6, #4194304 // ..*..................'.~....... + // asrs r9, r9, #23 // ...*.................'..~...... + // mls r6, r9, r12, r6 // ....*................'...~..... + // add r9, r7, #4194304 // ..............*......'......... + // asrs r9, r9, #23 // ................*....'......... + // mls r7, r9, r12, r7 // .................*...'......... + // add r9, r8, #4194304 // ...*.................'..~...... + // asrs r9, r9, #23 // ..........*..........'......... + // mls r8, r9, r12, r8 // ...........*.........'......... + // str.w r2, [r0, #1*4] // ........*............'.......~. + // str.w r3, [r0, #2*4] // ................*....'......... + // str.w r4, [r0, #3*4] // ..........*..........'......... + // str.w r5, [r0, #4*4] // .............*.......'......... + // str.w r6, [r0, #5*4] // .....*...............'....~.... + // str.w r7, [r0, #6*4] // ..................*..'......... + // str.w r8, [r0, #7*4] // ..............*......'......... + // str r1, [r0], #8*4 // ....................*'......... + // subs r10, #1 // ...................*.'......... + + bne 1b + // Instructions: 0 + // Expected cycles: 0 + // Expected IPC: 0.00 + // + // Wall time: 0.00s + // User time: 0.00s + // + + pop {r4-r11, r14} + bx lr + +.size pqcrystals_dilithium_asm_reduce32_opt_m7, .-pqcrystals_dilithium_asm_reduce32_opt_m7 \ No newline at end of file