diff --git a/examples/opt/armv7m/pointwise_769_dilithium_opt_m7.s b/examples/opt/armv7m/pointwise_769_dilithium_opt_m7.s index 1a7c5d49..8cc5f92b 100644 --- a/examples/opt/armv7m/pointwise_769_dilithium_opt_m7.s +++ b/examples/opt/armv7m/pointwise_769_dilithium_opt_m7.s @@ -33,162 +33,159 @@ small_pointmul_asm_769_opt_m7: add.w r3, r2, #64*width - // Instructions: 2 - // Expected cycles: 1 - // Expected IPC: 2.00 - // - // Cycle bound: 1.0 - // IPC bound: 2.00 - // - // Wall time: 0.00s - // User time: 0.00s - // - // ----- cycle (expected) ------> - // 0 25 - // |------------------------|---- - ldr.w r10, [r2, #4] // *............................. - ldr.w r8, [r1, #8] // *............................. - - // ------ cycle (expected) ------> + // Instructions: 5 + // Expected cycles: 3 + // Expected IPC: 1.67 + // + // Cycle bound: 3.0 + // IPC bound: 1.67 + // + // Wall time: 0.01s + // User time: 0.01s + // + // ----- cycle (expected) ------> // 0 25 - // |------------------------|----- - // ldr.w r10, [r2, #4] // *.............................. - // ldr.w r8, [r1, #8] // *.............................. + // |------------------------|---- + ldr.w r10, [r1, #8] // *............................. + ldr.w r5, [r2, #4] // *............................. + ldr.w r9, [r1], #4*4 // .*............................ + ldr.w r6, [r2], #2*4 // ..*........................... + smulwt r7, r5, r10 // ..*........................... + + // ------ cycle (expected) ------> + // 0 25 + // |------------------------|----- + // ldr.w r10, [r1, #8] // *.............................. + // ldr.w r5, [r2, #4] // *.............................. + // ldr.w r9, [r1], #4*4 // .*............................. + // smulwt r7, r5, r10 // ..*............................ + // ldr.w r6, [r2], #2*4 // ..*............................ sub r2, r2, #0 1: - // Instructions: 25 - // Expected cycles: 13 - // Expected IPC: 1.92 + // Instructions: 24 + // Expected cycles: 12 + // Expected IPC: 2.00 // - // Cycle bound: 15.0 - // IPC bound: 1.67 + // Cycle bound: 17.0 + // IPC bound: 1.41 // - // Wall time: 1.41s - // User time: 1.41s + // Wall time: 0.96s + // User time: 0.96s // // ----- cycle (expected) ------> // 0 25 // |------------------------|---- - ldr.w r5, [r2], #2*4 // *............................. - ldr.w r4, [r1, #12] // *............................. - smulwt r7, r10, r8 // .*............................ - neg r10, r10 // .*............................ - ldr.w r11, [r1], #4*4 // ..*........................... - smulwt r6, r10, r4 // ..*........................... + ldr r4, [r1, #-4] // *............................. + smlabt r11, r7, r12, r14 // *............................. + neg r5, r5 // .*............................ + smulwt r7, r6, r9 // .*............................ + neg r6, r6 // ..*........................... + smulwt r5, r5, r4 // ..*........................... + ldr r8, [r1, #-12] // ...*.......................... smlabt r7, r7, r12, r14 // ...*.......................... - ldr.w r9, [r1, #-12] // ...*.......................... - smlabt r6, r6, r12, r14 // ....*......................... - cmp.w r3, r2 // ....*......................... - smulwt r10, r5, r11 // .....*........................ - neg r5, r5 // .....*........................ - pkhbt r7, r8, r7 // ......*....................... - smulwt r8, r5, r9 // ......*....................... - pkhbt r4, r4, r6 // .......*...................... - smlabt r5, r10, r12, r14 // .......*...................... - ldr.w r10, [r2, #4] // ........e..................... - smlabt r6, r8, r12, r14 // ........*..................... - ldr.w r8, [r1, #8] // .........e.................... - str.w r4, [r0, #12] // .........*.................... - pkhbt r4, r11, r5 // ..........*................... - str.w r4, [r0], #2*4 // ..........*................... - str.w r7, [r0], #2*4 // ...........*.................. - pkhbt r11, r9, r6 // ............*................. - str.w r11, [r0, #-12] // ............*................. + pkhbt r11, r10, r11 // ....*......................... + smlabt r5, r5, r12, r14 // ....*......................... + ldr.w r10, [r1, #8] // .....e........................ + smulwt r6, r6, r8 // .....*........................ + pkhbt r7, r9, r7 // ......*....................... + str.w r7, [r0], #2*4 // ......*....................... + pkhbt r9, r4, r5 // .......*...................... + smlabt r6, r6, r12, r14 // .......*...................... + ldr.w r5, [r2, #4] // ........e..................... + str.w r9, [r0, #4] // ........*..................... + ldr.w r9, [r1], #4*4 // .........e.................... + str.w r11, [r0], #2*4 // .........*.................... + pkhbt r8, r8, r6 // ..........*................... + smulwt r7, r5, r10 // ..........e................... + str r8, [r0, #-12] // ...........*.................. + ldr.w r6, [r2], #2*4 // ...........e.................. // ------ cycle (expected) ------> // 0 25 // |------------------------|----- - // ldr.w r7, [r1, #2*4] // .e...'........~...'........~... - // ldr.w r8, [r1, #3*4] // .....*............~............ - // ldr.w r9, [r2, #1*4] // e....'.......~....'.......~.... - // ldr.w r5, [r1, #1*4] // .....'..*.........'..~......... - // ldr.w r4, [r1], #4*4 // .....'.*..........'.~.......... - // ldr.w r6, [r2], #2*4 // .....*............~............ - // smulwt r10, r6, r4 // .....'....*.......'....~....... - // smlabt r10, r10, r12, r14 // .....'......*.....'......~..... - // pkhbt r4, r4, r10 // ..~..'.........*..'.........~.. - // neg r6, r6 // .....'....*.......'....~....... - // smulwt r10, r6, r5 // .....'.....*......'.....~...... - // smlabt r10, r10, r12, r14 // ~....'.......*....'.......~.... - // pkhbt r5, r5, r10 // ....~'...........*'............ - // str.w r5, [r0, #1*4] // ....~'...........*'............ - // str.w r4, [r0], #2*4 // ..~..'.........*..'.........~.. - // smulwt r10, r9, r7 // .....'*...........'~........... - // smlabt r10, r10, r12, r14 // .....'..*.........'..~......... - // pkhbt r7, r7, r10 // .....'.....*......'.....~...... - // neg r9, r9 // .....'*...........'~........... - // smulwt r10, r9, r8 // .....'.*..........'.~.......... - // smlabt r10, r10, r12, r14 // .....'...*........'...~........ - // pkhbt r8, r8, r10 // .....'......*.....'......~..... - // str.w r8, [r0, #1*4] // .~...'........*...'........~... - // str.w r7, [r0], #2*4 // ...~.'..........*.'..........~. - // cmp.w r3, r2 // .....'...*........'...~........ + // ldr.w r7, [r1, #2*4] // e......'....~......'....~...... + // ldr.w r8, [r1, #3*4] // .......*...........~........... + // ldr.w r9, [r2, #1*4] // ...e...'.......~...'.......~... + // ldr.w r5, [r1, #1*4] // .......'..*........'..~........ + // ldr.w r4, [r1], #4*4 // ....e..'........~..'........~.. + // ldr.w r6, [r2], #2*4 // ......e'..........~'........... + // smulwt r10, r6, r4 // .......'*..........'~.......... + // smlabt r10, r10, r12, r14 // .......'..*........'..~........ + // pkhbt r4, r4, r10 // .~.....'.....*.....'.....~..... + // neg r6, r6 // .......'.*.........'.~......... + // smulwt r10, r6, r5 // ~......'....*......'....~...... + // smlabt r10, r10, r12, r14 // ..~....'......*....'......~.... + // pkhbt r5, r5, r10 // .....~.'.........*.'.........~. + // str.w r5, [r0, #1*4] // ......~'..........*'........... + // str.w r4, [r0], #2*4 // .~.....'.....*.....'.....~..... + // smulwt r10, r9, r7 // .....e.'.........~.'.........~. + // smlabt r10, r10, r12, r14 // .......*...........~........... + // pkhbt r7, r7, r10 // .......'...*.......'...~....... + // neg r9, r9 // .......'*..........'~.......... + // smulwt r10, r9, r8 // .......'.*.........'.~......... + // smlabt r10, r10, r12, r14 // .......'...*.......'...~....... + // pkhbt r8, r8, r10 // ..~....'......*....'......~.... + // str.w r8, [r0, #1*4] // ...~...'.......*...'.......~... + // str.w r7, [r0], #2*4 // ....~..'........*..'........~.. + cmp r3, r2 bne 1b - // Instructions: 23 - // Expected cycles: 13 - // Expected IPC: 1.77 - // - // Cycle bound: 13.0 - // IPC bound: 1.77 - // - // Wall time: 0.05s - // User time: 0.05s - // - // ----- cycle (expected) ------> - // 0 25 - // |------------------------|---- - cmp.w r3, r2 // *............................. - ldr.w r5, [r1, #12] // *............................. - smulwt r4, r10, r8 // .*............................ - neg r10, r10 // .*............................ - ldr.w r11, [r2], #2*4 // ..*........................... - smulwt r9, r10, r5 // ..*........................... - smlabt r4, r4, r12, r14 // ...*.......................... - ldr.w r6, [r1], #4*4 // ...*.......................... - smlabt r9, r9, r12, r14 // ....*......................... - ldr.w r7, [r1, #-12] // ....*......................... - smulwt r10, r11, r6 // .....*........................ - neg r11, r11 // .....*........................ - pkhbt r8, r8, r4 // ......*....................... - smulwt r4, r11, r7 // ......*....................... - pkhbt r5, r5, r9 // .......*...................... - smlabt r9, r10, r12, r14 // .......*...................... - smlabt r4, r4, r12, r14 // ........*..................... - str.w r5, [r0, #12] // .........*.................... - pkhbt r5, r6, r9 // ..........*................... - str.w r5, [r0], #2*4 // ..........*................... - pkhbt r5, r7, r4 // ...........*.................. - str.w r8, [r0], #2*4 // ...........*.................. - str.w r5, [r0, #-12] // ............*................. + // Instructions: 19 + // Expected cycles: 11 + // Expected IPC: 1.73 + // + // Cycle bound: 11.0 + // IPC bound: 1.73 + // + // Wall time: 0.04s + // User time: 0.04s + // + // ----- cycle (expected) ------> + // 0 25 + // |------------------------|---- + ldr r4, [r1, #-12] // *............................. + smulwt r8, r6, r9 // *............................. + neg r6, r6 // .*............................ + smlabt r7, r7, r12, r14 // .*............................ + neg r5, r5 // ..*........................... + smulwt r6, r6, r4 // ..*........................... + ldr r11, [r1, #-4] // ...*.......................... + smlabt r8, r8, r12, r14 // ...*.......................... + pkhbt r10, r10, r7 // ....*......................... + smlabt r6, r6, r12, r14 // ....*......................... + smulwt r7, r5, r11 // .....*........................ + pkhbt r8, r9, r8 // ......*....................... + str.w r8, [r0], #2*4 // ......*....................... + pkhbt r6, r4, r6 // .......*...................... + smlabt r4, r7, r12, r14 // .......*...................... + str.w r10, [r0], #2*4 // ........*..................... + str r6, [r0, #-12] // .........*.................... + pkhbt r6, r11, r4 // ..........*................... + str r6, [r0, #-4] // ..........*................... // ------ cycle (expected) ------> // 0 25 // |------------------------|----- - // ldr.w r5, [r2], #2*4 // ..*............................ - // ldr.w r4, [r1, #12] // *.............................. - // smulwt r7, r10, r8 // .*............................. - // neg r10, r10 // .*............................. - // ldr.w r11, [r1], #4*4 // ...*........................... - // smulwt r6, r10, r4 // ..*............................ + // ldr r4, [r1, #-4] // ...*........................... + // smlabt r11, r7, r12, r14 // .*............................. + // neg r5, r5 // ..*............................ + // smulwt r7, r6, r9 // *.............................. + // neg r6, r6 // .*............................. + // smulwt r5, r5, r4 // .....*......................... + // ldr r8, [r1, #-12] // *.............................. // smlabt r7, r7, r12, r14 // ...*........................... - // ldr.w r9, [r1, #-12] // ....*.......................... + // pkhbt r11, r10, r11 // ....*.......................... + // smlabt r5, r5, r12, r14 // .......*....................... + // smulwt r6, r6, r8 // ..*............................ + // pkhbt r7, r9, r7 // ......*........................ + // str.w r7, [r0], #2*4 // ......*........................ + // pkhbt r9, r4, r5 // ..........*.................... // smlabt r6, r6, r12, r14 // ....*.......................... - // cmp.w r3, r2 // *.............................. - // smulwt r10, r5, r11 // .....*......................... - // neg r5, r5 // .....*......................... - // pkhbt r7, r8, r7 // ......*........................ - // smulwt r8, r5, r9 // ......*........................ - // pkhbt r4, r4, r6 // .......*....................... - // smlabt r5, r10, r12, r14 // .......*....................... - // smlabt r6, r8, r12, r14 // ........*...................... - // str.w r4, [r0, #12] // .........*..................... - // pkhbt r4, r11, r5 // ..........*.................... - // str.w r4, [r0], #2*4 // ..........*.................... - // str.w r7, [r0], #2*4 // ...........*................... - // pkhbt r11, r9, r6 // ...........*................... - // str.w r11, [r0, #-12] // ............*.................. + // str.w r9, [r0, #4] // ..........*.................... + // str.w r11, [r0], #2*4 // ........*...................... + // pkhbt r8, r8, r6 // .......*....................... + // str r8, [r0, #-12] // .........*..................... pop.w {r4-r11, pc}