diff --git a/examples/naive/aarch64/aarch64_simple0_loop.s b/examples/naive/aarch64/aarch64_simple0_loop.s index fcfef9e1..10512245 100644 --- a/examples/naive/aarch64/aarch64_simple0_loop.s +++ b/examples/naive/aarch64/aarch64_simple0_loop.s @@ -33,8 +33,9 @@ modulus_ptr .req x2 .endm count .req x2 - ldr qtwiddle, [twiddle_ptr, #0] - ldr qmodulus, [modulus_ptr, #0] +ldr qtwiddle, [twiddle_ptr, #0] +ldr qmodulus, [modulus_ptr, #0] +mov count, #16 start: ldr qdata0, [data_ptr, #0*16] diff --git a/examples/opt/aarch64/aarch64_simple0_loop_opt_a55.s b/examples/opt/aarch64/aarch64_simple0_loop_opt_a55.s index 7638d59f..f945ec37 100644 --- a/examples/opt/aarch64/aarch64_simple0_loop_opt_a55.s +++ b/examples/opt/aarch64/aarch64_simple0_loop_opt_a55.s @@ -33,92 +33,93 @@ modulus_ptr .req x2 .endm count .req x2 - ldr qtwiddle, [twiddle_ptr, #0] - ldr qmodulus, [modulus_ptr, #0] - ldr q30, [x0, #48] - sqrdmulh v7.8H, v30.8H, v0.H[1] +ldr qtwiddle, [twiddle_ptr, #0] +ldr qmodulus, [modulus_ptr, #0] +mov count, #16 + ldr q3, [x0, #16] + sqrdmulh v7.8H, v3.8H, v0.H[1] sub count, count, #1 start: - ldr q28, [x0, #16] // .*................ + mul v3.8H, v3.8H, v0.H[0] // ....*............. // gap // .................. + ldr q19, [x0, #48] // ...*.............. // gap // .................. // gap // .................. - mul v5.8H, v30.8H, v0.H[0] // .........*........ // gap // .................. - ldr q20, [x0, #32] // ..*............... + ldr q15, [x0, #0] // *................. // gap // .................. // gap // .................. // gap // .................. - mul v14.8H, v28.8H, v0.H[0] // ....*............. + mls v3.8H, v7.8H, v1.H[0] // ......*........... // gap // .................. - mls v5.8H, v7.8H, v1.H[0] // ...........*...... + mul v13.8H, v19.8H, v0.H[0] // .........*........ // gap // .................. - sqrdmulh v13.8H, v28.8H, v0.H[1] // .....*............ + sqrdmulh v19.8H, v19.8H, v0.H[1] // ..........*....... // gap // .................. - ldr q25, [x0, #0] // *................. + ldr q7, [x0, #32] // ..*............... // gap // .................. // gap // .................. // gap // .................. - sub v11.8H, v20.8H, v5.8H // ............*..... + sub v17.8H, v15.8H, v3.8H // .......*.......... // gap // .................. - add v12.8H, v20.8H, v5.8H // .............*.... + add v10.8H, v15.8H, v3.8H // ........*......... // gap // .................. - mls v14.8H, v13.8H, v1.H[0] // ......*........... + mls v13.8H, v19.8H, v1.H[0] // ...........*...... // gap // .................. - ldr q30, [x0, #112] // ...e.............. + str q17, [x0, #16] // ...............*.. // gap // .................. + ldr q3, [x0, #80] // .e................ // gap // .................. // gap // .................. - str q11, [x0, #48] // .................* // gap // .................. - sub v26.8H, v25.8H, v14.8H // .......*.......... + add v15.8H, v7.8H, v13.8H // .............*.... // gap // .................. - str q12, [x0, #32] // ................*. + str q10, [x0], #4*16 // ..............*... // gap // .................. - add v13.8H, v25.8H, v14.8H // ........*......... + sub v13.8H, v7.8H, v13.8H // ............*..... // gap // .................. - str q26, [x0, #16] // ...............*.. + str q15, [x0, #-32] // ................*. // gap // .................. - sqrdmulh v7.8H, v30.8H, v0.H[1] // ..........e....... + sqrdmulh v7.8H, v3.8H, v0.H[1] // .....e............ // gap // .................. - str q13, [x0], #4*16 // ..............*... + str q13, [x0, #-16] // .................* // gap // .................. // original source code - // ldr q8, [x0, #0*16] // ........|.....*........... - // ldr q9, [x0, #1*16] // ........*................. - // ldr q10, [x0, #2*16] // ........|.*............... - // ldr q11, [x0, #3*16] // e.......|.........e....... - // mul v12.8h, v9.8h, v0.h[0] // ........|..*.............. - // sqrdmulh v9.8h, v9.8h, v0.h[1] // ........|....*............ - // mls v12.8h, v9.8h, v1.h[0] // ........|........*........ - // sub v9.8h, v8.8h, v12.8h // ..*.....|...........*..... - // add v8.8h, v8.8h, v12.8h // ....*...|.............*... - // mul v12.8h, v11.8h, v0.h[0] // ........|*................ - // sqrdmulh v11.8h, v11.8h, v0.h[1] // ......e.|...............e. - // mls v12.8h, v11.8h, v1.h[0] // ........|...*............. - // sub v11.8h, v10.8h, v12.8h // ........|......*.......... - // add v10.8h, v10.8h, v12.8h // ........|.......*......... - // str q8, [x0], #4*16 // .......*|................* - // str q9, [x0, #-3*16] // .....*..|..............*.. - // str q10, [x0, #-2*16] // ...*....|............*.... - // str q11, [x0, #-1*16] // .*......|..........*...... + // ldr q8, [x0, #0*16] // .......|.*............... + // ldr q9, [x0, #1*16] // e......|..........e...... + // ldr q10, [x0, #2*16] // .......|.....*........... + // ldr q11, [x0, #3*16] // .......|*................ + // mul v12.8h, v9.8h, v0.h[0] // .......*................. + // sqrdmulh v9.8h, v9.8h, v0.h[1] // .....e.|...............e. + // mls v12.8h, v9.8h, v1.h[0] // .......|..*.............. + // sub v9.8h, v8.8h, v12.8h // .......|......*.......... + // add v8.8h, v8.8h, v12.8h // .......|.......*......... + // mul v12.8h, v11.8h, v0.h[0] // .......|...*............. + // sqrdmulh v11.8h, v11.8h, v0.h[1] // .......|....*............ + // mls v12.8h, v11.8h, v1.h[0] // .......|........*........ + // sub v11.8h, v10.8h, v12.8h // ...*...|.............*... + // add v10.8h, v10.8h, v12.8h // .*.....|...........*..... + // str q8, [x0], #4*16 // ..*....|............*.... + // str q9, [x0, #-3*16] // .......|.........*....... + // str q10, [x0, #-2*16] // ....*..|..............*.. + // str q11, [x0, #-1*16] // ......*|................* sub count, count, #1 cbnz count, start - ldr q28, [x0, #16] - mul v5.8H, v30.8H, v0.H[0] - ldr q20, [x0, #32] - mul v14.8H, v28.8H, v0.H[0] - mls v5.8H, v7.8H, v1.H[0] - sqrdmulh v13.8H, v28.8H, v0.H[1] - ldr q25, [x0, #0] - sub v11.8H, v20.8H, v5.8H - add v12.8H, v20.8H, v5.8H - mls v14.8H, v13.8H, v1.H[0] - str q11, [x0, #48] - sub v26.8H, v25.8H, v14.8H - str q12, [x0, #32] - add v13.8H, v25.8H, v14.8H - str q26, [x0, #16] - str q13, [x0], #4*16 \ No newline at end of file + mul v3.8H, v3.8H, v0.H[0] + ldr q19, [x0, #48] + ldr q15, [x0, #0] + mls v3.8H, v7.8H, v1.H[0] + mul v13.8H, v19.8H, v0.H[0] + sqrdmulh v19.8H, v19.8H, v0.H[1] + ldr q7, [x0, #32] + sub v17.8H, v15.8H, v3.8H + add v10.8H, v15.8H, v3.8H + mls v13.8H, v19.8H, v1.H[0] + str q17, [x0, #16] + add v15.8H, v7.8H, v13.8H + str q10, [x0], #4*16 + sub v13.8H, v7.8H, v13.8H + str q15, [x0, #-32] + str q13, [x0, #-16] \ No newline at end of file diff --git a/examples/opt/aarch64/aarch64_simple0_loop_opt_a72.s b/examples/opt/aarch64/aarch64_simple0_loop_opt_a72.s index f6586f01..3e8c3935 100644 --- a/examples/opt/aarch64/aarch64_simple0_loop_opt_a72.s +++ b/examples/opt/aarch64/aarch64_simple0_loop_opt_a72.s @@ -33,50 +33,51 @@ modulus_ptr .req x2 .endm count .req x2 - ldr qtwiddle, [twiddle_ptr, #0] - ldr qmodulus, [modulus_ptr, #0] - ldr q28, [x0, #16] - ldr q3, [x0, #48] - sqrdmulh v12.8H, v28.8H, v0.H[1] - mul v6.8H, v28.8H, v0.H[0] - sqrdmulh v29.8H, v3.8H, v0.H[1] - mls v6.8H, v12.8H, v1.H[0] +ldr qtwiddle, [twiddle_ptr, #0] +ldr qmodulus, [modulus_ptr, #0] +mov count, #16 + ldr q7, [x0, #16] + ldr q17, [x0, #48] + sqrdmulh v25.8H, v7.8H, v0.H[1] + mul v3.8H, v7.8H, v0.H[0] + sqrdmulh v27.8H, v17.8H, v0.H[1] + mls v3.8H, v25.8H, v1.H[0] sub count, count, #1 start: - ldr q8, [x0, #0] // *................. - ldr q28, [x0, #80] // .e................ - mul v27.8H, v3.8H, v0.H[0] // .........*........ - ldr q24, [x0, #32] // ..*............... - ldr q3, [x0, #112] // ...e.............. + ldr q15, [x0, #0] // *................. + ldr q7, [x0, #80] // .e................ + mul v13.8H, v17.8H, v0.H[0] // .........*........ + ldr q19, [x0, #32] // ..*............... + ldr q17, [x0, #112] // ...e.............. // gap // .................. // gap // .................. // gap // .................. - mls v27.8H, v29.8H, v1.H[0] // ...........*...... + mls v13.8H, v27.8H, v1.H[0] // ...........*...... // gap // .................. // gap // .................. // gap // .................. + sqrdmulh v25.8H, v7.8H, v0.H[1] // .....e............ // gap // .................. - sqrdmulh v12.8H, v28.8H, v0.H[1] // .....e............ - add v10.8H, v8.8H, v6.8H // ........*......... - sub v19.8H, v8.8H, v6.8H // .......*.......... + add v4.8H, v15.8H, v3.8H // ........*......... + sub v10.8H, v15.8H, v3.8H // .......*.......... // gap // .................. // gap // .................. - mul v6.8H, v28.8H, v0.H[0] // ....e............. + mul v3.8H, v7.8H, v0.H[0] // ....e............. // gap // .................. // gap // .................. - sub v4.8H, v24.8H, v27.8H // ............*..... + sub v31.8H, v19.8H, v13.8H // ............*..... // gap // .................. // gap // .................. - sqrdmulh v29.8H, v3.8H, v0.H[1] // ..........e....... - add v9.8H, v24.8H, v27.8H // .............*.... + sqrdmulh v27.8H, v17.8H, v0.H[1] // ..........e....... + add v28.8H, v19.8H, v13.8H // .............*.... // gap // .................. // gap // .................. // gap // .................. - str q19, [x0, #16] // ...............*.. - mls v6.8H, v12.8H, v1.H[0] // ......e........... - str q4, [x0, #48] // .................* - str q10, [x0], #4*16 // ..............*... - str q9, [x0, #-32] // ................*. + str q10, [x0, #16] // ...............*.. + mls v3.8H, v25.8H, v1.H[0] // ......e........... + str q31, [x0, #48] // .................* + str q4, [x0], #4*16 // ..............*... + str q28, [x0, #-32] // ................*. // gap // .................. // gap // .................. @@ -102,15 +103,15 @@ start: sub count, count, #1 cbnz count, start - ldr q8, [x0, #0] - mul v27.8H, v3.8H, v0.H[0] - ldr q24, [x0, #32] - mls v27.8H, v29.8H, v1.H[0] - add v10.8H, v8.8H, v6.8H - sub v19.8H, v8.8H, v6.8H - sub v4.8H, v24.8H, v27.8H - add v9.8H, v24.8H, v27.8H - str q19, [x0, #16] - str q4, [x0, #48] - str q10, [x0], #4*16 - str q9, [x0, #-32] \ No newline at end of file + ldr q15, [x0, #0] + mul v13.8H, v17.8H, v0.H[0] + ldr q19, [x0, #32] + mls v13.8H, v27.8H, v1.H[0] + add v4.8H, v15.8H, v3.8H + sub v10.8H, v15.8H, v3.8H + sub v31.8H, v19.8H, v13.8H + add v28.8H, v19.8H, v13.8H + str q10, [x0, #16] + str q31, [x0, #48] + str q4, [x0], #4*16 + str q28, [x0, #-32] \ No newline at end of file diff --git a/tutorial/README.md b/tutorial/README.md index c4914418..02cc6e19 100644 --- a/tutorial/README.md +++ b/tutorial/README.md @@ -244,6 +244,9 @@ At the top you can see the re-scheduled assembly and at the bottom you find the As comments next to the two sections, you can also see a visual representation on how these instructions have been rescheduled. You can see that various instructions have been moved around to achieve fewer stalls. +Note that if you do run SLOTHY again, it may produce a different scheduling with the same minimal number of stalls. +This is expected and due to the constraint solver not producing deterministic outputs. + In the scheduled code, you can see `// gap` where SLOTHY would expect a "gap" in the current model: This is not a pipeline stall in the sense of a wasted cycle, but rather an issue slot of the CPU that was not used. The Cortex-A55 is a dual-issue CPU meaning in ideal circumstances 2 instructions can be issued per cycle. @@ -399,14 +402,16 @@ Only the first and last iteration(s) may require to be treated separately; those postamble, respectively. Let's look at an example demonstrating how SLOTHY can perform software pipelining for you. -Consider the simple case of performing the code from the previous example within a loop. This is exactly what the +Consider the simple case of performing the code from the previous example within a loop with a fixed number of iterations (>=2). This is exactly what the `aarch64_simple0_loop` example in SLOTHY does: ```nasm ... // .req and .macro as above count .req x2 - ldr qtwiddle, [twiddle_ptr, #0] - ldr qmodulus, modulus_ptr, #0 +ldr qtwiddle, [twiddle_ptr, #0] +ldr qmodulus, modulus_ptr, #0 + +mov count, #16 start: ldr qtwiddle, [twiddle_ptr, #0] @@ -451,98 +456,96 @@ look like: ```nasm // ... count .req x2 - - ldr qtwiddle, [twiddle_ptr, #0] - ldr qmodulus, [modulus_ptr, #0] - // Preamble - ldr q30, [x0, #48] - sqrdmulh v7.8H, v30.8H, v0.H[1] +ldr qtwiddle, [twiddle_ptr, #0] +ldr qmodulus, [modulus_ptr, #0] +mov count, #16 + ldr q3, [x0, #16] + sqrdmulh v7.8H, v3.8H, v0.H[1] sub count, count, #1 start: - ldr q28, [x0, #16] // .*................ + mul v3.8H, v3.8H, v0.H[0] // ....*............. // gap // .................. + ldr q19, [x0, #48] // ...*.............. // gap // .................. // gap // .................. - mul v5.8H, v30.8H, v0.H[0] // .........*........ // gap // .................. - ldr q20, [x0, #32] // ..*............... + ldr q15, [x0, #0] // *................. // gap // .................. // gap // .................. // gap // .................. - mul v14.8H, v28.8H, v0.H[0] // ....*............. + mls v3.8H, v7.8H, v1.H[0] // ......*........... // gap // .................. - mls v5.8H, v7.8H, v1.H[0] // ...........*...... + mul v13.8H, v19.8H, v0.H[0] // .........*........ // gap // .................. - sqrdmulh v13.8H, v28.8H, v0.H[1] // .....*............ + sqrdmulh v19.8H, v19.8H, v0.H[1] // ..........*....... // gap // .................. - ldr q25, [x0, #0] // *................. + ldr q7, [x0, #32] // ..*............... // gap // .................. // gap // .................. // gap // .................. - sub v11.8H, v20.8H, v5.8H // ............*..... + sub v17.8H, v15.8H, v3.8H // .......*.......... // gap // .................. - add v12.8H, v20.8H, v5.8H // .............*.... + add v10.8H, v15.8H, v3.8H // ........*......... // gap // .................. - mls v14.8H, v13.8H, v1.H[0] // ......*........... + mls v13.8H, v19.8H, v1.H[0] // ...........*...... // gap // .................. - ldr q30, [x0, #112] // ...e.............. + str q17, [x0, #16] // ...............*.. // gap // .................. + ldr q3, [x0, #80] // .e................ // gap // .................. // gap // .................. - str q11, [x0, #48] // .................* // gap // .................. - sub v26.8H, v25.8H, v14.8H // .......*.......... + add v15.8H, v7.8H, v13.8H // .............*.... // gap // .................. - str q12, [x0, #32] // ................*. + str q10, [x0], #4*16 // ..............*... // gap // .................. - add v13.8H, v25.8H, v14.8H // ........*......... + sub v13.8H, v7.8H, v13.8H // ............*..... // gap // .................. - str q26, [x0, #16] // ...............*.. + str q15, [x0, #-32] // ................*. // gap // .................. - sqrdmulh v7.8H, v30.8H, v0.H[1] // ..........e....... + sqrdmulh v7.8H, v3.8H, v0.H[1] // .....e............ // gap // .................. - str q13, [x0], #4*16 // ..............*... + str q13, [x0, #-16] // .................* // gap // .................. // original source code - // ldr q8, [x0, #0*16] // ........|.....*........... - // ldr q9, [x0, #1*16] // ........*................. - // ldr q10, [x0, #2*16] // ........|.*............... - // ldr q11, [x0, #3*16] // e.......|.........e....... - // mul v12.8h, v9.8h, v0.h[0] // ........|..*.............. - // sqrdmulh v9.8h, v9.8h, v0.h[1] // ........|....*............ - // mls v12.8h, v9.8h, v1.h[0] // ........|........*........ - // sub v9.8h, v8.8h, v12.8h // ..*.....|...........*..... - // add v8.8h, v8.8h, v12.8h // ....*...|.............*... - // mul v12.8h, v11.8h, v0.h[0] // ........|*................ - // sqrdmulh v11.8h, v11.8h, v0.h[1] // ......e.|...............e. - // mls v12.8h, v11.8h, v1.h[0] // ........|...*............. - // sub v11.8h, v10.8h, v12.8h // ........|......*.......... - // add v10.8h, v10.8h, v12.8h // ........|.......*......... - // str q8, [x0], #4*16 // .......*|................* - // str q9, [x0, #-3*16] // .....*..|..............*.. - // str q10, [x0, #-2*16] // ...*....|............*.... - // str q11, [x0, #-1*16] // .*......|..........*...... + // ldr q8, [x0, #0*16] // .......|.*............... + // ldr q9, [x0, #1*16] // e......|..........e...... + // ldr q10, [x0, #2*16] // .......|.....*........... + // ldr q11, [x0, #3*16] // .......|*................ + // mul v12.8h, v9.8h, v0.h[0] // .......*................. + // sqrdmulh v9.8h, v9.8h, v0.h[1] // .....e.|...............e. + // mls v12.8h, v9.8h, v1.h[0] // .......|..*.............. + // sub v9.8h, v8.8h, v12.8h // .......|......*.......... + // add v8.8h, v8.8h, v12.8h // .......|.......*......... + // mul v12.8h, v11.8h, v0.h[0] // .......|...*............. + // sqrdmulh v11.8h, v11.8h, v0.h[1] // .......|....*............ + // mls v12.8h, v11.8h, v1.h[0] // .......|........*........ + // sub v11.8h, v10.8h, v12.8h // ...*...|.............*... + // add v10.8h, v10.8h, v12.8h // .*.....|...........*..... + // str q8, [x0], #4*16 // ..*....|............*.... + // str q9, [x0, #-3*16] // .......|.........*....... + // str q10, [x0, #-2*16] // ....*..|..............*.. + // str q11, [x0, #-1*16] // ......*|................* sub count, count, #1 cbnz count, start - // Postamble - ldr q28, [x0, #16] - mul v5.8H, v30.8H, v0.H[0] - ldr q20, [x0, #32] - mul v14.8H, v28.8H, v0.H[0] - mls v5.8H, v7.8H, v1.H[0] - sqrdmulh v13.8H, v28.8H, v0.H[1] - ldr q25, [x0, #0] - sub v11.8H, v20.8H, v5.8H - add v12.8H, v20.8H, v5.8H - mls v14.8H, v13.8H, v1.H[0] - str q11, [x0, #48] - sub v26.8H, v25.8H, v14.8H - str q12, [x0, #32] - add v13.8H, v25.8H, v14.8H - str q26, [x0, #16] - str q13, [x0], #4*16 + mul v3.8H, v3.8H, v0.H[0] + ldr q19, [x0, #48] + ldr q15, [x0, #0] + mls v3.8H, v7.8H, v1.H[0] + mul v13.8H, v19.8H, v0.H[0] + sqrdmulh v19.8H, v19.8H, v0.H[1] + ldr q7, [x0, #32] + sub v17.8H, v15.8H, v3.8H + add v10.8H, v15.8H, v3.8H + mls v13.8H, v19.8H, v1.H[0] + str q17, [x0, #16] + add v15.8H, v7.8H, v13.8H + str q10, [x0], #4*16 + sub v13.8H, v7.8H, v13.8H + str q15, [x0, #-32] + str q13, [x0, #-16] ``` Let's start by looking at the optimized loop body going from `start:` to `cbnz count, start`: