Skip to content

Commit

Permalink
minor tweaks
Browse files Browse the repository at this point in the history
  • Loading branch information
mkannwischer committed Mar 19, 2024
1 parent 3c5b4e0 commit 029d0a1
Show file tree
Hide file tree
Showing 4 changed files with 164 additions and 158 deletions.
5 changes: 3 additions & 2 deletions examples/naive/aarch64/aarch64_simple0_loop.s
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,9 @@ modulus_ptr .req x2
.endm

count .req x2
ldr qtwiddle, [twiddle_ptr, #0]
ldr qmodulus, [modulus_ptr, #0]
ldr qtwiddle, [twiddle_ptr, #0]
ldr qmodulus, [modulus_ptr, #0]
mov count, #16
start:

ldr qdata0, [data_ptr, #0*16]
Expand Down
113 changes: 57 additions & 56 deletions examples/opt/aarch64/aarch64_simple0_loop_opt_a55.s
Original file line number Diff line number Diff line change
Expand Up @@ -33,92 +33,93 @@ modulus_ptr .req x2
.endm

count .req x2
ldr qtwiddle, [twiddle_ptr, #0]
ldr qmodulus, [modulus_ptr, #0]
ldr q30, [x0, #48]
sqrdmulh v7.8H, v30.8H, v0.H[1]
ldr qtwiddle, [twiddle_ptr, #0]
ldr qmodulus, [modulus_ptr, #0]
mov count, #16
ldr q3, [x0, #16]
sqrdmulh v7.8H, v3.8H, v0.H[1]
sub count, count, #1
start:
ldr q28, [x0, #16] // .*................
mul v3.8H, v3.8H, v0.H[0] // ....*.............
// gap // ..................
ldr q19, [x0, #48] // ...*..............
// gap // ..................
// gap // ..................
mul v5.8H, v30.8H, v0.H[0] // .........*........
// gap // ..................
ldr q20, [x0, #32] // ..*...............
ldr q15, [x0, #0] // *.................
// gap // ..................
// gap // ..................
// gap // ..................
mul v14.8H, v28.8H, v0.H[0] // ....*.............
mls v3.8H, v7.8H, v1.H[0] // ......*...........
// gap // ..................
mls v5.8H, v7.8H, v1.H[0] // ...........*......
mul v13.8H, v19.8H, v0.H[0] // .........*........
// gap // ..................
sqrdmulh v13.8H, v28.8H, v0.H[1] // .....*............
sqrdmulh v19.8H, v19.8H, v0.H[1] // ..........*.......
// gap // ..................
ldr q25, [x0, #0] // *.................
ldr q7, [x0, #32] // ..*...............
// gap // ..................
// gap // ..................
// gap // ..................
sub v11.8H, v20.8H, v5.8H // ............*.....
sub v17.8H, v15.8H, v3.8H // .......*..........
// gap // ..................
add v12.8H, v20.8H, v5.8H // .............*....
add v10.8H, v15.8H, v3.8H // ........*.........
// gap // ..................
mls v14.8H, v13.8H, v1.H[0] // ......*...........
mls v13.8H, v19.8H, v1.H[0] // ...........*......
// gap // ..................
ldr q30, [x0, #112] // ...e..............
str q17, [x0, #16] // ...............*..
// gap // ..................
ldr q3, [x0, #80] // .e................
// gap // ..................
// gap // ..................
str q11, [x0, #48] // .................*
// gap // ..................
sub v26.8H, v25.8H, v14.8H // .......*..........
add v15.8H, v7.8H, v13.8H // .............*....
// gap // ..................
str q12, [x0, #32] // ................*.
str q10, [x0], #4*16 // ..............*...
// gap // ..................
add v13.8H, v25.8H, v14.8H // ........*.........
sub v13.8H, v7.8H, v13.8H // ............*.....
// gap // ..................
str q26, [x0, #16] // ...............*..
str q15, [x0, #-32] // ................*.
// gap // ..................
sqrdmulh v7.8H, v30.8H, v0.H[1] // ..........e.......
sqrdmulh v7.8H, v3.8H, v0.H[1] // .....e............
// gap // ..................
str q13, [x0], #4*16 // ..............*...
str q13, [x0, #-16] // .................*
// gap // ..................

// original source code
// ldr q8, [x0, #0*16] // ........|.....*...........
// ldr q9, [x0, #1*16] // ........*.................
// ldr q10, [x0, #2*16] // ........|.*...............
// ldr q11, [x0, #3*16] // e.......|.........e.......
// mul v12.8h, v9.8h, v0.h[0] // ........|..*..............
// sqrdmulh v9.8h, v9.8h, v0.h[1] // ........|....*............
// mls v12.8h, v9.8h, v1.h[0] // ........|........*........
// sub v9.8h, v8.8h, v12.8h // ..*.....|...........*.....
// add v8.8h, v8.8h, v12.8h // ....*...|.............*...
// mul v12.8h, v11.8h, v0.h[0] // ........|*................
// sqrdmulh v11.8h, v11.8h, v0.h[1] // ......e.|...............e.
// mls v12.8h, v11.8h, v1.h[0] // ........|...*.............
// sub v11.8h, v10.8h, v12.8h // ........|......*..........
// add v10.8h, v10.8h, v12.8h // ........|.......*.........
// str q8, [x0], #4*16 // .......*|................*
// str q9, [x0, #-3*16] // .....*..|..............*..
// str q10, [x0, #-2*16] // ...*....|............*....
// str q11, [x0, #-1*16] // .*......|..........*......
// ldr q8, [x0, #0*16] // .......|.*...............
// ldr q9, [x0, #1*16] // e......|..........e......
// ldr q10, [x0, #2*16] // .......|.....*...........
// ldr q11, [x0, #3*16] // .......|*................
// mul v12.8h, v9.8h, v0.h[0] // .......*.................
// sqrdmulh v9.8h, v9.8h, v0.h[1] // .....e.|...............e.
// mls v12.8h, v9.8h, v1.h[0] // .......|..*..............
// sub v9.8h, v8.8h, v12.8h // .......|......*..........
// add v8.8h, v8.8h, v12.8h // .......|.......*.........
// mul v12.8h, v11.8h, v0.h[0] // .......|...*.............
// sqrdmulh v11.8h, v11.8h, v0.h[1] // .......|....*............
// mls v12.8h, v11.8h, v1.h[0] // .......|........*........
// sub v11.8h, v10.8h, v12.8h // ...*...|.............*...
// add v10.8h, v10.8h, v12.8h // .*.....|...........*.....
// str q8, [x0], #4*16 // ..*....|............*....
// str q9, [x0, #-3*16] // .......|.........*.......
// str q10, [x0, #-2*16] // ....*..|..............*..
// str q11, [x0, #-1*16] // ......*|................*

sub count, count, #1
cbnz count, start
ldr q28, [x0, #16]
mul v5.8H, v30.8H, v0.H[0]
ldr q20, [x0, #32]
mul v14.8H, v28.8H, v0.H[0]
mls v5.8H, v7.8H, v1.H[0]
sqrdmulh v13.8H, v28.8H, v0.H[1]
ldr q25, [x0, #0]
sub v11.8H, v20.8H, v5.8H
add v12.8H, v20.8H, v5.8H
mls v14.8H, v13.8H, v1.H[0]
str q11, [x0, #48]
sub v26.8H, v25.8H, v14.8H
str q12, [x0, #32]
add v13.8H, v25.8H, v14.8H
str q26, [x0, #16]
str q13, [x0], #4*16
mul v3.8H, v3.8H, v0.H[0]
ldr q19, [x0, #48]
ldr q15, [x0, #0]
mls v3.8H, v7.8H, v1.H[0]
mul v13.8H, v19.8H, v0.H[0]
sqrdmulh v19.8H, v19.8H, v0.H[1]
ldr q7, [x0, #32]
sub v17.8H, v15.8H, v3.8H
add v10.8H, v15.8H, v3.8H
mls v13.8H, v19.8H, v1.H[0]
str q17, [x0, #16]
add v15.8H, v7.8H, v13.8H
str q10, [x0], #4*16
sub v13.8H, v7.8H, v13.8H
str q15, [x0, #-32]
str q13, [x0, #-16]
77 changes: 39 additions & 38 deletions examples/opt/aarch64/aarch64_simple0_loop_opt_a72.s
Original file line number Diff line number Diff line change
Expand Up @@ -33,50 +33,51 @@ modulus_ptr .req x2
.endm

count .req x2
ldr qtwiddle, [twiddle_ptr, #0]
ldr qmodulus, [modulus_ptr, #0]
ldr q28, [x0, #16]
ldr q3, [x0, #48]
sqrdmulh v12.8H, v28.8H, v0.H[1]
mul v6.8H, v28.8H, v0.H[0]
sqrdmulh v29.8H, v3.8H, v0.H[1]
mls v6.8H, v12.8H, v1.H[0]
ldr qtwiddle, [twiddle_ptr, #0]
ldr qmodulus, [modulus_ptr, #0]
mov count, #16
ldr q7, [x0, #16]
ldr q17, [x0, #48]
sqrdmulh v25.8H, v7.8H, v0.H[1]
mul v3.8H, v7.8H, v0.H[0]
sqrdmulh v27.8H, v17.8H, v0.H[1]
mls v3.8H, v25.8H, v1.H[0]
sub count, count, #1
start:
ldr q8, [x0, #0] // *.................
ldr q28, [x0, #80] // .e................
mul v27.8H, v3.8H, v0.H[0] // .........*........
ldr q24, [x0, #32] // ..*...............
ldr q3, [x0, #112] // ...e..............
ldr q15, [x0, #0] // *.................
ldr q7, [x0, #80] // .e................
mul v13.8H, v17.8H, v0.H[0] // .........*........
ldr q19, [x0, #32] // ..*...............
ldr q17, [x0, #112] // ...e..............
// gap // ..................
// gap // ..................
// gap // ..................
mls v27.8H, v29.8H, v1.H[0] // ...........*......
mls v13.8H, v27.8H, v1.H[0] // ...........*......
// gap // ..................
// gap // ..................
// gap // ..................
sqrdmulh v25.8H, v7.8H, v0.H[1] // .....e............
// gap // ..................
sqrdmulh v12.8H, v28.8H, v0.H[1] // .....e............
add v10.8H, v8.8H, v6.8H // ........*.........
sub v19.8H, v8.8H, v6.8H // .......*..........
add v4.8H, v15.8H, v3.8H // ........*.........
sub v10.8H, v15.8H, v3.8H // .......*..........
// gap // ..................
// gap // ..................
mul v6.8H, v28.8H, v0.H[0] // ....e.............
mul v3.8H, v7.8H, v0.H[0] // ....e.............
// gap // ..................
// gap // ..................
sub v4.8H, v24.8H, v27.8H // ............*.....
sub v31.8H, v19.8H, v13.8H // ............*.....
// gap // ..................
// gap // ..................
sqrdmulh v29.8H, v3.8H, v0.H[1] // ..........e.......
add v9.8H, v24.8H, v27.8H // .............*....
sqrdmulh v27.8H, v17.8H, v0.H[1] // ..........e.......
add v28.8H, v19.8H, v13.8H // .............*....
// gap // ..................
// gap // ..................
// gap // ..................
str q19, [x0, #16] // ...............*..
mls v6.8H, v12.8H, v1.H[0] // ......e...........
str q4, [x0, #48] // .................*
str q10, [x0], #4*16 // ..............*...
str q9, [x0, #-32] // ................*.
str q10, [x0, #16] // ...............*..
mls v3.8H, v25.8H, v1.H[0] // ......e...........
str q31, [x0, #48] // .................*
str q4, [x0], #4*16 // ..............*...
str q28, [x0, #-32] // ................*.
// gap // ..................
// gap // ..................

Expand All @@ -102,15 +103,15 @@ start:

sub count, count, #1
cbnz count, start
ldr q8, [x0, #0]
mul v27.8H, v3.8H, v0.H[0]
ldr q24, [x0, #32]
mls v27.8H, v29.8H, v1.H[0]
add v10.8H, v8.8H, v6.8H
sub v19.8H, v8.8H, v6.8H
sub v4.8H, v24.8H, v27.8H
add v9.8H, v24.8H, v27.8H
str q19, [x0, #16]
str q4, [x0, #48]
str q10, [x0], #4*16
str q9, [x0, #-32]
ldr q15, [x0, #0]
mul v13.8H, v17.8H, v0.H[0]
ldr q19, [x0, #32]
mls v13.8H, v27.8H, v1.H[0]
add v4.8H, v15.8H, v3.8H
sub v10.8H, v15.8H, v3.8H
sub v31.8H, v19.8H, v13.8H
add v28.8H, v19.8H, v13.8H
str q10, [x0, #16]
str q31, [x0, #48]
str q4, [x0], #4*16
str q28, [x0, #-32]
Loading

0 comments on commit 029d0a1

Please sign in to comment.