From d5f38b8b36fcd48be66ae1d8d329be913d2f8989 Mon Sep 17 00:00:00 2001 From: cchudant Date: Fri, 3 Mar 2023 23:43:30 +0100 Subject: [PATCH 1/7] kernel generation --- .../2x12/packed_packed_loop1/avx-512.tmpli | 60 +++++++ linalg/x86_64/avx512/4x3/i32.tmpli | 50 ++++++ .../x86_64/avx512/avx512_mmm_f32_32x12.tmpl | 22 +++ linalg/x86_64/avx512/avx512_mmm_i32_64x3.tmpl | 148 ++++++++++++++++++ linalg/x86_64/avx512/f32_add_mat_mul.tmpliq | 62 ++++++++ .../avx512/f32_add_row_col_products.tmpliq | 23 +++ linalg/x86_64/avx512/f32_add_unicast.tmpliq | 94 +++++++++++ linalg/x86_64/avx512/f32_store_clear.tmpliq | 52 ++++++ linalg/x86_64/avx512/postamble.tmpliq | 22 +++ linalg/x86_64/avx512/preamble.tmpliq | 17 ++ linalg/x86_64/avx512/zmm_scalar.tmpliq | 6 +- 11 files changed, 553 insertions(+), 3 deletions(-) create mode 100644 linalg/x86_64/avx512/2x12/packed_packed_loop1/avx-512.tmpli create mode 100644 linalg/x86_64/avx512/4x3/i32.tmpli create mode 100644 linalg/x86_64/avx512/avx512_mmm_f32_32x12.tmpl create mode 100644 linalg/x86_64/avx512/avx512_mmm_i32_64x3.tmpl create mode 100644 linalg/x86_64/avx512/f32_add_mat_mul.tmpliq create mode 100644 linalg/x86_64/avx512/f32_add_row_col_products.tmpliq create mode 100644 linalg/x86_64/avx512/f32_add_unicast.tmpliq create mode 100644 linalg/x86_64/avx512/f32_store_clear.tmpliq diff --git a/linalg/x86_64/avx512/2x12/packed_packed_loop1/avx-512.tmpli b/linalg/x86_64/avx512/2x12/packed_packed_loop1/avx-512.tmpli new file mode 100644 index 0000000000..8ca7750da9 --- /dev/null +++ b/linalg/x86_64/avx512/2x12/packed_packed_loop1/avx-512.tmpli @@ -0,0 +1,60 @@ + // Tile size: 2x12 + // Accumulators: zmm0-23 + // Col regs: zmm25 + // Row regs: zmm26-27 + + prefetcht0 [rax + 256] + prefetcht0 [rax+64 + 256] + vmovaps zmm26, [rax] + vmovaps zmm27, [rax+64] + + vbroadcastss zmm25, dword ptr [rcx+0] + vfmadd231ps zmm0, zmm26, zmm25 + vfmadd231ps zmm1, zmm27, zmm25 + + vbroadcastss zmm25, dword ptr [rcx+4] + vfmadd231ps zmm2, zmm26, zmm25 + vfmadd231ps zmm3, zmm27, zmm25 + + vbroadcastss zmm25, dword ptr [rcx+8] + vfmadd231ps zmm4, zmm26, zmm25 + vfmadd231ps zmm5, zmm27, zmm25 + + vbroadcastss zmm25, dword ptr [rcx+12] + vfmadd231ps zmm6, zmm26, zmm25 + vfmadd231ps zmm7, zmm27, zmm25 + + vbroadcastss zmm25, dword ptr [rcx+16] + vfmadd231ps zmm8, zmm26, zmm25 + vfmadd231ps zmm9, zmm27, zmm25 + + vbroadcastss zmm25, dword ptr [rcx+20] + vfmadd231ps zmm10, zmm26, zmm25 + vfmadd231ps zmm11, zmm27, zmm25 + + vbroadcastss zmm25, dword ptr [rcx+24] + vfmadd231ps zmm12, zmm26, zmm25 + vfmadd231ps zmm13, zmm27, zmm25 + + vbroadcastss zmm25, dword ptr [rcx+28] + vfmadd231ps zmm14, zmm26, zmm25 + vfmadd231ps zmm15, zmm27, zmm25 + + vbroadcastss zmm25, dword ptr [rcx+32] + vfmadd231ps zmm16, zmm26, zmm25 + vfmadd231ps zmm17, zmm27, zmm25 + + vbroadcastss zmm25, dword ptr [rcx+36] + vfmadd231ps zmm18, zmm26, zmm25 + vfmadd231ps zmm19, zmm27, zmm25 + + vbroadcastss zmm25, dword ptr [rcx+40] + vfmadd231ps zmm20, zmm26, zmm25 + vfmadd231ps zmm21, zmm27, zmm25 + + vbroadcastss zmm25, dword ptr [rcx+44] + vfmadd231ps zmm22, zmm26, zmm25 + vfmadd231ps zmm23, zmm27, zmm25 + + add rax, 128 + add rcx, 48 diff --git a/linalg/x86_64/avx512/4x3/i32.tmpli b/linalg/x86_64/avx512/4x3/i32.tmpli new file mode 100644 index 0000000000..d6707e92c8 --- /dev/null +++ b/linalg/x86_64/avx512/4x3/i32.tmpli @@ -0,0 +1,50 @@ + // Tile size: 4x3 + // Accumulators: 0-11 + // Col regs: zmm12 + // Row regs: zmm13-15 + + // Load col of A + vmovaps zmm12, [rax] + + // Fill 3 cols of B + vbroadcastss zmm13, dword ptr [rcx + 0] + vbroadcastss zmm14, dword ptr [rcx + 4] + vbroadcastss zmm15, dword ptr [rcx + 8] + + // N.B. Stepping cols in inner loop + vpmulld zmm31, zmm12, zmm13 + vpaddd zmm0, zmm0, zmm31 + vpmulld zmm30, zmm12, zmm14 + vpaddd zmm4, zmm4, zmm30 + vpmulld zmm29, zmm12, zmm15 + vpaddd zmm8, zmm8, zmm29 + + vmovaps zmm12, [rax+64] + + vpmulld zmm31, zmm12, zmm13 + vpaddd zmm1, zmm1, zmm31 + vpmulld zmm30, zmm12, zmm14 + vpaddd zmm5, zmm5, zmm30 + vpmulld zmm29, zmm12, zmm15 + vpaddd zmm9, zmm9, zmm29 + + vmovaps zmm12, [rax+128] + + vpmulld zmm31, zmm12, zmm13 + vpaddd zmm2, zmm2, zmm31 + vpmulld zmm30, zmm12, zmm14 + vpaddd zmm6, zmm6, zmm30 + vpmulld zmm29, zmm12, zmm15 + vpaddd zmm10, zmm10, zmm29 + + vmovaps zmm12, [rax+192] + + vpmulld zmm31, zmm12, zmm13 + vpaddd zmm3, zmm3, zmm31 + vpmulld zmm30, zmm12, zmm14 + vpaddd zmm7, zmm7, zmm30 + vpmulld zmm29, zmm12, zmm15 + vpaddd zmm11, zmm11, zmm29 + + add rcx, 12 + add rax, 256 diff --git a/linalg/x86_64/avx512/avx512_mmm_f32_32x12.tmpl b/linalg/x86_64/avx512/avx512_mmm_f32_32x12.tmpl new file mode 100644 index 0000000000..e39e426278 --- /dev/null +++ b/linalg/x86_64/avx512/avx512_mmm_f32_32x12.tmpl @@ -0,0 +1,22 @@ +{% comment %} +// vim: set syntax=asm : + +/* mmm f32 32 x 12: + + zmm0 zmm2 zmm4 zmm6 zmm8 zmm10 zmm12 zmm14 zmm16 zmm18 zmm20 zmm22 + zmm1 zmm3 zmm5 zmm7 zmm9 zmm11 zmm13 zmm15 zmm17 zmm19 zmm21 zmm23 + +*/ +{% endcomment %} + +{% include "preamble.tmpliq" size:"32x12", suffix:suffix, G:G, arch:"avx512" %} + +{% include "f32_add_mat_mul.tmpliq" mr:32, nr:12 %} +{% include "f32_scalars.tmpliq" from:0, to:23 %} +{% include "f32_per_rows.tmpliq" mr:32, from:0, to:23 %} +{% include "f32_per_cols.tmpliq" mr:32, from:0, to:23 %} +{% include "f32_store_clear.tmpliq" mr:32, nr:12 %} +{% include "f32_add_row_col_products.tmpliq" mr:32, nr:12 %} +{% include "f32_add_unicast.tmpliq" mr:32, nr:12 %} + +{% include "postamble.tmpliq" size:"32x6", suffix:suffix, G:G, L:L, arch:"avx512" %} diff --git a/linalg/x86_64/avx512/avx512_mmm_i32_64x3.tmpl b/linalg/x86_64/avx512/avx512_mmm_i32_64x3.tmpl new file mode 100644 index 0000000000..8146509240 --- /dev/null +++ b/linalg/x86_64/avx512/avx512_mmm_i32_64x3.tmpl @@ -0,0 +1,148 @@ +{% comment %} +// vim: set syntax=asm : + +/* mmm 64 x 3: + + zmm0 zmm4 zmm8 + zmm1 zmm5 zmm9 + zmm2 zmm6 zmm10 + zmm3 zmm7 zmm11 + +System V ABI: + args: rdi, rsi, rdx, rcx, r8, r9 + preserve: rbx, rsp, rbp, r12, r13, r14, r15 + scratch: rax, rdi, rsi, rdx, rcx, r8, r9, r10, r11 + return: rax (+rdx) + +Windows ABI: + args: RCX, RDX, R8, R9 + preserve: RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15, and XMM6-15 + scratch: RAX, RCX, RDX, R8, R9, R10, R11, XMM0-5, and the upper portions of YMM0-15 and ZMM0-15 + return: rax (+rdx) +*/ +{% endcomment %} + +{% include "preamble.tmpliq" size:"64x3", suffix:suffix, G:G, arch:"avx512" %} + +{{L}}clear: + vzeroall + jmp {{L}}non_linear_loop + +{{L}}add_mat_mul: + mov rcx, [rdi + 24] // B + mov rax, [rdi + 16] // A + + mov rbx, [rdi + 8] // k + test rcx, rcx + jz {{L}}non_linear_loop + +{{L}}main_loop_packed_packed: + {% include "4x3/i32.tmpli" %} + + dec rbx + jnz {{L}}main_loop_packed_packed + + jmp {{L}}non_linear_loop + +{% include "i32_scalars.tmpliq" from:0, to:11 %} +{% include "i32_per_rows.tmpliq" mr:64, from:0, to:11 %} +{% include "i32_per_cols.tmpliq" mr:64, from:0, to:11 %} + +{{L}}add_unicast: // todo: not done + + mov r10, [rdi + 8] // c ptr + mov rsi, [rdi + 16] // row stride + mov rbx, [rdi + 24] // col stride + + mov eax, 0 + +{% for i in (0..3) %} + pinsrd xmm14, eax, {{i}} + add eax, esi +{% endfor %} +{% for i in (0..3) %} + pinsrd xmm15, eax, {{i}} + add eax, esi +{% endfor %} +{% for i in (0..3) %} + pinsrd xmm12, eax, {{i}} + add eax, esi +{% endfor %} +{% for i in (0..3) %} + pinsrd xmm13, eax, {{i}} + add eax, esi +{% endfor %} + + vperm2f128 ymm14, ymm14, ymm15, 32 // ymm14 <- xmm14::xmm15 + vperm2f128 ymm13, ymm12, ymm13, 32 // ymm12 <- xmm12::xmm13 + vinsertf32x8 zmm14, zmm14, ymm13, 1 + +{% for i in (0..2) %} + kxnorw k1,k1,k1 + vgatherdps zmm12{k1}, [ r10 + zmm14 ] + add r10, rbx + vaddps zmm{{i | times: 4}}, zmm{{i | times: 4}}, zmm12 +{% endfor %} + + imul esi, 16 + vpbroadcastd zmm15, esi + +{% for j in (1..3) %} + mov r10, [rdi + 8] + vpaddd zmm14, zmm14, zmm15 + + {% for i in (0..2) %} + kxnorw k1,k1,k1 + vgatherdps zmm12{k1}, [ r10 + zmm14 ] + add r10, rbx + vaddps zmm{{i | times: 4 | plus: j}}, zmm{{i | times: 4 | plus: j}}, zmm12 + {% endfor %} +{% endfor %} + + jmp {{L}}non_linear_loop + +{{L}}add_row_col_products: + mov rax, [ rdi + 8 ] + mov rbx, [ rdi + 16 ] + + vbroadcastss zmm13, dword ptr [rbx] + vbroadcastss zmm14, dword ptr [rbx+4] + vbroadcastss zmm15, dword ptr [rbx+8] + +{% for i in (0..3) %} + vmovups zmm12, zmmword ptr [rax+{{i | times:64}}] + vfmadd231ps zmm{{i}}, zmm12, zmm13 + vfmadd231ps zmm{{i | plus: 4}}, zmm12, zmm14 + vfmadd231ps zmm{{i | plus: 8}}, zmm12, zmm15 +{% endfor %} + + jmp {{L}}non_linear_loop + +{{L}}store: + mov r8, [rdi + 8] // c ptr + mov rsi, [rdi + 16] // row stride + mov rbx, [rdi + 24] // col stride + + // tops of cols + lea r9, [ r8 + rbx ] + lea r10, [ r8 + 2 * rbx ] + lea r11, [ r10 + rbx ] + + {% for word in (0..3) %} + {% for quarter in (0..3) %} + {% for r in (0..2) %} + vextractf32x4 xmm{{r | plus: 12}}, zmm{{r | times: 4 | plus: word}}, {{quarter}} + {% endfor %} + {% for row in (0..3) %} + {% for i in (0..2) %} + vextractps dword ptr [r{{i | plus: 8}}], xmm{{i | plus: 12}}, {{row}} + add r{{i | plus: 8}}, rsi + {% endfor %} + {% endfor %} + {% endfor %} + {% endfor %} + + jmp {{L}}non_linear_loop + +{% include "postamble.tmpliq" size:"64x3", suffix:suffix, G:G, L:L, arch:"avx512" %} + diff --git a/linalg/x86_64/avx512/f32_add_mat_mul.tmpliq b/linalg/x86_64/avx512/f32_add_mat_mul.tmpliq new file mode 100644 index 0000000000..8700acfae8 --- /dev/null +++ b/linalg/x86_64/avx512/f32_add_mat_mul.tmpliq @@ -0,0 +1,62 @@ +{% comment %} +Generate the code for the add_mat_mul instruction. +--- +Arguments: + mr - kernel size in number of elements + nr - kernel size in number of elements +{% endcomment %} + +{{L}}add_mat_mul: + mov rcx, [rdi + 24] // B + mov rax, [rdi + 16] // A + + mov rbx, [rdi + 8] // k + test rcx, rcx + jz {{L}}non_linear_loop + +{{L}}main_loop_packed_packed: + +{% assign arch_mr = mr | divided_by: 16 %} +{% assign arch_mr_min_1 = mr | divided_by: 16 | minus: 1 %} + +{% assign nr_min_1 = nr | minus: 1 %} + +// total bytes of the tile on the m axis +{% assign m_total_bytes = mr | times: 4 %} +// total bytes of the tile on the n axis +{% assign n_total_bytes = nr | times: 4 %} + +// first register to be used for row +{% assign row_reg = arch_mr | times: nr %} +// the column register +{% assign col_reg = row_reg | plus: arch_mr | plus: 1 %} + +{% assign prefetch_dist = 2 %} + +{% for i in (0..arch_mr_min_1) %} + prefetcht0 [rax + {{i | times:64}} + {{m_total_bytes | times:prefetch_dist}}] +{% endfor %} + +{% for i in (0..arch_mr_min_1) %} + vmovaps zmm{{row_reg | plus:i}}, [rax + {{i | times:64}}] +{% endfor %} + +// this loop will access A 16 elements at a time +// and B 1 element at a time + +{% for i in (0..nr_min_1) %} + vbroadcastss zmm{{col_reg}}, dword ptr [rcx + {{i | times:4}}] + + {% for j in (0..arch_mr_min_1) %} + vfmadd231ps zmm{{i | times:arch_mr | plus:j}}, zmm{{row_reg | plus:j}}, zmm{{col_reg}} + {% endfor %} + +{% endfor %} + + add rax, {{m_total_bytes}} + add rcx, {{n_total_bytes}} + + dec rbx + jnz {{L}}main_loop_packed_packed + + jmp {{L}}non_linear_loop diff --git a/linalg/x86_64/avx512/f32_add_row_col_products.tmpliq b/linalg/x86_64/avx512/f32_add_row_col_products.tmpliq new file mode 100644 index 0000000000..dd91d8e1b7 --- /dev/null +++ b/linalg/x86_64/avx512/f32_add_row_col_products.tmpliq @@ -0,0 +1,23 @@ +{% comment %} +Generate the code for the store instruction. +--- +Arguments: + nr - kernel size in number of elements +{% endcomment %} + +{% assign nr_min_1 = nr | minus: 1 %} + +{{L}}add_row_col_products: + mov rax, [ rdi + 8 ] + mov rbx, [ rdi + 16 ] + + vmovups zmm31, zmmword ptr [rax] + vmovups zmm30, zmmword ptr [rax+64] + +{% for i in (0..nr_min_1) %} + vbroadcastss zmm29, dword ptr [rbx + {{i|times:4}} ] + vfmadd231ps zmm{{i | times: 2}}, zmm31, zmm29 + vfmadd231ps zmm{{i | times: 2 | plus: 1}}, zmm30, zmm29 +{% endfor %} + + jmp {{L}}non_linear_loop \ No newline at end of file diff --git a/linalg/x86_64/avx512/f32_add_unicast.tmpliq b/linalg/x86_64/avx512/f32_add_unicast.tmpliq new file mode 100644 index 0000000000..bdf7501330 --- /dev/null +++ b/linalg/x86_64/avx512/f32_add_unicast.tmpliq @@ -0,0 +1,94 @@ +{% comment %} +Generate the code for the add_unicast instruction. +--- +Arguments: + mr - kernel size in number of elements + nr - kernel size in number of elements +{% endcomment %} + + +{{L}}add_unicast: + + mov r10, [rdi + 8] // c ptr + mov rsi, [rdi + 16] // row stride + mov rbx, [rdi + 24] // col stride + + mov eax, 0 + +// this is a hack - we move stuff around because +// pinsrd and vperm2f128 don't support ymm16-ymm31 registers +// meaning we need some scratch registers on ymm0-ymm16 +// however we have our data there :/ + +{% assign last_data_reg = mr | divided_by:16 | times:nr | minus:1 %} +{% if last_data_reg >= 12 %} + vmovups zmm28, zmm12 +{% endif %} +{% if last_data_reg >= 13 %} + vmovups zmm29, zmm13 +{% endif %} +{% if last_data_reg >= 14 %} + vmovups zmm30, zmm14 +{% endif %} +{% if last_data_reg >= 15 %} + vmovups zmm31, zmm15 +{% endif %} + +{% for i in (0..3) %} + pinsrd xmm14, eax, {{i}} + add eax, esi +{% endfor %} +{% for i in (0..3) %} + pinsrd xmm15, eax, {{i}} + add eax, esi +{% endfor %} +{% for i in (0..3) %} + pinsrd xmm12, eax, {{i}} + add eax, esi +{% endfor %} +{% for i in (0..3) %} + pinsrd xmm13, eax, {{i}} + add eax, esi +{% endfor %} + + vperm2f128 ymm14, ymm14, ymm15, 32 // ymm14 <- xmm14::xmm15 + vperm2f128 ymm13, ymm12, ymm13, 32 // ymm12 <- xmm12::xmm13 + vinsertf32x8 zmm14, zmm14, ymm13, 1 + + vmovups zmm25, zmm15 + vmovups zmm26, zmm14 + vmovups zmm27, zmm12 + +{% if last_data_reg >= 12 %} + vmovups zmm12, zmm28 +{% endif %} +{% if last_data_reg >= 13 %} + vmovups zmm13, zmm29 +{% endif %} +{% if last_data_reg >= 14 %} + vmovups zmm14, zmm30 +{% endif %} +{% if last_data_reg >= 15 %} + vmovups zmm15, zmm31 +{% endif %} + +{% for i in (0..nr) %} + kxnorw k1,k1,k1 + vgatherdps zmm27{k1}, [r10 + zmm26] + add r10, rbx + vaddps zmm{{i | times:2}}, zmm{{i | times:2}}, zmm27 +{% endfor %} + + mov r10, [rdi + 8] + imul esi, 16 + vpbroadcastd zmm25, esi + vpaddd zmm26, zmm26, zmm25 + +{% for i in (0..nr) %} + kxnorw k1,k1,k1 + vgatherdps zmm27{k1}, [r10 + zmm26] + add r10, rbx + vaddps zmm{{i | times:2 | plus:1}}, zmm{{i | times:2 | plus: 1}}, zmm27 +{% endfor %} + + jmp {{L}}non_linear_loop diff --git a/linalg/x86_64/avx512/f32_store_clear.tmpliq b/linalg/x86_64/avx512/f32_store_clear.tmpliq new file mode 100644 index 0000000000..2e696cba1a --- /dev/null +++ b/linalg/x86_64/avx512/f32_store_clear.tmpliq @@ -0,0 +1,52 @@ +{% comment %} +Generate the code for the store and clear instructions. +--- +Arguments: + mr - kernel size in number of elements + nr - kernel size in number of elements +{% endcomment %} + +{% assign arch_mr_min_1 = mr | divided_by:16 | minus:1 %} +{% assign nr_min_1 = nr | minus:1 %} + +{{L}}store: + mov r8, [rdi + 8] // c ptr + mov rsi, [rdi + 16] // row stride + mov rbx, [rdi + 24] // col stride + + mov r9, r8 // current row + mov r10, r8 // current col + + {% for regcol in (0..nr_min_1) %} + {% for regrow in (0..arch_mr_min_1) %} + {% for quarter in (0..3) %} + vextractf32x4 xmm31, zmm{{regcol | times:2 | plus:regrow}}, {{quarter}} + {% for innerrow in (0..3) %} + vextractps dword ptr [r9], xmm31, {{innerrow}} + add r9, rsi + {% endfor %} + {% endfor %} + {% endfor %} + add r10, rbx + mov r9, r10 + {% endfor %} + + jmp {{L}}non_linear_loop + +{% assign last_reg = mr | divided_by:16 | times:nr | minus:1 %} + +{{L}}clear: + vzeroall + // turns out vzeroall only zeroes zmm0 to zmm15 + {% for regcol in (15..last_reg) %} + vmovups zmm16, zmm0 + vmovups zmm17, zmm0 + vmovups zmm18, zmm0 + vmovups zmm19, zmm0 + vmovups zmm20, zmm0 + vmovups zmm21, zmm0 + vmovups zmm22, zmm0 + vmovups zmm23, zmm0 + {% endfor %} + + jmp {{L}}non_linear_loop diff --git a/linalg/x86_64/avx512/postamble.tmpliq b/linalg/x86_64/avx512/postamble.tmpliq index ff3071a71a..6482e66ddf 100644 --- a/linalg/x86_64/avx512/postamble.tmpliq +++ b/linalg/x86_64/avx512/postamble.tmpliq @@ -1,3 +1,25 @@ +{% comment %} +// vim: set syntax=asm : + +/* mmm f32 32 x 12: + + zmm0 zmm2 ... zmm22 + zmm1 zmm3 ... zmm23 + +System V ABI: + args: rdi, rsi, rdx, rcx, r8, r9 + preserve: rbx, rsp, rbp, r12, r13, r14, r15 + scratch: rax, rdi, rsi, rdx, rcx, r8, r9, r10, r11 + return: rax (+rdx) + +Windows ABI: + args: RCX, RDX, R8, R9 + preserve: RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15, and XMM6-15 + scratch: RAX, RCX, RDX, R8, R9, R10, R11, XMM0-5, and the upper portions of YMM0-15 and ZMM0-15 + return: rax (+rdx) +*/ +{% endcomment %} + {{L}}return: ldmxcsr [rsp + 4] add rsp, 8 diff --git a/linalg/x86_64/avx512/preamble.tmpliq b/linalg/x86_64/avx512/preamble.tmpliq index 3ed2f7c309..10e2437562 100644 --- a/linalg/x86_64/avx512/preamble.tmpliq +++ b/linalg/x86_64/avx512/preamble.tmpliq @@ -1,3 +1,20 @@ +{% comment %} +// vim: set syntax=asm : + +System V ABI: + args: rdi, rsi, rdx, rcx, r8, r9 + preserve: rbx, rsp, rbp, r12, r13, r14, r15 + scratch: rax, rdi, rsi, rdx, rcx, r8, r9, r10, r11 + return: rax (+rdx) + +Windows ABI: + args: RCX, RDX, R8, R9 + preserve: RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15, and XMM6-15 + scratch: RAX, RCX, RDX, R8, R9, R10, R11, XMM0-5, and the upper portions of YMM0-15 and ZMM0-15 + return: rax (+rdx) +*/ +{% endcomment %} + {% if msvc %} _text segment diff --git a/linalg/x86_64/avx512/zmm_scalar.tmpliq b/linalg/x86_64/avx512/zmm_scalar.tmpliq index 43373c9d82..c38a5965cf 100644 --- a/linalg/x86_64/avx512/zmm_scalar.tmpliq +++ b/linalg/x86_64/avx512/zmm_scalar.tmpliq @@ -1,14 +1,14 @@ // vim: set syntax=asm : {{L}}{{label}}: - vbroadcastss zmm12, dword ptr [rdi + 8] + vbroadcastss zmm31, dword ptr [rdi + 8] {% if flipped %} {% for reg in (from..to) %} - {{op}} zmm{{reg}}, zmm{{reg}}, zmm12 + {{op}} zmm{{reg}}, zmm{{reg}}, zmm31 {% endfor %} {% else %} {% for reg in (from..to) %} - {{op}} zmm{{reg}}, zmm12, zmm{{reg}} + {{op}} zmm{{reg}}, zmm31, zmm{{reg}} {% endfor %} {% endif %} From 5109d974ae7563cca313c3fadf265464301ff3c6 Mon Sep 17 00:00:00 2001 From: Charles Chudant Date: Fri, 10 Mar 2023 22:25:47 +0000 Subject: [PATCH 2/7] fix kernel gen, throughput benchmarks --- linalg/Cargo.toml | 5 + linalg/benches/intel.rs | 4 +- linalg/benches/kernel_test.rs | 84 +++++++++ linalg/benches/utils.rs | 3 +- linalg/build.rs | 111 ++++++++---- linalg/src/x86_64_fma/mmm.rs | 64 ++++++- .../packed_packed_loop1/avx-512-unroll.tmpli | 59 ------- .../10x1/packed_packed_loop1/avx-512.tmpli | 33 ---- .../1x1/packed_packed_loop1/avx-512.tmpli | 7 - .../1x1/packed_packed_loop1/unroll-16.tmpli | 68 -------- .../1x1/packed_packed_loop1/unroll-4.tmpli | 24 --- .../1x1/packed_packed_loop1/unroll-8.tmpli | 29 ---- .../1x1/packed_packed_loop1/unroll.tmpli | 11 -- .../1x12/packed_packed_loop1/avx-512.tmpli | 45 ----- .../2x12/packed_packed_loop1/avx-512.tmpli | 60 ------- .../packed_packed_loop1/avx-512-unroll.tmpli | 53 ------ .../2x5/packed_packed_loop1/avx-512.tmpli | 30 ---- .../packed_packed_loop1/avx-512-unroll.tmpli | 71 -------- .../2x6/packed_packed_loop1/avx-512.tmpli | 39 ----- .../packed_packed_loop1/avx-512-unroll.tmpli | 63 ------- .../3x4/packed_packed_loop1/avx-512.tmpli | 35 ---- linalg/x86_64/avx512/4x3/i32.tmpli | 50 ------ .../packed_packed_loop1/avx-512-unroll.tmpli | 69 -------- .../4x3/packed_packed_loop1/avx-512.tmpli | 38 ---- .../packed_packed_loop1/avx-512-unroll.tmpli | 63 ------- .../5x2/packed_packed_loop1/avx-512.tmpli | 34 ---- .../packed_packed_loop1/avx-512-unroll.tmpli | 25 --- .../6x1/packed_packed_loop1/avx-512.tmpli | 29 ---- .../packed_packed_loop1/avx-512-unroll.tmpli | 70 -------- .../6x2/packed_packed_loop1/avx-512.tmpli | 38 ---- .../packed_packed_loop1/avx-512-unroll.tmpli | 40 ----- .../7x1/packed_packed_loop1/avx-512.tmpli | 21 --- .../packed_packed_loop1/avx-512-unroll.tmpli | 30 ---- .../8x1/packed_packed_loop1/avx-512.tmpli | 25 --- .../8x2/packed_packed_loop1/avx-512.tmpli | 42 ----- .../packed_packed_loop1/avx-512-unroll.tmpli | 61 ------- .../8x8/packed_packed_loop1/avx-512.tmpli | 33 ---- linalg/x86_64/avx512/avx512_mmm_f32.tmpliq | 28 +++ .../x86_64/avx512/avx512_mmm_f32_128x1.tmpl | 95 ---------- linalg/x86_64/avx512/avx512_mmm_f32_16x1.tmpl | 134 -------------- .../x86_64/avx512/avx512_mmm_f32_16x12.tmpl | 164 ------------------ linalg/x86_64/avx512/avx512_mmm_f32_16x8.tmpl | 142 --------------- .../x86_64/avx512/avx512_mmm_f32_32x12.tmpl | 22 --- linalg/x86_64/avx512/avx512_mmm_f32_32x5.tmpl | 143 --------------- linalg/x86_64/avx512/avx512_mmm_f32_32x6.tmpl | 160 ----------------- linalg/x86_64/avx512/avx512_mmm_f32_48x4.tmpl | 147 ---------------- linalg/x86_64/avx512/avx512_mmm_f32_64x3.tmpl | 148 ---------------- linalg/x86_64/avx512/avx512_mmm_f32_80x2.tmpl | 147 ---------------- linalg/x86_64/avx512/avx512_mmm_i32_64x3.tmpl | 148 ---------------- linalg/x86_64/avx512/f32_add_mat_mul.tmpliq | 14 +- .../avx512/f32_add_row_col_products.tmpliq | 24 ++- linalg/x86_64/avx512/f32_add_unicast.tmpliq | 37 ++-- linalg/x86_64/avx512/f32_per_cols.tmpliq | 20 ++- linalg/x86_64/avx512/f32_per_rows.tmpliq | 20 ++- linalg/x86_64/avx512/f32_scalars.tmpliq | 20 ++- linalg/x86_64/avx512/f32_store_clear.tmpliq | 16 +- linalg/x86_64/avx512/zmm_per_col.tmpliq | 30 +++- linalg/x86_64/avx512/zmm_per_row.tmpliq | 26 ++- linalg/x86_64/avx512/zmm_scalar.tmpliq | 18 +- linalg/x86_64/kernel_throughput.py | 51 ++++++ 60 files changed, 459 insertions(+), 2861 deletions(-) create mode 100644 linalg/benches/kernel_test.rs delete mode 100644 linalg/x86_64/avx512/10x1/packed_packed_loop1/avx-512-unroll.tmpli delete mode 100644 linalg/x86_64/avx512/10x1/packed_packed_loop1/avx-512.tmpli delete mode 100644 linalg/x86_64/avx512/1x1/packed_packed_loop1/avx-512.tmpli delete mode 100644 linalg/x86_64/avx512/1x1/packed_packed_loop1/unroll-16.tmpli delete mode 100644 linalg/x86_64/avx512/1x1/packed_packed_loop1/unroll-4.tmpli delete mode 100644 linalg/x86_64/avx512/1x1/packed_packed_loop1/unroll-8.tmpli delete mode 100644 linalg/x86_64/avx512/1x1/packed_packed_loop1/unroll.tmpli delete mode 100644 linalg/x86_64/avx512/1x12/packed_packed_loop1/avx-512.tmpli delete mode 100644 linalg/x86_64/avx512/2x12/packed_packed_loop1/avx-512.tmpli delete mode 100644 linalg/x86_64/avx512/2x5/packed_packed_loop1/avx-512-unroll.tmpli delete mode 100644 linalg/x86_64/avx512/2x5/packed_packed_loop1/avx-512.tmpli delete mode 100644 linalg/x86_64/avx512/2x6/packed_packed_loop1/avx-512-unroll.tmpli delete mode 100644 linalg/x86_64/avx512/2x6/packed_packed_loop1/avx-512.tmpli delete mode 100644 linalg/x86_64/avx512/3x4/packed_packed_loop1/avx-512-unroll.tmpli delete mode 100644 linalg/x86_64/avx512/3x4/packed_packed_loop1/avx-512.tmpli delete mode 100644 linalg/x86_64/avx512/4x3/i32.tmpli delete mode 100644 linalg/x86_64/avx512/4x3/packed_packed_loop1/avx-512-unroll.tmpli delete mode 100644 linalg/x86_64/avx512/4x3/packed_packed_loop1/avx-512.tmpli delete mode 100644 linalg/x86_64/avx512/5x2/packed_packed_loop1/avx-512-unroll.tmpli delete mode 100644 linalg/x86_64/avx512/5x2/packed_packed_loop1/avx-512.tmpli delete mode 100644 linalg/x86_64/avx512/6x1/packed_packed_loop1/avx-512-unroll.tmpli delete mode 100644 linalg/x86_64/avx512/6x1/packed_packed_loop1/avx-512.tmpli delete mode 100644 linalg/x86_64/avx512/6x2/packed_packed_loop1/avx-512-unroll.tmpli delete mode 100644 linalg/x86_64/avx512/6x2/packed_packed_loop1/avx-512.tmpli delete mode 100644 linalg/x86_64/avx512/7x1/packed_packed_loop1/avx-512-unroll.tmpli delete mode 100644 linalg/x86_64/avx512/7x1/packed_packed_loop1/avx-512.tmpli delete mode 100644 linalg/x86_64/avx512/8x1/packed_packed_loop1/avx-512-unroll.tmpli delete mode 100644 linalg/x86_64/avx512/8x1/packed_packed_loop1/avx-512.tmpli delete mode 100644 linalg/x86_64/avx512/8x2/packed_packed_loop1/avx-512.tmpli delete mode 100644 linalg/x86_64/avx512/8x8/packed_packed_loop1/avx-512-unroll.tmpli delete mode 100644 linalg/x86_64/avx512/8x8/packed_packed_loop1/avx-512.tmpli create mode 100644 linalg/x86_64/avx512/avx512_mmm_f32.tmpliq delete mode 100644 linalg/x86_64/avx512/avx512_mmm_f32_128x1.tmpl delete mode 100644 linalg/x86_64/avx512/avx512_mmm_f32_16x1.tmpl delete mode 100644 linalg/x86_64/avx512/avx512_mmm_f32_16x12.tmpl delete mode 100644 linalg/x86_64/avx512/avx512_mmm_f32_16x8.tmpl delete mode 100644 linalg/x86_64/avx512/avx512_mmm_f32_32x12.tmpl delete mode 100644 linalg/x86_64/avx512/avx512_mmm_f32_32x5.tmpl delete mode 100644 linalg/x86_64/avx512/avx512_mmm_f32_32x6.tmpl delete mode 100644 linalg/x86_64/avx512/avx512_mmm_f32_48x4.tmpl delete mode 100644 linalg/x86_64/avx512/avx512_mmm_f32_64x3.tmpl delete mode 100644 linalg/x86_64/avx512/avx512_mmm_f32_80x2.tmpl delete mode 100644 linalg/x86_64/avx512/avx512_mmm_i32_64x3.tmpl create mode 100644 linalg/x86_64/kernel_throughput.py diff --git a/linalg/Cargo.toml b/linalg/Cargo.toml index b5e01949fd..5991d1dc79 100644 --- a/linalg/Cargo.toml +++ b/linalg/Cargo.toml @@ -99,3 +99,8 @@ harness = false [[bench]] name = "intel" harness = false + +[[bench]] +bench = false +name = "kernel_test" +harness = false diff --git a/linalg/benches/intel.rs b/linalg/benches/intel.rs index d98a4ab8c4..c43d1b366a 100644 --- a/linalg/benches/intel.rs +++ b/linalg/benches/intel.rs @@ -6,7 +6,7 @@ use tract_linalg::mmm::OutputStoreKer; fn ruin_cache() { // return; - let _a = (0..1000000).collect::>(); + let _a = std::hint::black_box((0..10000000).collect::>()); } pub fn reference(mr: usize, k: usize, nr: usize) -> Vec @@ -63,7 +63,7 @@ fn bench_to_nanos< FusedSpec::AddMatMul { k, a: kernel.a_packed(4, k).wrap(&a.view()), - b: kernel.b_packed(4, k).wrap(&b.view()).unwrap(), + b: kernel.b_packed(4, k).wrap(&b.view()), }, // FusedSpec::AddUnicast(kernel.c_view(1, 0).wrap(&c.view_mut())), FusedSpec::Store(kernel.c_view(1, 0).wrap(&c.view_mut())), diff --git a/linalg/benches/kernel_test.rs b/linalg/benches/kernel_test.rs new file mode 100644 index 0000000000..14d45297d1 --- /dev/null +++ b/linalg/benches/kernel_test.rs @@ -0,0 +1,84 @@ +use criterion::*; + +mod utils; +use tract_data::prelude::DatumType; +use tract_linalg::mmm::MatMatMul; +use tract_linalg::mmm::MatMatMulKer; +use utils::*; + +pub fn mat_mat_mm( + be: &mut Bencher, + &(mm, dt, m, k, n, cold): &(&dyn MatMatMul, DatumType, usize, usize, usize, bool), +) { + mat_mat_with_mm(be, mm, &(dt, m, k, n, cold)); +} + +fn cold_and_hot(c: &mut Criterion, mm: &dyn MatMatMul, m: usize, k: usize, n: usize) { + let mut group = c.benchmark_group(format!("{}", mm.kernel_name())); + group.throughput(Throughput::Elements((m * k * n) as u64)); + let id = format!("{m}x{k}x{n}"); + group.bench_with_input( + BenchmarkId::new("f32/cold", &id), + &(mm, DatumType::F32, m, k, n, false), + mat_mat_mm, + ); + // group.bench_with_input( + // BenchmarkId::new("f32/hot", &id), + // &(mm, DatumType::F32, m, k, n, true), + // mat_mat_mm, + // ); +} + +fn mm(be: &mut Criterion, mm: impl AsRef, n: usize) { + // for m in (0..1024).step_by(128).skip(1) { + cold_and_hot(be, mm.as_ref(), 1024, 1000, n); + // } +} + +fn all(c: &mut Criterion) { + use tract_linalg::x86_64_fma::mmm::*; + macro_rules! benches_for_n { + ($c:expr ; $n:expr ; $m:expr) => ( + paste::paste! { + mm($c, []::mmm(), $n); + } + ); + ($c:expr ; $x:expr ; $m1:expr, $($y:expr),+) => ( + benches_for_n!($c ; $x ; $m1); + benches_for_n!($c ; $x ; $($y),+); + ); + } + + benches_for_n!(c; 1 ; 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240); + benches_for_n!(c; 2 ; 16, 32, 48, 64, 80, 96, 112, 128, 144, 160); + benches_for_n!(c; 3 ; 16, 32, 48, 64, 80, 96, 112); + benches_for_n!(c; 4 ; 16, 32, 48, 64, 80, 96); + benches_for_n!(c; 5 ; 16, 32, 48, 64, 80); + benches_for_n!(c; 6 ; 16, 32, 48, 64); + benches_for_n!(c; 7 ; 16, 32, 48); + benches_for_n!(c; 8 ; 16, 32, 48); + benches_for_n!(c; 9 ; 16, 32, 48); + benches_for_n!(c; 10 ; 16, 32); + benches_for_n!(c; 11 ; 16, 32); + benches_for_n!(c; 12 ; 16, 32); + benches_for_n!(c; 13 ; 16, 32); + benches_for_n!(c; 14 ; 16, 32); + benches_for_n!(c; 15 ; 16); + benches_for_n!(c; 16 ; 16); + benches_for_n!(c; 17 ; 16); + benches_for_n!(c; 18 ; 16); + benches_for_n!(c; 19 ; 16); + benches_for_n!(c; 20 ; 16); + benches_for_n!(c; 21 ; 16); + benches_for_n!(c; 22 ; 16); + benches_for_n!(c; 23 ; 16); + benches_for_n!(c; 24 ; 16); + benches_for_n!(c; 25 ; 16); + benches_for_n!(c; 26 ; 16); + benches_for_n!(c; 27 ; 16); + benches_for_n!(c; 28 ; 16); + benches_for_n!(c; 29 ; 16); +} + +criterion_group!(benches, all); +criterion_main!(benches); diff --git a/linalg/benches/utils.rs b/linalg/benches/utils.rs index 321c8140b1..bb3574760b 100644 --- a/linalg/benches/utils.rs +++ b/linalg/benches/utils.rs @@ -28,7 +28,8 @@ pub fn packed_vec(c: &mut Criterion, name: &str, m: usize, k: usize, n: usize) { } pub fn ruin_cache() { - let _a = (0..1000000).collect::>(); + // the collect gets optimized out by llvm without black_box + let _a = std::hint::black_box((0..10000000).collect::>()); } #[allow(clippy::too_many_arguments)] diff --git a/linalg/build.rs b/linalg/build.rs index 55c526c381..ff8ab37cb9 100644 --- a/linalg/build.rs +++ b/linalg/build.rs @@ -72,6 +72,11 @@ impl ConfigForHalf { } } +struct GenerateKernelsSpec { + sizes: Vec<(usize, usize)>, + file: path::PathBuf, +} + fn main() { let target = var("TARGET"); let arch = var("CARGO_CFG_TARGET_ARCH"); @@ -83,8 +88,28 @@ fn main() { match arch.as_ref() { "x86_64" => { - let mut files = preprocess_files("x86_64/fma", &[], &suffix, false); - files.extend(preprocess_files("x86_64/avx512", &[], &suffix, false)); + let mut files = preprocess_files("x86_64/fma", &[], &suffix, false, None); + // limits of the size of the kernels in avx512; index is n-1 + let avx_kernels_max = [ + 240, 160, 112, 96, 80, 64, 48, 48, 48, 32, 32, 32, 32, 32, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, + ]; + let avx512_kernels: Vec<_> = avx_kernels_max + .iter() + .enumerate() + .flat_map(|(n_min_1, &max)| (16..=max).step_by(16).map(move |m| (m, n_min_1 + 1))) + .collect(); + + files.extend(preprocess_files( + "x86_64/avx512", + &[], + &suffix, + false, + Some(GenerateKernelsSpec { + sizes: avx512_kernels, + file: "x86_64/avx512/avx512_mmm_f32.tmpliq".into(), + }), + )); if os == "windows" { if use_masm() { @@ -136,7 +161,7 @@ fn main() { } } "arm" | "armv7" => { - let files = preprocess_files("arm32/armvfpv2", &[], &suffix, false); + let files = preprocess_files("arm32/armvfpv2", &[], &suffix, false, None); cc::Build::new() .files(files) .flag("-marm") @@ -148,6 +173,7 @@ fn main() { &[("core", vec!["cortexa7", "cortexa9", "generic"])], &suffix, false, + None, ); cc::Build::new() .files(files) @@ -162,11 +188,12 @@ fn main() { &[("core", vec!["a53", "a55", "gen"])], &suffix, false, + None, ); cc::Build::new().files(files).static_flag(true).compile("arm64simd"); if os == "macos" { // aarch64 darwin => M1 - let files = preprocess_files("arm64/apple_amx", &[], &suffix, false); + let files = preprocess_files("arm64/apple_amx", &[], &suffix, false, None); cc::Build::new().files(files).static_flag(true).compile("appleamx"); } if std::env::var("CARGO_FEATURE_NO_FP16").is_err() { @@ -177,6 +204,7 @@ fn main() { &[("core", vec!["a55", "gen"])], &suffix, config.needs_pragma, + None, ); config.cc().files(files).static_flag(true).compile("arm64fp16") } @@ -192,36 +220,53 @@ fn preprocess_files( variants: &[Variant], suffix: &str, needs_pragma: bool, + generate_kernels_spec: Option, ) -> Vec { let out_dir = path::PathBuf::from(var("OUT_DIR")); let mut files = vec![]; - let dir_entries = { - let mut dir_entries: Vec = - input.as_ref().read_dir().unwrap().map(|f| f.unwrap()).collect(); - dir_entries.sort_by_key(|a| a.path()); - dir_entries - }; - for f in dir_entries { - if f.path().extension() == Some(ffi::OsStr::new("tmpl")) { - let tmpl_file = f.path().file_name().unwrap().to_str().unwrap().to_owned(); - let concerned_variants: Vec<&Variant> = - variants.iter().filter(|v| tmpl_file.contains(v.0)).collect(); - let expanded_variants = concerned_variants.iter().map(|pair| pair.1.len()).product(); - for v in 0..expanded_variants { - let mut tmpl_file = tmpl_file.clone(); - let mut id = v; - let mut globals = vec![]; - for variable in variants { - let key = variable.0; - let value = variable.1[id % variable.1.len()]; - globals.push((key, value)); - tmpl_file = tmpl_file.replace(key, value); - id /= variable.1.len(); + + if let Some(spec) = generate_kernels_spec { + let tmpl_file = spec.file.file_name().unwrap().to_str().unwrap(); + for (m, n) in spec.sizes { + let globals = vec![ + ("mr", liquid::model::Value::scalar(format!("{m}"))), + ("nr", liquid::model::Value::scalar(format!("{n}"))), + ]; + let file = out_dir.join(format!("{tmpl_file}_{m}x{n}.S")); + println!("{}", file.display()); + preprocess_file(&spec.file, &file, &globals, suffix, needs_pragma); + files.push(file); + } + } else { + let dir_entries = { + let mut dir_entries: Vec = + input.as_ref().read_dir().unwrap().map(|f| f.unwrap()).collect(); + dir_entries.sort_by_key(|a| a.path()); + dir_entries + }; + for f in dir_entries { + if f.path().extension() == Some(ffi::OsStr::new("tmpl")) { + let tmpl_file = f.path().file_name().unwrap().to_str().unwrap().to_owned(); + let concerned_variants: Vec<&Variant> = + variants.iter().filter(|v| tmpl_file.contains(v.0)).collect(); + let expanded_variants = + concerned_variants.iter().map(|pair| pair.1.len()).product(); + for v in 0..expanded_variants { + let mut tmpl_file = tmpl_file.clone(); + let mut id = v; + let mut globals = vec![]; + for variable in variants { + let key = variable.0; + let value = variable.1[id % variable.1.len()]; + globals.push((key, liquid::model::Value::scalar(value))); + tmpl_file = tmpl_file.replace(key, value); + id /= variable.1.len(); + } + let mut file = out_dir.join(tmpl_file); + file.set_extension("S"); + preprocess_file(f.path(), &file, &globals, suffix, needs_pragma); + files.push(file); } - let mut file = out_dir.join(tmpl_file); - file.set_extension("S"); - preprocess_file(f.path(), &file, &globals, suffix, needs_pragma); - files.push(file); } } } @@ -239,7 +284,7 @@ fn strip_comments(s: String, msvc: bool) -> String { fn preprocess_file( template: impl AsRef, output: impl AsRef, - variants: &[(&'static str, &'static str)], + added_globals: &[(&'static str, liquid::model::Value)], suffix: &str, needs_pragma: bool, ) { @@ -277,8 +322,8 @@ fn preprocess_file( "jump_table": jump_table(), "align": align, }); - for (k, v) in variants { - globals.insert(k.to_string().into(), liquid::model::Value::scalar(*v)); + for (k, v) in added_globals { + globals.insert(k.to_string().into(), v.clone()); } let partials = load_partials(template.as_ref().parent().unwrap(), msvc); let mut parser = liquid::ParserBuilder::with_stdlib() diff --git a/linalg/src/x86_64_fma/mmm.rs b/linalg/src/x86_64_fma/mmm.rs index ded1e27248..23cc63f67f 100644 --- a/linalg/src/x86_64_fma/mmm.rs +++ b/linalg/src/x86_64_fma/mmm.rs @@ -7,14 +7,60 @@ MMMKernel!(f32, fma_mmm_f32_24x4; 24, 4; 32, 4; 0, 0; no_prefetch, is_x86_featur MMMKernel!(f32, fma_mmm_f32_32x3; 32, 3; 32, 4; 0, 0; no_prefetch, is_x86_feature_detected!("fma")); MMMKernel!(f32, fma_mmm_f32_40x2; 40, 2; 32, 4; 0, 0; no_prefetch, is_x86_feature_detected!("fma")); MMMKernel!(f32, fma_mmm_f32_64x1; 64, 1; 32, 4; 0, 0; no_prefetch, is_x86_feature_detected!("fma")); -MMMKernel!(f32, avx512_mmm_f32_128x1; 128, 1; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); -MMMKernel!(f32, avx512_mmm_f32_16x1; 16, 1; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); -MMMKernel!(f32, avx512_mmm_f32_16x12; 16, 12; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); -MMMKernel!(f32, avx512_mmm_f32_16x8; 16, 8; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); -MMMKernel!(f32, avx512_mmm_f32_32x6; 32, 6; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); -MMMKernel!(f32, avx512_mmm_f32_32x5; 32, 5; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); -MMMKernel!(f32, avx512_mmm_f32_48x4; 48, 4; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); -MMMKernel!(f32, avx512_mmm_f32_64x3; 64, 3; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); -MMMKernel!(f32, avx512_mmm_f32_80x2; 80, 2; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); MMMKernel!(i32, avx2_mmm_i32_8x8; 8, 8; 32, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx2")); + +// MMMKernel!(f32, avx512_mmm_f32_240x1; 240, 1; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); +// MMMKernel!(f32, avx512_mmm_f32_160x2; 160, 2; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); +// MMMKernel!(f32, avx512_mmm_f32_112x3; 112, 3; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); +// MMMKernel!(f32, avx512_mmm_f32_96x4; 96, 4; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); +// MMMKernel!(f32, avx512_mmm_f32_80x5; 80, 5; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); +// MMMKernel!(f32, avx512_mmm_f32_64x6; 64, 6; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); +// MMMKernel!(f32, avx512_mmm_f32_48x7; 48, 7; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); +// MMMKernel!(f32, avx512_mmm_f32_48x8; 48, 8; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); +// MMMKernel!(f32, avx512_mmm_f32_48x9; 48, 9; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); +// MMMKernel!(f32, avx512_mmm_f32_32x10; 32, 10; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); +// MMMKernel!(f32, avx512_mmm_f32_32x11; 32, 11; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); +// MMMKernel!(f32, avx512_mmm_f32_32x12; 32, 12; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); + +macro_rules! make_kernels_for_n { + ($n:expr ; $m:expr) => ( + paste! { + MMMKernel!(f32, []; $m, $n; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); + } + ); + ($n:expr ; $m1:expr, $($y:expr),+) => ( + make_kernels_for_n!($n ; $m1); + make_kernels_for_n!($n ; $($y),+); + ) +} + +make_kernels_for_n!(1 ; 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240); +make_kernels_for_n!(2 ; 16, 32, 48, 64, 80, 96, 112, 128, 144, 160); +make_kernels_for_n!(3 ; 16, 32, 48, 64, 80, 96, 112); +make_kernels_for_n!(4 ; 16, 32, 48, 64, 80, 96); +make_kernels_for_n!(5 ; 16, 32, 48, 64, 80); +make_kernels_for_n!(6 ; 16, 32, 48, 64); +make_kernels_for_n!(7 ; 16, 32, 48); +make_kernels_for_n!(8 ; 16, 32, 48); +make_kernels_for_n!(9 ; 16, 32, 48); +make_kernels_for_n!(10 ; 16, 32); +make_kernels_for_n!(11 ; 16, 32); +make_kernels_for_n!(12 ; 16, 32); +make_kernels_for_n!(13 ; 16, 32); +make_kernels_for_n!(14 ; 16, 32); +make_kernels_for_n!(15 ; 16); +make_kernels_for_n!(16 ; 16); +make_kernels_for_n!(17 ; 16); +make_kernels_for_n!(18 ; 16); +make_kernels_for_n!(19 ; 16); +make_kernels_for_n!(20 ; 16); +make_kernels_for_n!(21 ; 16); +make_kernels_for_n!(22 ; 16); +make_kernels_for_n!(23 ; 16); +make_kernels_for_n!(24 ; 16); +make_kernels_for_n!(25 ; 16); +make_kernels_for_n!(26 ; 16); +make_kernels_for_n!(27 ; 16); +make_kernels_for_n!(28 ; 16); +make_kernels_for_n!(29 ; 16); diff --git a/linalg/x86_64/avx512/10x1/packed_packed_loop1/avx-512-unroll.tmpli b/linalg/x86_64/avx512/10x1/packed_packed_loop1/avx-512-unroll.tmpli deleted file mode 100644 index 857f7821c7..0000000000 --- a/linalg/x86_64/avx512/10x1/packed_packed_loop1/avx-512-unroll.tmpli +++ /dev/null @@ -1,59 +0,0 @@ - // Tile size: 10x1 - // Accumulators: 0-9 - // Col regs: 10-19 - // Row regs: 20, 21 - - vbroadcastss zmm20, dword ptr [rcx] - - vmovaps zmm10, [rax + 0] - vmovaps zmm11, [rax + 64] - vmovaps zmm12, [rax + 128] - vmovaps zmm13, [rax + 192] - vmovaps zmm14, [rax + 256] - - vfmadd231ps zmm0, zmm10, zmm20 - vfmadd231ps zmm1, zmm11, zmm20 - vfmadd231ps zmm2, zmm12, zmm20 - vfmadd231ps zmm3, zmm13, zmm20 - vfmadd231ps zmm4, zmm14, zmm20 - - vmovaps zmm15, [rax + 320] - vmovaps zmm16, [rax + 384] - vmovaps zmm17, [rax + 448] - vmovaps zmm18, [rax + 512] - vmovaps zmm19, [rax + 576] - - vfmadd231ps zmm5, zmm10, zmm20 - vfmadd231ps zmm6, zmm11, zmm20 - vfmadd231ps zmm7, zmm12, zmm20 - vfmadd231ps zmm8, zmm13, zmm20 - vfmadd231ps zmm9, zmm14, zmm20 - - vbroadcastss zmm21, dword ptr [rcx + 4] - - vmovaps zmm10, [rax + 640] - vmovaps zmm11, [rax + 704] - vmovaps zmm12, [rax + 768] - vmovaps zmm13, [rax + 832] - vmovaps zmm14, [rax + 896] - - vfmadd231ps zmm0, zmm10, zmm21 - vfmadd231ps zmm1, zmm11, zmm21 - vfmadd231ps zmm2, zmm12, zmm21 - vfmadd231ps zmm3, zmm13, zmm21 - vfmadd231ps zmm4, zmm14, zmm21 - - vmovaps zmm15, [rax + 960] - vmovaps zmm16, [rax + 1024] - vmovaps zmm17, [rax + 1088] - vmovaps zmm18, [rax + 1152] - vmovaps zmm19, [rax + 1216] - - vfmadd231ps zmm5, zmm10, zmm21 - vfmadd231ps zmm6, zmm11, zmm21 - vfmadd231ps zmm7, zmm12, zmm21 - vfmadd231ps zmm8, zmm13, zmm21 - vfmadd231ps zmm9, zmm14, zmm21 - - add rcx, 8 - add rax, 1280 diff --git a/linalg/x86_64/avx512/10x1/packed_packed_loop1/avx-512.tmpli b/linalg/x86_64/avx512/10x1/packed_packed_loop1/avx-512.tmpli deleted file mode 100644 index 76aaae5bff..0000000000 --- a/linalg/x86_64/avx512/10x1/packed_packed_loop1/avx-512.tmpli +++ /dev/null @@ -1,33 +0,0 @@ - // Tile size: 10x1 - // Accumulators: 0-9 - // Col regs: 10-19 - // Row regs: 20 - - vbroadcastss zmm20, dword ptr [rcx] - - vmovaps zmm10, [rax + 0] - vmovaps zmm11, [rax + 64] - vmovaps zmm12, [rax + 128] - vmovaps zmm13, [rax + 192] - vmovaps zmm14, [rax + 256] - - vfmadd231ps zmm0, zmm10, zmm20 - vfmadd231ps zmm1, zmm11, zmm20 - vfmadd231ps zmm2, zmm12, zmm20 - vfmadd231ps zmm3, zmm13, zmm20 - vfmadd231ps zmm4, zmm14, zmm20 - - vmovaps zmm15, [rax + 320] - vmovaps zmm16, [rax + 384] - vmovaps zmm17, [rax + 448] - vmovaps zmm18, [rax + 512] - vmovaps zmm19, [rax + 576] - - vfmadd231ps zmm5, zmm10, zmm20 - vfmadd231ps zmm6, zmm11, zmm20 - vfmadd231ps zmm7, zmm12, zmm20 - vfmadd231ps zmm8, zmm13, zmm20 - vfmadd231ps zmm9, zmm14, zmm20 - - add rcx, 4 - add rax, 320 diff --git a/linalg/x86_64/avx512/1x1/packed_packed_loop1/avx-512.tmpli b/linalg/x86_64/avx512/1x1/packed_packed_loop1/avx-512.tmpli deleted file mode 100644 index ba4e6232c0..0000000000 --- a/linalg/x86_64/avx512/1x1/packed_packed_loop1/avx-512.tmpli +++ /dev/null @@ -1,7 +0,0 @@ - vbroadcastss zmm15, dword ptr [rcx] - - vmovups zmm8, [rax] - vfmadd231ps zmm0, zmm15, zmm8 - - add rcx, 4 - add rax, 64 diff --git a/linalg/x86_64/avx512/1x1/packed_packed_loop1/unroll-16.tmpli b/linalg/x86_64/avx512/1x1/packed_packed_loop1/unroll-16.tmpli deleted file mode 100644 index 4a1c310834..0000000000 --- a/linalg/x86_64/avx512/1x1/packed_packed_loop1/unroll-16.tmpli +++ /dev/null @@ -1,68 +0,0 @@ - vmovups zmm31, [rcx] - // vbroadcastss zmm17, [rcx + 4 * 0] - // vbroadcastss zmm18, [rcx + 4 * 1] - // vbroadcastss zmm19, [rcx + 4 * 2] - // vbroadcastss zmm20, [rcx + 4 * 3] - // vbroadcastss zmm21, [rcx + 4 * 4] - // vbroadcastss zmm22, [rcx + 4 * 5] - // vbroadcastss zmm23, [rcx + 4 * 6] - // vbroadcastss zmm24, [rcx + 4 * 7] - // vbroadcastss zmm25, [rcx + 4 * 8] - // vbroadcastss zmm26, [rcx + 4 * 9] - // vbroadcastss zmm27, [rcx + 4 * 10] - // vbroadcastss zmm28, [rcx + 4 * 11] - // vbroadcastss zmm29, [rcx + 4 * 12] - // vbroadcastss zmm30, [rcx + 4 * 13] - // vbroadcastss zmm31, [rcx + 4 * 14] - - vbroadcastss zmm16, xmm31 - valignd zmm17, zmm31, zmm31, 1 - vbroadcastss zmm17, xmm17 - valignd zmm18, zmm31, zmm31, 2 - vbroadcastss zmm18, xmm18 - valignd zmm19, zmm31, zmm31, 3 - vbroadcastss zmm19, xmm19 - valignd zmm20, zmm31, zmm31, 4 - vbroadcastss zmm20, xmm20 - valignd zmm21, zmm31, zmm31, 5 - vbroadcastss zmm21, xmm21 - valignd zmm22, zmm31, zmm31, 6 - vbroadcastss zmm22, xmm22 - valignd zmm23, zmm31, zmm31, 7 - vbroadcastss zmm23, xmm23 - valignd zmm24, zmm31, zmm31, 8 - vbroadcastss zmm24, xmm24 - valignd zmm25, zmm31, zmm31, 9 - vbroadcastss zmm25, xmm25 - valignd zmm26, zmm31, zmm31, 10 - vbroadcastss zmm26, xmm26 - valignd zmm27, zmm31, zmm31, 11 - vbroadcastss zmm27, xmm27 - valignd zmm28, zmm31, zmm31, 12 - vbroadcastss zmm28, xmm28 - valignd zmm29, zmm31, zmm31, 13 - vbroadcastss zmm29, xmm29 - valignd zmm30, zmm31, zmm31, 14 - vbroadcastss zmm30, xmm30 - valignd zmm31, zmm31, zmm31, 15 - vbroadcastss zmm31, xmm31 - - vfmadd231ps zmm0, zmm16, [rax + 0] - vfmadd231ps zmm1, zmm17, [rax + 64] - vfmadd231ps zmm2, zmm18, [rax + 128] - vfmadd231ps zmm3, zmm19, [rax + 192] - vfmadd231ps zmm4, zmm20, [rax + 256] - vfmadd231ps zmm5, zmm21, [rax + 320] - vfmadd231ps zmm6, zmm22, [rax + 384] - vfmadd231ps zmm7, zmm23, [rax + 448] - vfmadd231ps zmm8, zmm24, [rax + 512] - vfmadd231ps zmm9, zmm25, [rax + 576] - vfmadd231ps zmm10, zmm26, [rax + 640] - vfmadd231ps zmm11, zmm27, [rax + 704] - vfmadd231ps zmm12, zmm28, [rax + 768] - vfmadd231ps zmm13, zmm29, [rax + 832] - vfmadd231ps zmm14, zmm30, [rax + 896] - vfmadd231ps zmm15, zmm31, [rax + 960] - - add rcx, 64 - add rax, 1024 diff --git a/linalg/x86_64/avx512/1x1/packed_packed_loop1/unroll-4.tmpli b/linalg/x86_64/avx512/1x1/packed_packed_loop1/unroll-4.tmpli deleted file mode 100644 index 103be7015b..0000000000 --- a/linalg/x86_64/avx512/1x1/packed_packed_loop1/unroll-4.tmpli +++ /dev/null @@ -1,24 +0,0 @@ - // slow - vbroadcastss xmm16, dword ptr [rcx] - vbroadcastss xmm17, dword ptr [rcx + 4] - vbroadcastss xmm18, dword ptr [rcx + 8] - vbroadcastss xmm19, dword ptr [rcx + 12] - - // fast - vmovups xmm31, [rcx] - vbroadcastss zmm16, xmm31 - valignd xmm17, xmm31, xmm31, 1 - vbroadcastss zmm17, xmm17 - valignd xmm18, xmm31, xmm31, 2 - vbroadcastss zmm18, xmm18 - valignd xmm19, xmm31, xmm31, 3 - vbroadcastss zmm19, xmm19 - - // commmon - vfmadd231ps zmm0, zmm16, [rax + 0] - vfmadd231ps zmm1, zmm17, [rax + 64] - vfmadd231ps zmm2, zmm18, [rax + 128] - vfmadd231ps zmm3, zmm19, [rax + 192] - - add rcx, 16 - add rax, 256 diff --git a/linalg/x86_64/avx512/1x1/packed_packed_loop1/unroll-8.tmpli b/linalg/x86_64/avx512/1x1/packed_packed_loop1/unroll-8.tmpli deleted file mode 100644 index d6cb277f89..0000000000 --- a/linalg/x86_64/avx512/1x1/packed_packed_loop1/unroll-8.tmpli +++ /dev/null @@ -1,29 +0,0 @@ - vmovups ymm31, [rcx] - - vbroadcastss zmm16, xmm31 - valignd ymm17, ymm31, ymm31, 1 - vbroadcastss zmm17, xmm17 - valignd ymm18, ymm31, ymm31, 2 - vbroadcastss zmm18, xmm18 - valignd ymm19, ymm31, ymm31, 3 - vbroadcastss zmm19, xmm19 - valignd ymm20, ymm31, ymm31, 4 - vbroadcastss zmm20, xmm20 - valignd ymm21, ymm31, ymm31, 5 - vbroadcastss zmm21, xmm21 - valignd ymm22, ymm31, ymm31, 6 - vbroadcastss zmm22, xmm22 - valignd ymm23, ymm31, ymm31, 7 - vbroadcastss zmm23, xmm23 - - vfmadd231ps zmm0, zmm16, [rax + 0] - vfmadd231ps zmm1, zmm17, [rax + 64] - vfmadd231ps zmm2, zmm18, [rax + 128] - vfmadd231ps zmm3, zmm19, [rax + 192] - vfmadd231ps zmm4, zmm20, [rax + 256] - vfmadd231ps zmm5, zmm21, [rax + 320] - vfmadd231ps zmm6, zmm22, [rax + 384] - vfmadd231ps zmm7, zmm23, [rax + 448] - - add rcx, 32 - add rax, 512 diff --git a/linalg/x86_64/avx512/1x1/packed_packed_loop1/unroll.tmpli b/linalg/x86_64/avx512/1x1/packed_packed_loop1/unroll.tmpli deleted file mode 100644 index 8c9bf905b3..0000000000 --- a/linalg/x86_64/avx512/1x1/packed_packed_loop1/unroll.tmpli +++ /dev/null @@ -1,11 +0,0 @@ - vbroadcastss zmm15, dword ptr [rcx] - - vmovaps zmm8, [rax + 0] - vfmadd231ps zmm0, zmm15, zmm8 - - vbroadcastss zmm16, dword ptr [rcx + 4] - vmovaps zmm9, [rax + 64] - vfmadd231ps zmm1, zmm16, zmm9 - - add rcx, 8 - add rax, 128 diff --git a/linalg/x86_64/avx512/1x12/packed_packed_loop1/avx-512.tmpli b/linalg/x86_64/avx512/1x12/packed_packed_loop1/avx-512.tmpli deleted file mode 100644 index 4ffab3bd4e..0000000000 --- a/linalg/x86_64/avx512/1x12/packed_packed_loop1/avx-512.tmpli +++ /dev/null @@ -1,45 +0,0 @@ - // Tile size: 1x12 - // Accumulators: 0-11 - // Col regs: zmm14 - // Row regs: zmm15 - - vmovaps zmm15, [rax] - - vbroadcastss zmm14, dword ptr [rcx + 0 * 4] - vfmadd231ps zmm0, zmm15, zmm14 - - vbroadcastss zmm14, dword ptr [rcx + 1 * 4] - vfmadd231ps zmm1, zmm15, zmm14 - - vbroadcastss zmm14, dword ptr [rcx + 2 * 4] - vfmadd231ps zmm2, zmm15, zmm14 - - vbroadcastss zmm14, dword ptr [rcx + 3 * 4] - vfmadd231ps zmm3, zmm15, zmm14 - - vbroadcastss zmm14, dword ptr [rcx + 4 * 4] - vfmadd231ps zmm4, zmm15, zmm14 - - vbroadcastss zmm14, dword ptr [rcx + 5 * 4] - vfmadd231ps zmm5, zmm15, zmm14 - - vbroadcastss zmm14, dword ptr [rcx + 6 * 4] - vfmadd231ps zmm6, zmm15, zmm14 - - vbroadcastss zmm14, dword ptr [rcx + 7 * 4] - vfmadd231ps zmm7, zmm15, zmm14 - - vbroadcastss zmm14, dword ptr [rcx + 8 * 4] - vfmadd231ps zmm8, zmm15, zmm14 - - vbroadcastss zmm14, dword ptr [rcx + 9 * 4] - vfmadd231ps zmm9, zmm15, zmm14 - - vbroadcastss zmm14, dword ptr [rcx + 10 * 4] - vfmadd231ps zmm10, zmm15, zmm14 - - vbroadcastss zmm14, dword ptr [rcx + 11 * 4] - vfmadd231ps zmm11, zmm15, zmm14 - - add rcx, 48 - add rax, 64 \ No newline at end of file diff --git a/linalg/x86_64/avx512/2x12/packed_packed_loop1/avx-512.tmpli b/linalg/x86_64/avx512/2x12/packed_packed_loop1/avx-512.tmpli deleted file mode 100644 index 8ca7750da9..0000000000 --- a/linalg/x86_64/avx512/2x12/packed_packed_loop1/avx-512.tmpli +++ /dev/null @@ -1,60 +0,0 @@ - // Tile size: 2x12 - // Accumulators: zmm0-23 - // Col regs: zmm25 - // Row regs: zmm26-27 - - prefetcht0 [rax + 256] - prefetcht0 [rax+64 + 256] - vmovaps zmm26, [rax] - vmovaps zmm27, [rax+64] - - vbroadcastss zmm25, dword ptr [rcx+0] - vfmadd231ps zmm0, zmm26, zmm25 - vfmadd231ps zmm1, zmm27, zmm25 - - vbroadcastss zmm25, dword ptr [rcx+4] - vfmadd231ps zmm2, zmm26, zmm25 - vfmadd231ps zmm3, zmm27, zmm25 - - vbroadcastss zmm25, dword ptr [rcx+8] - vfmadd231ps zmm4, zmm26, zmm25 - vfmadd231ps zmm5, zmm27, zmm25 - - vbroadcastss zmm25, dword ptr [rcx+12] - vfmadd231ps zmm6, zmm26, zmm25 - vfmadd231ps zmm7, zmm27, zmm25 - - vbroadcastss zmm25, dword ptr [rcx+16] - vfmadd231ps zmm8, zmm26, zmm25 - vfmadd231ps zmm9, zmm27, zmm25 - - vbroadcastss zmm25, dword ptr [rcx+20] - vfmadd231ps zmm10, zmm26, zmm25 - vfmadd231ps zmm11, zmm27, zmm25 - - vbroadcastss zmm25, dword ptr [rcx+24] - vfmadd231ps zmm12, zmm26, zmm25 - vfmadd231ps zmm13, zmm27, zmm25 - - vbroadcastss zmm25, dword ptr [rcx+28] - vfmadd231ps zmm14, zmm26, zmm25 - vfmadd231ps zmm15, zmm27, zmm25 - - vbroadcastss zmm25, dword ptr [rcx+32] - vfmadd231ps zmm16, zmm26, zmm25 - vfmadd231ps zmm17, zmm27, zmm25 - - vbroadcastss zmm25, dword ptr [rcx+36] - vfmadd231ps zmm18, zmm26, zmm25 - vfmadd231ps zmm19, zmm27, zmm25 - - vbroadcastss zmm25, dword ptr [rcx+40] - vfmadd231ps zmm20, zmm26, zmm25 - vfmadd231ps zmm21, zmm27, zmm25 - - vbroadcastss zmm25, dword ptr [rcx+44] - vfmadd231ps zmm22, zmm26, zmm25 - vfmadd231ps zmm23, zmm27, zmm25 - - add rax, 128 - add rcx, 48 diff --git a/linalg/x86_64/avx512/2x5/packed_packed_loop1/avx-512-unroll.tmpli b/linalg/x86_64/avx512/2x5/packed_packed_loop1/avx-512-unroll.tmpli deleted file mode 100644 index 118d312c82..0000000000 --- a/linalg/x86_64/avx512/2x5/packed_packed_loop1/avx-512-unroll.tmpli +++ /dev/null @@ -1,53 +0,0 @@ - // Accumulators: 0-9 - // Columns: 15-16 - // Rows: 10-14 - vbroadcastss zmm10, dword ptr [rcx] - vbroadcastss zmm11, dword ptr [rcx + 4] - vbroadcastss zmm12, dword ptr [rcx + 8] - vbroadcastss zmm13, dword ptr [rcx + 12] - vbroadcastss zmm14, dword ptr [rcx + 16] - - vmovaps zmm15, [rax] - vmovaps zmm16, [rax + 64] - - vfmadd231ps zmm0, zmm15, zmm10 - vfmadd231ps zmm1, zmm16, zmm10 - - vfmadd231ps zmm2, zmm15, zmm11 - vfmadd231ps zmm3, zmm16, zmm11 - - vfmadd231ps zmm4, zmm15, zmm12 - vfmadd231ps zmm5, zmm16, zmm12 - - vfmadd231ps zmm6, zmm15, zmm13 - vfmadd231ps zmm7, zmm16, zmm13 - - vfmadd231ps zmm8, zmm15, zmm14 - vfmadd231ps zmm9, zmm16, zmm14 - - vbroadcastss zmm10, dword ptr [rcx + 20] - vbroadcastss zmm11, dword ptr [rcx + 24] - vbroadcastss zmm12, dword ptr [rcx + 28] - vbroadcastss zmm13, dword ptr [rcx + 32] - vbroadcastss zmm14, dword ptr [rcx + 36] - - vmovaps zmm15, [rax + 128] - vmovaps zmm16, [rax + 192] - - vfmadd231ps zmm0, zmm15, zmm10 - vfmadd231ps zmm1, zmm16, zmm10 - - vfmadd231ps zmm2, zmm15, zmm11 - vfmadd231ps zmm3, zmm16, zmm11 - - vfmadd231ps zmm4, zmm15, zmm12 - vfmadd231ps zmm5, zmm16, zmm12 - - vfmadd231ps zmm6, zmm15, zmm13 - vfmadd231ps zmm7, zmm16, zmm13 - - vfmadd231ps zmm8, zmm15, zmm14 - vfmadd231ps zmm9, zmm16, zmm14 - - add rcx, 40 - add rax, 256 diff --git a/linalg/x86_64/avx512/2x5/packed_packed_loop1/avx-512.tmpli b/linalg/x86_64/avx512/2x5/packed_packed_loop1/avx-512.tmpli deleted file mode 100644 index e017834d25..0000000000 --- a/linalg/x86_64/avx512/2x5/packed_packed_loop1/avx-512.tmpli +++ /dev/null @@ -1,30 +0,0 @@ - // Accumulators: 0-9 - // Columns: 15 - // Rows: 10-14 - - vbroadcastss zmm10, dword ptr [rcx] - vbroadcastss zmm11, dword ptr [rcx + 4] - vbroadcastss zmm12, dword ptr [rcx + 8] - vbroadcastss zmm13, dword ptr [rcx + 12] - vbroadcastss zmm14, dword ptr [rcx + 16] - - vmovaps zmm15, [rax] - vmovaps zmm16, [rax + 64] - - vfmadd231ps zmm0, zmm15, zmm10 - vfmadd231ps zmm1, zmm16, zmm10 - - vfmadd231ps zmm2, zmm15, zmm11 - vfmadd231ps zmm3, zmm16, zmm11 - - vfmadd231ps zmm4, zmm15, zmm12 - vfmadd231ps zmm5, zmm16, zmm12 - - vfmadd231ps zmm6, zmm15, zmm13 - vfmadd231ps zmm7, zmm16, zmm13 - - vfmadd231ps zmm8, zmm15, zmm14 - vfmadd231ps zmm9, zmm16, zmm14 - - add rcx, 20 - add rax, 128 diff --git a/linalg/x86_64/avx512/2x6/packed_packed_loop1/avx-512-unroll.tmpli b/linalg/x86_64/avx512/2x6/packed_packed_loop1/avx-512-unroll.tmpli deleted file mode 100644 index 9d6c940a94..0000000000 --- a/linalg/x86_64/avx512/2x6/packed_packed_loop1/avx-512-unroll.tmpli +++ /dev/null @@ -1,71 +0,0 @@ - // Tile size: 2x6 - // Accumulators: 0-11 - // Col regs: zmm14-15 - // Row regs: zmm12-13 - - vbroadcastss zmm14, dword ptr [rcx] - vmovaps zmm12, [rax] - vmovaps zmm13, [rax + 64] - vbroadcastss zmm15, dword ptr [rcx + 4] - - vfmadd231ps zmm0, zmm12, zmm14 - vfmadd231ps zmm1, zmm13, zmm14 - - vbroadcastss zmm14, dword ptr [rcx + 8] - - vfmadd231ps zmm2, zmm12, zmm15 - vfmadd231ps zmm3, zmm13, zmm15 - - vbroadcastss zmm15, dword ptr [rcx + 12] - - vfmadd231ps zmm4, zmm12, zmm14 - vfmadd231ps zmm5, zmm13, zmm14 - - vbroadcastss zmm14, dword ptr [rcx + 16] - - vfmadd231ps zmm6, zmm12, zmm15 - vfmadd231ps zmm7, zmm13, zmm15 - - vbroadcastss zmm15, dword ptr [rcx + 20] - - vfmadd231ps zmm8, zmm12, zmm14 - vfmadd231ps zmm9, zmm13, zmm14 - - vbroadcastss zmm14, dword ptr [rcx+24] - - vfmadd231ps zmm10, zmm12, zmm15 - vfmadd231ps zmm11, zmm13, zmm15 - - // Iteration two - vmovaps zmm12, [rax + 128] - vmovaps zmm13, [rax + 192] - vbroadcastss zmm15, dword ptr [rcx + 24 + 4] - - vfmadd231ps zmm0, zmm12, zmm14 - vfmadd231ps zmm1, zmm13, zmm14 - - vbroadcastss zmm14, dword ptr [rcx + 24 + 8] - - vfmadd231ps zmm2, zmm12, zmm15 - vfmadd231ps zmm3, zmm13, zmm15 - - vbroadcastss zmm15, dword ptr [rcx + 24 + 12] - - vfmadd231ps zmm4, zmm12, zmm14 - vfmadd231ps zmm5, zmm13, zmm14 - - vbroadcastss zmm14, dword ptr [rcx + 24 + 16] - - vfmadd231ps zmm6, zmm12, zmm15 - vfmadd231ps zmm7, zmm13, zmm15 - - vbroadcastss zmm15, dword ptr [rcx + 24 + 20] - - vfmadd231ps zmm8, zmm12, zmm14 - vfmadd231ps zmm9, zmm13, zmm14 - - vfmadd231ps zmm10, zmm12, zmm15 - vfmadd231ps zmm11, zmm13, zmm15 - - add rax, 256 - add rcx, 48 diff --git a/linalg/x86_64/avx512/2x6/packed_packed_loop1/avx-512.tmpli b/linalg/x86_64/avx512/2x6/packed_packed_loop1/avx-512.tmpli deleted file mode 100644 index 31f861b105..0000000000 --- a/linalg/x86_64/avx512/2x6/packed_packed_loop1/avx-512.tmpli +++ /dev/null @@ -1,39 +0,0 @@ - // Tile size: 2x6 - // Accumulators: 0-11 - // Col regs: zmm14-15 - // Row regs: zmm12-13 - - // Load ordered by earliest use for first 2x2 block - vbroadcastss zmm14, dword ptr [rcx] - vmovaps zmm12, [rax] - vmovaps zmm13, [rax + 64] - vbroadcastss zmm15, dword ptr [rcx + 4] - - vfmadd231ps zmm0, zmm12, zmm14 - vfmadd231ps zmm1, zmm13, zmm14 - - vbroadcastss zmm14, dword ptr [rcx + 8] - - vfmadd231ps zmm2, zmm12, zmm15 - vfmadd231ps zmm3, zmm13, zmm15 - - vbroadcastss zmm15, dword ptr [rcx + 12] - - vfmadd231ps zmm4, zmm12, zmm14 - vfmadd231ps zmm5, zmm13, zmm14 - - vbroadcastss zmm14, dword ptr [rcx + 16] - - vfmadd231ps zmm6, zmm12, zmm15 - vfmadd231ps zmm7, zmm13, zmm15 - - vbroadcastss zmm15, dword ptr [rcx + 20] - - vfmadd231ps zmm8, zmm12, zmm14 - vfmadd231ps zmm9, zmm13, zmm14 - - vfmadd231ps zmm10, zmm12, zmm15 - vfmadd231ps zmm11, zmm13, zmm15 - - add rax, 128 - add rcx, 24 diff --git a/linalg/x86_64/avx512/3x4/packed_packed_loop1/avx-512-unroll.tmpli b/linalg/x86_64/avx512/3x4/packed_packed_loop1/avx-512-unroll.tmpli deleted file mode 100644 index c36b7f6b6a..0000000000 --- a/linalg/x86_64/avx512/3x4/packed_packed_loop1/avx-512-unroll.tmpli +++ /dev/null @@ -1,63 +0,0 @@ - // Tile size: 3x4 - // Accumulators: 0-11 - // Col regs: zmm12-14 - // Row regs: zmm15 - - vmovaps zmm12, [rax] - vmovaps zmm13, [rax+64] - vmovaps zmm14, [rax+128] - - vbroadcastss zmm15, dword ptr [rcx + 0] - - vfmadd231ps zmm0, zmm12, zmm15 - vfmadd231ps zmm1, zmm13, zmm15 - vfmadd231ps zmm2, zmm14, zmm15 - - vbroadcastss zmm15, dword ptr [rcx + 4] - - vfmadd231ps zmm3, zmm12, zmm15 - vfmadd231ps zmm4, zmm13, zmm15 - vfmadd231ps zmm5, zmm14, zmm15 - - vbroadcastss zmm15, dword ptr [rcx + 8] - - vfmadd231ps zmm6, zmm12, zmm15 - vfmadd231ps zmm7, zmm13, zmm15 - vfmadd231ps zmm8, zmm14, zmm15 - - vbroadcastss zmm15, dword ptr [rcx + 12] - - vfmadd231ps zmm9, zmm12, zmm15 - vfmadd231ps zmm10, zmm13, zmm15 - vfmadd231ps zmm11, zmm14, zmm15 - - vmovaps zmm12, [rax + 192] - vmovaps zmm13, [rax + 256] - vmovaps zmm14, [rax + 320] - - vbroadcastss zmm15, dword ptr [rcx + 16] - - vfmadd231ps zmm0, zmm12, zmm15 - vfmadd231ps zmm1, zmm13, zmm15 - vfmadd231ps zmm2, zmm14, zmm15 - - vbroadcastss zmm15, dword ptr [rcx + 20] - - vfmadd231ps zmm3, zmm12, zmm15 - vfmadd231ps zmm4, zmm13, zmm15 - vfmadd231ps zmm5, zmm14, zmm15 - - vbroadcastss zmm15, dword ptr [rcx + 24] - - vfmadd231ps zmm6, zmm12, zmm15 - vfmadd231ps zmm7, zmm13, zmm15 - vfmadd231ps zmm8, zmm14, zmm15 - - vbroadcastss zmm15, dword ptr [rcx + 28] - - vfmadd231ps zmm9, zmm12, zmm15 - vfmadd231ps zmm10, zmm13, zmm15 - vfmadd231ps zmm11, zmm14, zmm15 - - add rax, 384 - add rcx, 32 diff --git a/linalg/x86_64/avx512/3x4/packed_packed_loop1/avx-512.tmpli b/linalg/x86_64/avx512/3x4/packed_packed_loop1/avx-512.tmpli deleted file mode 100644 index a8b1c3221a..0000000000 --- a/linalg/x86_64/avx512/3x4/packed_packed_loop1/avx-512.tmpli +++ /dev/null @@ -1,35 +0,0 @@ - // Tile size: 3x4 - // Accumulators: 0-11 - // Col regs: zmm12-14 - // Row regs: zmm15 - - vmovaps zmm12, [rax] - vmovaps zmm13, [rax+64] - vmovaps zmm14, [rax+128] - - vbroadcastss zmm15, dword ptr [rcx + 0] - - vfmadd231ps zmm0, zmm12, zmm15 - vfmadd231ps zmm1, zmm13, zmm15 - vfmadd231ps zmm2, zmm14, zmm15 - - vbroadcastss zmm15, dword ptr [rcx + 4] - - vfmadd231ps zmm3, zmm12, zmm15 - vfmadd231ps zmm4, zmm13, zmm15 - vfmadd231ps zmm5, zmm14, zmm15 - - vbroadcastss zmm15, dword ptr [rcx + 8] - - vfmadd231ps zmm6, zmm12, zmm15 - vfmadd231ps zmm7, zmm13, zmm15 - vfmadd231ps zmm8, zmm14, zmm15 - - vbroadcastss zmm15, dword ptr [rcx + 12] - - vfmadd231ps zmm9, zmm12, zmm15 - vfmadd231ps zmm10, zmm13, zmm15 - vfmadd231ps zmm11, zmm14, zmm15 - - add rax, 192 - add rcx, 16 diff --git a/linalg/x86_64/avx512/4x3/i32.tmpli b/linalg/x86_64/avx512/4x3/i32.tmpli deleted file mode 100644 index d6707e92c8..0000000000 --- a/linalg/x86_64/avx512/4x3/i32.tmpli +++ /dev/null @@ -1,50 +0,0 @@ - // Tile size: 4x3 - // Accumulators: 0-11 - // Col regs: zmm12 - // Row regs: zmm13-15 - - // Load col of A - vmovaps zmm12, [rax] - - // Fill 3 cols of B - vbroadcastss zmm13, dword ptr [rcx + 0] - vbroadcastss zmm14, dword ptr [rcx + 4] - vbroadcastss zmm15, dword ptr [rcx + 8] - - // N.B. Stepping cols in inner loop - vpmulld zmm31, zmm12, zmm13 - vpaddd zmm0, zmm0, zmm31 - vpmulld zmm30, zmm12, zmm14 - vpaddd zmm4, zmm4, zmm30 - vpmulld zmm29, zmm12, zmm15 - vpaddd zmm8, zmm8, zmm29 - - vmovaps zmm12, [rax+64] - - vpmulld zmm31, zmm12, zmm13 - vpaddd zmm1, zmm1, zmm31 - vpmulld zmm30, zmm12, zmm14 - vpaddd zmm5, zmm5, zmm30 - vpmulld zmm29, zmm12, zmm15 - vpaddd zmm9, zmm9, zmm29 - - vmovaps zmm12, [rax+128] - - vpmulld zmm31, zmm12, zmm13 - vpaddd zmm2, zmm2, zmm31 - vpmulld zmm30, zmm12, zmm14 - vpaddd zmm6, zmm6, zmm30 - vpmulld zmm29, zmm12, zmm15 - vpaddd zmm10, zmm10, zmm29 - - vmovaps zmm12, [rax+192] - - vpmulld zmm31, zmm12, zmm13 - vpaddd zmm3, zmm3, zmm31 - vpmulld zmm30, zmm12, zmm14 - vpaddd zmm7, zmm7, zmm30 - vpmulld zmm29, zmm12, zmm15 - vpaddd zmm11, zmm11, zmm29 - - add rcx, 12 - add rax, 256 diff --git a/linalg/x86_64/avx512/4x3/packed_packed_loop1/avx-512-unroll.tmpli b/linalg/x86_64/avx512/4x3/packed_packed_loop1/avx-512-unroll.tmpli deleted file mode 100644 index fe661b7fa2..0000000000 --- a/linalg/x86_64/avx512/4x3/packed_packed_loop1/avx-512-unroll.tmpli +++ /dev/null @@ -1,69 +0,0 @@ - // Tile size: 4x3 - // Accumulators: 0-11 - // Col regs: zmm12 - // Row regs: zmm13-15 - - // Load col of A - vmovaps zmm12, [rax] - - // Fill 3 cols of B - vbroadcastss zmm13, dword ptr [rcx + 0] - vbroadcastss zmm14, dword ptr [rcx + 4] - vbroadcastss zmm15, dword ptr [rcx + 8] - - // N.B. Stepping cols in inner loop - vfmadd231ps zmm0, zmm12, zmm13 - vfmadd231ps zmm4, zmm12, zmm14 - vfmadd231ps zmm8, zmm12, zmm15 - - vmovaps zmm12, [rax+64] - - vfmadd231ps zmm1, zmm12, zmm13 - vfmadd231ps zmm5, zmm12, zmm14 - vfmadd231ps zmm9, zmm12, zmm15 - - vmovaps zmm12, [rax+128] - - vfmadd231ps zmm2, zmm12, zmm13 - vfmadd231ps zmm6, zmm12, zmm14 - vfmadd231ps zmm10, zmm12, zmm15 - - vmovaps zmm12, [rax+192] - - vfmadd231ps zmm3, zmm12, zmm13 - vfmadd231ps zmm7, zmm12, zmm14 - vfmadd231ps zmm11, zmm12, zmm15 - - // Load col of A, switching col! - vmovaps zmm13, [rax + 256] - - // Fill 3 cols of B - vbroadcastss zmm14, dword ptr [rcx + 12] - vbroadcastss zmm15, dword ptr [rcx + 16] - vbroadcastss zmm12, dword ptr [rcx + 20] - - // N.B. Stepping cols in inner loop - vfmadd231ps zmm0, zmm13, zmm14 - vfmadd231ps zmm4, zmm13, zmm15 - vfmadd231ps zmm8, zmm13, zmm12 - - vmovaps zmm13, [rax + 320] - - vfmadd231ps zmm1, zmm13, zmm14 - vfmadd231ps zmm5, zmm13, zmm15 - vfmadd231ps zmm9, zmm13, zmm12 - - vmovaps zmm13, [rax + 384] - - vfmadd231ps zmm2, zmm13, zmm14 - vfmadd231ps zmm6, zmm13, zmm15 - vfmadd231ps zmm10, zmm13, zmm12 - - vmovaps zmm13, [rax + 448] - - vfmadd231ps zmm3, zmm13, zmm14 - vfmadd231ps zmm7, zmm13, zmm15 - vfmadd231ps zmm11, zmm13, zmm12 - - add rcx, 24 - add rax, 512 diff --git a/linalg/x86_64/avx512/4x3/packed_packed_loop1/avx-512.tmpli b/linalg/x86_64/avx512/4x3/packed_packed_loop1/avx-512.tmpli deleted file mode 100644 index 0e71a747e4..0000000000 --- a/linalg/x86_64/avx512/4x3/packed_packed_loop1/avx-512.tmpli +++ /dev/null @@ -1,38 +0,0 @@ - // Tile size: 4x3 - // Accumulators: 0-11 - // Col regs: zmm12 - // Row regs: zmm13-15 - - // Load col of A - vmovaps zmm12, [rax] - - // Fill 3 cols of B - vbroadcastss zmm13, dword ptr [rcx + 0] - vbroadcastss zmm14, dword ptr [rcx + 4] - vbroadcastss zmm15, dword ptr [rcx + 8] - - // N.B. Stepping cols in inner loop - vfmadd231ps zmm0, zmm12, zmm13 - vfmadd231ps zmm4, zmm12, zmm14 - vfmadd231ps zmm8, zmm12, zmm15 - - vmovaps zmm12, [rax+64] - - vfmadd231ps zmm1, zmm12, zmm13 - vfmadd231ps zmm5, zmm12, zmm14 - vfmadd231ps zmm9, zmm12, zmm15 - - vmovaps zmm12, [rax+128] - - vfmadd231ps zmm2, zmm12, zmm13 - vfmadd231ps zmm6, zmm12, zmm14 - vfmadd231ps zmm10, zmm12, zmm15 - - vmovaps zmm12, [rax+192] - - vfmadd231ps zmm3, zmm12, zmm13 - vfmadd231ps zmm7, zmm12, zmm14 - vfmadd231ps zmm11, zmm12, zmm15 - - add rcx, 12 - add rax, 256 diff --git a/linalg/x86_64/avx512/5x2/packed_packed_loop1/avx-512-unroll.tmpli b/linalg/x86_64/avx512/5x2/packed_packed_loop1/avx-512-unroll.tmpli deleted file mode 100644 index 6a5b887b8b..0000000000 --- a/linalg/x86_64/avx512/5x2/packed_packed_loop1/avx-512-unroll.tmpli +++ /dev/null @@ -1,63 +0,0 @@ - // Tile size: 5x2 - // Accumulators: 0-9 - // Col regs: zmm10-13 - // Row regs: zmm14-15 - - vmovaps zmm10, [rax] - vbroadcastss zmm14, dword ptr [rcx + 0] - vbroadcastss zmm15, dword ptr [rcx + 4] - vmovaps zmm11, [rax + 64] - - // NB stepping column-wise - vfmadd231ps zmm0, zmm10, zmm14 - vfmadd231ps zmm5, zmm10, zmm15 - - vmovaps zmm12, [rax + 128] - - vfmadd231ps zmm1, zmm11, zmm14 - vfmadd231ps zmm6, zmm11, zmm15 - - vmovaps zmm13, [rax + 192] - - vfmadd231ps zmm2, zmm12, zmm14 - vfmadd231ps zmm7, zmm12, zmm15 - - vmovaps zmm10, [rax + 256] - - vfmadd231ps zmm3, zmm13, zmm14 - vfmadd231ps zmm8, zmm13, zmm15 - - vmovaps zmm11, [rax + 320] - - vfmadd231ps zmm4, zmm10, zmm14 - vfmadd231ps zmm9, zmm10, zmm15 - - vbroadcastss zmm14, dword ptr [rcx + 8] - vbroadcastss zmm15, dword ptr [rcx + 12] - - vmovaps zmm12, [rax + 384] - - // NB stepping column-wise - vfmadd231ps zmm0, zmm11, zmm14 - vfmadd231ps zmm5, zmm11, zmm15 - - vmovaps zmm13, [rax + 448] - - vfmadd231ps zmm1, zmm12, zmm14 - vfmadd231ps zmm6, zmm12, zmm15 - - vmovaps zmm10, [rax + 512] - - vfmadd231ps zmm2, zmm13, zmm14 - vfmadd231ps zmm7, zmm13, zmm15 - - vmovaps zmm11, [rax + 576] - - vfmadd231ps zmm3, zmm10, zmm14 - vfmadd231ps zmm8, zmm10, zmm15 - - vfmadd231ps zmm4, zmm11, zmm14 - vfmadd231ps zmm9, zmm11, zmm15 - - add rax, 640 - add rcx, 16 diff --git a/linalg/x86_64/avx512/5x2/packed_packed_loop1/avx-512.tmpli b/linalg/x86_64/avx512/5x2/packed_packed_loop1/avx-512.tmpli deleted file mode 100644 index 73ef89b588..0000000000 --- a/linalg/x86_64/avx512/5x2/packed_packed_loop1/avx-512.tmpli +++ /dev/null @@ -1,34 +0,0 @@ - // Tile size: 5x2 - // Accumulators: 0-9 - // Col regs: zmm10-14 - // Row regs: zmm15-16 - - vmovaps zmm10, [rax] - vbroadcastss zmm15, dword ptr [rcx + 0] - vbroadcastss zmm16, dword ptr [rcx + 4] - vmovaps zmm11, [rax + 64] - - // NB stepping column-wise - vfmadd231ps zmm0, zmm10, zmm15 - vfmadd231ps zmm5, zmm10, zmm16 - - vmovaps zmm12, [rax + 128] - - vfmadd231ps zmm1, zmm11, zmm15 - vfmadd231ps zmm6, zmm11, zmm16 - - vmovaps zmm13, [rax + 192] - - vfmadd231ps zmm2, zmm12, zmm15 - vfmadd231ps zmm7, zmm12, zmm16 - - vmovaps zmm14, [rax + 256] - - vfmadd231ps zmm3, zmm13, zmm15 - vfmadd231ps zmm8, zmm13, zmm16 - - vfmadd231ps zmm4, zmm14, zmm15 - vfmadd231ps zmm9, zmm14, zmm16 - - add rax, 320 - add rcx, 8 diff --git a/linalg/x86_64/avx512/6x1/packed_packed_loop1/avx-512-unroll.tmpli b/linalg/x86_64/avx512/6x1/packed_packed_loop1/avx-512-unroll.tmpli deleted file mode 100644 index 8c77044339..0000000000 --- a/linalg/x86_64/avx512/6x1/packed_packed_loop1/avx-512-unroll.tmpli +++ /dev/null @@ -1,25 +0,0 @@ - // Tile size: 6x1 - // Accumulators: 0-5 - // Col regs: 6-11 - // Row regs: 15 - - - vbroadcastss zmm15, dword ptr [rcx] - vfmadd231ps zmm0, zmm15, [rax] - vfmadd231ps zmm1, zmm15, [rax + 64] - vfmadd231ps zmm2, zmm15, [rax + 128] - vfmadd231ps zmm3, zmm15, [rax + 192] - vfmadd231ps zmm4, zmm15, [rax + 256] - vfmadd231ps zmm5, zmm15, [rax + 320] - - vbroadcastss zmm14, dword ptr [rcx + 4] - - vfmadd231ps zmm0, zmm14, [rax + 384] - vfmadd231ps zmm1, zmm14, [rax + 448] - vfmadd231ps zmm2, zmm14, [rax + 512] - vfmadd231ps zmm3, zmm14, [rax + 576] - vfmadd231ps zmm4, zmm14, [rax + 640] - vfmadd231ps zmm5, zmm14, [rax + 704] - - add rax, 768 - add rcx, 8 diff --git a/linalg/x86_64/avx512/6x1/packed_packed_loop1/avx-512.tmpli b/linalg/x86_64/avx512/6x1/packed_packed_loop1/avx-512.tmpli deleted file mode 100644 index a34c40fee4..0000000000 --- a/linalg/x86_64/avx512/6x1/packed_packed_loop1/avx-512.tmpli +++ /dev/null @@ -1,29 +0,0 @@ - // Tile size: 6x1 - // Accumulators: 0-5 - // Col regs: 6-11 - // Row regs: 15 - - vbroadcastss zmm15, dword ptr [rcx] - - vmovups zmm10, [rax] - vmulps zmm10, zmm10, zmm15 - vaddps zmm0, zmm0, zmm10 - vmovups zmm11, [rax + 64] - vmulps zmm11, zmm11, zmm15 - vaddps zmm1, zmm1, zmm11 - vmovups zmm12, [rax + 128] - vmulps zmm12, zmm12, zmm15 - vaddps zmm2, zmm2, zmm12 - vmovups zmm13, [rax + 192] - vmulps zmm13, zmm13, zmm15 - vaddps zmm3, zmm3, zmm13 - vmovups zmm14, [rax + 256] - vmulps zmm14, zmm14, zmm15 - vaddps zmm4, zmm4, zmm14 - vmovups zmm15, [rax + 320] - vmulps zmm15, zmm15, zmm15 - vaddps zmm5, zmm5, zmm15 - - - add rcx, 4 - add rax, 384 diff --git a/linalg/x86_64/avx512/6x2/packed_packed_loop1/avx-512-unroll.tmpli b/linalg/x86_64/avx512/6x2/packed_packed_loop1/avx-512-unroll.tmpli deleted file mode 100644 index 58ed8f4331..0000000000 --- a/linalg/x86_64/avx512/6x2/packed_packed_loop1/avx-512-unroll.tmpli +++ /dev/null @@ -1,70 +0,0 @@ - // Tile size: 6x2 - // Accumulators: 0-9 - // Col regs: zmm10-13 - // Row regs: zmm14-15 - - vmovaps zmm12, [rax] - vbroadcastss zmm14, dword ptr [rcx + 0] - vbroadcastss zmm15, dword ptr [rcx + 4] - vmovaps zmm13, [rax + 64] - - vfmadd231ps zmm0, zmm12, zmm14 - vfmadd231ps zmm6, zmm12, zmm15 - - vmovaps zmm12, [rax + 128] - - vfmadd231ps zmm1, zmm13, zmm14 - vfmadd231ps zmm7, zmm13, zmm15 - - vmovaps zmm13, [rax + 192] - - vfmadd231ps zmm2, zmm12, zmm14 - vfmadd231ps zmm8, zmm12, zmm15 - - vmovaps zmm12, [rax + 256] - - vfmadd231ps zmm3, zmm13, zmm14 - vfmadd231ps zmm9, zmm13, zmm15 - - vmovaps zmm13, [rax + 320] - - vfmadd231ps zmm4, zmm12, zmm14 - vfmadd231ps zmm10, zmm12, zmm15 - - vmovaps zmm12, [rax + 384] - vbroadcastss zmm14, dword ptr [rcx + 8] - - vfmadd231ps zmm5, zmm13, zmm14 - vfmadd231ps zmm11, zmm13, zmm15 - - vbroadcastss zmm15, dword ptr [rcx + 12] - vmovaps zmm13, [rax + 448] - - vfmadd231ps zmm0, zmm12, zmm14 - vfmadd231ps zmm6, zmm12, zmm15 - - vmovaps zmm12, [rax + 512] - - vfmadd231ps zmm1, zmm13, zmm14 - vfmadd231ps zmm7, zmm13, zmm15 - - vmovaps zmm13, [rax + 576] - - vfmadd231ps zmm2, zmm12, zmm14 - vfmadd231ps zmm8, zmm12, zmm15 - - vmovaps zmm12, [rax + 640] - - vfmadd231ps zmm3, zmm13, zmm14 - vfmadd231ps zmm9, zmm13, zmm15 - - vmovaps zmm13, [rax + 704] - - vfmadd231ps zmm4, zmm12, zmm14 - vfmadd231ps zmm10, zmm12, zmm15 - - vfmadd231ps zmm5, zmm13, zmm14 - vfmadd231ps zmm11, zmm13, zmm15 - - add rax, 768 - add rcx, 16 diff --git a/linalg/x86_64/avx512/6x2/packed_packed_loop1/avx-512.tmpli b/linalg/x86_64/avx512/6x2/packed_packed_loop1/avx-512.tmpli deleted file mode 100644 index 0fa5fa8e45..0000000000 --- a/linalg/x86_64/avx512/6x2/packed_packed_loop1/avx-512.tmpli +++ /dev/null @@ -1,38 +0,0 @@ - // Tile size: 6x2 - // Accumulators: 0-11 - // Col regs: 12-13 - // Row regs: 14-15 - - vmovaps zmm12, [rax] - vbroadcastss zmm14, dword ptr [rcx + 0] - vbroadcastss zmm15, dword ptr [rcx + 4] - vmovaps zmm13, [rax + 64] - - vfmadd231ps zmm0, zmm12, zmm14 - vfmadd231ps zmm6, zmm12, zmm15 - - vmovaps zmm12, [rax + 128] - - vfmadd231ps zmm1, zmm13, zmm14 - vfmadd231ps zmm7, zmm13, zmm15 - - vmovaps zmm13, [rax + 192] - - vfmadd231ps zmm2, zmm12, zmm14 - vfmadd231ps zmm8, zmm12, zmm15 - - vmovaps zmm12, [rax + 256] - - vfmadd231ps zmm3, zmm13, zmm14 - vfmadd231ps zmm9, zmm13, zmm15 - - vmovaps zmm13, [rax + 320] - - vfmadd231ps zmm4, zmm12, zmm14 - vfmadd231ps zmm10, zmm12, zmm15 - - vfmadd231ps zmm5, zmm13, zmm14 - vfmadd231ps zmm11, zmm13, zmm15 - - add rcx, 8 - add rax, 384 diff --git a/linalg/x86_64/avx512/7x1/packed_packed_loop1/avx-512-unroll.tmpli b/linalg/x86_64/avx512/7x1/packed_packed_loop1/avx-512-unroll.tmpli deleted file mode 100644 index e23d79d2d5..0000000000 --- a/linalg/x86_64/avx512/7x1/packed_packed_loop1/avx-512-unroll.tmpli +++ /dev/null @@ -1,40 +0,0 @@ - // Tile size: 6x1 - // Accumulators: 0-5 - // Col regs: 6-11 - // Row regs: 15 - - vbroadcastss zmm15, dword ptr [rcx] - - vmovaps zmm7, [rax + 0] - vmovaps zmm8, [rax + 64] - vmovaps zmm9, [rax + 128] - vmovaps zmm10, [rax + 192] - vmovaps zmm11, [rax + 256] - vmovaps zmm12, [rax + 320] - vmovaps zmm13, [rax + 384] - - vfmadd231ps zmm0, zmm7, zmm15 - vfmadd231ps zmm1, zmm8, zmm15 - vfmadd231ps zmm2, zmm9, zmm15 - vfmadd231ps zmm3, zmm10, zmm15 - vfmadd231ps zmm4, zmm11, zmm15 - vfmadd231ps zmm5, zmm12, zmm15 - vfmadd231ps zmm6, zmm13, zmm15 - - vbroadcastss zmm16, dword ptr [rcx + 4] - - vmovaps zmm7, [rax + 448 + 0] - vmovaps zmm8, [rax + 448 + 64] - vmovaps zmm9, [rax + 448 + 128] - vmovaps zmm10, [rax + 448 + 192] - vmovaps zmm11, [rax + 448 + 256] - vmovaps zmm12, [rax + 448 + 320] - vmovaps zmm13, [rax + 448 + 384] - - vfmadd231ps zmm0, zmm7, zmm15 - vfmadd231ps zmm1, zmm8, zmm15 - vfmadd231ps zmm2, zmm9, zmm15 - vfmadd231ps zmm3, zmm10, zmm15 - vfmadd231ps zmm4, zmm11, zmm15 - vfmadd231ps zmm5, zmm12, zmm15 - vfmadd231ps zmm6, zmm13, zmm15 diff --git a/linalg/x86_64/avx512/7x1/packed_packed_loop1/avx-512.tmpli b/linalg/x86_64/avx512/7x1/packed_packed_loop1/avx-512.tmpli deleted file mode 100644 index 889cb34e9b..0000000000 --- a/linalg/x86_64/avx512/7x1/packed_packed_loop1/avx-512.tmpli +++ /dev/null @@ -1,21 +0,0 @@ - // Tile size: 7x1 - // Accumulators: 0-6 - // Col regs: 6-13 - // Row regs: 15 - vbroadcastss zmm15, dword ptr [rcx] - - vmovaps zmm7, [rax + 0] - vmovaps zmm8, [rax + 64] - vmovaps zmm9, [rax + 128] - vmovaps zmm10, [rax + 192] - vmovaps zmm11, [rax + 256] - vmovaps zmm12, [rax + 320] - vmovaps zmm13, [rax + 384] - - vfmadd231ps zmm0, zmm7, zmm15 - vfmadd231ps zmm1, zmm8, zmm15 - vfmadd231ps zmm2, zmm9, zmm15 - vfmadd231ps zmm3, zmm10, zmm15 - vfmadd231ps zmm4, zmm11, zmm15 - vfmadd231ps zmm5, zmm12, zmm15 - vfmadd231ps zmm6, zmm13, zmm15 diff --git a/linalg/x86_64/avx512/8x1/packed_packed_loop1/avx-512-unroll.tmpli b/linalg/x86_64/avx512/8x1/packed_packed_loop1/avx-512-unroll.tmpli deleted file mode 100644 index 96d0d9863d..0000000000 --- a/linalg/x86_64/avx512/8x1/packed_packed_loop1/avx-512-unroll.tmpli +++ /dev/null @@ -1,30 +0,0 @@ - // Tile size: 8x1 - // Accumulators: 0-7 - // Col regs: 8-14 - // Row regs: 15 - - vbroadcastss zmm17, dword ptr [rcx] - - - vfmadd231ps zmm0, zmm17, [rax + 0] - vfmadd231ps zmm1, zmm17, [rax + 64] - vfmadd231ps zmm2, zmm17, [rax + 128] - vfmadd231ps zmm3, zmm17, [rax + 192] - vfmadd231ps zmm4, zmm17, [rax + 256] - vfmadd231ps zmm5, zmm17, [rax + 320] - vfmadd231ps zmm6, zmm17, [rax + 384] - vfmadd231ps zmm7, zmm17, [rax + 448] - - vbroadcastss zmm16, dword ptr [rcx + 4] - - vfmadd231ps zmm0, zmm16, [rax + 0 + 512] - vfmadd231ps zmm1, zmm16, [rax + 64 + 512] - vfmadd231ps zmm2, zmm16, [rax + 128 + 512] - vfmadd231ps zmm3, zmm16, [rax + 192 + 512] - vfmadd231ps zmm4, zmm16, [rax + 256 + 512] - vfmadd231ps zmm5, zmm16, [rax + 320 + 512] - vfmadd231ps zmm6, zmm16, [rax + 384 + 512] - vfmadd231ps zmm7, zmm16, [rax + 448 + 512] - - add rcx, 8 - add rax, 1024 diff --git a/linalg/x86_64/avx512/8x1/packed_packed_loop1/avx-512.tmpli b/linalg/x86_64/avx512/8x1/packed_packed_loop1/avx-512.tmpli deleted file mode 100644 index 38d57ce66d..0000000000 --- a/linalg/x86_64/avx512/8x1/packed_packed_loop1/avx-512.tmpli +++ /dev/null @@ -1,25 +0,0 @@ - // Tile size: 8x1 - // Accumulators: 0-7 - // Col regs: 8-14 - // Row regs: 15 - - vbroadcastss zmm15, dword ptr [rcx] - - vmovaps zmm8, [rax + 0] - vfmadd231ps zmm0, zmm15, zmm8 - vmovaps zmm9, [rax + 64] - vfmadd231ps zmm1, zmm15, zmm9 - vmovaps zmm10, [rax + 128] - vfmadd231ps zmm2, zmm15, zmm10 - vmovaps zmm11, [rax + 192] - vfmadd231ps zmm3, zmm15, zmm11 - vmovaps zmm12, [rax + 256] - vfmadd231ps zmm4, zmm15, zmm12 - vmovaps zmm13, [rax + 320] - vfmadd231ps zmm5, zmm15, zmm13 - vmovaps zmm14, [rax + 384] - vfmadd231ps zmm6, zmm15, zmm14 - vmovaps zmm8, [rax + 448] - vfmadd231ps zmm7, zmm15, zmm8 - add rcx, 4 - add rax, 512 diff --git a/linalg/x86_64/avx512/8x2/packed_packed_loop1/avx-512.tmpli b/linalg/x86_64/avx512/8x2/packed_packed_loop1/avx-512.tmpli deleted file mode 100644 index 772651ce8f..0000000000 --- a/linalg/x86_64/avx512/8x2/packed_packed_loop1/avx-512.tmpli +++ /dev/null @@ -1,42 +0,0 @@ - // Tile size: 8x2 - // Accumulators: 0-15 - // Col regs: 16-23 - // Row regs: 24-25 - - vmovaps zmm16, [rax + 0] - vbroadcastss zmm24, dword ptr [rcx + 0] - vbroadcastss zmm25, dword ptr [rcx + 4] - - vfmadd231ps zmm0, zmm16, zmm24 - vfmadd231ps zmm8, zmm16, zmm25 - - vmovaps zmm17, [rax + 64] - vfmadd231ps zmm1, zmm17, zmm24 - vfmadd231ps zmm9, zmm17, zmm25 - - vmovaps zmm18, [rax + 128] - vfmadd231ps zmm2, zmm18, zmm24 - vfmadd231ps zmm10, zmm18, zmm25 - - vmovaps zmm19, [rax + 192] - vfmadd231ps zmm3, zmm19, zmm24 - vfmadd231ps zmm11, zmm19, zmm25 - - vmovaps zmm20, [rax + 256] - vfmadd231ps zmm4, zmm20, zmm24 - vfmadd231ps zmm12, zmm20, zmm25 - - vmovaps zmm21, [rax + 320] - vfmadd231ps zmm5, zmm21, zmm24 - vfmadd231ps zmm13, zmm21, zmm25 - - vmovaps zmm22, [rax + 384] - vfmadd231ps zmm6, zmm22, zmm24 - vfmadd231ps zmm14, zmm22, zmm25 - - vmovaps zmm23, [rax + 448] - vfmadd231ps zmm7, zmm23, zmm24 - vfmadd231ps zmm15, zmm23, zmm25 - - add rax, 512 - add rcx, 8 diff --git a/linalg/x86_64/avx512/8x8/packed_packed_loop1/avx-512-unroll.tmpli b/linalg/x86_64/avx512/8x8/packed_packed_loop1/avx-512-unroll.tmpli deleted file mode 100644 index 1400fdf0da..0000000000 --- a/linalg/x86_64/avx512/8x8/packed_packed_loop1/avx-512-unroll.tmpli +++ /dev/null @@ -1,61 +0,0 @@ - // Tile size: 1x8 - // Accumulators: 0-7 - // Col regs: 8-14 - // Row regs: 15 - - - vmovaps zmm15, [rax] - - vbroadcastss zmm8, dword ptr [rcx + 0 * 4] - vfmadd231ps zmm0, zmm15, zmm8 - - vbroadcastss zmm9, dword ptr [rcx + 1 * 4] - vfmadd231ps zmm1, zmm15, zmm9 - - vbroadcastss zmm10, dword ptr [rcx + 2 * 4] - vfmadd231ps zmm2, zmm15, zmm10 - - vbroadcastss zmm11, dword ptr [rcx + 3 * 4] - vfmadd231ps zmm3, zmm15, zmm11 - - vbroadcastss zmm12, dword ptr [rcx + 4 * 4] - vfmadd231ps zmm4, zmm15, zmm12 - - vbroadcastss zmm13, dword ptr [rcx + 5 * 4] - vfmadd231ps zmm5, zmm15, zmm13 - - vbroadcastss zmm10, dword ptr [rcx + 6 * 4] - vfmadd231ps zmm6, zmm15, zmm10 - - vbroadcastss zmm11, dword ptr [rcx + 7 * 4] - vfmadd231ps zmm7, zmm15, zmm11 - - - vmovaps zmm15, [rax+64] - - vbroadcastss zmm8, dword ptr [rcx + 8 * 4] - vfmadd231ps zmm0, zmm15, zmm8 - - vbroadcastss zmm9, dword ptr [rcx + 9 * 4] - vfmadd231ps zmm1, zmm15, zmm9 - - vbroadcastss zmm10, dword ptr [rcx + 10 * 4] - vfmadd231ps zmm2, zmm15, zmm10 - - vbroadcastss zmm11, dword ptr [rcx + 11 * 4] - vfmadd231ps zmm3, zmm15, zmm11 - - vbroadcastss zmm12, dword ptr [rcx + 12 * 4] - vfmadd231ps zmm4, zmm15, zmm12 - - vbroadcastss zmm13, dword ptr [rcx + 13 * 4] - vfmadd231ps zmm5, zmm15, zmm13 - - vbroadcastss zmm10, dword ptr [rcx + 14 * 4] - vfmadd231ps zmm6, zmm15, zmm10 - - vbroadcastss zmm11, dword ptr [rcx + 15 * 4] - vfmadd231ps zmm7, zmm15, zmm11 - - add rcx, 64 - add rax, 128 \ No newline at end of file diff --git a/linalg/x86_64/avx512/8x8/packed_packed_loop1/avx-512.tmpli b/linalg/x86_64/avx512/8x8/packed_packed_loop1/avx-512.tmpli deleted file mode 100644 index c08151c2ac..0000000000 --- a/linalg/x86_64/avx512/8x8/packed_packed_loop1/avx-512.tmpli +++ /dev/null @@ -1,33 +0,0 @@ - // Tile size: 1x8 - // Accumulators: 0-7 - // Col regs: 8-14 - // Row regs: 15 - - vmovaps zmm15, [rax] - - vbroadcastss zmm8, dword ptr [rcx + 0 * 4] - vfmadd231ps zmm0, zmm15, zmm8 - - vbroadcastss zmm9, dword ptr [rcx + 1 * 4] - vfmadd231ps zmm1, zmm15, zmm9 - - vbroadcastss zmm10, dword ptr [rcx + 2 * 4] - vfmadd231ps zmm2, zmm15, zmm10 - - vbroadcastss zmm11, dword ptr [rcx + 3 * 4] - vfmadd231ps zmm3, zmm15, zmm11 - - vbroadcastss zmm12, dword ptr [rcx + 4 * 4] - vfmadd231ps zmm4, zmm15, zmm12 - - vbroadcastss zmm13, dword ptr [rcx + 5 * 4] - vfmadd231ps zmm5, zmm15, zmm13 - - vbroadcastss zmm10, dword ptr [rcx + 6 * 4] - vfmadd231ps zmm6, zmm15, zmm10 - - vbroadcastss zmm11, dword ptr [rcx + 7 * 4] - vfmadd231ps zmm7, zmm15, zmm11 - - add rcx, 32 - add rax, 64 diff --git a/linalg/x86_64/avx512/avx512_mmm_f32.tmpliq b/linalg/x86_64/avx512/avx512_mmm_f32.tmpliq new file mode 100644 index 0000000000..b83cff8d09 --- /dev/null +++ b/linalg/x86_64/avx512/avx512_mmm_f32.tmpliq @@ -0,0 +1,28 @@ +{% comment %} +Generate the code for a full AVX512 f32 kernel. +--- +Arguments: + mr - kernel size in number of elements + nr - kernel size in number of elements +{% endcomment %} + +// The kernel will operate on mr times nr elements at once, +// by laying them out in the registers as such: +// +// mmm f32 32 x 12: +// zmm0 zmm2 zmm4 zmm6 zmm8 zmm10 zmm12 zmm14 zmm16 zmm18 zmm20 zmm22 +// zmm1 zmm3 zmm5 zmm7 zmm9 zmm11 zmm13 zmm15 zmm17 zmm19 zmm21 zmm23 + +{% assign kernel_name = mr | append:"x" | append:nr %} + +{% include "preamble.tmpliq" size:kernel_name, suffix:suffix, G:G, arch:"avx512" %} + +{% include "f32_add_mat_mul.tmpliq" mr:mr, nr:nr %} +{% include "f32_scalars.tmpliq" mr:mr, nr:nr %} +{% include "f32_per_rows.tmpliq" mr:mr, nr:nr %} +{% include "f32_per_cols.tmpliq" mr:mr, nr:nr %} +{% include "f32_store_clear.tmpliq" mr:mr, nr:nr %} +{% include "f32_add_row_col_products.tmpliq" mr:mr, nr:nr %} +{% include "f32_add_unicast.tmpliq" mr:mr, nr:nr %} + +{% include "postamble.tmpliq" size:kernel_name, suffix:suffix, G:G, L:L, arch:"avx512" %} diff --git a/linalg/x86_64/avx512/avx512_mmm_f32_128x1.tmpl b/linalg/x86_64/avx512/avx512_mmm_f32_128x1.tmpl deleted file mode 100644 index 195e764f8a..0000000000 --- a/linalg/x86_64/avx512/avx512_mmm_f32_128x1.tmpl +++ /dev/null @@ -1,95 +0,0 @@ -{% comment %} -// vim: set syntax=asm : - -/* mmm 128 x 1 - - zmm0 - zmm1 - ... - zmm7 - -System V ABI: - args: rdi, rsi, rdx, rcx, r8, r9 - preserve: rbx, rsp, rbp, r12, r13, r14, r15 - scratch: rax, rdi, rsi, rdx, rcx, r8, r9, r10, r11 - return: rax (+rdx) - -Windows ABI: - args: RCX, RDX, R8, R9 - preserve: RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15, and XMM6-15 - scratch: RAX, RCX, RDX, R8, R9, R10, R11, XMM0-5, and the upper portions of ZMM0-15 and ZMM0-15 - return: rax (+rdx) -*/ -{% endcomment %} - -{% include "preamble.tmpliq" size:"128x1", suffix:suffix, G:G, arch:"avx512" %} - -{{L}}clear: - vzeroall - jmp {{L}}non_linear_loop - -{{L}}add_mat_mul: - mov rcx, [rdi + 24] // B - mov rax, [rdi + 16] // A - - mov rbx, [rdi + 8] // k - test rbx, rbx - jz {{L}}non_linear_loop - -{{align}} 16 -{{L}}main_loop_packed_packed: - {% include "8x1/packed_packed_loop1/avx-512.tmpli" %} - - sub rbx, 1 - jnz {{L}}main_loop_packed_packed - - jmp {{L}}non_linear_loop - -{% include "f32_scalars.tmpliq" from:0, to:7 %} -{% include "f32_per_rows.tmpliq" mr:128, from:0, to:7 %} -{% include "f32_per_cols.tmpliq" mr:128, from:0, to:7 %} - -{{L}}add_unicast: - mov r10, [rdi + 8] // c ptr - mov rsi, [rdi + 16] // row stride - - {% for row in (0..7) %} - vaddps zmm{{row}}, zmm{{row}}, [ r10 + {{row|times:64}} ] - {% endfor %} - - jmp {{L}}non_linear_loop - -{{L}}add_row_col_products: - mov rax, [ rdi + 8 ] - mov rbx, [ rdi + 16 ] - - vbroadcastss zmm14, dword ptr [rbx] - -{% for i in (0..7) %} - vmovups zmm12, [rax + {{i|times:64}}] - vfmadd231ps zmm{{i}}, zmm12, zmm14 -{% endfor %} - jmp {{L}}non_linear_loop - -{{L}}store: - mov r8, [rdi + 8] // c ptr - mov rsi, [rdi + 16] // row stride - - test r8, 63 - jnz {{L}}store_unaligned - - {% for row in (0..7) %} - vmovaps [r8 + {{row|times:64}}], zmm{{row}} - {% endfor %} - - jmp {{L}}non_linear_loop - - -{{L}}store_unaligned: - {% for row in (0..7) %} - vmovups [r8 + {{row|times:64}}], zmm{{row}} - {% endfor %} - - jmp {{L}}non_linear_loop - -{% include "postamble.tmpliq" size:"128x1", suffix:suffix, G:G, L:L, arch:"avx512" %} diff --git a/linalg/x86_64/avx512/avx512_mmm_f32_16x1.tmpl b/linalg/x86_64/avx512/avx512_mmm_f32_16x1.tmpl deleted file mode 100644 index 4e46ca8b4c..0000000000 --- a/linalg/x86_64/avx512/avx512_mmm_f32_16x1.tmpl +++ /dev/null @@ -1,134 +0,0 @@ -{% comment %} -// vim: set syntax=asm : - -/* mmm 16 x 1 - - zmm0 - -System V ABI: - args: rdi, rsi, rdx, rcx, r8, r9 - preserve: rbx, rsp, rbp, r12, r13, r14, r15 - scratch: rax, rdi, rsi, rdx, rcx, r8, r9, r10, r11 - return: rax (+rdx) - -Windows ABI: - args: RCX, RDX, R8, R9 - preserve: RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15, and XMM6-15 - scratch: RAX, RCX, RDX, R8, R9, R10, R11, XMM0-5, and the upper portions of ZMM0-15 and ZMM0-15 - return: rax (+rdx) -*/ -{% endcomment %} - - -{% include "preamble.tmpliq" size:"16x1", suffix:suffix, G:G, arch:"avx512" %} - -{{L}}clear: - vzeroall - jmp {{L}}non_linear_loop - -{{L}}add_mat_mul: - mov rcx, [rdi + 24] // B - mov rax, [rdi + 16] // A - - mov rbx, [rdi + 8] // k - test rbx, rbx - jz {{L}}non_linear_loop - - cmp rbx, 8 - jl {{L}}main_loop_packed_packed_tail - -{{align}} 16 -{{L}}main_loop_packed_packed: - {% include "1x1/packed_packed_loop1/unroll-4.tmpli" %} - - sub rbx, 4 - cmp rbx, 4 - jge {{L}}main_loop_packed_packed - - {% for r in (1..3) %} - vaddps zmm0, zmm0, zmm{{r}} - {% endfor %} - - test rbx, rbx - jz {{L}}non_linear_loop - -{{align}} 16 -{{L}}main_loop_packed_packed_tail: - {% include "1x1/packed_packed_loop1/avx-512.tmpli" %} - - sub rbx, 1 - jnz {{L}}main_loop_packed_packed_tail - - jmp {{L}}non_linear_loop - -{% include "f32_scalars.tmpliq" from:0, to:0 %} -{% include "f32_per_rows.tmpliq" mr:16, from:0, to:0 %} -{% include "f32_per_cols.tmpliq" mr:16, from:0, to:0 %} - -{{L}}add_unicast: - mov r10, [rdi + 8] // c ptr - mov rsi, [rdi + 16] // row stride - - cmp rsi, 4 - jne {{L}}add_unicast_generic - - vaddps zmm0, zmm0, [r10] - - jmp {{L}}non_linear_loop - -{{L}}add_unicast_generic: - mov r8, [0] -// mov eax, 0 -// {% for i in (0..3) %} -// pinsrd xmm14, eax, {{i}} -// add eax, esi -// {% endfor %} -// {% for i in (0..3) %} -// pinsrd xmm15, eax, {{i}} -// add eax, esi -// {% endfor %} -// -// vperm2f128 zmm14, zmm14, zmm15, 32 // zmm14 <- xmm14::xmm15 -// -// {% for i in (0..7) %} -// vpcmpeqd zmm15, zmm15, zmm15 -// vgatherdps zmm12, [ r10 + zmm14 ], zmm15 -// -// vaddps zmm{{i}}, zmm{{i}}, zmm12 -// lea r10, [ r10 + rsi * 8 ] -// {% endfor %} -// - jmp {{L}}non_linear_loop - -{{L}}add_row_col_products: - mov rax, [ rdi + 8 ] - mov rbx, [ rdi + 16 ] - - vbroadcastss zmm14, dword ptr [rbx] - -{% for i in (0..0) %} - vmovups zmm12, [rax + {{i|times:64}}] - vfmadd231ps zmm{{i}}, zmm12, zmm14 -{% endfor %} - jmp {{L}}non_linear_loop - -{{L}}store: - mov r8, [rdi + 8] // c ptr - mov rsi, [rdi + 16] // row stride - - cmp rsi, 4 - jne {{L}}crash - - test r8, 63 - jnz {{L}}store_unaligned - - vmovaps [r8], zmm0 - jmp {{L}}non_linear_loop - -{{L}}store_unaligned: - vmovups [r8], zmm0 - jmp {{L}}non_linear_loop - -{{L}}crash: - mov r10, [0] -{% include "postamble.tmpliq" size:"16x1", suffix:suffix, G:G, L:L, arch:"avx512" %} diff --git a/linalg/x86_64/avx512/avx512_mmm_f32_16x12.tmpl b/linalg/x86_64/avx512/avx512_mmm_f32_16x12.tmpl deleted file mode 100644 index 159f689233..0000000000 --- a/linalg/x86_64/avx512/avx512_mmm_f32_16x12.tmpl +++ /dev/null @@ -1,164 +0,0 @@ -{% comment %} -// vim: set syntax=asm : - -/* mmm 16 x 12 - - zmm0 zmm1 ... zmm11 - -System V ABI: - args: rdi, rsi, rdx, rcx, r8, r9 - preserve: rbx, rsp, rbp, r12, r13, r14, r15 - scratch: rax, rdi, rsi, rdx, rcx, r8, r9, r10, r11 - return: rax (+rdx) - -Windows ABI: - args: RCX, RDX, R8, R9 - preserve: RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15, and XMM6-15 - scratch: RAX, RCX, RDX, R8, R9, R10, R11, XMM0-5, and the upper portions of ZMM0-15 and ZMM0-15 - return: rax (+rdx) -*/ -{% endcomment %} - - -{% include "preamble.tmpliq" size:"16x12", suffix:suffix, G:G, arch:"avx512" %} - -{{L}}clear: - vzeroall - jmp {{L}}non_linear_loop - -{{L}}add_mat_mul: - mov rcx, [rdi + 24] // B - mov rax, [rdi + 16] // A - - mov rbx, [rdi + 8] // k - test rbx, rbx - jz {{L}}non_linear_loop - -{{align}} 16 -{{L}}main_loop_packed_packed_tail: - {% include "1x12/packed_packed_loop1/avx-512.tmpli" %} - - sub rbx, 1 - jnz {{L}}main_loop_packed_packed_tail - - jmp {{L}}non_linear_loop - -{% include "f32_scalars.tmpliq" from:0, to:11 %} -{% include "f32_per_rows.tmpliq" mr:16, from:0, to:11 %} -{% include "f32_per_cols.tmpliq" mr:16, from:0, to:11 %} - -{{L}}add_unicast: - - mov r10, [rdi + 8] // c ptr - mov rsi, [rdi + 16] // row stride - mov rbx, [rdi + 24] // col stride - - mov eax, 0 - -{% for i in (0..3) %} - pinsrd xmm14, eax, {{i}} - add eax, esi -{% endfor %} -{% for i in (0..3) %} - pinsrd xmm15, eax, {{i}} - add eax, esi -{% endfor %} -{% for i in (0..3) %} - pinsrd xmm12, eax, {{i}} - add eax, esi -{% endfor %} -{% for i in (0..3) %} - pinsrd xmm13, eax, {{i}} - add eax, esi -{% endfor %} - - vperm2f128 ymm14, ymm14, ymm15, 32 // ymm14 <- xmm14::xmm15 - vperm2f128 ymm13, ymm12, ymm13, 32 // ymm12 <- xmm12::xmm13 - vinsertf32x8 zmm14, zmm14, ymm13, 1 - -{% for i in (0..11) %} - kxnorw k1,k1,k1 - vgatherdps zmm12{k1}, [ r10 + zmm14 ] - add r10, rbx - vaddps zmm{{i}}, zmm{{i}}, zmm12 -{% endfor %} - - jmp {{L}}non_linear_loop - -{{L}}add_row_col_products: - mov rax, [ rdi + 8 ] - mov rbx, [ rdi + 16 ] - - vmovups zmm12, zmmword ptr [rax] - -{% for i in (0..11) %} - vbroadcastss zmm14, dword ptr [rbx + {{i|times:4}} ] - vfmadd231ps zmm{{i}}, zmm12, zmm14 -{% endfor %} - jmp {{L}}non_linear_loop - -{{L}}store: - mov r8, [rdi + 8] // c ptr - mov rsi, [rdi + 16] // row stride - mov rbx, [rdi + 24] // col stride - - // tops of cols - lea r9, [ r8 + rbx ] - lea r10, [ r8 + 2 * rbx ] - lea r11, [ r10 + rbx ] - - {% for quarter in (0..3) %} - {% for r in (0..3) %} - vextractf32x4 xmm{{r | plus: 12}}, zmm{{r}}, {{quarter}} - {% endfor %} - {% for row in (0..3) %} - {% for i in (0..3) %} - vextractps dword ptr [r{{i | plus: 8}}], xmm{{i | plus: 12}}, {{row}} - add r{{i | plus: 8}}, rsi - {% endfor %} - {% endfor %} - {% endfor %} - - mov r8, [rdi + 8] // c ptr - - // tops of cols - lea r8, [ r8 + 4 * rbx ] - lea r9, [ r8 + rbx ] - lea r10, [ r8 + 2 * rbx ] - lea r11, [ r10 + rbx ] - - {% for quarter in (0..3) %} - {% for r in (0..3) %} - vextractf32x4 xmm{{r | plus: 12}}, zmm{{r | plus: 4}}, {{quarter}} - {% endfor %} - {% for row in (0..3) %} - {% for i in (0..3) %} - vextractps dword ptr [r{{i | plus: 8}}], xmm{{i | plus: 12}}, {{row}} - add r{{i | plus: 8}}, rsi - {% endfor %} - {% endfor %} - {% endfor %} - - mov r8, [rdi + 8] // c ptr - - // tops of cols - lea r8, [ r8 + 8 * rbx ] - lea r9, [ r8 + rbx ] - lea r10, [ r8 + 2 * rbx ] - lea r11, [ r10 + rbx ] - - {% for quarter in (0..3) %} - {% for r in (0..3) %} - vextractf32x4 xmm{{r | plus: 12}}, zmm{{r | plus: 8}}, {{quarter}} - {% endfor %} - {% for row in (0..3) %} - {% for i in (0..3) %} - vextractps dword ptr [r{{i | plus: 8}}], xmm{{i | plus: 12}}, {{row}} - add r{{i | plus: 8}}, rsi - {% endfor %} - {% endfor %} - {% endfor %} - - jmp {{L}}non_linear_loop - -{% include "postamble.tmpliq" size:"16x12", suffix:suffix, G:G, L:L, arch:"avx512" %} diff --git a/linalg/x86_64/avx512/avx512_mmm_f32_16x8.tmpl b/linalg/x86_64/avx512/avx512_mmm_f32_16x8.tmpl deleted file mode 100644 index e45835880c..0000000000 --- a/linalg/x86_64/avx512/avx512_mmm_f32_16x8.tmpl +++ /dev/null @@ -1,142 +0,0 @@ -{% comment %} -// vim: set syntax=asm : - -/* mmm 16 x 8 - - zmm0 zmm1 ... zmm8 - -System V ABI: - args: rdi, rsi, rdx, rcx, r8, r9 - preserve: rbx, rsp, rbp, r12, r13, r14, r15 - scratch: rax, rdi, rsi, rdx, rcx, r8, r9, r10, r11 - return: rax (+rdx) - -Windows ABI: - args: RCX, RDX, R8, R9 - preserve: RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15, and XMM6-15 - scratch: RAX, RCX, RDX, R8, R9, R10, R11, XMM0-5, and the upper portions of ZMM0-15 and ZMM0-15 - return: rax (+rdx) -*/ -{% endcomment %} - - -{% include "preamble.tmpliq" size:"16x8", suffix:suffix, G:G, arch:"avx512" %} - -{{L}}clear: - vzeroall - jmp {{L}}non_linear_loop - -{{L}}add_mat_mul: - mov rcx, [rdi + 24] // B - mov rax, [rdi + 16] // A - - mov rbx, [rdi + 8] // k - test rbx, rbx - jz {{L}}non_linear_loop - - cmp rbx, 2 - jl {{L}}main_loop_packed_packed_tail - -{{align}} 16 -{{L}}main_loop_packed_packed: - {% include "8x8/packed_packed_loop1/avx-512-unroll.tmpli" %} - - sub rbx, 2 - cmp rbx, 2 - jge {{L}}main_loop_packed_packed - - test rbx, rbx - jz {{L}}non_linear_loop - -{{align}} 16 -{{L}}main_loop_packed_packed_tail: - {% include "8x8/packed_packed_loop1/avx-512.tmpli" %} - - sub rbx, 1 - jnz {{L}}main_loop_packed_packed_tail - - jmp {{L}}non_linear_loop - -{% include "f32_scalars.tmpliq" from:0, to:7 %} -{% include "f32_per_rows.tmpliq" mr:16, from:0, to:7 %} -{% include "f32_per_cols.tmpliq" mr:16, from:0, to:7 %} - -{{L}}add_unicast: - - mov r10, [rdi + 8] // c ptr - mov rsi, [rdi + 16] // row stride - mov rbx, [rdi + 24] // col stride - - mov eax, 0 - -{% for i in (0..3) %} - pinsrd xmm14, eax, {{i}} - add eax, esi -{% endfor %} -{% for i in (0..3) %} - pinsrd xmm15, eax, {{i}} - add eax, esi -{% endfor %} -{% for i in (0..3) %} - pinsrd xmm12, eax, {{i}} - add eax, esi -{% endfor %} -{% for i in (0..3) %} - pinsrd xmm13, eax, {{i}} - add eax, esi -{% endfor %} - - vperm2f128 ymm14, ymm14, ymm15, 32 // ymm14 <- xmm14::xmm15 - vperm2f128 ymm13, ymm12, ymm13, 32 // ymm12 <- xmm12::xmm13 - vinsertf32x8 zmm14, zmm14, ymm13, 1 - -{% for i in (0..7) %} - kxnorw k1,k1,k1 - vgatherdps zmm12{k1}, [ r10 + zmm14 ] - add r10, rbx - vaddps zmm{{i}}, zmm{{i}}, zmm12 -{% endfor %} - - jmp {{L}}non_linear_loop - -{{L}}add_row_col_products: - mov rax, [ rdi + 8 ] - mov rbx, [ rdi + 16 ] - - vmovups zmm12, zmmword ptr [rax] - -{% for i in (0..7) %} - vbroadcastss zmm14, dword ptr [rbx + {{i|times:4}} ] - vfmadd231ps zmm{{i}}, zmm12, zmm14 -{% endfor %} - jmp {{L}}non_linear_loop - -{{L}}store: - mov r8, [rdi + 8] // c ptr - mov rsi, [rdi + 16] // row stride - mov rbx, [rdi + 24] // col stride - - // tops of cols - lea r9, [ r8 + rbx ] - lea r10, [ r8 + 2 * rbx ] - lea r12, [ r8 + 4 * rbx ] - lea r11, [ r10 + rbx ] - lea r13, [ r12 + rbx ] - lea r14, [ r12 + 2 * rbx ] - lea r15, [ r13 + 2 * rbx ] - - {% for quarter in (0..3) %} - {% for r in (0..7) %} - vextractf32x4 xmm{{r | plus: 8}}, zmm{{r}}, {{quarter}} - {% endfor %} - {% for row in (0..3) %} - {% for i in (0..7) %} - vextractps dword ptr [r{{i | plus: 8}}], xmm{{i | plus: 8}}, {{row}} - add r{{i | plus: 8}}, rsi - {% endfor %} - {% endfor %} - {% endfor %} - - jmp {{L}}non_linear_loop - -{% include "postamble.tmpliq" size:"16x8", suffix:suffix, G:G, L:L, arch:"avx512" %} diff --git a/linalg/x86_64/avx512/avx512_mmm_f32_32x12.tmpl b/linalg/x86_64/avx512/avx512_mmm_f32_32x12.tmpl deleted file mode 100644 index e39e426278..0000000000 --- a/linalg/x86_64/avx512/avx512_mmm_f32_32x12.tmpl +++ /dev/null @@ -1,22 +0,0 @@ -{% comment %} -// vim: set syntax=asm : - -/* mmm f32 32 x 12: - - zmm0 zmm2 zmm4 zmm6 zmm8 zmm10 zmm12 zmm14 zmm16 zmm18 zmm20 zmm22 - zmm1 zmm3 zmm5 zmm7 zmm9 zmm11 zmm13 zmm15 zmm17 zmm19 zmm21 zmm23 - -*/ -{% endcomment %} - -{% include "preamble.tmpliq" size:"32x12", suffix:suffix, G:G, arch:"avx512" %} - -{% include "f32_add_mat_mul.tmpliq" mr:32, nr:12 %} -{% include "f32_scalars.tmpliq" from:0, to:23 %} -{% include "f32_per_rows.tmpliq" mr:32, from:0, to:23 %} -{% include "f32_per_cols.tmpliq" mr:32, from:0, to:23 %} -{% include "f32_store_clear.tmpliq" mr:32, nr:12 %} -{% include "f32_add_row_col_products.tmpliq" mr:32, nr:12 %} -{% include "f32_add_unicast.tmpliq" mr:32, nr:12 %} - -{% include "postamble.tmpliq" size:"32x6", suffix:suffix, G:G, L:L, arch:"avx512" %} diff --git a/linalg/x86_64/avx512/avx512_mmm_f32_32x5.tmpl b/linalg/x86_64/avx512/avx512_mmm_f32_32x5.tmpl deleted file mode 100644 index 9d5c9f6b34..0000000000 --- a/linalg/x86_64/avx512/avx512_mmm_f32_32x5.tmpl +++ /dev/null @@ -1,143 +0,0 @@ -{% comment %} -// vim: set syntax=asm : - -/* mmm 32 x 5: - - zmm0 zmm2 zmm4 zmm6 zmm8 - zmm1 zmm3 zmm5 zmm7 zmm9 - -System V ABI: - args: rdi, rsi, rdx, rcx, r8, r9 - preserve: rbx, rsp, rbp, r12, r13, r14, r15 - scratch: rax, rdi, rsi, rdx, rcx, r8, r9, r10, r11 - return: rax (+rdx) - -Windows ABI: - args: RCX, RDX, R8, R9 - preserve: RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15, and XMM6-15 - scratch: RAX, RCX, RDX, R8, R9, R10, R11, XMM0-5, and the upper portions of YMM0-15 and ZMM0-15 - return: rax (+rdx) -*/ -{% endcomment %} - -{% include "preamble.tmpliq" size:"32x5", suffix:suffix, G:G, arch:"avx512" %} - -{{L}}clear: - vzeroall - jmp {{L}}non_linear_loop - -{{L}}add_mat_mul: - mov rcx, [rdi + 24] // B - mov rax, [rdi + 16] // A - - mov rbx, [rdi + 8] // k - test rcx, rcx - jz {{L}}non_linear_loop - -{{L}}main_loop_packed_packed: - {% include "2x5/packed_packed_loop1/avx-512.tmpli" %} - - dec rbx - jnz {{L}}main_loop_packed_packed - - jmp {{L}}non_linear_loop - -{% include "f32_scalars.tmpliq" from:0, to:9 %} -{% include "f32_per_rows.tmpliq" mr:32, from:0, to:9 %} -{% include "f32_per_cols.tmpliq" mr:32, from:0, to:9 %} - -{{L}}add_unicast: - - mov r10, [rdi + 8] // c ptr - mov rsi, [rdi + 16] // row stride - mov rbx, [rdi + 24] // col stride - - mov eax, 0 - -{% for i in (0..3) %} - pinsrd xmm14, eax, {{i}} - add eax, esi -{% endfor %} -{% for i in (0..3) %} - pinsrd xmm15, eax, {{i}} - add eax, esi -{% endfor %} -{% for i in (0..3) %} - pinsrd xmm12, eax, {{i}} - add eax, esi -{% endfor %} -{% for i in (0..3) %} - pinsrd xmm13, eax, {{i}} - add eax, esi -{% endfor %} - - vperm2f128 ymm14, ymm14, ymm15, 32 // ymm14 <- xmm14::xmm15 - vperm2f128 ymm13, ymm12, ymm13, 32 // ymm12 <- xmm12::xmm13 - vinsertf32x8 zmm14, zmm14, ymm13, 1 - -{% for i in (0..4) %} - kxnorw k1,k1,k1 - vgatherdps zmm12{k1}, [ r10 + zmm14 ] - add r10, rbx - vaddps zmm{{i | times: 2}}, zmm{{i | times: 2}}, zmm12 -{% endfor %} - - imul esi, 16 - vpbroadcastd zmm15, esi - - mov r10, [rdi + 8] - vpaddd zmm14, zmm14, zmm15 - -{% for i in (0..4) %} - kxnorw k1,k1,k1 - vgatherdps zmm12{k1}, [ r10 + zmm14 ] - add r10, rbx - vaddps zmm{{i | times: 2 | plus: 1}}, zmm{{i | times: 2 | plus: 1}}, zmm12 -{% endfor %} - - jmp {{L}}non_linear_loop - -{{L}}add_row_col_products: - mov rax, [ rdi + 8 ] - mov rbx, [ rdi + 16 ] - - vmovups zmm12, zmmword ptr [rax] - vmovups zmm13, zmmword ptr [rax+64] - -{% for i in (0..4) %} - vbroadcastss zmm14, dword ptr [rbx + {{i|times:4}} ] - vfmadd231ps zmm{{i | times: 2}}, zmm12, zmm14 - vfmadd231ps zmm{{i | times: 2 | plus: 1}}, zmm13, zmm14 -{% endfor %} - - jmp {{L}}non_linear_loop - -{{L}}store: - mov r8, [rdi + 8] // c ptr - mov rsi, [rdi + 16] // row stride - mov rbx, [rdi + 24] // col stride - - // tops of cols - lea r9, [ r8 + rbx ] - lea r10, [ r8 + 2 * rbx ] - lea r11, [ r10 + rbx ] - lea r12, [ r10 + 2 * rbx ] - - {% for word in (0..1) %} - {% for quarter in (0..3) %} - {% for r in (0..4) %} - vextractf32x4 xmm{{r | plus: 11}}, zmm{{r | times: 2 | plus: word}}, {{quarter}} - {% endfor %} - {% for row in (0..3) %} - {% for i in (0..4) %} - vextractps dword ptr [r{{i | plus: 8}}], xmm{{i | plus: 11}}, {{row}} - add r{{i | plus: 8}}, rsi - {% endfor %} - {% endfor %} - {% endfor %} - {% endfor %} - - jmp {{L}}non_linear_loop - -{% include "postamble.tmpliq" size:"32x5", suffix:suffix, G:G, L:L, arch:"avx512" %} - diff --git a/linalg/x86_64/avx512/avx512_mmm_f32_32x6.tmpl b/linalg/x86_64/avx512/avx512_mmm_f32_32x6.tmpl deleted file mode 100644 index d0f1c84f12..0000000000 --- a/linalg/x86_64/avx512/avx512_mmm_f32_32x6.tmpl +++ /dev/null @@ -1,160 +0,0 @@ -{% comment %} -// vim: set syntax=asm : - -/* mmm 32 x 6: - - zmm0 zmm2 zmm4 zmm6 zmm8 zmm10 - zmm1 zmm3 zmm5 zmm7 zmm9 zmm11 - -System V ABI: - args: rdi, rsi, rdx, rcx, r8, r9 - preserve: rbx, rsp, rbp, r12, r13, r14, r15 - scratch: rax, rdi, rsi, rdx, rcx, r8, r9, r10, r11 - return: rax (+rdx) - -Windows ABI: - args: RCX, RDX, R8, R9 - preserve: RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15, and XMM6-15 - scratch: RAX, RCX, RDX, R8, R9, R10, R11, XMM0-5, and the upper portions of YMM0-15 and ZMM0-15 - return: rax (+rdx) -*/ -{% endcomment %} - -{% include "preamble.tmpliq" size:"32x6", suffix:suffix, G:G, arch:"avx512" %} - -{{L}}clear: - vzeroall - jmp {{L}}non_linear_loop - -{{L}}add_mat_mul: - mov rcx, [rdi + 24] // B - mov rax, [rdi + 16] // A - - mov rbx, [rdi + 8] // k - test rcx, rcx - jz {{L}}non_linear_loop - -{{L}}main_loop_packed_packed: - {% include "2x6/packed_packed_loop1/avx-512.tmpli" %} - - dec rbx - jnz {{L}}main_loop_packed_packed - - jmp {{L}}non_linear_loop - -{% include "f32_scalars.tmpliq" from:0, to:11 %} -{% include "f32_per_rows.tmpliq" mr:32, from:0, to:11 %} -{% include "f32_per_cols.tmpliq" mr:32, from:0, to:11 %} - -{{L}}add_unicast: - - mov r10, [rdi + 8] // c ptr - mov rsi, [rdi + 16] // row stride - mov rbx, [rdi + 24] // col stride - - mov eax, 0 - -{% for i in (0..3) %} - pinsrd xmm14, eax, {{i}} - add eax, esi -{% endfor %} -{% for i in (0..3) %} - pinsrd xmm15, eax, {{i}} - add eax, esi -{% endfor %} -{% for i in (0..3) %} - pinsrd xmm12, eax, {{i}} - add eax, esi -{% endfor %} -{% for i in (0..3) %} - pinsrd xmm13, eax, {{i}} - add eax, esi -{% endfor %} - - vperm2f128 ymm14, ymm14, ymm15, 32 // ymm14 <- xmm14::xmm15 - vperm2f128 ymm13, ymm12, ymm13, 32 // ymm12 <- xmm12::xmm13 - vinsertf32x8 zmm14, zmm14, ymm13, 1 - -{% for i in (0..5) %} - kxnorw k1,k1,k1 - vgatherdps zmm12{k1}, [ r10 + zmm14 ] - add r10, rbx - vaddps zmm{{i | times: 2}}, zmm{{i | times: 2}}, zmm12 -{% endfor %} - - mov r10, [rdi + 8] - imul esi, 16 - vpbroadcastd zmm15, esi - vpaddd zmm14, zmm14, zmm15 - -{% for i in (0..5) %} - kxnorw k1,k1,k1 - vgatherdps zmm12{k1}, [ r10 + zmm14 ] - add r10, rbx - vaddps zmm{{i | times: 2 | plus: 1}}, zmm{{i | times: 2 | plus: 1}}, zmm12 -{% endfor %} - - jmp {{L}}non_linear_loop - -{{L}}add_row_col_products: - mov rax, [ rdi + 8 ] - mov rbx, [ rdi + 16 ] - - vmovups zmm12, zmmword ptr [rax] - vmovups zmm13, zmmword ptr [rax+64] - -{% for i in (0..5) %} - vbroadcastss zmm14, dword ptr [rbx + {{i|times:4}} ] - vfmadd231ps zmm{{i | times: 2}}, zmm12, zmm14 - vfmadd231ps zmm{{i | times: 2 | plus: 1}}, zmm13, zmm14 -{% endfor %} - - jmp {{L}}non_linear_loop - -{{L}}store: - mov r8, [rdi + 8] // c ptr - mov rsi, [rdi + 16] // row stride - mov rbx, [rdi + 24] // col stride - - // tops of cols - lea r9, [ r8 + rbx ] - lea r10, [ r8 + 2 * rbx ] - lea r11, [ r10 + rbx ] - - {% for word in (0..1) %} - {% for quarter in (0..3) %} - {% for r in (0..2) %} - vextractf32x4 xmm{{r | plus: 12}}, zmm{{r | times: 2 | plus: word}}, {{quarter}} - {% endfor %} - {% for row in (0..3) %} - {% for i in (0..2) %} - vextractps dword ptr [r{{i | plus: 8}}], xmm{{i | plus: 12}}, {{row}} - add r{{i | plus: 8}}, rsi - {% endfor %} - {% endfor %} - {% endfor %} - {% endfor %} - - // tops of cols - mov r8, r11 - lea r9, [ r8 + rbx ] - lea r10, [ r8 + 2 * rbx ] - - {% for word in (0..1) %} - {% for quarter in (0..3) %} - {% for r in (0..2) %} - vextractf32x4 xmm{{r | plus: 12}}, zmm{{r | plus: 3 | times: 2 | plus: word}}, {{quarter}} - {% endfor %} - {% for row in (0..3) %} - {% for i in (0..2) %} - vextractps dword ptr [r{{i | plus: 8}}], xmm{{i | plus: 12}}, {{row}} - add r{{i | plus: 8}}, rsi - {% endfor %} - {% endfor %} - {% endfor %} - {% endfor %} - - jmp {{L}}non_linear_loop - -{% include "postamble.tmpliq" size:"32x6", suffix:suffix, G:G, L:L, arch:"avx512" %} - diff --git a/linalg/x86_64/avx512/avx512_mmm_f32_48x4.tmpl b/linalg/x86_64/avx512/avx512_mmm_f32_48x4.tmpl deleted file mode 100644 index b0410ee1ee..0000000000 --- a/linalg/x86_64/avx512/avx512_mmm_f32_48x4.tmpl +++ /dev/null @@ -1,147 +0,0 @@ -{% comment %} -// vim: set syntax=asm : - -/* mmm 48 x 4: - - zmm0 zmm3 zmm6 zmm9 - zmm1 zmm4 zmm7 zmm10 - zmm2 zmm5 zmm8 zmm11 - -System V ABI: - args: rdi, rsi, rdx, rcx, r8, r9 - preserve: rbx, rsp, rbp, r12, r13, r14, r15 - scratch: rax, rdi, rsi, rdx, rcx, r8, r9, r10, r11 - return: rax (+rdx) - -Windows ABI: - args: RCX, RDX, R8, R9 - preserve: RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15, and XMM6-15 - scratch: RAX, RCX, RDX, R8, R9, R10, R11, XMM0-5, and the upper portions of YMM0-15 and ZMM0-15 - return: rax (+rdx) -*/ -{% endcomment %} - -{% include "preamble.tmpliq" size:"48x4", suffix:suffix, G:G, arch:"avx512" %} - -{{L}}clear: - vzeroall - jmp {{L}}non_linear_loop - -{{L}}add_mat_mul: - mov rcx, [rdi + 24] // B - mov rax, [rdi + 16] // A - - mov rbx, [rdi + 8] // k - test rcx, rcx - jz {{L}}non_linear_loop - -{{L}}main_loop_packed_packed: - {% include "3x4/packed_packed_loop1/avx-512.tmpli" %} - - dec rbx - jnz {{L}}main_loop_packed_packed - - jmp {{L}}non_linear_loop - -{% include "f32_scalars.tmpliq" from:0, to:11 %} -{% include "f32_per_rows.tmpliq" mr:48, from:0, to:11 %} -{% include "f32_per_cols.tmpliq" mr:48, from:0, to:11 %} - -{{L}}add_unicast: - - mov r10, [rdi + 8] // c ptr - mov rsi, [rdi + 16] // row stride - mov rbx, [rdi + 24] // col stride - - mov eax, 0 - -{% for i in (0..3) %} - pinsrd xmm14, eax, {{i}} - add eax, esi -{% endfor %} -{% for i in (0..3) %} - pinsrd xmm15, eax, {{i}} - add eax, esi -{% endfor %} -{% for i in (0..3) %} - pinsrd xmm12, eax, {{i}} - add eax, esi -{% endfor %} -{% for i in (0..3) %} - pinsrd xmm13, eax, {{i}} - add eax, esi -{% endfor %} - - vperm2f128 ymm14, ymm14, ymm15, 32 // ymm14 <- xmm14::xmm15 - vperm2f128 ymm13, ymm12, ymm13, 32 // ymm12 <- xmm12::xmm13 - vinsertf32x8 zmm14, zmm14, ymm13, 1 - -{% for i in (0..3) %} - kxnorw k1,k1,k1 - vgatherdps zmm12{k1}, [ r10 + zmm14 ] - add r10, rbx - vaddps zmm{{i | times: 3}}, zmm{{i | times: 3}}, zmm12 -{% endfor %} - - imul esi, 16 - vpbroadcastd zmm15, esi - -{% for j in (1..2) %} - mov r10, [rdi + 8] - vpaddd zmm14, zmm14, zmm15 - - {% for i in (0..3) %} - kxnorw k1,k1,k1 - vgatherdps zmm12{k1}, [ r10 + zmm14 ] - add r10, rbx - vaddps zmm{{i | times: 3 | plus: j}}, zmm{{i | times: 3 | plus: j}}, zmm12 - {% endfor %} -{% endfor %} - - jmp {{L}}non_linear_loop - -{{L}}add_row_col_products: - mov rax, [ rdi + 8 ] - mov rbx, [ rdi + 16 ] - - vmovups zmm12, zmmword ptr [rax] - vmovups zmm13, zmmword ptr [rax+64] - vmovups zmm15, zmmword ptr [rax+128] - -{% for i in (0..3) %} - vbroadcastss zmm14, dword ptr [rbx + {{i|times:4}} ] - vfmadd231ps zmm{{i | times: 3}}, zmm12, zmm14 - vfmadd231ps zmm{{i | times: 3 | plus: 1}}, zmm13, zmm14 - vfmadd231ps zmm{{i | times: 3 | plus: 2}}, zmm15, zmm14 -{% endfor %} - - jmp {{L}}non_linear_loop - -{{L}}store: - mov r8, [rdi + 8] // c ptr - mov rsi, [rdi + 16] // row stride - mov rbx, [rdi + 24] // col stride - - // tops of cols - lea r9, [ r8 + rbx ] - lea r10, [ r8 + 2 * rbx ] - lea r11, [ r10 + rbx ] - - {% for word in (0..2) %} - {% for quarter in (0..3) %} - {% for r in (0..3) %} - vextractf32x4 xmm{{r | plus: 12}}, zmm{{r | times: 3 | plus: word}}, {{quarter}} - {% endfor %} - {% for row in (0..3) %} - {% for i in (0..3) %} - vextractps dword ptr [r{{i | plus: 8}}], xmm{{i | plus: 12}}, {{row}} - add r{{i | plus: 8}}, rsi - {% endfor %} - {% endfor %} - {% endfor %} - {% endfor %} - - jmp {{L}}non_linear_loop - -{% include "postamble.tmpliq" size:"48x4", suffix:suffix, G:G, L:L, arch:"avx512" %} - diff --git a/linalg/x86_64/avx512/avx512_mmm_f32_64x3.tmpl b/linalg/x86_64/avx512/avx512_mmm_f32_64x3.tmpl deleted file mode 100644 index 0016c845b1..0000000000 --- a/linalg/x86_64/avx512/avx512_mmm_f32_64x3.tmpl +++ /dev/null @@ -1,148 +0,0 @@ -{% comment %} -// vim: set syntax=asm : - -/* mmm 64 x 3: - - zmm0 zmm4 zmm8 - zmm1 zmm5 zmm9 - zmm2 zmm6 zmm10 - zmm3 zmm7 zmm11 - -System V ABI: - args: rdi, rsi, rdx, rcx, r8, r9 - preserve: rbx, rsp, rbp, r12, r13, r14, r15 - scratch: rax, rdi, rsi, rdx, rcx, r8, r9, r10, r11 - return: rax (+rdx) - -Windows ABI: - args: RCX, RDX, R8, R9 - preserve: RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15, and XMM6-15 - scratch: RAX, RCX, RDX, R8, R9, R10, R11, XMM0-5, and the upper portions of YMM0-15 and ZMM0-15 - return: rax (+rdx) -*/ -{% endcomment %} - -{% include "preamble.tmpliq" size:"64x3", suffix:suffix, G:G, arch:"avx512" %} - -{{L}}clear: - vzeroall - jmp {{L}}non_linear_loop - -{{L}}add_mat_mul: - mov rcx, [rdi + 24] // B - mov rax, [rdi + 16] // A - - mov rbx, [rdi + 8] // k - test rcx, rcx - jz {{L}}non_linear_loop - -{{L}}main_loop_packed_packed: - {% include "4x3/packed_packed_loop1/avx-512.tmpli" %} - - dec rbx - jnz {{L}}main_loop_packed_packed - - jmp {{L}}non_linear_loop - -{% include "f32_scalars.tmpliq" from:0, to:11 %} -{% include "f32_per_rows.tmpliq" mr:64, from:0, to:11 %} -{% include "f32_per_cols.tmpliq" mr:64, from:0, to:11 %} - -{{L}}add_unicast: - - mov r10, [rdi + 8] // c ptr - mov rsi, [rdi + 16] // row stride - mov rbx, [rdi + 24] // col stride - - mov eax, 0 - -{% for i in (0..3) %} - pinsrd xmm14, eax, {{i}} - add eax, esi -{% endfor %} -{% for i in (0..3) %} - pinsrd xmm15, eax, {{i}} - add eax, esi -{% endfor %} -{% for i in (0..3) %} - pinsrd xmm12, eax, {{i}} - add eax, esi -{% endfor %} -{% for i in (0..3) %} - pinsrd xmm13, eax, {{i}} - add eax, esi -{% endfor %} - - vperm2f128 ymm14, ymm14, ymm15, 32 // ymm14 <- xmm14::xmm15 - vperm2f128 ymm13, ymm12, ymm13, 32 // ymm12 <- xmm12::xmm13 - vinsertf32x8 zmm14, zmm14, ymm13, 1 - -{% for i in (0..2) %} - kxnorw k1,k1,k1 - vgatherdps zmm12{k1}, [ r10 + zmm14 ] - add r10, rbx - vaddps zmm{{i | times: 4}}, zmm{{i | times: 4}}, zmm12 -{% endfor %} - - imul esi, 16 - vpbroadcastd zmm15, esi - -{% for j in (1..3) %} - mov r10, [rdi + 8] - vpaddd zmm14, zmm14, zmm15 - - {% for i in (0..2) %} - kxnorw k1,k1,k1 - vgatherdps zmm12{k1}, [ r10 + zmm14 ] - add r10, rbx - vaddps zmm{{i | times: 4 | plus: j}}, zmm{{i | times: 4 | plus: j}}, zmm12 - {% endfor %} -{% endfor %} - - jmp {{L}}non_linear_loop - -{{L}}add_row_col_products: - mov rax, [ rdi + 8 ] - mov rbx, [ rdi + 16 ] - - vbroadcastss zmm13, dword ptr [rbx] - vbroadcastss zmm14, dword ptr [rbx+4] - vbroadcastss zmm15, dword ptr [rbx+8] - -{% for i in (0..3) %} - vmovups zmm12, zmmword ptr [rax+{{i | times:64}}] - vfmadd231ps zmm{{i}}, zmm12, zmm13 - vfmadd231ps zmm{{i | plus: 4}}, zmm12, zmm14 - vfmadd231ps zmm{{i | plus: 8}}, zmm12, zmm15 -{% endfor %} - - jmp {{L}}non_linear_loop - -{{L}}store: - mov r8, [rdi + 8] // c ptr - mov rsi, [rdi + 16] // row stride - mov rbx, [rdi + 24] // col stride - - // tops of cols - lea r9, [ r8 + rbx ] - lea r10, [ r8 + 2 * rbx ] - lea r11, [ r10 + rbx ] - - {% for word in (0..3) %} - {% for quarter in (0..3) %} - {% for r in (0..2) %} - vextractf32x4 xmm{{r | plus: 12}}, zmm{{r | times: 4 | plus: word}}, {{quarter}} - {% endfor %} - {% for row in (0..3) %} - {% for i in (0..2) %} - vextractps dword ptr [r{{i | plus: 8}}], xmm{{i | plus: 12}}, {{row}} - add r{{i | plus: 8}}, rsi - {% endfor %} - {% endfor %} - {% endfor %} - {% endfor %} - - jmp {{L}}non_linear_loop - -{% include "postamble.tmpliq" size:"64x3", suffix:suffix, G:G, L:L, arch:"avx512" %} - diff --git a/linalg/x86_64/avx512/avx512_mmm_f32_80x2.tmpl b/linalg/x86_64/avx512/avx512_mmm_f32_80x2.tmpl deleted file mode 100644 index 2d4bc2c5a0..0000000000 --- a/linalg/x86_64/avx512/avx512_mmm_f32_80x2.tmpl +++ /dev/null @@ -1,147 +0,0 @@ -{% comment %} -// vim: set syntax=asm : - -/* mmm 80 x 2: - - zmm0 zmm5 - zmm1 zmm6 - zmm2 zmm7 - zmm3 zmm8 - zmm4 zmm9 - -System V ABI: - args: rdi, rsi, rdx, rcx, r8, r9 - preserve: rbx, rsp, rbp, r12, r13, r14, r15 - scratch: rax, rdi, rsi, rdx, rcx, r8, r9, r10, r11 - return: rax (+rdx) - -Windows ABI: - args: RCX, RDX, R8, R9 - preserve: RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15, and XMM6-15 - scratch: RAX, RCX, RDX, R8, R9, R10, R11, XMM0-5, and the upper portions of YMM0-15 and ZMM0-15 - return: rax (+rdx) -*/ -{% endcomment %} - -{% include "preamble.tmpliq" size:"80x2", suffix:suffix, G:G, arch:"avx512" %} - -{{L}}clear: - vzeroall - jmp {{L}}non_linear_loop - -{{L}}add_mat_mul: - mov rcx, [rdi + 24] // B - mov rax, [rdi + 16] // A - - mov rbx, [rdi + 8] // k - test rcx, rcx - jz {{L}}non_linear_loop - -{{L}}main_loop_packed_packed: - {% include "5x2/packed_packed_loop1/avx-512.tmpli" %} - - dec rbx - jnz {{L}}main_loop_packed_packed - - jmp {{L}}non_linear_loop - -{% include "f32_scalars.tmpliq" from:0, to:9 %} -{% include "f32_per_rows.tmpliq" mr:80, from:0, to:9 %} -{% include "f32_per_cols.tmpliq" mr:80, from:0, to:9 %} - -{{L}}add_unicast: - - mov r10, [rdi + 8] // c ptr - mov rsi, [rdi + 16] // row stride - mov rbx, [rdi + 24] // col stride - - mov eax, 0 - -{% for i in (0..3) %} - pinsrd xmm14, eax, {{i}} - add eax, esi -{% endfor %} -{% for i in (0..3) %} - pinsrd xmm15, eax, {{i}} - add eax, esi -{% endfor %} -{% for i in (0..3) %} - pinsrd xmm12, eax, {{i}} - add eax, esi -{% endfor %} -{% for i in (0..3) %} - pinsrd xmm13, eax, {{i}} - add eax, esi -{% endfor %} - - vperm2f128 ymm14, ymm14, ymm15, 32 // ymm14 <- xmm14::xmm15 - vperm2f128 ymm13, ymm12, ymm13, 32 // ymm12 <- xmm12::xmm13 - vinsertf32x8 zmm14, zmm14, ymm13, 1 - -{% for i in (0..1) %} - kxnorw k1,k1,k1 - vgatherdps zmm12{k1}, [ r10 + zmm14 ] - add r10, rbx - vaddps zmm{{i | times: 5}}, zmm{{i | times: 5}}, zmm12 -{% endfor %} - - imul esi, 16 - vpbroadcastd zmm15, esi - -{% for j in (1..4) %} - mov r10, [rdi + 8] - vpaddd zmm14, zmm14, zmm15 - - {% for i in (0..1) %} - kxnorw k1,k1,k1 - vgatherdps zmm12{k1}, [ r10 + zmm14 ] - add r10, rbx - vaddps zmm{{i | times: 5 | plus: j}}, zmm{{i | times: 5 | plus: j}}, zmm12 - {% endfor %} -{% endfor %} - - jmp {{L}}non_linear_loop - -{{L}}add_row_col_products: - mov rax, [ rdi + 8 ] - mov rbx, [ rdi + 16 ] - - vbroadcastss zmm14, dword ptr [rbx] - vbroadcastss zmm15, dword ptr [rbx+4] - -{% for i in (0..4) %} - vmovups zmm12, zmmword ptr [rax+{{i | times:64}}] - vfmadd231ps zmm{{i}}, zmm12, zmm14 - vfmadd231ps zmm{{i | plus: 5}}, zmm12, zmm15 -{% endfor %} - - jmp {{L}}non_linear_loop - -{{L}}store: - mov r8, [rdi + 8] // c ptr - mov rsi, [rdi + 16] // row stride - mov rbx, [rdi + 24] // col stride - - // tops of cols - lea r9, [ r8 + rbx ] - lea r10, [ r8 + 2 * rbx ] - lea r11, [ r10 + rbx ] - - {% for word in (0..4) %} - {% for quarter in (0..3) %} - {% for r in (0..1) %} - vextractf32x4 xmm{{r | plus: 12}}, zmm{{r | times: 5 | plus: word}}, {{quarter}} - {% endfor %} - {% for row in (0..3) %} - {% for i in (0..1) %} - vextractps dword ptr [r{{i | plus: 8}}], xmm{{i | plus: 12}}, {{row}} - add r{{i | plus: 8}}, rsi - {% endfor %} - {% endfor %} - {% endfor %} - {% endfor %} - - jmp {{L}}non_linear_loop - -{% include "postamble.tmpliq" size:"80x2", suffix:suffix, G:G, L:L, arch:"avx512" %} - diff --git a/linalg/x86_64/avx512/avx512_mmm_i32_64x3.tmpl b/linalg/x86_64/avx512/avx512_mmm_i32_64x3.tmpl deleted file mode 100644 index 8146509240..0000000000 --- a/linalg/x86_64/avx512/avx512_mmm_i32_64x3.tmpl +++ /dev/null @@ -1,148 +0,0 @@ -{% comment %} -// vim: set syntax=asm : - -/* mmm 64 x 3: - - zmm0 zmm4 zmm8 - zmm1 zmm5 zmm9 - zmm2 zmm6 zmm10 - zmm3 zmm7 zmm11 - -System V ABI: - args: rdi, rsi, rdx, rcx, r8, r9 - preserve: rbx, rsp, rbp, r12, r13, r14, r15 - scratch: rax, rdi, rsi, rdx, rcx, r8, r9, r10, r11 - return: rax (+rdx) - -Windows ABI: - args: RCX, RDX, R8, R9 - preserve: RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15, and XMM6-15 - scratch: RAX, RCX, RDX, R8, R9, R10, R11, XMM0-5, and the upper portions of YMM0-15 and ZMM0-15 - return: rax (+rdx) -*/ -{% endcomment %} - -{% include "preamble.tmpliq" size:"64x3", suffix:suffix, G:G, arch:"avx512" %} - -{{L}}clear: - vzeroall - jmp {{L}}non_linear_loop - -{{L}}add_mat_mul: - mov rcx, [rdi + 24] // B - mov rax, [rdi + 16] // A - - mov rbx, [rdi + 8] // k - test rcx, rcx - jz {{L}}non_linear_loop - -{{L}}main_loop_packed_packed: - {% include "4x3/i32.tmpli" %} - - dec rbx - jnz {{L}}main_loop_packed_packed - - jmp {{L}}non_linear_loop - -{% include "i32_scalars.tmpliq" from:0, to:11 %} -{% include "i32_per_rows.tmpliq" mr:64, from:0, to:11 %} -{% include "i32_per_cols.tmpliq" mr:64, from:0, to:11 %} - -{{L}}add_unicast: // todo: not done - - mov r10, [rdi + 8] // c ptr - mov rsi, [rdi + 16] // row stride - mov rbx, [rdi + 24] // col stride - - mov eax, 0 - -{% for i in (0..3) %} - pinsrd xmm14, eax, {{i}} - add eax, esi -{% endfor %} -{% for i in (0..3) %} - pinsrd xmm15, eax, {{i}} - add eax, esi -{% endfor %} -{% for i in (0..3) %} - pinsrd xmm12, eax, {{i}} - add eax, esi -{% endfor %} -{% for i in (0..3) %} - pinsrd xmm13, eax, {{i}} - add eax, esi -{% endfor %} - - vperm2f128 ymm14, ymm14, ymm15, 32 // ymm14 <- xmm14::xmm15 - vperm2f128 ymm13, ymm12, ymm13, 32 // ymm12 <- xmm12::xmm13 - vinsertf32x8 zmm14, zmm14, ymm13, 1 - -{% for i in (0..2) %} - kxnorw k1,k1,k1 - vgatherdps zmm12{k1}, [ r10 + zmm14 ] - add r10, rbx - vaddps zmm{{i | times: 4}}, zmm{{i | times: 4}}, zmm12 -{% endfor %} - - imul esi, 16 - vpbroadcastd zmm15, esi - -{% for j in (1..3) %} - mov r10, [rdi + 8] - vpaddd zmm14, zmm14, zmm15 - - {% for i in (0..2) %} - kxnorw k1,k1,k1 - vgatherdps zmm12{k1}, [ r10 + zmm14 ] - add r10, rbx - vaddps zmm{{i | times: 4 | plus: j}}, zmm{{i | times: 4 | plus: j}}, zmm12 - {% endfor %} -{% endfor %} - - jmp {{L}}non_linear_loop - -{{L}}add_row_col_products: - mov rax, [ rdi + 8 ] - mov rbx, [ rdi + 16 ] - - vbroadcastss zmm13, dword ptr [rbx] - vbroadcastss zmm14, dword ptr [rbx+4] - vbroadcastss zmm15, dword ptr [rbx+8] - -{% for i in (0..3) %} - vmovups zmm12, zmmword ptr [rax+{{i | times:64}}] - vfmadd231ps zmm{{i}}, zmm12, zmm13 - vfmadd231ps zmm{{i | plus: 4}}, zmm12, zmm14 - vfmadd231ps zmm{{i | plus: 8}}, zmm12, zmm15 -{% endfor %} - - jmp {{L}}non_linear_loop - -{{L}}store: - mov r8, [rdi + 8] // c ptr - mov rsi, [rdi + 16] // row stride - mov rbx, [rdi + 24] // col stride - - // tops of cols - lea r9, [ r8 + rbx ] - lea r10, [ r8 + 2 * rbx ] - lea r11, [ r10 + rbx ] - - {% for word in (0..3) %} - {% for quarter in (0..3) %} - {% for r in (0..2) %} - vextractf32x4 xmm{{r | plus: 12}}, zmm{{r | times: 4 | plus: word}}, {{quarter}} - {% endfor %} - {% for row in (0..3) %} - {% for i in (0..2) %} - vextractps dword ptr [r{{i | plus: 8}}], xmm{{i | plus: 12}}, {{row}} - add r{{i | plus: 8}}, rsi - {% endfor %} - {% endfor %} - {% endfor %} - {% endfor %} - - jmp {{L}}non_linear_loop - -{% include "postamble.tmpliq" size:"64x3", suffix:suffix, G:G, L:L, arch:"avx512" %} - diff --git a/linalg/x86_64/avx512/f32_add_mat_mul.tmpliq b/linalg/x86_64/avx512/f32_add_mat_mul.tmpliq index 8700acfae8..eaf83392a9 100644 --- a/linalg/x86_64/avx512/f32_add_mat_mul.tmpliq +++ b/linalg/x86_64/avx512/f32_add_mat_mul.tmpliq @@ -33,7 +33,19 @@ Arguments: {% assign prefetch_dist = 2 %} -{% for i in (0..arch_mr_min_1) %} +// we limit the number of prefetches +// because otherwise we end up producing too many prefetches at once +// and, we fill the Line Fill Buffer, which is the cpu's buffer for +// outstanding fetch request for L1 - which by the way has a size of +// 10 requests on most cpus +// +// filling up the LFB is actually a very big deal, because subsequent prefetches +// will block until there is space in the LFB +// +// so we definitely do not want to issue 10 prefetches when mr is 10 :D +// we only issue 2 prefetches here at most +{% assign prefetches_to_issue_min_1 = arch_mr | at_most: 2 | minus: 1 %} +{% for i in (0..prefetches_to_issue_min_1) %} prefetcht0 [rax + {{i | times:64}} + {{m_total_bytes | times:prefetch_dist}}] {% endfor %} diff --git a/linalg/x86_64/avx512/f32_add_row_col_products.tmpliq b/linalg/x86_64/avx512/f32_add_row_col_products.tmpliq index dd91d8e1b7..f331c98330 100644 --- a/linalg/x86_64/avx512/f32_add_row_col_products.tmpliq +++ b/linalg/x86_64/avx512/f32_add_row_col_products.tmpliq @@ -2,22 +2,30 @@ Generate the code for the store instruction. --- Arguments: + mr - kernel size in number of elements nr - kernel size in number of elements {% endcomment %} -{% assign nr_min_1 = nr | minus: 1 %} +{% assign nr_min_1 = nr | minus:1 %} +{% assign mr_arch = mr | divided_by:16 %} +{% assign mr_arch_min_1 = mr | divided_by:16 | minus:1 %} {{L}}add_row_col_products: - mov rax, [ rdi + 8 ] - mov rbx, [ rdi + 16 ] + mov rax, [rdi + 8] + mov rbx, [rdi + 16] - vmovups zmm31, zmmword ptr [rax] - vmovups zmm30, zmmword ptr [rax+64] +// name of the first scratch reg +{% assign scratch = mr_arch | times: nr %} + +{% for j in (0..mr_arch_min_1) %} + vmovups zmm{{scratch | plus:j}}, zmmword ptr [rax + {{j | times:64}}] +{% endfor %} {% for i in (0..nr_min_1) %} - vbroadcastss zmm29, dword ptr [rbx + {{i|times:4}} ] - vfmadd231ps zmm{{i | times: 2}}, zmm31, zmm29 - vfmadd231ps zmm{{i | times: 2 | plus: 1}}, zmm30, zmm29 + vbroadcastss zmm31, dword ptr [rbx + {{i | times:4}}] + {% for j in (0..mr_arch_min_1) %} + vfmadd231ps zmm{{mr_arch | times:i | plus:j}}, zmm{{scratch | plus:j}}, zmm31 + {% endfor %} {% endfor %} jmp {{L}}non_linear_loop \ No newline at end of file diff --git a/linalg/x86_64/avx512/f32_add_unicast.tmpliq b/linalg/x86_64/avx512/f32_add_unicast.tmpliq index bdf7501330..afec5451dc 100644 --- a/linalg/x86_64/avx512/f32_add_unicast.tmpliq +++ b/linalg/x86_64/avx512/f32_add_unicast.tmpliq @@ -72,23 +72,32 @@ Arguments: vmovups zmm15, zmm31 {% endif %} -{% for i in (0..nr) %} - kxnorw k1,k1,k1 - vgatherdps zmm27{k1}, [r10 + zmm26] - add r10, rbx - vaddps zmm{{i | times:2}}, zmm{{i | times:2}}, zmm27 +{% assign nr_min_1 = nr | minus:1 %} +{% assign mr_arch = mr | divided_by:16 %} +{% assign mr_arch_min_1 = mr | divided_by:16 | minus:1 %} + +{% for i in (0..nr_min_1) %} + kxnorw k1,k1,k1 + vgatherdps zmm27{k1}, [r10 + zmm26] + add r10, rbx + vaddps zmm{{i | times:mr_arch}}, zmm{{i | times:mr_arch}}, zmm27 {% endfor %} - mov r10, [rdi + 8] - imul esi, 16 - vpbroadcastd zmm25, esi - vpaddd zmm26, zmm26, zmm25 + imul esi, 16 + vpbroadcastd zmm25, esi -{% for i in (0..nr) %} - kxnorw k1,k1,k1 - vgatherdps zmm27{k1}, [r10 + zmm26] - add r10, rbx - vaddps zmm{{i | times:2 | plus:1}}, zmm{{i | times:2 | plus: 1}}, zmm27 +{% for j in (1..mr_arch_min_1) %} + mov r10, [rdi + 8] + vpaddd zmm26, zmm26, zmm25 + + {% for i in (0..nr_min_1) %} + kxnorw k1,k1,k1 + vgatherdps zmm27{k1}, [r10 + zmm26] + add r10, rbx + vaddps zmm{{i | times:mr_arch | plus:j}}, zmm{{i | times:mr_arch | plus:j}}, zmm27 + {% endfor %} {% endfor %} + jmp {{L}}non_linear_loop + diff --git a/linalg/x86_64/avx512/f32_per_cols.tmpliq b/linalg/x86_64/avx512/f32_per_cols.tmpliq index 6d4097d416..076543f5e2 100644 --- a/linalg/x86_64/avx512/f32_per_cols.tmpliq +++ b/linalg/x86_64/avx512/f32_per_cols.tmpliq @@ -1,8 +1,14 @@ -// vim: set syntax=asm : +{% comment %} +Generate the code for the per-col instructions for f32. +--- +Arguments: + mr - kernel size in number of elements + nr - kernel size in number of elements +{% endcomment %} -{% include "zmm_per_col.tmpliq" label:"per_col_min", op:"vminps", mr:mr, from:from, to:to %} -{% include "zmm_per_col.tmpliq" label:"per_col_max", op:"vmaxps", mr:mr, from:from, to:to %} -{% include "zmm_per_col.tmpliq" label:"per_col_add", op:"vaddps", mr:mr, from:from, to:to %} -{% include "zmm_per_col.tmpliq" label:"per_col_mul", op:"vmulps", mr:mr, from:from, to:to %} -{% include "zmm_per_col.tmpliq" label:"per_col_sub", op:"vsubps", from:from, to:to %} -{% include "zmm_per_col.tmpliq" label:"per_col_sub_flipped", op:"vsubps", from:from, to:to, flipped: true %} +{% include "zmm_per_col.tmpliq" label:"per_col_min", op:"vminps", mr:mr, nr:nr %} +{% include "zmm_per_col.tmpliq" label:"per_col_max", op:"vmaxps", mr:mr, nr:nr %} +{% include "zmm_per_col.tmpliq" label:"per_col_add", op:"vaddps", mr:mr, nr:nr %} +{% include "zmm_per_col.tmpliq" label:"per_col_mul", op:"vmulps", mr:mr, nr:nr %} +{% include "zmm_per_col.tmpliq" label:"per_col_sub", op:"vsubps", mr:mr, nr:nr %} +{% include "zmm_per_col.tmpliq" label:"per_col_sub_flipped", op:"vsubps", mr:mr, nr:nr, flipped:true %} diff --git a/linalg/x86_64/avx512/f32_per_rows.tmpliq b/linalg/x86_64/avx512/f32_per_rows.tmpliq index b20fcbbbbc..b84a189694 100644 --- a/linalg/x86_64/avx512/f32_per_rows.tmpliq +++ b/linalg/x86_64/avx512/f32_per_rows.tmpliq @@ -1,8 +1,14 @@ -// vim: set syntax=asm : +{% comment %} +Generate the code for the per-row instructions for f32. +--- +Arguments: + mr - kernel size in number of elements + nr - kernel size in number of elements +{% endcomment %} -{% include "zmm_per_row.tmpliq" label:"per_row_min", op:"vminps", mr:mr, from:from, to:to %} -{% include "zmm_per_row.tmpliq" label:"per_row_max", op:"vmaxps", mr:mr, from:from, to:to %} -{% include "zmm_per_row.tmpliq" label:"per_row_add", op:"vaddps", mr:mr, from:from, to:to %} -{% include "zmm_per_row.tmpliq" label:"per_row_mul", op:"vmulps", mr:mr, from:from, to:to %} -{% include "zmm_per_row.tmpliq" label:"per_row_sub", op:"vsubps", from:from, to:to %} -{% include "zmm_per_row.tmpliq" label:"per_row_sub_flipped", op:"vsubps", from:from, to:to, flipped: true %} +{% include "zmm_per_row.tmpliq" label:"per_row_min", op:"vminps", mr:mr, nr:nr %} +{% include "zmm_per_row.tmpliq" label:"per_row_max", op:"vmaxps", mr:mr, nr:nr %} +{% include "zmm_per_row.tmpliq" label:"per_row_add", op:"vaddps", mr:mr, nr:nr %} +{% include "zmm_per_row.tmpliq" label:"per_row_mul", op:"vmulps", mr:mr, nr:nr %} +{% include "zmm_per_row.tmpliq" label:"per_row_sub", op:"vsubps", mr:mr, nr:nr %} +{% include "zmm_per_row.tmpliq" label:"per_row_sub_flipped", op:"vsubps", mr:mr, nr:nr, flipped:true %} diff --git a/linalg/x86_64/avx512/f32_scalars.tmpliq b/linalg/x86_64/avx512/f32_scalars.tmpliq index d6a4a24fd9..1f2e498f2a 100644 --- a/linalg/x86_64/avx512/f32_scalars.tmpliq +++ b/linalg/x86_64/avx512/f32_scalars.tmpliq @@ -1,11 +1,17 @@ -// vim: set syntax=asm : +{% comment %} +Generate the code for the scalar instructions for f32. +--- +Arguments: + mr - kernel size in number of elements + nr - kernel size in number of elements +{% endcomment %} -{% include "zmm_scalar.tmpliq" label:"scalar_min", op:"vminps", from:from, to:to %} -{% include "zmm_scalar.tmpliq" label:"scalar_max", op:"vmaxps", from:from, to:to %} -{% include "zmm_scalar.tmpliq" label:"scalar_add", op:"vaddps", from:from, to:to %} -{% include "zmm_scalar.tmpliq" label:"scalar_mul", op:"vmulps", from:from, to:to %} -{% include "zmm_scalar.tmpliq" label:"scalar_sub", op:"vsubps", from:from, to:to %} -{% include "zmm_scalar.tmpliq" label:"scalar_sub_flipped", op:"vsubps", from:from, to:to, flipped: true %} +{% include "zmm_scalar.tmpliq" label:"scalar_min", op:"vminps", mr:mr, nr:nr %} +{% include "zmm_scalar.tmpliq" label:"scalar_max", op:"vmaxps", mr:mr, nr:nr %} +{% include "zmm_scalar.tmpliq" label:"scalar_add", op:"vaddps", mr:mr, nr:nr %} +{% include "zmm_scalar.tmpliq" label:"scalar_mul", op:"vmulps", mr:mr, nr:nr %} +{% include "zmm_scalar.tmpliq" label:"scalar_sub", op:"vsubps", mr:mr, nr:nr %} +{% include "zmm_scalar.tmpliq" label:"scalar_sub_flipped", op:"vsubps", mr:mr, nr:nr, flipped:true %} {{L}}q_scale: {{L}}q_shl: diff --git a/linalg/x86_64/avx512/f32_store_clear.tmpliq b/linalg/x86_64/avx512/f32_store_clear.tmpliq index 2e696cba1a..fd83e8f71e 100644 --- a/linalg/x86_64/avx512/f32_store_clear.tmpliq +++ b/linalg/x86_64/avx512/f32_store_clear.tmpliq @@ -6,6 +6,7 @@ Arguments: nr - kernel size in number of elements {% endcomment %} +{% assign arch_mr = mr | divided_by:16 %} {% assign arch_mr_min_1 = mr | divided_by:16 | minus:1 %} {% assign nr_min_1 = nr | minus:1 %} @@ -20,7 +21,7 @@ Arguments: {% for regcol in (0..nr_min_1) %} {% for regrow in (0..arch_mr_min_1) %} {% for quarter in (0..3) %} - vextractf32x4 xmm31, zmm{{regcol | times:2 | plus:regrow}}, {{quarter}} + vextractf32x4 xmm31, zmm{{regcol | times:arch_mr | plus:regrow}}, {{quarter}} {% for innerrow in (0..3) %} vextractps dword ptr [r9], xmm31, {{innerrow}} add r9, rsi @@ -38,15 +39,10 @@ Arguments: {{L}}clear: vzeroall // turns out vzeroall only zeroes zmm0 to zmm15 - {% for regcol in (15..last_reg) %} - vmovups zmm16, zmm0 - vmovups zmm17, zmm0 - vmovups zmm18, zmm0 - vmovups zmm19, zmm0 - vmovups zmm20, zmm0 - vmovups zmm21, zmm0 - vmovups zmm22, zmm0 - vmovups zmm23, zmm0 + {% if last_reg >= 16 %} + {% for regcol in (16..last_reg) %} + vmovups zmm{{regcol}}, zmm0 {% endfor %} + {% endif %} jmp {{L}}non_linear_loop diff --git a/linalg/x86_64/avx512/zmm_per_col.tmpliq b/linalg/x86_64/avx512/zmm_per_col.tmpliq index 16c9d32eb7..7f638482a0 100644 --- a/linalg/x86_64/avx512/zmm_per_col.tmpliq +++ b/linalg/x86_64/avx512/zmm_per_col.tmpliq @@ -1,23 +1,35 @@ -// vim: set syntax=asm : +{% comment %} +Generate the code for a per-col instruction. +--- +Arguments: + mr - kernel size in number of elements + nr - kernel size in number of elements + op - the avx512 instruction + flipped - boolean to flip the order + label - the asm label to clear +{% endcomment %} {{L}}{{label}}: - mov rax, [ rdi + 8 ] + mov rax, [rdi + 8] -{% capture mr_over_16 %}{{ mr | divided_by: 16}}{%endcapture%} -{% capture mr_over_16_min_1 %}{{ mr | divided_by: 16 | minus: 1}}{%endcapture%} +{% assign mr_over_16 = mr | divided_by:16 %} +{% assign mr_over_16_min_1 = mr | divided_by:16 | minus:1 %} -{%capture tmp%}{{to | plus: 1 }}{%endcapture%} +{% assign from = 0 %} +{% assign to = mr_over_16 | times:nr | minus:1 %} -{%capture cols%}{{to | plus: 1| minus:from| divided_by:mr_over_16}}{%endcapture%} -{%capture cols_min_1%}{{to | plus: 1| minus:from| divided_by:mr_over_16|minus:1}}{%endcapture%} +{% assign tmp = to | plus:1 %} +{% assign cols = to | plus:1 | minus:from | divided_by:mr_over_16 %} + +{% assign cols_min_1 = to | plus:1 | minus:from | divided_by:mr_over_16 | minus:1 %} // {{to|minus:from|plus:1}} cols:{{cols}} {% for right in (0..cols_min_1) %} - vbroadcastss zmm{{tmp}}, dword ptr [ rax ] + vbroadcastss zmm{{tmp}}, dword ptr [rax] add rax, 4 {% for down in (0..mr_over_16_min_1) %} - {%capture acc%}{{mr_over_16|times:right|plus:from|plus:down}}{%endcapture%} + {% assign acc = mr_over_16 | times:right | plus:from | plus:down %} {% if flipped %} {{op}} zmm{{acc}}, zmm{{acc}}, zmm{{tmp}} {% else %} diff --git a/linalg/x86_64/avx512/zmm_per_row.tmpliq b/linalg/x86_64/avx512/zmm_per_row.tmpliq index f9da1b35f7..1041a1a96d 100644 --- a/linalg/x86_64/avx512/zmm_per_row.tmpliq +++ b/linalg/x86_64/avx512/zmm_per_row.tmpliq @@ -1,22 +1,34 @@ -// vim: set syntax=asm : +{% comment %} +Generate the code for a per-row instruction. +--- +Arguments: + mr - kernel size in number of elements + nr - kernel size in number of elements + op - the avx512 instruction + flipped - boolean to flip the order + label - the asm label to clear +{% endcomment %} {{L}}{{label}}: - mov rax, [ rdi + 8 ] + mov rax, [rdi + 8] -{% capture mr_over_16 %}{{ mr | divided_by: 16}}{%endcapture%} -{% capture mr_over_16_min_1 %}{{ mr | divided_by: 16 | minus: 1}}{%endcapture%} +{% assign mr_over_16 = mr | divided_by:16 %} +{% assign mr_over_16_min_1 = mr | divided_by:16 | minus:1 %} + +{% assign from = 0 %} +{% assign to = mr_over_16 | times:nr | minus:1 %} {% for ix in (0..mr_over_16_min_1) %} - vmovups zmm{{to | plus: 1 | plus: ix}}, [rax + {{ix | times: 64}}] + vmovups zmm{{to | plus:1 | plus:ix}}, [rax + {{ix | times:64}}] {% endfor %} {% if flipped %} {% for acc in (from..to) %} - {{op}} zmm{{acc}}, zmm{{acc}}, zmm{{ acc | modulo: mr_over_16 | plus: to | plus: 1 }} + {{op}} zmm{{acc}}, zmm{{acc}}, zmm{{acc | modulo:mr_over_16 | plus:to | plus:1}} {% endfor %} {% else %} {% for acc in (from..to) %} - {{op}} zmm{{acc}}, zmm{{ acc | modulo: mr_over_16 | plus: to | plus: 1 }}, zmm{{acc}} + {{op}} zmm{{acc}}, zmm{{acc | modulo:mr_over_16 | plus:to | plus:1}}, zmm{{acc}} {% endfor %} {% endif %} diff --git a/linalg/x86_64/avx512/zmm_scalar.tmpliq b/linalg/x86_64/avx512/zmm_scalar.tmpliq index c38a5965cf..7c0aedacc3 100644 --- a/linalg/x86_64/avx512/zmm_scalar.tmpliq +++ b/linalg/x86_64/avx512/zmm_scalar.tmpliq @@ -1,4 +1,20 @@ -// vim: set syntax=asm : +{% comment %} +Generate the code for a per-row instruction. +--- +Arguments: + mr - kernel size in number of elements + nr - kernel size in number of elements + op - the avx512 instruction + flipped - boolean to flip the order + label - the asm label to clear +{% endcomment %} + +{% assign mr_over_16 = mr | divided_by:16 %} + +{% assign from = 0 %} +{% assign to = mr_over_16 | times:nr | minus:1 %} + +// from={{from}} to={{to}} {{L}}{{label}}: vbroadcastss zmm31, dword ptr [rdi + 8] diff --git a/linalg/x86_64/kernel_throughput.py b/linalg/x86_64/kernel_throughput.py new file mode 100644 index 0000000000..6e4ae7666e --- /dev/null +++ b/linalg/x86_64/kernel_throughput.py @@ -0,0 +1,51 @@ +#!/usr/bin/env python3 + +""" +Display the kernel throughputs as a dataframe and a csv file. + +Usage: +1. First, run the benchmarks using `cargo bench -p tract-linalg --bench kernel_test`. +2. Then run this file in the project root: `python3 linalg/x86_64/kernel_throughput.py`. + +The results are in Gelem/s. +""" + +import os +import re +import json +import os.path as path +import pandas as pd + +criterion = './target/criterion' +results = os.listdir(criterion) + +mat_common_dims = '1024x1000' + +df = pd.DataFrame(index=range(16, 256+16, 16), columns=range(1, 33), dtype='float') +for r in results: + ma = re.match("avx512_mmm_f32_(\d+)x(\d+)", r) + if not ma: + continue + + m = int(ma.group(1)) + n = int(ma.group(2)) + + path_ = path.join(criterion, r, "f32_cold", f"{mat_common_dims}x{n}") + benchmark = path.join(path_, "base/benchmark.json") + with open(benchmark) as f: + benchmark = json.load(f) + + sample = path.join(path_, "base/sample.json") + with open(sample) as f: + sample = json.load(f) + + elements = benchmark["throughput"]["Elements"] + time_per_iter = sum(sample["times"]) / sum(sample["iters"]) + + df.loc[m, n] = 1 / (time_per_iter / elements) + print(df.loc[m, n]) + df.loc[m, n] = f"{round(df.loc[m, n], 2)}" + +pd.set_option('display.max_columns', None) +print(df) +df.to_csv("result.csv") From 38b179174bdd9a754292cf20d8a844743563cbc0 Mon Sep 17 00:00:00 2001 From: Charles Chudant Date: Sat, 11 Mar 2023 01:14:08 +0000 Subject: [PATCH 3/7] unroll on k --- linalg/build.rs | 58 +++++++------ linalg/x86_64/avx512/f32_add_mat_mul.tmpliq | 93 ++++++++++++++------- 2 files changed, 89 insertions(+), 62 deletions(-) diff --git a/linalg/build.rs b/linalg/build.rs index ff8ab37cb9..f9832c594f 100644 --- a/linalg/build.rs +++ b/linalg/build.rs @@ -226,7 +226,7 @@ fn preprocess_files( let mut files = vec![]; if let Some(spec) = generate_kernels_spec { - let tmpl_file = spec.file.file_name().unwrap().to_str().unwrap(); + let tmpl_file = spec.file.file_stem().unwrap().to_str().unwrap(); for (m, n) in spec.sizes { let globals = vec![ ("mr", liquid::model::Value::scalar(format!("{m}"))), @@ -237,36 +237,34 @@ fn preprocess_files( preprocess_file(&spec.file, &file, &globals, suffix, needs_pragma); files.push(file); } - } else { - let dir_entries = { - let mut dir_entries: Vec = - input.as_ref().read_dir().unwrap().map(|f| f.unwrap()).collect(); - dir_entries.sort_by_key(|a| a.path()); - dir_entries - }; - for f in dir_entries { - if f.path().extension() == Some(ffi::OsStr::new("tmpl")) { - let tmpl_file = f.path().file_name().unwrap().to_str().unwrap().to_owned(); - let concerned_variants: Vec<&Variant> = - variants.iter().filter(|v| tmpl_file.contains(v.0)).collect(); - let expanded_variants = - concerned_variants.iter().map(|pair| pair.1.len()).product(); - for v in 0..expanded_variants { - let mut tmpl_file = tmpl_file.clone(); - let mut id = v; - let mut globals = vec![]; - for variable in variants { - let key = variable.0; - let value = variable.1[id % variable.1.len()]; - globals.push((key, liquid::model::Value::scalar(value))); - tmpl_file = tmpl_file.replace(key, value); - id /= variable.1.len(); - } - let mut file = out_dir.join(tmpl_file); - file.set_extension("S"); - preprocess_file(f.path(), &file, &globals, suffix, needs_pragma); - files.push(file); + } + let dir_entries = { + let mut dir_entries: Vec = + input.as_ref().read_dir().unwrap().map(|f| f.unwrap()).collect(); + dir_entries.sort_by_key(|a| a.path()); + dir_entries + }; + for f in dir_entries { + if f.path().extension() == Some(ffi::OsStr::new("tmpl")) { + let tmpl_file = f.path().file_name().unwrap().to_str().unwrap().to_owned(); + let concerned_variants: Vec<&Variant> = + variants.iter().filter(|v| tmpl_file.contains(v.0)).collect(); + let expanded_variants = concerned_variants.iter().map(|pair| pair.1.len()).product(); + for v in 0..expanded_variants { + let mut tmpl_file = tmpl_file.clone(); + let mut id = v; + let mut globals = vec![]; + for variable in variants { + let key = variable.0; + let value = variable.1[id % variable.1.len()]; + globals.push((key, liquid::model::Value::scalar(value))); + tmpl_file = tmpl_file.replace(key, value); + id /= variable.1.len(); } + let mut file = out_dir.join(tmpl_file); + file.set_extension("S"); + preprocess_file(f.path(), &file, &globals, suffix, needs_pragma); + files.push(file); } } } diff --git a/linalg/x86_64/avx512/f32_add_mat_mul.tmpliq b/linalg/x86_64/avx512/f32_add_mat_mul.tmpliq index eaf83392a9..b5f420ce6c 100644 --- a/linalg/x86_64/avx512/f32_add_mat_mul.tmpliq +++ b/linalg/x86_64/avx512/f32_add_mat_mul.tmpliq @@ -14,22 +14,24 @@ Arguments: test rcx, rcx jz {{L}}non_linear_loop -{{L}}main_loop_packed_packed: +// the main loop will access A 16 elements at a time +// and B 1 element at a time +// it may be unrolled on a few elements of K -{% assign arch_mr = mr | divided_by: 16 %} -{% assign arch_mr_min_1 = mr | divided_by: 16 | minus: 1 %} +{% assign arch_mr = mr | divided_by:16 %} +{% assign arch_mr_min_1 = mr | divided_by:16 | minus:1 %} -{% assign nr_min_1 = nr | minus: 1 %} +{% assign nr_min_1 = nr | minus:1 %} // total bytes of the tile on the m axis -{% assign m_total_bytes = mr | times: 4 %} +{% assign m_total_bytes = mr | times:4 %} // total bytes of the tile on the n axis -{% assign n_total_bytes = nr | times: 4 %} +{% assign n_total_bytes = nr | times:4 %} // first register to be used for row -{% assign row_reg = arch_mr | times: nr %} +{% assign row_reg = arch_mr | times:nr %} // the column register -{% assign col_reg = row_reg | plus: arch_mr | plus: 1 %} +{% assign col_reg = row_reg | plus:arch_mr | plus:1 %} {% assign prefetch_dist = 2 %} @@ -37,38 +39,65 @@ Arguments: // because otherwise we end up producing too many prefetches at once // and, we fill the Line Fill Buffer, which is the cpu's buffer for // outstanding fetch request for L1 - which by the way has a size of -// 10 requests on most cpus +// 10 requests on most intel cpus // // filling up the LFB is actually a very big deal, because subsequent prefetches // will block until there is space in the LFB -// -// so we definitely do not want to issue 10 prefetches when mr is 10 :D -// we only issue 2 prefetches here at most -{% assign prefetches_to_issue_min_1 = arch_mr | at_most: 2 | minus: 1 %} -{% for i in (0..prefetches_to_issue_min_1) %} - prefetcht0 [rax + {{i | times:64}} + {{m_total_bytes | times:prefetch_dist}}] -{% endfor %} +{% assign prefetches_to_issue_min_1 = arch_mr | at_most:2 | minus:1 %} -{% for i in (0..arch_mr_min_1) %} - vmovaps zmm{{row_reg | plus:i}}, [rax + {{i | times:64}}] -{% endfor %} +// how many unrolls on k should we produce +{% assign unroll_count = 8 %} +{% assign unroll_count_min_1 = unroll_count | minus:1 %} -// this loop will access A 16 elements at a time -// and B 1 element at a time +// this is the dispatch part +{{L}}main_loop_packed_packed: + +// hardcoded 8 unrolls + + cmp rbx, 1 + jb {{L}}non_linear_loop + je {{L}}main_loop_packed_packed_1 + cmp rbx, 3 + jb {{L}}main_loop_packed_packed_2 + je {{L}}main_loop_packed_packed_3 + cmp rbx, 5 + jb {{L}}main_loop_packed_packed_4 + je {{L}}main_loop_packed_packed_5 + cmp rbx, 7 + jb {{L}}main_loop_packed_packed_6 + je {{L}}main_loop_packed_packed_7 +{% comment %} +{% endcomment %} -{% for i in (0..nr_min_1) %} - vbroadcastss zmm{{col_reg}}, dword ptr [rcx + {{i | times:4}}] +{% for unroll in (0..unroll_count_min_1) %} - {% for j in (0..arch_mr_min_1) %} - vfmadd231ps zmm{{i | times:arch_mr | plus:j}}, zmm{{row_reg | plus:j}}, zmm{{col_reg}} - {% endfor %} + {% assign n_items_on_k = unroll_count | minus:unroll %} -{% endfor %} + {{L}}main_loop_packed_packed_{{n_items_on_k}}: + + {% assign unroll_min_1 = n_items_on_k | minus:1 %} + {% for cur_unroll_count in (0..unroll_min_1) %} + + {% for i in (0..prefetches_to_issue_min_1) %} + prefetcht0 [rax + {{i | times:64}} + {{m_total_bytes | times:prefetch_dist}} + {{cur_unroll_count | times:m_total_bytes}}] + {% endfor %} - add rax, {{m_total_bytes}} - add rcx, {{n_total_bytes}} + {% for i in (0..arch_mr_min_1) %} + vmovaps zmm{{row_reg | plus:i}}, [rax + {{i | times:64}} + {{cur_unroll_count | times:m_total_bytes}}] + {% endfor %} - dec rbx - jnz {{L}}main_loop_packed_packed + {% for i in (0..nr_min_1) %} + vbroadcastss zmm{{col_reg}}, dword ptr [rcx + {{i | times:4}} + {{cur_unroll_count | times:n_total_bytes}}] - jmp {{L}}non_linear_loop + {% for j in (0..arch_mr_min_1) %} + vfmadd231ps zmm{{i | times:arch_mr | plus:j}}, zmm{{row_reg | plus:j}}, zmm{{col_reg}} + {% endfor %} + {% endfor %} + {% endfor %} + + add rax, {{m_total_bytes | times:n_items_on_k}} + add rcx, {{n_total_bytes | times:n_items_on_k}} + sub rbx, {{n_items_on_k}} + + jmp {{L}}main_loop_packed_packed +{% endfor %} From c2304c12ff005d804036a04de57e93537d7da952 Mon Sep 17 00:00:00 2001 From: Charles Chudant Date: Sun, 12 Mar 2023 16:38:17 +0000 Subject: [PATCH 4/7] better scatter/gather, feature for compiling all kernels, better kernel selection --- linalg/Cargo.toml | 7 ++ linalg/build.rs | 40 +++++-- linalg/src/x86_64_fma.rs | 73 +++++++++++- linalg/src/x86_64_fma/mmm.rs | 116 +++++++++++--------- linalg/x86_64/avx512/avx512_mmm_f32.tmpliq | 17 ++- linalg/x86_64/avx512/f32_add_mat_mul.tmpliq | 4 +- linalg/x86_64/avx512/f32_add_unicast.tmpliq | 110 +++++++------------ linalg/x86_64/avx512/f32_store_clear.tmpliq | 71 +++++++++--- linalg/x86_64/kernel_throughput.py | 5 +- 9 files changed, 278 insertions(+), 165 deletions(-) mode change 100644 => 100755 linalg/x86_64/kernel_throughput.py diff --git a/linalg/Cargo.toml b/linalg/Cargo.toml index 5991d1dc79..609456f0e9 100644 --- a/linalg/Cargo.toml +++ b/linalg/Cargo.toml @@ -47,6 +47,8 @@ core_affinity.workspace = true no_fp16 = [] default = [] complex = [ "tract-data/complex" ] +# Internal feature for benchmarking matmul kernels +compile_all_kernels = [] [[bench]] bench = false @@ -104,3 +106,8 @@ harness = false bench = false name = "kernel_test" harness = false + +[[bench]] +bench = false +name = "kernel_selection" +harness = false diff --git a/linalg/build.rs b/linalg/build.rs index f9832c594f..148febbefd 100644 --- a/linalg/build.rs +++ b/linalg/build.rs @@ -89,16 +89,36 @@ fn main() { match arch.as_ref() { "x86_64" => { let mut files = preprocess_files("x86_64/fma", &[], &suffix, false, None); - // limits of the size of the kernels in avx512; index is n-1 - let avx_kernels_max = [ - 240, 160, 112, 96, 80, 64, 48, 48, 48, 32, 32, 32, 32, 32, 16, 16, 16, 16, 16, 16, - 16, 16, 16, 16, 16, 16, 16, 16, 16, - ]; - let avx512_kernels: Vec<_> = avx_kernels_max - .iter() - .enumerate() - .flat_map(|(n_min_1, &max)| (16..=max).step_by(16).map(move |m| (m, n_min_1 + 1))) - .collect(); + + let avx512_kernels: Vec<_> = if cfg!(feature = "compile_all_kernels") { + // limits of the max M size of the kernels in avx512; index is n-1 + let avx512_kernels_max = [ + 240, 160, 112, 96, 80, 64, 48, 48, 48, 32, 32, 32, 32, 32, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + ]; + avx512_kernels_max + .iter() + .enumerate() + .flat_map(|(n_min_1, &max)| { + (16..=max).step_by(16).map(move |m| (m, n_min_1 + 1)) + }) + .collect() + } else { + vec![ + (96, 1), + (96, 2), + (80, 3), + (64, 4), + (32, 5), + (32, 6), + (32, 7), + (32, 8), + (32, 9), + (32, 10), + (32, 11), + (32, 12), + ] + }; files.extend(preprocess_files( "x86_64/avx512", diff --git a/linalg/src/x86_64_fma.rs b/linalg/src/x86_64_fma.rs index 4e9e4dcaa3..3d6f54d908 100644 --- a/linalg/src/x86_64_fma.rs +++ b/linalg/src/x86_64_fma.rs @@ -1,3 +1,7 @@ +use std::cmp::Ordering; + +use tract_data::internal::num_integer::Integer; + use crate::frame::element_wise::ElementWiseKer; use crate::frame::mmm::kernel::MatMatMulKer; use crate::Ops; @@ -96,14 +100,71 @@ fn plug_fma(ops: &mut Ops) { fn plug_avx512f(ops: &mut Ops) { ops.mmv_f32 = Box::new(|m, _k| match m { Some(m) if m < 31 => mmm::avx512_mmm_f32_16x1::mmm(), - _ => mmm::avx512_mmm_f32_128x1::mmm(), + _ => mmm::avx512_mmm_f32_96x1::mmm(), }); - ops.mmm_f32 = Box::new(|_, _, n| match n { - Some(1) => unreachable!("should've been mmv"), - Some(2) => mmm::avx512_mmm_f32_80x2::mmm(), - Some(n) if n % 4 == 0 && n % 3 != 0 => mmm::avx512_mmm_f32_48x4::mmm(), - _ => mmm::avx512_mmm_f32_64x3::mmm(), + ops.mmm_f32 = Box::new(|_, _, n| { + if n.is_none() { + return mmm::avx512_mmm_f32_32x12::mmm(); + } + let mut n = n.unwrap(); + + if n > 14 { + // throughputs are mesured using the kernel_throughput.py script + let scaling_baseline = 98.0; + let kernel_throughputs = [ + (2, 18.0 / scaling_baseline), + (3, 28.0 / scaling_baseline), + (4, 36.5 / scaling_baseline), + (5, 44.0 / scaling_baseline), + (6, 49.0 / scaling_baseline), + (7, 58.0 / scaling_baseline), + (8, 65.0 / scaling_baseline), + (9, 72.5 / scaling_baseline), + (10, 82.0 / scaling_baseline), + (11, 84.0 / scaling_baseline), + (12, 88.5 / scaling_baseline), + (13, 95.0 / scaling_baseline), + (14, 98.0 / scaling_baseline), + ]; + + let throughputs = kernel_throughputs.map(|(kernel_width, thrpt): (usize, f32)| { + let n_tiles = Integer::div_ceil(&n, &kernel_width); + + let n_elem_total = n_tiles * kernel_width; + let n_elem_on_border_tile = n_elem_total - n; + let wasted_ratio = n_elem_on_border_tile as f32 / n_elem_total as f32; + + let final_thrpt = thrpt * (1.0 - wasted_ratio); + + (kernel_width, final_thrpt) + }); + + let best_ker = *throughputs + .iter() + .max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap_or(Ordering::Equal)) + .map(|(ker_width, _)| ker_width) + .unwrap(); + + n = best_ker; + } + + match n { + 2 => mmm::avx512_mmm_f32_96x2::mmm(), + 3 => mmm::avx512_mmm_f32_80x3::mmm(), + 4 => mmm::avx512_mmm_f32_64x4::mmm(), + 5 => mmm::avx512_mmm_f32_32x5::mmm(), + 6 => mmm::avx512_mmm_f32_32x6::mmm(), + 7 => mmm::avx512_mmm_f32_32x7::mmm(), + 8 => mmm::avx512_mmm_f32_32x8::mmm(), + 9 => mmm::avx512_mmm_f32_32x9::mmm(), + 10 => mmm::avx512_mmm_f32_32x10::mmm(), + 11 => mmm::avx512_mmm_f32_32x11::mmm(), + 12 => mmm::avx512_mmm_f32_32x12::mmm(), + 13 => mmm::avx512_mmm_f32_32x13::mmm(), + 14 => mmm::avx512_mmm_f32_32x14::mmm(), + _ => unreachable!("not a valid index"), + } }); log::info!("mmm_f32, mmv_f32: x86_64/avx512f activated"); } diff --git a/linalg/src/x86_64_fma/mmm.rs b/linalg/src/x86_64_fma/mmm.rs index 23cc63f67f..20a1c01140 100644 --- a/linalg/src/x86_64_fma/mmm.rs +++ b/linalg/src/x86_64_fma/mmm.rs @@ -10,57 +10,69 @@ MMMKernel!(f32, fma_mmm_f32_64x1; 64, 1; 32, 4; 0, 0; no_prefetch, is_x86_featur MMMKernel!(i32, avx2_mmm_i32_8x8; 8, 8; 32, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx2")); -// MMMKernel!(f32, avx512_mmm_f32_240x1; 240, 1; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); -// MMMKernel!(f32, avx512_mmm_f32_160x2; 160, 2; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); -// MMMKernel!(f32, avx512_mmm_f32_112x3; 112, 3; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); -// MMMKernel!(f32, avx512_mmm_f32_96x4; 96, 4; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); -// MMMKernel!(f32, avx512_mmm_f32_80x5; 80, 5; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); -// MMMKernel!(f32, avx512_mmm_f32_64x6; 64, 6; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); -// MMMKernel!(f32, avx512_mmm_f32_48x7; 48, 7; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); -// MMMKernel!(f32, avx512_mmm_f32_48x8; 48, 8; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); -// MMMKernel!(f32, avx512_mmm_f32_48x9; 48, 9; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); -// MMMKernel!(f32, avx512_mmm_f32_32x10; 32, 10; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); -// MMMKernel!(f32, avx512_mmm_f32_32x11; 32, 11; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); -// MMMKernel!(f32, avx512_mmm_f32_32x12; 32, 12; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); - -macro_rules! make_kernels_for_n { - ($n:expr ; $m:expr) => ( - paste! { - MMMKernel!(f32, []; $m, $n; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); - } - ); - ($n:expr ; $m1:expr, $($y:expr),+) => ( - make_kernels_for_n!($n ; $m1); - make_kernels_for_n!($n ; $($y),+); - ) +#[cfg(not(feature = "compile_all_kernels"))] +mod avx512_best { + use super::*; + MMMKernel!(f32, avx512_mmm_f32_240x1; 240, 1; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); + MMMKernel!(f32, avx512_mmm_f32_160x2; 160, 2; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); + MMMKernel!(f32, avx512_mmm_f32_112x3; 112, 3; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); + MMMKernel!(f32, avx512_mmm_f32_96x4; 96, 4; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); + MMMKernel!(f32, avx512_mmm_f32_80x5; 80, 5; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); + MMMKernel!(f32, avx512_mmm_f32_64x6; 64, 6; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); + MMMKernel!(f32, avx512_mmm_f32_48x7; 48, 7; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); + MMMKernel!(f32, avx512_mmm_f32_48x8; 48, 8; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); + MMMKernel!(f32, avx512_mmm_f32_48x9; 48, 9; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); + MMMKernel!(f32, avx512_mmm_f32_32x10; 32, 10; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); + MMMKernel!(f32, avx512_mmm_f32_32x11; 32, 11; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); + MMMKernel!(f32, avx512_mmm_f32_32x12; 32, 12; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); } +#[cfg(not(feature = "compile_all_kernels"))] +pub use avx512_best::*; + +#[cfg(feature = "compile_all_kernels")] +mod all_avx512 { + use super::*; + macro_rules! make_kernels_for_n { + ($n:expr ; $m:expr) => ( + paste! { + MMMKernel!(f32, []; $m, $n; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); + } + ); + ($n:expr ; $m1:expr, $($y:expr),+) => ( + make_kernels_for_n!($n ; $m1); + make_kernels_for_n!($n ; $($y),+); + ) + } -make_kernels_for_n!(1 ; 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240); -make_kernels_for_n!(2 ; 16, 32, 48, 64, 80, 96, 112, 128, 144, 160); -make_kernels_for_n!(3 ; 16, 32, 48, 64, 80, 96, 112); -make_kernels_for_n!(4 ; 16, 32, 48, 64, 80, 96); -make_kernels_for_n!(5 ; 16, 32, 48, 64, 80); -make_kernels_for_n!(6 ; 16, 32, 48, 64); -make_kernels_for_n!(7 ; 16, 32, 48); -make_kernels_for_n!(8 ; 16, 32, 48); -make_kernels_for_n!(9 ; 16, 32, 48); -make_kernels_for_n!(10 ; 16, 32); -make_kernels_for_n!(11 ; 16, 32); -make_kernels_for_n!(12 ; 16, 32); -make_kernels_for_n!(13 ; 16, 32); -make_kernels_for_n!(14 ; 16, 32); -make_kernels_for_n!(15 ; 16); -make_kernels_for_n!(16 ; 16); -make_kernels_for_n!(17 ; 16); -make_kernels_for_n!(18 ; 16); -make_kernels_for_n!(19 ; 16); -make_kernels_for_n!(20 ; 16); -make_kernels_for_n!(21 ; 16); -make_kernels_for_n!(22 ; 16); -make_kernels_for_n!(23 ; 16); -make_kernels_for_n!(24 ; 16); -make_kernels_for_n!(25 ; 16); -make_kernels_for_n!(26 ; 16); -make_kernels_for_n!(27 ; 16); -make_kernels_for_n!(28 ; 16); -make_kernels_for_n!(29 ; 16); + make_kernels_for_n!(1 ; 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240); + make_kernels_for_n!(2 ; 16, 32, 48, 64, 80, 96, 112, 128, 144, 160); + make_kernels_for_n!(3 ; 16, 32, 48, 64, 80, 96, 112); + make_kernels_for_n!(4 ; 16, 32, 48, 64, 80, 96); + make_kernels_for_n!(5 ; 16, 32, 48, 64, 80); + make_kernels_for_n!(6 ; 16, 32, 48, 64); + make_kernels_for_n!(7 ; 16, 32, 48); + make_kernels_for_n!(8 ; 16, 32, 48); + make_kernels_for_n!(9 ; 16, 32, 48); + make_kernels_for_n!(10 ; 16, 32); + make_kernels_for_n!(11 ; 16, 32); + make_kernels_for_n!(12 ; 16, 32); + make_kernels_for_n!(13 ; 16, 32); + make_kernels_for_n!(14 ; 16, 32); + make_kernels_for_n!(15 ; 16); + make_kernels_for_n!(16 ; 16); + make_kernels_for_n!(17 ; 16); + make_kernels_for_n!(18 ; 16); + make_kernels_for_n!(19 ; 16); + make_kernels_for_n!(20 ; 16); + make_kernels_for_n!(21 ; 16); + make_kernels_for_n!(22 ; 16); + make_kernels_for_n!(23 ; 16); + make_kernels_for_n!(24 ; 16); + make_kernels_for_n!(25 ; 16); + make_kernels_for_n!(26 ; 16); + make_kernels_for_n!(27 ; 16); + make_kernels_for_n!(28 ; 16); + make_kernels_for_n!(29 ; 16); +} +#[cfg(feature = "compile_all_kernels")] +pub use all_avx512::*; diff --git a/linalg/x86_64/avx512/avx512_mmm_f32.tmpliq b/linalg/x86_64/avx512/avx512_mmm_f32.tmpliq index b83cff8d09..1ed4b1e9db 100644 --- a/linalg/x86_64/avx512/avx512_mmm_f32.tmpliq +++ b/linalg/x86_64/avx512/avx512_mmm_f32.tmpliq @@ -7,11 +7,24 @@ Arguments: {% endcomment %} // The kernel will operate on mr times nr elements at once, -// by laying them out in the registers as such: +// by laying them out in the zmm registers. // -// mmm f32 32 x 12: +// As an example, mmm f32 32 x 12 will be laid out this way: // zmm0 zmm2 zmm4 zmm6 zmm8 zmm10 zmm12 zmm14 zmm16 zmm18 zmm20 zmm22 // zmm1 zmm3 zmm5 zmm7 zmm9 zmm11 zmm13 zmm15 zmm17 zmm19 zmm21 zmm23 +// +// The scratch registers are currently: +// - zmm31 and zmm30 in every case +// - every zmm register from `mr_arch * nr` to the end +// +// This means you always have at least mr_arch registers scratch registers +// plus zmm31 and zmm30 available. +// +// More scratch registers may be added later if we decide to limit the +// range for data registers, as we don't really need the biggest kernel sizes. +// +// The list of possible kernel sizes is thus defined by every mr,nr combinations +// that match `mr_arch * nr + mr_arch + 2 <= 32`. {% assign kernel_name = mr | append:"x" | append:nr %} diff --git a/linalg/x86_64/avx512/f32_add_mat_mul.tmpliq b/linalg/x86_64/avx512/f32_add_mat_mul.tmpliq index b5f420ce6c..32835a0783 100644 --- a/linalg/x86_64/avx512/f32_add_mat_mul.tmpliq +++ b/linalg/x86_64/avx512/f32_add_mat_mul.tmpliq @@ -46,7 +46,7 @@ Arguments: {% assign prefetches_to_issue_min_1 = arch_mr | at_most:2 | minus:1 %} // how many unrolls on k should we produce -{% assign unroll_count = 8 %} +{% assign unroll_count = 4 %} {% assign unroll_count_min_1 = unroll_count | minus:1 %} // this is the dispatch part @@ -60,13 +60,13 @@ Arguments: cmp rbx, 3 jb {{L}}main_loop_packed_packed_2 je {{L}}main_loop_packed_packed_3 +{% comment %} cmp rbx, 5 jb {{L}}main_loop_packed_packed_4 je {{L}}main_loop_packed_packed_5 cmp rbx, 7 jb {{L}}main_loop_packed_packed_6 je {{L}}main_loop_packed_packed_7 -{% comment %} {% endcomment %} {% for unroll in (0..unroll_count_min_1) %} diff --git a/linalg/x86_64/avx512/f32_add_unicast.tmpliq b/linalg/x86_64/avx512/f32_add_unicast.tmpliq index afec5451dc..ff8bab37e5 100644 --- a/linalg/x86_64/avx512/f32_add_unicast.tmpliq +++ b/linalg/x86_64/avx512/f32_add_unicast.tmpliq @@ -6,6 +6,7 @@ Arguments: nr - kernel size in number of elements {% endcomment %} +{%capture offset%}{% if msvc %} offset {%else%} rip + {%endif%}{%endcapture%} {{L}}add_unicast: @@ -13,91 +14,54 @@ Arguments: mov rsi, [rdi + 16] // row stride mov rbx, [rdi + 24] // col stride - mov eax, 0 + // gather operation -// this is a hack - we move stuff around because -// pinsrd and vperm2f128 don't support ymm16-ymm31 registers -// meaning we need some scratch registers on ymm0-ymm16 -// however we have our data there :/ + vpbroadcastd zmm30, esi + vpmulld zmm30, zmm30, zmmword ptr [{{offset}} {{L}}numbers_seq_add_unicast] -{% assign last_data_reg = mr | divided_by:16 | times:nr | minus:1 %} -{% if last_data_reg >= 12 %} - vmovups zmm28, zmm12 -{% endif %} -{% if last_data_reg >= 13 %} - vmovups zmm29, zmm13 -{% endif %} -{% if last_data_reg >= 14 %} - vmovups zmm30, zmm14 -{% endif %} -{% if last_data_reg >= 15 %} - vmovups zmm31, zmm15 -{% endif %} - -{% for i in (0..3) %} - pinsrd xmm14, eax, {{i}} - add eax, esi -{% endfor %} -{% for i in (0..3) %} - pinsrd xmm15, eax, {{i}} - add eax, esi -{% endfor %} -{% for i in (0..3) %} - pinsrd xmm12, eax, {{i}} - add eax, esi -{% endfor %} -{% for i in (0..3) %} - pinsrd xmm13, eax, {{i}} - add eax, esi -{% endfor %} - - vperm2f128 ymm14, ymm14, ymm15, 32 // ymm14 <- xmm14::xmm15 - vperm2f128 ymm13, ymm12, ymm13, 32 // ymm12 <- xmm12::xmm13 - vinsertf32x8 zmm14, zmm14, ymm13, 1 - - vmovups zmm25, zmm15 - vmovups zmm26, zmm14 - vmovups zmm27, zmm12 - -{% if last_data_reg >= 12 %} - vmovups zmm12, zmm28 -{% endif %} -{% if last_data_reg >= 13 %} - vmovups zmm13, zmm29 -{% endif %} -{% if last_data_reg >= 14 %} - vmovups zmm14, zmm30 -{% endif %} -{% if last_data_reg >= 15 %} - vmovups zmm15, zmm31 -{% endif %} + // zmm30 is now a sequence of 0,esi,esi*2,esi*3...esi*15 {% assign nr_min_1 = nr | minus:1 %} {% assign mr_arch = mr | divided_by:16 %} {% assign mr_arch_min_1 = mr | divided_by:16 | minus:1 %} -{% for i in (0..nr_min_1) %} - kxnorw k1,k1,k1 - vgatherdps zmm27{k1}, [r10 + zmm26] - add r10, rbx - vaddps zmm{{i | times:mr_arch}}, zmm{{i | times:mr_arch}}, zmm27 -{% endfor %} + // r10 is cur col + imul rsi, 16 // stride for 16 elems - imul esi, 16 - vpbroadcastd zmm25, esi +{% for col in (0..nr_min_1) %} + mov r9, r10 // cur row -{% for j in (1..mr_arch_min_1) %} - mov r10, [rdi + 8] - vpaddd zmm26, zmm26, zmm25 + {% for row in (0..mr_arch_min_1) %} + kxnorw k1,k1,k1 // set writemask to ones + vgatherdps zmm31{k1}, [r9 + zmm30] + vaddps zmm{{col | times:mr_arch | plus:row}}, zmm{{col | times:mr_arch | plus:row}}, zmm31 - {% for i in (0..nr_min_1) %} - kxnorw k1,k1,k1 - vgatherdps zmm27{k1}, [r10 + zmm26] - add r10, rbx - vaddps zmm{{i | times:mr_arch | plus:j}}, zmm{{i | times:mr_arch | plus:j}}, zmm27 + {% if row != mr_arch_min_1 %} + add r9, rsi + {% endif %} {% endfor %} -{% endfor %} + {% if col != nr_min_1 %} + add r10, rbx + {% endif %} +{% endfor %} jmp {{L}}non_linear_loop +{{L}}numbers_seq_add_unicast: + {{long}} 0 + {{long}} 1 + {{long}} 2 + {{long}} 3 + {{long}} 4 + {{long}} 5 + {{long}} 6 + {{long}} 7 + {{long}} 8 + {{long}} 9 + {{long}} 10 + {{long}} 11 + {{long}} 12 + {{long}} 13 + {{long}} 14 + {{long}} 15 diff --git a/linalg/x86_64/avx512/f32_store_clear.tmpliq b/linalg/x86_64/avx512/f32_store_clear.tmpliq index fd83e8f71e..9d79849c76 100644 --- a/linalg/x86_64/avx512/f32_store_clear.tmpliq +++ b/linalg/x86_64/avx512/f32_store_clear.tmpliq @@ -6,40 +6,77 @@ Arguments: nr - kernel size in number of elements {% endcomment %} +{%capture offset%}{% if msvc %} offset {%else%} rip + {%endif%}{%endcapture%} + {% assign arch_mr = mr | divided_by:16 %} {% assign arch_mr_min_1 = mr | divided_by:16 | minus:1 %} {% assign nr_min_1 = nr | minus:1 %} {{L}}store: - mov r8, [rdi + 8] // c ptr + + mov r10, [rdi + 8] // c ptr mov rsi, [rdi + 16] // row stride mov rbx, [rdi + 24] // col stride - mov r9, r8 // current row - mov r10, r8 // current col - - {% for regcol in (0..nr_min_1) %} - {% for regrow in (0..arch_mr_min_1) %} - {% for quarter in (0..3) %} - vextractf32x4 xmm31, zmm{{regcol | times:arch_mr | plus:regrow}}, {{quarter}} - {% for innerrow in (0..3) %} - vextractps dword ptr [r9], xmm31, {{innerrow}} - add r9, rsi - {% endfor %} - {% endfor %} - {% endfor %} - add r10, rbx - mov r9, r10 + // scatter operation + + vpbroadcastd zmm30, esi + vpmulld zmm30, zmm30, zmmword ptr [{{offset}} {{L}}numbers_seq_store] + + // zmm30 is now a sequence of 0,esi,esi*2,esi*3...esi*15 + +{% assign nr_min_1 = nr | minus:1 %} +{% assign mr_arch = mr | divided_by:16 %} +{% assign mr_arch_min_1 = mr | divided_by:16 | minus:1 %} + + // r10 is cur col + imul rsi, 16 // stride for 16 elems + +{% for col in (0..nr_min_1) %} + mov r9, r10 // cur row + + {% for row in (0..mr_arch_min_1) %} + vmovaps zmm31, zmm{{col | times:mr_arch | plus:row}} + + kxnorw k1,k1,k1 // set writemask to ones + vscatterdps [r9 + zmm30]{k1}, zmm31 + + {% if row != mr_arch_min_1 %} + add r9, rsi + {% endif %} {% endfor %} + {% if col != nr_min_1 %} + add r10, rbx + {% endif %} +{% endfor %} + jmp {{L}}non_linear_loop +{{L}}numbers_seq_store: + {{long}} 0 + {{long}} 1 + {{long}} 2 + {{long}} 3 + {{long}} 4 + {{long}} 5 + {{long}} 6 + {{long}} 7 + {{long}} 8 + {{long}} 9 + {{long}} 10 + {{long}} 11 + {{long}} 12 + {{long}} 13 + {{long}} 14 + {{long}} 15 + {% assign last_reg = mr | divided_by:16 | times:nr | minus:1 %} {{L}}clear: vzeroall - // turns out vzeroall only zeroes zmm0 to zmm15 {% if last_reg >= 16 %} + // turns out vzeroall only zeroes zmm0 to zmm15 {% for regcol in (16..last_reg) %} vmovups zmm{{regcol}}, zmm0 {% endfor %} diff --git a/linalg/x86_64/kernel_throughput.py b/linalg/x86_64/kernel_throughput.py old mode 100644 new mode 100755 index 6e4ae7666e..416cc303ee --- a/linalg/x86_64/kernel_throughput.py +++ b/linalg/x86_64/kernel_throughput.py @@ -4,7 +4,7 @@ Display the kernel throughputs as a dataframe and a csv file. Usage: -1. First, run the benchmarks using `cargo bench -p tract-linalg --bench kernel_test`. +1. First, run the benchmarks using `cargo bench -p tract-linalg --features compile_all_kernels --bench kernel_test`. 2. Then run this file in the project root: `python3 linalg/x86_64/kernel_throughput.py`. The results are in Gelem/s. @@ -42,9 +42,8 @@ elements = benchmark["throughput"]["Elements"] time_per_iter = sum(sample["times"]) / sum(sample["iters"]) - df.loc[m, n] = 1 / (time_per_iter / elements) + df.loc[m, n] = round(1 / (time_per_iter / elements), 2) print(df.loc[m, n]) - df.loc[m, n] = f"{round(df.loc[m, n], 2)}" pd.set_option('display.max_columns', None) print(df) From 9fa96012cabbde38d82719314fd8c30d1e9bc5b9 Mon Sep 17 00:00:00 2001 From: Charles Chudant Date: Sun, 12 Mar 2023 18:08:01 +0000 Subject: [PATCH 5/7] fix CI issues --- linalg/Cargo.toml | 5 ----- linalg/src/x86_64_fma/mmm.rs | 21 ++++++++++++--------- 2 files changed, 12 insertions(+), 14 deletions(-) diff --git a/linalg/Cargo.toml b/linalg/Cargo.toml index 609456f0e9..5373b58d8d 100644 --- a/linalg/Cargo.toml +++ b/linalg/Cargo.toml @@ -106,8 +106,3 @@ harness = false bench = false name = "kernel_test" harness = false - -[[bench]] -bench = false -name = "kernel_selection" -harness = false diff --git a/linalg/src/x86_64_fma/mmm.rs b/linalg/src/x86_64_fma/mmm.rs index 20a1c01140..1bbe1ec79c 100644 --- a/linalg/src/x86_64_fma/mmm.rs +++ b/linalg/src/x86_64_fma/mmm.rs @@ -13,18 +13,21 @@ MMMKernel!(i32, avx2_mmm_i32_8x8; 8, 8; 32, 4; 0, 0; no_prefetch, is_x86_feature #[cfg(not(feature = "compile_all_kernels"))] mod avx512_best { use super::*; - MMMKernel!(f32, avx512_mmm_f32_240x1; 240, 1; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); - MMMKernel!(f32, avx512_mmm_f32_160x2; 160, 2; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); - MMMKernel!(f32, avx512_mmm_f32_112x3; 112, 3; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); - MMMKernel!(f32, avx512_mmm_f32_96x4; 96, 4; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); - MMMKernel!(f32, avx512_mmm_f32_80x5; 80, 5; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); - MMMKernel!(f32, avx512_mmm_f32_64x6; 64, 6; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); - MMMKernel!(f32, avx512_mmm_f32_48x7; 48, 7; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); - MMMKernel!(f32, avx512_mmm_f32_48x8; 48, 8; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); - MMMKernel!(f32, avx512_mmm_f32_48x9; 48, 9; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); + MMMKernel!(f32, avx512_mmm_f32_16x1; 12, 1; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); + MMMKernel!(f32, avx512_mmm_f32_96x1; 96, 1; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); + MMMKernel!(f32, avx512_mmm_f32_96x2; 96, 2; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); + MMMKernel!(f32, avx512_mmm_f32_80x3; 80, 3; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); + MMMKernel!(f32, avx512_mmm_f32_64x4; 64, 4; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); + MMMKernel!(f32, avx512_mmm_f32_32x5; 32, 5; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); + MMMKernel!(f32, avx512_mmm_f32_32x6; 32, 6; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); + MMMKernel!(f32, avx512_mmm_f32_32x7; 32, 7; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); + MMMKernel!(f32, avx512_mmm_f32_32x8; 32, 8; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); + MMMKernel!(f32, avx512_mmm_f32_32x9; 32, 9; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); MMMKernel!(f32, avx512_mmm_f32_32x10; 32, 10; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); MMMKernel!(f32, avx512_mmm_f32_32x11; 32, 11; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); MMMKernel!(f32, avx512_mmm_f32_32x12; 32, 12; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); + MMMKernel!(f32, avx512_mmm_f32_32x13; 32, 12; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); + MMMKernel!(f32, avx512_mmm_f32_32x14; 32, 12; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); } #[cfg(not(feature = "compile_all_kernels"))] pub use avx512_best::*; From 8ed347768e2e18ce02aac274f90acc83060f3779 Mon Sep 17 00:00:00 2001 From: Charles Chudant Date: Sun, 12 Mar 2023 18:14:18 +0000 Subject: [PATCH 6/7] fix compile issue --- linalg/build.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/linalg/build.rs b/linalg/build.rs index 148febbefd..bc59f0258e 100644 --- a/linalg/build.rs +++ b/linalg/build.rs @@ -105,6 +105,7 @@ fn main() { .collect() } else { vec![ + (16, 1), (96, 1), (96, 2), (80, 3), @@ -117,6 +118,8 @@ fn main() { (32, 10), (32, 11), (32, 12), + (32, 13), + (32, 14), ] }; From e8f82bcfe250e0786ea4cd0df049c3d4faf064b4 Mon Sep 17 00:00:00 2001 From: Charles Chudant Date: Sun, 12 Mar 2023 20:13:16 +0000 Subject: [PATCH 7/7] fix kernel definition on rust side --- linalg/src/x86_64_fma/mmm.rs | 6 +++--- linalg/x86_64/avx512/f32_store_clear.tmpliq | 10 ++++------ 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/linalg/src/x86_64_fma/mmm.rs b/linalg/src/x86_64_fma/mmm.rs index 1bbe1ec79c..bbc6ac5cbb 100644 --- a/linalg/src/x86_64_fma/mmm.rs +++ b/linalg/src/x86_64_fma/mmm.rs @@ -13,7 +13,7 @@ MMMKernel!(i32, avx2_mmm_i32_8x8; 8, 8; 32, 4; 0, 0; no_prefetch, is_x86_feature #[cfg(not(feature = "compile_all_kernels"))] mod avx512_best { use super::*; - MMMKernel!(f32, avx512_mmm_f32_16x1; 12, 1; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); + MMMKernel!(f32, avx512_mmm_f32_16x1; 16, 1; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); MMMKernel!(f32, avx512_mmm_f32_96x1; 96, 1; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); MMMKernel!(f32, avx512_mmm_f32_96x2; 96, 2; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); MMMKernel!(f32, avx512_mmm_f32_80x3; 80, 3; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); @@ -26,8 +26,8 @@ mod avx512_best { MMMKernel!(f32, avx512_mmm_f32_32x10; 32, 10; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); MMMKernel!(f32, avx512_mmm_f32_32x11; 32, 11; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); MMMKernel!(f32, avx512_mmm_f32_32x12; 32, 12; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); - MMMKernel!(f32, avx512_mmm_f32_32x13; 32, 12; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); - MMMKernel!(f32, avx512_mmm_f32_32x14; 32, 12; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); + MMMKernel!(f32, avx512_mmm_f32_32x13; 32, 13; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); + MMMKernel!(f32, avx512_mmm_f32_32x14; 32, 14; 64, 4; 0, 0; no_prefetch, is_x86_feature_detected!("avx512f")); } #[cfg(not(feature = "compile_all_kernels"))] pub use avx512_best::*; diff --git a/linalg/x86_64/avx512/f32_store_clear.tmpliq b/linalg/x86_64/avx512/f32_store_clear.tmpliq index 9d79849c76..50b099532d 100644 --- a/linalg/x86_64/avx512/f32_store_clear.tmpliq +++ b/linalg/x86_64/avx512/f32_store_clear.tmpliq @@ -20,10 +20,10 @@ Arguments: // scatter operation - vpbroadcastd zmm30, esi - vpmulld zmm30, zmm30, zmmword ptr [{{offset}} {{L}}numbers_seq_store] + vpbroadcastd zmm31, esi + vpmulld zmm31, zmm31, zmmword ptr [{{offset}} {{L}}numbers_seq_store] - // zmm30 is now a sequence of 0,esi,esi*2,esi*3...esi*15 + // zmm31 is now a sequence of 0,esi,esi*2,esi*3...esi*15 {% assign nr_min_1 = nr | minus:1 %} {% assign mr_arch = mr | divided_by:16 %} @@ -36,10 +36,8 @@ Arguments: mov r9, r10 // cur row {% for row in (0..mr_arch_min_1) %} - vmovaps zmm31, zmm{{col | times:mr_arch | plus:row}} - kxnorw k1,k1,k1 // set writemask to ones - vscatterdps [r9 + zmm30]{k1}, zmm31 + vscatterdps [r9 + zmm31]{k1}, zmm{{col | times:mr_arch | plus:row}} {% if row != mr_arch_min_1 %} add r9, rsi