From 50b7117e4bc4a2c675002529b79823ebcca3160e Mon Sep 17 00:00:00 2001 From: Michael Yeh <111819036+myeh01@users.noreply.github.com> Date: Thu, 28 Nov 2024 16:05:59 -0800 Subject: [PATCH] Use intrinsics for all sifive_x280 kernels (#822) Details: - Replace all assembly kernels in the `sifive_x280` kernel set with intrinsic versions. - Fixes bug encountered in #805. - Update the RISC-V toolchain used in CI testing. - Special thanks to Michael Yeh (@myeh01) and SiFive. --- .../sifive_x280/bli_cntx_init_sifive_x280.c | 106 +- config/sifive_x280/make_defs.mk | 2 +- .../sifive_x280/1/bli_amaxv_sifive_x280_asm.c | 293 -- .../bli_amaxv_sifive_x280_intr.c | 179 + .../bli_amaxv_sifive_x280_intr_complex.c | 105 + .../bli_amaxv_sifive_x280_intr_real.c | 100 + .../bli_axpbyv_sifive_x280_intr.c | 4 +- .../sifive_x280/1/bli_copyv_sifive_x280_asm.c | 272 -- .../bli_copyv_sifive_x280_intr.c | 116 + .../bli_copyv_sifive_x280_intr_complex.c | 75 + .../bli_copyv_sifive_x280_intr_real.c | 68 + .../1/bli_invertv_sifive_x280_asm.c | 221 -- .../bli_invertv_sifive_x280_intr.c | 118 + .../bli_invertv_sifive_x280_intr_complex.c | 83 + .../bli_invertv_sifive_x280_intr_real.c | 68 + .../1/bli_invscalv_sifive_x280_asm.c | 266 -- .../bli_invscalv_sifive_x280_intr.c | 117 + .../bli_invscalv_sifive_x280_intr_complex.c | 83 + .../bli_invscalv_sifive_x280_intr_real.c | 75 + .../bli_scal2v_sifive_x280_intr.c | 4 +- .../bli_scal2v_sifive_x280_intr_complex.c | 6 +- .../bli_scalv_sifive_x280_intr.c | 2 +- .../sifive_x280/1/bli_setv_sifive_x280_asm.c | 204 -- .../bli_setv_sifive_x280_intr.c | 116 + .../bli_setv_sifive_x280_intr_complex.c | 71 + .../bli_setv_sifive_x280_intr_real.c | 64 + .../sifive_x280/1/bli_swapv_sifive_x280_asm.c | 245 -- .../bli_swapv_sifive_x280_intr.c | 115 + .../bli_swapv_sifive_x280_intr_complex.c | 76 + .../bli_swapv_sifive_x280_intr_real.c | 76 + .../bli_xpbyv_sifive_x280_intr.c | 2 +- .../1f/bli_axpyf_sifive_x280_asm.c | 430 --- .../bli_axpyf_sifive_x280_intr.c | 121 + .../bli_axpyf_sifive_x280_intr_complex.c | 149 + .../bli_axpyf_sifive_x280_intr_real.c | 96 + .../1f/bli_dotxaxpyf_sifive_x280_asm.c | 3120 ----------------- .../bli_dotxaxpyf_sifive_x280_intr.c | 137 + .../bli_dotxaxpyf_sifive_x280_intr_complex.c | 427 +++ .../bli_dotxaxpyf_sifive_x280_intr_real.c | 283 ++ .../1f/bli_dotxf_sifive_x280_asm.c | 2645 -------------- .../bli_dotxf_sifive_x280_intr.c | 132 + .../bli_dotxf_sifive_x280_intr_complex.c | 324 ++ .../bli_dotxf_sifive_x280_intr_real.c | 262 ++ .../1m/bli_packm_sifive_x280_asm.c | 1465 -------- .../bli_packm_sifive_x280_intr.c | 168 + .../bli_packm_sifive_x280_intr_complex.c | 545 +++ .../bli_packm_sifive_x280_intr_real.c | 364 ++ .../sifive_x280/3/bli_gemm_sifive_x280_asm.c | 2406 ------------- .../bli_gemm_sifive_x280_intr.c | 138 + .../bli_gemm_sifive_x280_intr_complex.c | 517 +++ .../bli_gemm_sifive_x280_intr_real.c | 339 ++ .../bli_gemmtrsm_l_sifive_x280_asm_complex.c | 327 -- .../bli_gemmtrsm_l_sifive_x280_asm_real.c | 253 -- .../bli_gemmtrsm_u_sifive_x280_asm_complex.c | 331 -- .../bli_gemmtrsm_u_sifive_x280_asm_real.c | 260 -- .../bli_gemmtrsm_sifive_x280_intr.c} | 97 +- .../bli_gemmtrsm_sifive_x280_intr_complex.c | 437 +++ .../bli_gemmtrsm_sifive_x280_intr_real.c | 364 ++ kernels/sifive_x280/bli_kernels_sifive_x280.h | 106 +- kernels/sifive_x280/riscv_cmul_macros_intr.h | 147 + .../sifive_x280/riscv_overloaded_intrinsics.h | 109 +- travis/do_riscv.sh | 2 +- 62 files changed, 6902 insertions(+), 12931 deletions(-) delete mode 100644 kernels/sifive_x280/1/bli_amaxv_sifive_x280_asm.c create mode 100644 kernels/sifive_x280/1/bli_amaxv_sifive_x280_intr/bli_amaxv_sifive_x280_intr.c create mode 100644 kernels/sifive_x280/1/bli_amaxv_sifive_x280_intr/bli_amaxv_sifive_x280_intr_complex.c create mode 100644 kernels/sifive_x280/1/bli_amaxv_sifive_x280_intr/bli_amaxv_sifive_x280_intr_real.c delete mode 100644 kernels/sifive_x280/1/bli_copyv_sifive_x280_asm.c create mode 100644 kernels/sifive_x280/1/bli_copyv_sifive_x280_intr/bli_copyv_sifive_x280_intr.c create mode 100644 kernels/sifive_x280/1/bli_copyv_sifive_x280_intr/bli_copyv_sifive_x280_intr_complex.c create mode 100644 kernels/sifive_x280/1/bli_copyv_sifive_x280_intr/bli_copyv_sifive_x280_intr_real.c delete mode 100644 kernels/sifive_x280/1/bli_invertv_sifive_x280_asm.c create mode 100644 kernels/sifive_x280/1/bli_invertv_sifive_x280_intr/bli_invertv_sifive_x280_intr.c create mode 100644 kernels/sifive_x280/1/bli_invertv_sifive_x280_intr/bli_invertv_sifive_x280_intr_complex.c create mode 100644 kernels/sifive_x280/1/bli_invertv_sifive_x280_intr/bli_invertv_sifive_x280_intr_real.c delete mode 100644 kernels/sifive_x280/1/bli_invscalv_sifive_x280_asm.c create mode 100644 kernels/sifive_x280/1/bli_invscalv_sifive_x280_intr/bli_invscalv_sifive_x280_intr.c create mode 100644 kernels/sifive_x280/1/bli_invscalv_sifive_x280_intr/bli_invscalv_sifive_x280_intr_complex.c create mode 100644 kernels/sifive_x280/1/bli_invscalv_sifive_x280_intr/bli_invscalv_sifive_x280_intr_real.c delete mode 100644 kernels/sifive_x280/1/bli_setv_sifive_x280_asm.c create mode 100644 kernels/sifive_x280/1/bli_setv_sifive_x280_intr/bli_setv_sifive_x280_intr.c create mode 100644 kernels/sifive_x280/1/bli_setv_sifive_x280_intr/bli_setv_sifive_x280_intr_complex.c create mode 100644 kernels/sifive_x280/1/bli_setv_sifive_x280_intr/bli_setv_sifive_x280_intr_real.c delete mode 100644 kernels/sifive_x280/1/bli_swapv_sifive_x280_asm.c create mode 100644 kernels/sifive_x280/1/bli_swapv_sifive_x280_intr/bli_swapv_sifive_x280_intr.c create mode 100644 kernels/sifive_x280/1/bli_swapv_sifive_x280_intr/bli_swapv_sifive_x280_intr_complex.c create mode 100644 kernels/sifive_x280/1/bli_swapv_sifive_x280_intr/bli_swapv_sifive_x280_intr_real.c delete mode 100644 kernels/sifive_x280/1f/bli_axpyf_sifive_x280_asm.c create mode 100644 kernels/sifive_x280/1f/bli_axpyf_sifive_x280_intr/bli_axpyf_sifive_x280_intr.c create mode 100644 kernels/sifive_x280/1f/bli_axpyf_sifive_x280_intr/bli_axpyf_sifive_x280_intr_complex.c create mode 100644 kernels/sifive_x280/1f/bli_axpyf_sifive_x280_intr/bli_axpyf_sifive_x280_intr_real.c delete mode 100644 kernels/sifive_x280/1f/bli_dotxaxpyf_sifive_x280_asm.c create mode 100644 kernels/sifive_x280/1f/bli_dotxaxpyf_sifive_x280_intr/bli_dotxaxpyf_sifive_x280_intr.c create mode 100644 kernels/sifive_x280/1f/bli_dotxaxpyf_sifive_x280_intr/bli_dotxaxpyf_sifive_x280_intr_complex.c create mode 100644 kernels/sifive_x280/1f/bli_dotxaxpyf_sifive_x280_intr/bli_dotxaxpyf_sifive_x280_intr_real.c delete mode 100644 kernels/sifive_x280/1f/bli_dotxf_sifive_x280_asm.c create mode 100644 kernels/sifive_x280/1f/bli_dotxf_sifive_x280_intr/bli_dotxf_sifive_x280_intr.c create mode 100644 kernels/sifive_x280/1f/bli_dotxf_sifive_x280_intr/bli_dotxf_sifive_x280_intr_complex.c create mode 100644 kernels/sifive_x280/1f/bli_dotxf_sifive_x280_intr/bli_dotxf_sifive_x280_intr_real.c delete mode 100644 kernels/sifive_x280/1m/bli_packm_sifive_x280_asm.c create mode 100644 kernels/sifive_x280/1m/bli_packm_sifive_x280_intr/bli_packm_sifive_x280_intr.c create mode 100644 kernels/sifive_x280/1m/bli_packm_sifive_x280_intr/bli_packm_sifive_x280_intr_complex.c create mode 100644 kernels/sifive_x280/1m/bli_packm_sifive_x280_intr/bli_packm_sifive_x280_intr_real.c delete mode 100644 kernels/sifive_x280/3/bli_gemm_sifive_x280_asm.c create mode 100644 kernels/sifive_x280/3/bli_gemm_sifive_x280_intr/bli_gemm_sifive_x280_intr.c create mode 100644 kernels/sifive_x280/3/bli_gemm_sifive_x280_intr/bli_gemm_sifive_x280_intr_complex.c create mode 100644 kernels/sifive_x280/3/bli_gemm_sifive_x280_intr/bli_gemm_sifive_x280_intr_real.c delete mode 100644 kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_l_sifive_x280_asm_complex.c delete mode 100644 kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_l_sifive_x280_asm_real.c delete mode 100644 kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_u_sifive_x280_asm_complex.c delete mode 100644 kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_u_sifive_x280_asm_real.c rename kernels/sifive_x280/3/{bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_sifive_x280_asm.c => bli_gemmtrsm_sifive_x280_intr/bli_gemmtrsm_sifive_x280_intr.c} (75%) create mode 100644 kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_intr/bli_gemmtrsm_sifive_x280_intr_complex.c create mode 100644 kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_intr/bli_gemmtrsm_sifive_x280_intr_real.c create mode 100644 kernels/sifive_x280/riscv_cmul_macros_intr.h diff --git a/config/sifive_x280/bli_cntx_init_sifive_x280.c b/config/sifive_x280/bli_cntx_init_sifive_x280.c index 56a1a66d53..668891cf3f 100644 --- a/config/sifive_x280/bli_cntx_init_sifive_x280.c +++ b/config/sifive_x280/bli_cntx_init_sifive_x280.c @@ -54,10 +54,10 @@ void bli_cntx_init_sifive_x280( cntx_t* cntx ) BLIS_ADDV_KER, BLIS_SCOMPLEX, bli_caddv_sifive_x280_intr, BLIS_ADDV_KER, BLIS_DCOMPLEX, bli_zaddv_sifive_x280_intr, - BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_sifive_x280_asm, - BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_sifive_x280_asm, - BLIS_AMAXV_KER, BLIS_SCOMPLEX, bli_camaxv_sifive_x280_asm, - BLIS_AMAXV_KER, BLIS_DCOMPLEX, bli_zamaxv_sifive_x280_asm, + BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_sifive_x280_intr, + BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_sifive_x280_intr, + BLIS_AMAXV_KER, BLIS_SCOMPLEX, bli_camaxv_sifive_x280_intr, + BLIS_AMAXV_KER, BLIS_DCOMPLEX, bli_zamaxv_sifive_x280_intr, BLIS_AXPBYV_KER, BLIS_FLOAT, bli_saxpbyv_sifive_x280_intr, BLIS_AXPBYV_KER, BLIS_DOUBLE, bli_daxpbyv_sifive_x280_intr, @@ -69,10 +69,10 @@ void bli_cntx_init_sifive_x280( cntx_t* cntx ) BLIS_AXPYV_KER, BLIS_SCOMPLEX, bli_caxpyv_sifive_x280_intr, BLIS_AXPYV_KER, BLIS_DCOMPLEX, bli_zaxpyv_sifive_x280_intr, - BLIS_COPYV_KER, BLIS_FLOAT, bli_scopyv_sifive_x280_asm, - BLIS_COPYV_KER, BLIS_DOUBLE, bli_dcopyv_sifive_x280_asm, - BLIS_COPYV_KER, BLIS_SCOMPLEX, bli_ccopyv_sifive_x280_asm, - BLIS_COPYV_KER, BLIS_DCOMPLEX, bli_zcopyv_sifive_x280_asm, + BLIS_COPYV_KER, BLIS_FLOAT, bli_scopyv_sifive_x280_intr, + BLIS_COPYV_KER, BLIS_DOUBLE, bli_dcopyv_sifive_x280_intr, + BLIS_COPYV_KER, BLIS_SCOMPLEX, bli_ccopyv_sifive_x280_intr, + BLIS_COPYV_KER, BLIS_DCOMPLEX, bli_zcopyv_sifive_x280_intr, BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_sifive_x280_intr, BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_sifive_x280_intr, @@ -84,15 +84,15 @@ void bli_cntx_init_sifive_x280( cntx_t* cntx ) BLIS_DOTXV_KER, BLIS_SCOMPLEX, bli_cdotxv_sifive_x280_intr, BLIS_DOTXV_KER, BLIS_DCOMPLEX, bli_zdotxv_sifive_x280_intr, - BLIS_INVERTV_KER, BLIS_FLOAT, bli_sinvertv_sifive_x280_asm, - BLIS_INVERTV_KER, BLIS_DOUBLE, bli_dinvertv_sifive_x280_asm, - BLIS_INVERTV_KER, BLIS_SCOMPLEX, bli_cinvertv_sifive_x280_asm, - BLIS_INVERTV_KER, BLIS_DCOMPLEX, bli_zinvertv_sifive_x280_asm, + BLIS_INVERTV_KER, BLIS_FLOAT, bli_sinvertv_sifive_x280_intr, + BLIS_INVERTV_KER, BLIS_DOUBLE, bli_dinvertv_sifive_x280_intr, + BLIS_INVERTV_KER, BLIS_SCOMPLEX, bli_cinvertv_sifive_x280_intr, + BLIS_INVERTV_KER, BLIS_DCOMPLEX, bli_zinvertv_sifive_x280_intr, - BLIS_INVSCALV_KER, BLIS_FLOAT, bli_sinvscalv_sifive_x280_asm, - BLIS_INVSCALV_KER, BLIS_DOUBLE, bli_dinvscalv_sifive_x280_asm, - BLIS_INVSCALV_KER, BLIS_SCOMPLEX, bli_cinvscalv_sifive_x280_asm, - BLIS_INVSCALV_KER, BLIS_DCOMPLEX, bli_zinvscalv_sifive_x280_asm, + BLIS_INVSCALV_KER, BLIS_FLOAT, bli_sinvscalv_sifive_x280_intr, + BLIS_INVSCALV_KER, BLIS_DOUBLE, bli_dinvscalv_sifive_x280_intr, + BLIS_INVSCALV_KER, BLIS_SCOMPLEX, bli_cinvscalv_sifive_x280_intr, + BLIS_INVSCALV_KER, BLIS_DCOMPLEX, bli_zinvscalv_sifive_x280_intr, BLIS_SCAL2V_KER, BLIS_FLOAT, bli_sscal2v_sifive_x280_intr, BLIS_SCAL2V_KER, BLIS_DOUBLE, bli_dscal2v_sifive_x280_intr, @@ -104,20 +104,20 @@ void bli_cntx_init_sifive_x280( cntx_t* cntx ) BLIS_SCALV_KER, BLIS_SCOMPLEX, bli_cscalv_sifive_x280_intr, BLIS_SCALV_KER, BLIS_DCOMPLEX, bli_zscalv_sifive_x280_intr, - BLIS_SETV_KER, BLIS_FLOAT, bli_ssetv_sifive_x280_asm, - BLIS_SETV_KER, BLIS_DOUBLE, bli_dsetv_sifive_x280_asm, - BLIS_SETV_KER, BLIS_SCOMPLEX, bli_csetv_sifive_x280_asm, - BLIS_SETV_KER, BLIS_DCOMPLEX, bli_zsetv_sifive_x280_asm, + BLIS_SETV_KER, BLIS_FLOAT, bli_ssetv_sifive_x280_intr, + BLIS_SETV_KER, BLIS_DOUBLE, bli_dsetv_sifive_x280_intr, + BLIS_SETV_KER, BLIS_SCOMPLEX, bli_csetv_sifive_x280_intr, + BLIS_SETV_KER, BLIS_DCOMPLEX, bli_zsetv_sifive_x280_intr, BLIS_SUBV_KER, BLIS_FLOAT, bli_ssubv_sifive_x280_intr, BLIS_SUBV_KER, BLIS_DOUBLE, bli_dsubv_sifive_x280_intr, BLIS_SUBV_KER, BLIS_SCOMPLEX, bli_csubv_sifive_x280_intr, BLIS_SUBV_KER, BLIS_DCOMPLEX, bli_zsubv_sifive_x280_intr, - BLIS_SWAPV_KER, BLIS_FLOAT, bli_sswapv_sifive_x280_asm, - BLIS_SWAPV_KER, BLIS_DOUBLE, bli_dswapv_sifive_x280_asm, - BLIS_SWAPV_KER, BLIS_SCOMPLEX, bli_cswapv_sifive_x280_asm, - BLIS_SWAPV_KER, BLIS_DCOMPLEX, bli_zswapv_sifive_x280_asm, + BLIS_SWAPV_KER, BLIS_FLOAT, bli_sswapv_sifive_x280_intr, + BLIS_SWAPV_KER, BLIS_DOUBLE, bli_dswapv_sifive_x280_intr, + BLIS_SWAPV_KER, BLIS_SCOMPLEX, bli_cswapv_sifive_x280_intr, + BLIS_SWAPV_KER, BLIS_DCOMPLEX, bli_zswapv_sifive_x280_intr, BLIS_XPBYV_KER, BLIS_FLOAT, bli_sxpbyv_sifive_x280_intr, BLIS_XPBYV_KER, BLIS_DOUBLE, bli_dxpbyv_sifive_x280_intr, @@ -130,46 +130,46 @@ void bli_cntx_init_sifive_x280( cntx_t* cntx ) BLIS_AXPY2V_KER, BLIS_SCOMPLEX, bli_caxpy2v_sifive_x280_intr, BLIS_AXPY2V_KER, BLIS_DCOMPLEX, bli_zaxpy2v_sifive_x280_intr, - BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_sifive_x280_asm, - BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_sifive_x280_asm, - BLIS_AXPYF_KER, BLIS_SCOMPLEX, bli_caxpyf_sifive_x280_asm, - BLIS_AXPYF_KER, BLIS_DCOMPLEX, bli_zaxpyf_sifive_x280_asm, + BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_sifive_x280_intr, + BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_sifive_x280_intr, + BLIS_AXPYF_KER, BLIS_SCOMPLEX, bli_caxpyf_sifive_x280_intr, + BLIS_AXPYF_KER, BLIS_DCOMPLEX, bli_zaxpyf_sifive_x280_intr, - BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_sifive_x280_asm, - BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_sifive_x280_asm, - BLIS_DOTXF_KER, BLIS_SCOMPLEX, bli_cdotxf_sifive_x280_asm, - BLIS_DOTXF_KER, BLIS_DCOMPLEX, bli_zdotxf_sifive_x280_asm, + BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_sifive_x280_intr, + BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_sifive_x280_intr, + BLIS_DOTXF_KER, BLIS_SCOMPLEX, bli_cdotxf_sifive_x280_intr, + BLIS_DOTXF_KER, BLIS_DCOMPLEX, bli_zdotxf_sifive_x280_intr, BLIS_DOTAXPYV_KER, BLIS_FLOAT, bli_sdotaxpyv_sifive_x280_intr, BLIS_DOTAXPYV_KER, BLIS_DOUBLE, bli_ddotaxpyv_sifive_x280_intr, BLIS_DOTAXPYV_KER, BLIS_SCOMPLEX, bli_cdotaxpyv_sifive_x280_intr, BLIS_DOTAXPYV_KER, BLIS_DCOMPLEX, bli_zdotaxpyv_sifive_x280_intr, - BLIS_DOTXAXPYF_KER, BLIS_FLOAT, bli_sdotxaxpyf_sifive_x280_asm, - BLIS_DOTXAXPYF_KER, BLIS_DOUBLE, bli_ddotxaxpyf_sifive_x280_asm, - BLIS_DOTXAXPYF_KER, BLIS_SCOMPLEX, bli_cdotxaxpyf_sifive_x280_asm, - BLIS_DOTXAXPYF_KER, BLIS_DCOMPLEX, bli_zdotxaxpyf_sifive_x280_asm, + BLIS_DOTXAXPYF_KER, BLIS_FLOAT, bli_sdotxaxpyf_sifive_x280_intr, + BLIS_DOTXAXPYF_KER, BLIS_DOUBLE, bli_ddotxaxpyf_sifive_x280_intr, + BLIS_DOTXAXPYF_KER, BLIS_SCOMPLEX, bli_cdotxaxpyf_sifive_x280_intr, + BLIS_DOTXAXPYF_KER, BLIS_DCOMPLEX, bli_zdotxaxpyf_sifive_x280_intr, // Level 1m - BLIS_PACKM_KER, BLIS_FLOAT, bli_spackm_sifive_x280_asm_7m4, - BLIS_PACKM_KER, BLIS_DOUBLE, bli_dpackm_sifive_x280_asm_7m4, - BLIS_PACKM_KER, BLIS_SCOMPLEX, bli_cpackm_sifive_x280_asm_6m2, - BLIS_PACKM_KER, BLIS_DCOMPLEX, bli_zpackm_sifive_x280_asm_6m2, + BLIS_PACKM_KER, BLIS_FLOAT, bli_spackm_sifive_x280_intr, + BLIS_PACKM_KER, BLIS_DOUBLE, bli_dpackm_sifive_x280_intr, + BLIS_PACKM_KER, BLIS_SCOMPLEX, bli_cpackm_sifive_x280_intr, + BLIS_PACKM_KER, BLIS_DCOMPLEX, bli_zpackm_sifive_x280_intr, // Level 3 - BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_sifive_x280_asm_7m4, - BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_sifive_x280_asm_7m4, - BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_sifive_x280_asm_6m2, - BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_sifive_x280_asm_6m2, - - BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_sifive_x280_asm, - BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_sifive_x280_asm, - BLIS_GEMMTRSM_L_UKR, BLIS_SCOMPLEX, bli_cgemmtrsm_l_sifive_x280_asm, - BLIS_GEMMTRSM_L_UKR, BLIS_DCOMPLEX, bli_zgemmtrsm_l_sifive_x280_asm, - BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_sifive_x280_asm, - BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_sifive_x280_asm, - BLIS_GEMMTRSM_U_UKR, BLIS_SCOMPLEX, bli_cgemmtrsm_u_sifive_x280_asm, - BLIS_GEMMTRSM_U_UKR, BLIS_DCOMPLEX, bli_zgemmtrsm_u_sifive_x280_asm, + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_sifive_x280_intr, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_sifive_x280_intr, + BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_sifive_x280_intr, + BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_sifive_x280_intr, + + BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_sifive_x280_intr, + BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_sifive_x280_intr, + BLIS_GEMMTRSM_L_UKR, BLIS_SCOMPLEX, bli_cgemmtrsm_l_sifive_x280_intr, + BLIS_GEMMTRSM_L_UKR, BLIS_DCOMPLEX, bli_zgemmtrsm_l_sifive_x280_intr, + BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_sifive_x280_intr, + BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_sifive_x280_intr, + BLIS_GEMMTRSM_U_UKR, BLIS_SCOMPLEX, bli_cgemmtrsm_u_sifive_x280_intr, + BLIS_GEMMTRSM_U_UKR, BLIS_DCOMPLEX, bli_zgemmtrsm_u_sifive_x280_intr, BLIS_VA_END ); diff --git a/config/sifive_x280/make_defs.mk b/config/sifive_x280/make_defs.mk index acdf5a3611..31b31e387a 100644 --- a/config/sifive_x280/make_defs.mk +++ b/config/sifive_x280/make_defs.mk @@ -61,7 +61,7 @@ endif ifeq ($(DEBUG_TYPE),noopt) COPTFLAGS := -O0 else -COPTFLAGS := -Ofast +COPTFLAGS := -O3 endif # Flags specific to optimized kernels. diff --git a/kernels/sifive_x280/1/bli_amaxv_sifive_x280_asm.c b/kernels/sifive_x280/1/bli_amaxv_sifive_x280_asm.c deleted file mode 100644 index c423dd131d..0000000000 --- a/kernels/sifive_x280/1/bli_amaxv_sifive_x280_asm.c +++ /dev/null @@ -1,293 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2023, SiFive, Inc. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" -#include -#include -#include -#include - -void bli_samaxv_sifive_x280_asm(dim_t n, const void * restrict x_, inc_t incx, - dim_t *index, const cntx_t *cntx) { - // assumes 64-bit index - (void)cntx; - const float* restrict x = x_; - - if (n <= 1) { - *index = 0; - return; - } - incx *= 4; - size_t avl = n; - size_t offset = 0; - bool first = true; - while (avl) { - size_t vl; - __asm__ volatile("vsetvli %0, %1, e32, m4, tu, ma" - : "=r"(vl) - : "r"(avl)); - if (incx == 4) - __asm__("vle32.v v24, (%0)" : : "r"(x)); - else - __asm__("vlse32.v v24, (%0), %1" : : "r"(x), "r"(incx)); - // check for NaN - __asm__ volatile("vmfne.vv v0, v24, v24"); - dim_t nan_index; - __asm__ volatile("vfirst.m %0, v0" : "=r"(nan_index)); - if (nan_index != -1) { - *index = nan_index + offset; - return; - } - if (first) { - __asm__("vfabs.v v8, v24"); - // keep vl same, change SEW and LMUL - __asm__ volatile("vsetvli zero, zero, e64, m8, ta, ma"); - __asm__("vid.v v16"); - first = false; - } else { - __asm__("vfabs.v v24, v24"); - __asm__("vmflt.vv v0, v8, v24"); - __asm__("vmerge.vvm v8, v8, v24, v0"); - // keep vl same, change SEW and LMUL - __asm__ volatile("vsetvli zero, zero, e64, m8, tu, ma"); - __asm__("vid.v v24"); - __asm__("vadd.vx v24, v24, %0" : : "r"(offset)); - __asm__("vmerge.vvm v16, v16, v24, v0"); - } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); - offset += vl; - avl -= vl; - } - __asm__ volatile("vsetvli zero, %0, e32, m4, ta, ma" : : "r"(n)); - __asm__("vmv.s.x v0, zero"); - __asm__("vfredmax.vs v0, v8, v0"); - __asm__("vrgather.vi v24, v0, 0"); - __asm__("vmfeq.vv v0, v8, v24"); - __asm__ volatile("vsetvli zero, zero, e64, m8, ta, ma"); - uint64_t imax = -1; - __asm__("vmv.s.x v24, %0" : : "r"(imax)); - __asm__("vredminu.vs v24, v16, v24, v0.t"); - __asm__ volatile("vsetivli zero, 1, e64, m1, ta, ma"); - __asm__("vse64.v v24, (%0)" : : "r"(index)); - return; -} - -void bli_damaxv_sifive_x280_asm(dim_t n, const void * restrict x_, inc_t incx, - dim_t *index, const cntx_t *cntx) { - // assumes 64-bit index - (void)cntx; - const double* restrict x = x_; - - if (n <= 1) { - *index = 0; - return; - } - incx *= 8; - size_t avl = n; - size_t offset = 0; - bool first = true; - while (avl) { - size_t vl; - __asm__ volatile("vsetvli %0, %1, e64, m8, tu, ma" - : "=r"(vl) - : "r"(avl)); - if (incx == 8) - __asm__("vle64.v v24, (%0)" : : "r"(x)); - else - __asm__("vlse64.v v24, (%0), %1" : : "r"(x), "r"(incx)); - // check for NaN - __asm__ volatile("vmfne.vv v0, v24, v24"); - dim_t nan_index; - __asm__ volatile("vfirst.m %0, v0" : "=r"(nan_index)); - if (nan_index != -1) { - *index = nan_index + offset; - return; - } - if (first) { - __asm__("vfabs.v v8, v24"); - __asm__("vid.v v16"); - first = false; - } else { - __asm__("vfabs.v v24, v24"); - __asm__("vmflt.vv v0, v8, v24"); - __asm__("vmerge.vvm v8, v8, v24, v0"); - __asm__("vid.v v24"); - __asm__("vadd.vx v24, v24, %0" : : "r"(offset)); - __asm__("vmerge.vvm v16, v16, v24, v0"); - } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); - offset += vl; - avl -= vl; - } - __asm__ volatile("vsetvli zero, %0, e64, m8, ta, ma" : : "r"(n)); - __asm__("vmv.s.x v0, zero"); - __asm__("vfredmax.vs v0, v8, v0"); - __asm__("vrgather.vi v24, v0, 0"); - __asm__("vmfeq.vv v0, v8, v24"); - uint64_t imax = -1; - __asm__("vmv.s.x v24, %0" : : "r"(imax)); - __asm__("vredminu.vs v24, v16, v24, v0.t"); - __asm__ volatile("vsetivli zero, 1, e64, m1, ta, ma"); - __asm__("vse64.v v24, (%0)" : : "r"(index)); - return; -} - -void bli_camaxv_sifive_x280_asm(dim_t n, const void * restrict x_, inc_t incx, - dim_t *index, const cntx_t *cntx) { - // assumes 64-bit index - (void)cntx; - const scomplex* restrict x = x_; - - if (n <= 1) { - *index = 0; - return; - } - incx *= 8; - size_t avl = n; - size_t offset = 0; - bool first = true; - while (avl) { - size_t vl; - __asm__ volatile("vsetvli %0, %1, e32, m4, tu, ma" - : "=r"(vl) - : "r"(avl)); - if (incx == 8) - __asm__("vlseg2e32.v v24, (%0)" : : "r"(x)); - else - __asm__("vlsseg2e32.v v24, (%0), %1" : : "r"(x), "r"(incx)); - __asm__("vfabs.v v24, v24"); - __asm__("vfabs.v v28, v28"); - __asm__("vfadd.vv v24, v24, v28"); - // check for NaN - __asm__ volatile("vmfne.vv v0, v24, v24"); - dim_t nan_index; - __asm__ volatile("vfirst.m %0, v0" : "=r"(nan_index)); - if (nan_index != -1) { - *index = nan_index + offset; - return; - } - if (first) { - __asm__("vmv4r.v v8, v24"); - // keep vl same, change SEW and LMUL - __asm__ volatile("vsetvli zero, zero, e64, m8, ta, ma"); - __asm__("vid.v v16"); - first = false; - } else { - __asm__("vmflt.vv v0, v8, v24"); - __asm__("vmerge.vvm v8, v8, v24, v0"); - // keep vl same, change SEW and LMUL - __asm__ volatile("vsetvli zero, zero, e64, m8, tu, ma"); - __asm__("vid.v v24"); - __asm__("vadd.vx v24, v24, %0" : : "r"(offset)); - __asm__("vmerge.vvm v16, v16, v24, v0"); - } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); - offset += vl; - avl -= vl; - } - __asm__ volatile("vsetvli zero, %0, e32, m4, ta, ma" : : "r"(n)); - __asm__("vmv.s.x v0, zero"); - __asm__("vfredmax.vs v0, v8, v0"); - __asm__("vrgather.vi v24, v0, 0"); - __asm__("vmfeq.vv v0, v8, v24"); - __asm__ volatile("vsetvli zero, zero, e64, m8, ta, ma"); - uint64_t imax = -1; - __asm__("vmv.s.x v24, %0" : : "r"(imax)); - __asm__("vredminu.vs v24, v16, v24, v0.t"); - __asm__ volatile("vsetivli zero, 1, e64, m1, ta, ma"); - __asm__("vse64.v v24, (%0)" : : "r"(index)); - return; -} - -void bli_zamaxv_sifive_x280_asm(dim_t n, const void * restrict x_, inc_t incx, - dim_t *index, const cntx_t *cntx) { - // assumes 64-bit index - (void)cntx; - const dcomplex* restrict x = x_; - - if (n <= 1) { - *index = 0; - return; - } - incx *= 16; - size_t avl = n; - size_t offset = 0; - bool first = true; - while (avl) { - size_t vl; - __asm__ volatile("vsetvli %0, %1, e64, m4, tu, ma" - : "=r"(vl) - : "r"(avl)); - if (incx == 16) - __asm__("vlseg2e64.v v24, (%0)" : : "r"(x)); - else - __asm__("vlsseg2e64.v v24, (%0), %1" : : "r"(x), "r"(incx)); - __asm__("vfabs.v v24, v24"); - __asm__("vfabs.v v28, v28"); - __asm__("vfadd.vv v24, v24, v28"); - // check for NaN - __asm__ volatile("vmfne.vv v0, v24, v24"); - dim_t nan_index; - __asm__ volatile("vfirst.m %0, v0" : "=r"(nan_index)); - if (nan_index != -1) { - *index = nan_index + offset; - return; - } - if (first) { - __asm__("vmv4r.v v8, v24"); - __asm__("vid.v v16"); - first = false; - } else { - __asm__("vmflt.vv v0, v8, v24"); - __asm__("vmerge.vvm v8, v8, v24, v0"); - __asm__("vid.v v24"); - __asm__("vadd.vx v24, v24, %0" : : "r"(offset)); - __asm__("vmerge.vvm v16, v16, v24, v0"); - } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); - offset += vl; - avl -= vl; - } - __asm__ volatile("vsetvli zero, %0, e64, m4, ta, ma" : : "r"(n)); - __asm__("vmv.s.x v0, zero"); - __asm__("vfredmax.vs v0, v8, v0"); - __asm__("vrgather.vi v24, v0, 0"); - __asm__("vmfeq.vv v0, v8, v24"); - uint64_t imax = -1; - __asm__("vmv.s.x v24, %0" : : "r"(imax)); - __asm__("vredminu.vs v24, v16, v24, v0.t"); - __asm__ volatile("vsetivli zero, 1, e64, m1, ta, ma"); - __asm__("vse64.v v24, (%0)" : : "r"(index)); - return; -} diff --git a/kernels/sifive_x280/1/bli_amaxv_sifive_x280_intr/bli_amaxv_sifive_x280_intr.c b/kernels/sifive_x280/1/bli_amaxv_sifive_x280_intr/bli_amaxv_sifive_x280_intr.c new file mode 100644 index 0000000000..4f7d546304 --- /dev/null +++ b/kernels/sifive_x280/1/bli_amaxv_sifive_x280_intr/bli_amaxv_sifive_x280_intr.c @@ -0,0 +1,179 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2024, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off +#include "../../riscv_overloaded_intrinsics.h" +#include "blis.h" +#include +#include +#include +#include + +#define AMAXV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##amaxv_sifive_x280_intr(\ + dim_t n, \ + const T* restrict x_, inc_t incx, \ + dim_t* index, \ + const cntx_t* cntx \ +) + +#define AMAXV(...) AMAXV_(__VA_ARGS__) + +// BLIS defines integers to be 32 or 64 bits according to BLIS_INT_TYPE_SIZE. +// If BLIS_INT_TYPE_SIZE is any other value, integers are defined to be longs. +#if BLIS_INT_TYPE_SIZE == 32 || BLIS_INT_TYPE_SIZE == 64 +#define AMAXV_SIFIVE_X280_INT_SIZE BLIS_INT_TYPE_SIZE +#elif LONG_MAX == INT32_MAX +#define AMAXV_SIFIVE_X280_INT_SIZE 32 +#elif LONG_MAX == INT64_MAX +#define AMAXV_SIFIVE_X280_INT_SIZE 64 +#else +#error "Integers must be 32- or 64-bits for bli_?amaxv_sifive_x280_intr." +#endif + +// Single precision real +#define DATATYPE float +#define PRECISION_CHAR s +#define PREC_X 32 +#define PREC_I AMAXV_SIFIVE_X280_INT_SIZE +#if PREC_I == 32 +#define LMUL_X m4 +#define LMUL_I m4 +#define RATIO 8 +#elif PREC_I == 64 +#define LMUL_X m4 +#define LMUL_I m8 +#define RATIO 8 +#endif +#define FLT_SIZE sizeof(float) + +#include "./bli_amaxv_sifive_x280_intr_real.c" + +#undef DATATYPE +#undef PRECISION_CHAR +#undef PREC_X +#undef PREC_I +#undef LMUL_X +#undef LMUL_I +#undef RATIO +#undef FLT_SIZE + +// Double precision real +#define DATATYPE double +#define PRECISION_CHAR d +#define PREC_X 64 +#define PREC_I AMAXV_SIFIVE_X280_INT_SIZE +#if PREC_I == 32 +#define LMUL_X m8 +#define LMUL_I m4 +#define RATIO 8 +#elif PREC_I == 64 +#define LMUL_X m4 +#define LMUL_I m4 +#define RATIO 16 +#endif +#define FLT_SIZE sizeof(double) + +#include "./bli_amaxv_sifive_x280_intr_real.c" + +#undef DATATYPE +#undef PRECISION_CHAR +#undef PREC_X +#undef PREC_I +#undef LMUL_X +#undef LMUL_I +#undef RATIO +#undef FLT_SIZE + +// Single precision complex +#define DATATYPE scomplex +#define BASE_DT float +#define PRECISION_CHAR c +#define PREC_X 32 +#define PREC_I AMAXV_SIFIVE_X280_INT_SIZE +#if PREC_I == 32 +#define LMUL_X m4 +#define LMUL_I m4 +#define RATIO 8 +#elif PREC_I == 64 +#define LMUL_X m4 +#define LMUL_I m8 +#define RATIO 8 +#endif +#define FLT_SIZE sizeof(float) + +#include "./bli_amaxv_sifive_x280_intr_complex.c" + +#undef DATATYPE +#undef BASE_DT +#undef PRECISION_CHAR +#undef PREC_X +#undef PREC_I +#undef LMUL_X +#undef LMUL_I +#undef RATIO +#undef FLT_SIZE + +// Double precision complex +#define DATATYPE dcomplex +#define BASE_DT double +#define PRECISION_CHAR z +#define PREC_X 64 +#define PREC_I AMAXV_SIFIVE_X280_INT_SIZE +#if PREC_I == 32 +#define LMUL_X m8 +#define LMUL_I m4 +#define RATIO 8 +#elif PREC_I == 64 +#define LMUL_X m4 +#define LMUL_I m4 +#define RATIO 16 +#endif +#define FLT_SIZE sizeof(double) + +#include "./bli_amaxv_sifive_x280_intr_complex.c" + +#undef DATATYPE +#undef BASE_DT +#undef PRECISION_CHAR +#undef PREC_X +#undef PREC_I +#undef LMUL_X +#undef LMUL_I +#undef RATIO +#undef FLT_SIZE + +#undef AMAXV_SIFIVE_X280_INT_SIZE + +#undef AMAXV +#undef AMAXV_ diff --git a/kernels/sifive_x280/1/bli_amaxv_sifive_x280_intr/bli_amaxv_sifive_x280_intr_complex.c b/kernels/sifive_x280/1/bli_amaxv_sifive_x280_intr/bli_amaxv_sifive_x280_intr_complex.c new file mode 100644 index 0000000000..f1f3a749e7 --- /dev/null +++ b/kernels/sifive_x280/1/bli_amaxv_sifive_x280_intr/bli_amaxv_sifive_x280_intr_complex.c @@ -0,0 +1,105 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2024, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off +#ifdef AMAXV + +AMAXV(PRECISION_CHAR, void) +{ + (void)cntx; + const DATATYPE* restrict x = x_; + + if (n <= 1) { + *index = 0; + return; + } + + RVV_TYPE_F(PREC_X, LMUL_X) xacc; + // Indices will be unsigned and of the same width as dim_t. + RVV_TYPE_U(PREC_I, LMUL_I) iacc; + RVV_TYPE_U(PREC_I, LMUL_I) vid_vec = VID_V(PREC_I, LMUL_I)(n); + bool first = true; + guint_t offset = 0; + size_t avl = n; + while (avl) { + size_t vl = VSETVL(PREC_X, LMUL_X)(avl); + RVV_TYPE_FX(PREC_X, LMUL_X, 2) xvec; + + if (incx == 1) + xvec = VLSEG2_V_F(PREC_X, LMUL_X, 2)((BASE_DT*) x, vl); + else + xvec = VLSSEG2_V_F(PREC_X, LMUL_X, 2)((BASE_DT*) x, 2 * FLT_SIZE * incx, vl); + + RVV_TYPE_F(PREC_X, LMUL_X) xvec_real = VGET_V_F(PREC_X, LMUL_X, 2)(xvec, 0); + RVV_TYPE_F(PREC_X, LMUL_X) xvec_imag = VGET_V_F(PREC_X, LMUL_X, 2)(xvec, 1); + RVV_TYPE_F(PREC_X, LMUL_X) xvec_real_abs = VFABS_V(PREC_X, LMUL_X)(xvec_real, vl); + RVV_TYPE_F(PREC_X, LMUL_X) xvec_imag_abs = VFABS_V(PREC_X, LMUL_X)(xvec_imag, vl); + RVV_TYPE_F(PREC_X, LMUL_X) xvec_abs = VFADD_VV(PREC_X, LMUL_X)(xvec_real_abs, xvec_imag_abs, vl); + + RVV_TYPE_B(RATIO) is_nan = VMFNE_VV(PREC_X, LMUL_X, RATIO)(xvec_abs, xvec_abs, vl); + int nan_index = VFIRST_M(RATIO)(is_nan, vl); + if (nan_index != -1) { + *index = (guint_t) nan_index + offset; + return; + } + + if (first) { + xacc = xvec_abs; + iacc = vid_vec; + first = false; + } + else { + RVV_TYPE_B(RATIO) mask = VMFGT_VV(PREC_X, LMUL_X, RATIO)(xvec_abs, xacc, vl); + xacc = VFMAX_VV_TU(PREC_X, LMUL_X)(xacc, xvec_abs, xacc, vl); + RVV_TYPE_U(PREC_I, LMUL_I) ivec = VADD_VX_U(PREC_I, LMUL_I)(vid_vec, offset, vl); + iacc = VMERGE_VVM_TU_U(PREC_I, LMUL_I)(iacc, iacc, ivec, mask, vl); + } + + x += vl * incx; + offset += vl; + avl -= vl; + } + + RVV_TYPE_F(PREC_X, m1) xmax = VFMV_S_F(PREC_X, m1)(0., 1); + xmax = VFREDMAX_VS(PREC_X, LMUL_X)(xacc, xmax, n); + RVV_TYPE_F(PREC_X, LMUL_X) xmax_splat = VLMUL_EXT_V_F_M1(PREC_X, LMUL_X)(xmax); + xmax_splat = VRGATHER_VX_F(PREC_X, LMUL_X)(xmax_splat, 0, n); + RVV_TYPE_B(RATIO) mask = VMFEQ_VV(PREC_X, LMUL_X, RATIO)(xacc, xmax_splat, n); + RVV_TYPE_U(PREC_I, m1) imax = VMV_S_X_U(PREC_I, m1)(-1, 1); + imax = VREDMINU_VS_M(PREC_I, LMUL_I)(mask, iacc, imax, n); + *index = VMV_X_S_U(PREC_I)(imax); + return; +} + +#endif // AMAXV diff --git a/kernels/sifive_x280/1/bli_amaxv_sifive_x280_intr/bli_amaxv_sifive_x280_intr_real.c b/kernels/sifive_x280/1/bli_amaxv_sifive_x280_intr/bli_amaxv_sifive_x280_intr_real.c new file mode 100644 index 0000000000..bcc4ee99de --- /dev/null +++ b/kernels/sifive_x280/1/bli_amaxv_sifive_x280_intr/bli_amaxv_sifive_x280_intr_real.c @@ -0,0 +1,100 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2024, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off +#ifdef AMAXV + +AMAXV(PRECISION_CHAR, void) +{ + (void)cntx; + const DATATYPE* restrict x = x_; + + if (n <= 1) { + *index = 0; + return; + } + + RVV_TYPE_F(PREC_X, LMUL_X) xacc; + // Indices will be unsigned and of the same width as dim_t. + RVV_TYPE_U(PREC_I, LMUL_I) iacc; + RVV_TYPE_U(PREC_I, LMUL_I) vid_vec = VID_V(PREC_I, LMUL_I)(n); + bool first = true; + guint_t offset = 0; + size_t avl = n; + while (avl) { + size_t vl = VSETVL(PREC_X, LMUL_X)(avl); + RVV_TYPE_F(PREC_X, LMUL_X) xvec; + + if (incx == 1) + xvec = VLE_V_F(PREC_X, LMUL_X)(x, vl); + else + xvec = VLSE_V_F(PREC_X, LMUL_X)(x, FLT_SIZE * incx, vl); + + RVV_TYPE_B(RATIO) is_nan = VMFNE_VV(PREC_X, LMUL_X, RATIO)(xvec, xvec, vl); + int nan_index = VFIRST_M(RATIO)(is_nan, vl); + if (nan_index != -1) { + *index = (guint_t) nan_index + offset; + return; + } + + if (first) { + xacc = VFABS_V(PREC_X, LMUL_X)(xvec, vl); + iacc = vid_vec; + first = false; + } + else { + xvec = VFABS_V(PREC_X, LMUL_X)(xvec, vl); + RVV_TYPE_B(RATIO) mask = VMFGT_VV(PREC_X, LMUL_X, RATIO)(xvec, xacc, vl); + xacc = VFMAX_VV_TU(PREC_X, LMUL_X)(xacc, xvec, xacc, vl); + RVV_TYPE_U(PREC_I, LMUL_I) ivec = VADD_VX_U(PREC_I, LMUL_I)(vid_vec, offset, vl); + iacc = VMERGE_VVM_TU_U(PREC_I, LMUL_I)(iacc, iacc, ivec, mask, vl); + } + + x += vl * incx; + offset += vl; + avl -= vl; + } + + RVV_TYPE_F(PREC_X, m1) xmax = VFMV_S_F(PREC_X, m1)(0., 1); + xmax = VFREDMAX_VS(PREC_X, LMUL_X)(xacc, xmax, n); + RVV_TYPE_F(PREC_X, LMUL_X) xmax_splat = VLMUL_EXT_V_F_M1(PREC_X, LMUL_X)(xmax); + xmax_splat = VRGATHER_VX_F(PREC_X, LMUL_X)(xmax_splat, 0, n); + RVV_TYPE_B(RATIO) mask = VMFEQ_VV(PREC_X, LMUL_X, RATIO)(xacc, xmax_splat, n); + RVV_TYPE_U(PREC_I, m1) imax = VMV_S_X_U(PREC_I, m1)(-1, 1); + imax = VREDMINU_VS_M(PREC_I, LMUL_I)(mask, iacc, imax, n); + *index = VMV_X_S_U(PREC_I)(imax); + return; +} + +#endif // AMAXV diff --git a/kernels/sifive_x280/1/bli_axpbyv_sifive_x280_intr/bli_axpbyv_sifive_x280_intr.c b/kernels/sifive_x280/1/bli_axpbyv_sifive_x280_intr/bli_axpbyv_sifive_x280_intr.c index 3b29f898df..389292f90f 100644 --- a/kernels/sifive_x280/1/bli_axpbyv_sifive_x280_intr/bli_axpbyv_sifive_x280_intr.c +++ b/kernels/sifive_x280/1/bli_axpbyv_sifive_x280_intr/bli_axpbyv_sifive_x280_intr.c @@ -52,9 +52,7 @@ #define AXPBYV(...) AXPBYV_(__VA_ARGS__) -#define COPYV_(PRECISION_CHAR) bli_##PRECISION_CHAR##copyv_sifive_x280_asm -#define COPYV(PRECISION_CHAR) COPYV_(PRECISION_CHAR) -#define SETV_(PRECISION_CHAR) bli_##PRECISION_CHAR##setv_sifive_x280_asm +#define SETV_(PRECISION_CHAR) bli_##PRECISION_CHAR##setv_sifive_x280_intr #define SETV(PRECISION_CHAR) SETV_(PRECISION_CHAR) #define SCALV_(PRECISION_CHAR) bli_##PRECISION_CHAR##scalv_sifive_x280_intr #define SCALV(PRECISION_CHAR) SCALV_(PRECISION_CHAR) diff --git a/kernels/sifive_x280/1/bli_copyv_sifive_x280_asm.c b/kernels/sifive_x280/1/bli_copyv_sifive_x280_asm.c deleted file mode 100644 index 3571877759..0000000000 --- a/kernels/sifive_x280/1/bli_copyv_sifive_x280_asm.c +++ /dev/null @@ -1,272 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2023, SiFive, Inc. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" -#include -#include -#include -#include - -#define FLT_SIZE 4 -#define VLE "vle32.v " -#define VLSE "vlse32.v " -#define VSE "vse32.v " -#define VSSE "vsse32.v " - -void bli_scopyv_sifive_x280_asm(conj_t conjx, dim_t n, const void * restrict x_, inc_t incx, - void * restrict y_, inc_t incy, const cntx_t *cntx) { - (void)conjx; - (void)cntx; - const float* restrict x = x_; - float* restrict y = y_; - if (n <= 0) - return; - - incx *= FLT_SIZE; - incy *= FLT_SIZE; - size_t avl = n; - while (avl) { - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma" - : "=r"(vl) - : "r"(avl), "i"(8 * FLT_SIZE)); - if (incx == FLT_SIZE) - __asm__(VLE "v0, (%0)" : : "r"(x)); - else - __asm__(VLSE "v0, (%0), %1" : : "r"(x), "r"(incx)); - - if (incy == FLT_SIZE) - __asm__(VSE "v0, (%0)" : : "r"(y)); - else - __asm__(VSSE "v0, (%0), %1" : : "r"(y), "r"(incy)); - - __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); - __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy)); - avl -= vl; - } - return; -} - -#undef FLT_SIZE -#undef VLE -#undef VLSE -#undef VSE -#undef VSSE - -#define FLT_SIZE 8 -#define VLE "vle64.v " -#define VLSE "vlse64.v " -#define VSE "vse64.v " -#define VSSE "vsse64.v " - -void bli_dcopyv_sifive_x280_asm(conj_t conjx, dim_t n, const void * restrict x_, inc_t incx, - void * restrict y_, inc_t incy, const cntx_t *cntx) { - (void)conjx; - (void)cntx; - const double* restrict x = x_; - double* restrict y = y_; - if (n <= 0) - return; - - incx *= FLT_SIZE; - incy *= FLT_SIZE; - size_t avl = n; - while (avl) { - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma" - : "=r"(vl) - : "r"(avl), "i"(8 * FLT_SIZE)); - if (incx == FLT_SIZE) - __asm__(VLE "v0, (%0)" : : "r"(x)); - else - __asm__(VLSE "v0, (%0), %1" : : "r"(x), "r"(incx)); - - if (incy == FLT_SIZE) - __asm__(VSE "v0, (%0)" : : "r"(y)); - else - __asm__(VSSE "v0, (%0), %1" : : "r"(y), "r"(incy)); - - __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); - __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy)); - avl -= vl; - } - return; -} - -#undef FLT_SIZE -#undef VLE -#undef VLSE -#undef VSE -#undef VSSE - -#define FLT_SIZE 4 -#define VLE "vle64.v " -#define VLSE "vlse64.v " -#define VSE "vse64.v " -#define VSSE "vsse64.v " -#define VLSEG2 "vlseg2e32.v " -#define VLSSEG2 "vlsseg2e32.v " -#define VSSEG2 "vsseg2e32.v " -#define VSSSEG2 "vssseg2e32.v " - -void bli_ccopyv_sifive_x280_asm(conj_t conjx, dim_t n, const void * restrict x_, inc_t incx, - void * restrict y_, inc_t incy, const cntx_t *cntx) { - (void)cntx; - const scomplex* restrict x = x_; - scomplex* restrict y = y_; - if (n <= 0) - return; - - incx *= 2 * FLT_SIZE; - incy *= 2 * FLT_SIZE; - if (conjx == BLIS_NO_CONJUGATE) { - size_t avl = n; - while (avl) { - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma" - : "=r"(vl) - : "r"(avl), "i"(8 * 2 * FLT_SIZE)); - if (incx == 2 * FLT_SIZE) - __asm__(VLE "v0, (%0)" : : "r"(x)); - else - __asm__(VLSE "v0, (%0), %1" : : "r"(x), "r"(incx)); - - if (incy == 2 * FLT_SIZE) - __asm__(VSE "v0, (%0)" : : "r"(y)); - else - __asm__(VSSE "v0, (%0), %1" : : "r"(y), "r"(incy)); - - __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); - __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy)); - avl -= vl; - } - } else { - size_t avl = n; - while (avl) { - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma" - : "=r"(vl) - : "r"(avl), "i"(8 * FLT_SIZE)); - if (incx == 2 * FLT_SIZE) - __asm__(VLSEG2 "v0, (%0)" : : "r"(x)); - else - __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx)); - - __asm__("vfneg.v v4, v4"); - - if (incy == 2 * FLT_SIZE) - __asm__(VSSEG2 "v0, (%0)" : : "r"(y)); - else - __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(y), "r"(incy)); - - __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); - __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy)); - avl -= vl; - } - } - return; -} - -#undef FLT_SIZE -#undef VLE -#undef VLSE -#undef VSE -#undef VSSE -#undef VLSEG2 -#undef VLSSEG2 -#undef VSSEG2 -#undef VSSSEG2 - -#define FLT_SIZE 8 -#define SH_ADD "sh3add " -#define VLE "vle64.v " -#define VLSE "vlse64.v " -#define VSE "vse64.v " -#define VSSE "vsse64.v " -#define VLSEG2 "vlseg2e64.v " -#define VLSSEG2 "vlsseg2e64.v " -#define VSSEG2 "vsseg2e64.v " -#define VSSSEG2 "vssseg2e64.v " - -void bli_zcopyv_sifive_x280_asm(conj_t conjx, dim_t n, const void * restrict x_, inc_t incx, - void * restrict y_, inc_t incy, const cntx_t *cntx) { - (void)cntx; - const dcomplex* restrict x = x_; - dcomplex* restrict y = y_; - if (n <= 0) - return; - - incx *= 2 * FLT_SIZE; - incy *= 2 * FLT_SIZE; - if (conjx == BLIS_NO_CONJUGATE && incx == 2 * FLT_SIZE && - incy == 2 * FLT_SIZE) { - size_t avl = 2 * n; - while (avl) { - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma" - : "=r"(vl) - : "r"(avl), "i"(8 * FLT_SIZE)); - __asm__(VLE "v0, (%0)" : : "r"(x)); - __asm__(VSE "v0, (%0)" : : "r"(y)); - __asm__(SH_ADD "%0, %1, %0" : "+r"(x) : "r"(vl)); - __asm__(SH_ADD "%0, %1, %0" : "+r"(y) : "r"(vl)); - avl -= vl; - } - } else { - size_t avl = n; - while (avl) { - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma" - : "=r"(vl) - : "r"(avl), "i"(8 * FLT_SIZE)); - if (incx == 2 * FLT_SIZE) - __asm__(VLSEG2 "v0, (%0)" : : "r"(x)); - else - __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx)); - - if (conjx == BLIS_CONJUGATE) - __asm__("vfneg.v v4, v4"); - - if (incy == 2 * FLT_SIZE) - __asm__(VSSEG2 "v0, (%0)" : : "r"(y)); - else - __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(y), "r"(incy)); - - __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); - __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy)); - avl -= vl; - } - } - return; -} diff --git a/kernels/sifive_x280/1/bli_copyv_sifive_x280_intr/bli_copyv_sifive_x280_intr.c b/kernels/sifive_x280/1/bli_copyv_sifive_x280_intr/bli_copyv_sifive_x280_intr.c new file mode 100644 index 0000000000..e030d85ff3 --- /dev/null +++ b/kernels/sifive_x280/1/bli_copyv_sifive_x280_intr/bli_copyv_sifive_x280_intr.c @@ -0,0 +1,116 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2024, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off +#include "../../riscv_overloaded_intrinsics.h" +#include "blis.h" +#include +#include + +#define COPYV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##copyv_sifive_x280_intr(\ + conj_t conjx, \ + dim_t n, \ + const T* restrict x_, inc_t incx, \ + T* restrict y_, inc_t incy, \ + const cntx_t* cntx \ +) + +#define COPYV(...) COPYV_(__VA_ARGS__) + +// Single precision real +#define DATATYPE float +#define PRECISION_CHAR s +#define PREC 32 +#define LMUL m8 +#define FLT_SIZE sizeof(float) + +#include "./bli_copyv_sifive_x280_intr_real.c" + +#undef DATATYPE +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +// Double precision real +#define DATATYPE double +#define PRECISION_CHAR d +#define PREC 64 +#define LMUL m8 +#define FLT_SIZE sizeof(double) + +#include "./bli_copyv_sifive_x280_intr_real.c" + +#undef DATATYPE +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +// Single precision complex +#define DATATYPE scomplex +#define BASE_DT float +#define PRECISION_CHAR c +#define PREC 32 +#define LMUL m4 +#define FLT_SIZE sizeof(float) + +#include "./bli_copyv_sifive_x280_intr_complex.c" + +#undef DATATYPE +#undef BASE_DT +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +// Double precision complex +#define DATATYPE dcomplex +#define BASE_DT double +#define PRECISION_CHAR z +#define PREC 64 +#define LMUL m4 +#define FLT_SIZE sizeof(double) + +#include "./bli_copyv_sifive_x280_intr_complex.c" + +#undef DATATYPE +#undef BASE_DT +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +#undef COPYV +#undef COPYV_ diff --git a/kernels/sifive_x280/1/bli_copyv_sifive_x280_intr/bli_copyv_sifive_x280_intr_complex.c b/kernels/sifive_x280/1/bli_copyv_sifive_x280_intr/bli_copyv_sifive_x280_intr_complex.c new file mode 100644 index 0000000000..21e5959679 --- /dev/null +++ b/kernels/sifive_x280/1/bli_copyv_sifive_x280_intr/bli_copyv_sifive_x280_intr_complex.c @@ -0,0 +1,75 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2024, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off +#ifdef COPYV + +COPYV(PRECISION_CHAR, void) +{ + (void)cntx; + const DATATYPE* restrict x = x_; + DATATYPE* restrict y = y_; + + if (n <= 0) return; + + size_t avl = n; + while (avl) { + size_t vl = VSETVL(PREC, LMUL)(avl); + RVV_TYPE_FX(PREC, LMUL, 2) xvec; + + if (incx == 1) + xvec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, vl); + else + xvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, 2 * FLT_SIZE * incx, vl); + + if (bli_is_conj(conjx)) { + RVV_TYPE_F(PREC, LMUL) xvec_imag; + xvec_imag = VGET_V_F(PREC, LMUL, 2)(xvec, 1); + xvec_imag = VFNEG_VF(PREC, LMUL)(xvec_imag, vl); + xvec = VSET_V_F(PREC, LMUL, 2)(xvec, 1, xvec_imag); + } + + if (incy == 1) + VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) y, xvec, vl); + else + VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) y, 2 * FLT_SIZE * incy, xvec, vl); + + x += vl * incx; + y += vl * incy; + avl -= vl; + } + return; +} + +#endif // COPYV diff --git a/kernels/sifive_x280/1/bli_copyv_sifive_x280_intr/bli_copyv_sifive_x280_intr_real.c b/kernels/sifive_x280/1/bli_copyv_sifive_x280_intr/bli_copyv_sifive_x280_intr_real.c new file mode 100644 index 0000000000..00bb8ed494 --- /dev/null +++ b/kernels/sifive_x280/1/bli_copyv_sifive_x280_intr/bli_copyv_sifive_x280_intr_real.c @@ -0,0 +1,68 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2024, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off +#ifdef COPYV + +COPYV(PRECISION_CHAR, void) +{ + (void)conjx; + (void)cntx; + const DATATYPE* restrict x = x_; + DATATYPE* restrict y = y_; + + if (n <= 0) return; + + size_t avl = n; + while (avl) { + size_t vl = VSETVL(PREC, LMUL)(avl); + RVV_TYPE_F(PREC, LMUL) xvec; + + if (incx == 1) + xvec = VLE_V_F(PREC, LMUL)(x, vl); + else + xvec = VLSE_V_F(PREC, LMUL)(x, FLT_SIZE * incx, vl); + if (incy == 1) + VSE_V_F(PREC, LMUL)(y, xvec, vl); + else + VSSE_V_F(PREC, LMUL)(y, FLT_SIZE * incy, xvec, vl); + + x += vl * incx; + y += vl * incy; + avl -= vl; + } + return; +} + +#endif // COPYV diff --git a/kernels/sifive_x280/1/bli_invertv_sifive_x280_asm.c b/kernels/sifive_x280/1/bli_invertv_sifive_x280_asm.c deleted file mode 100644 index cbca885929..0000000000 --- a/kernels/sifive_x280/1/bli_invertv_sifive_x280_asm.c +++ /dev/null @@ -1,221 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2023, SiFive, Inc. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" -#include -#include -#include -#include - -#define FLT_SIZE 4 -#define FLT_LOAD "flw " -#define VLE "vle32.v " -#define VLSE "vlse32.v " -#define VSE "vse32.v " -#define VSSE "vsse32.v " - -void bli_sinvertv_sifive_x280_asm(dim_t n, void * restrict x_, inc_t incx, - const cntx_t *cntx) { - (void)cntx; - float* restrict x = x_; - if (n <= 0) - return; - - float one = 1.f; - __asm__(FLT_LOAD "f0, (%0)" : : "r"(&one)); - incx *= FLT_SIZE; - size_t avl = n; - while (avl) { - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma" - : "=r"(vl) - : "r"(avl), "i"(8 * FLT_SIZE)); - if (incx == FLT_SIZE) { - __asm__(VLE "v0, (%0)" : : "r"(x)); - __asm__("vfrdiv.vf v0, v0, f0"); - __asm__(VSE "v0, (%0)" : : "r"(x)); - } else { - __asm__(VLSE "v0, (%0), %1" : : "r"(x), "r"(incx)); - __asm__("vfrdiv.vf v0, v0, f0"); - __asm__(VSSE "v0, (%0), %1" : : "r"(x), "r"(incx)); - } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); - avl -= vl; - } - return; -} - -#undef FLT_SIZE -#undef FLT_LOAD -#undef VLE -#undef VLSE -#undef VSE -#undef VSSE - -#define FLT_SIZE 8 -#define FLT_LOAD "fld " -#define VLE "vle64.v " -#define VLSE "vlse64.v " -#define VSE "vse64.v " -#define VSSE "vsse64.v " - -void bli_dinvertv_sifive_x280_asm(dim_t n, void * restrict x_, inc_t incx, - const cntx_t *cntx) { - (void)cntx; - double* restrict x = x_; - if (n <= 0) - return; - - double one = 1.; - __asm__(FLT_LOAD "f0, (%0)" : : "r"(&one)); - incx *= FLT_SIZE; - size_t avl = n; - while (avl) { - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma" - : "=r"(vl) - : "r"(avl), "i"(8 * FLT_SIZE)); - if (incx == FLT_SIZE) { - __asm__(VLE "v0, (%0)" : : "r"(x)); - __asm__("vfrdiv.vf v0, v0, f0"); - __asm__(VSE "v0, (%0)" : : "r"(x)); - } else { - __asm__(VLSE "v0, (%0), %1" : : "r"(x), "r"(incx)); - __asm__("vfrdiv.vf v0, v0, f0"); - __asm__(VSSE "v0, (%0), %1" : : "r"(x), "r"(incx)); - } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); - avl -= vl; - } - return; -} - -#undef FLT_SIZE -#undef FLT_LOAD -#undef VLE -#undef VLSE -#undef VSE -#undef VSSE - -#define FLT_SIZE 4 -#define VLSEG2 "vlseg2e32.v " -#define VLSSEG2 "vlsseg2e32.v " -#define VSSEG2 "vsseg2e32.v " -#define VSSSEG2 "vssseg2e32.v " - -void bli_cinvertv_sifive_x280_asm(dim_t n, void * restrict x_, inc_t incx, - const cntx_t *cntx) { - (void)cntx; - scomplex* restrict x = x_; - if (n <= 0) - return; - - incx *= 2 * FLT_SIZE; - size_t avl = n; - while (avl) { - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma" - : "=r"(vl) - : "r"(avl), "i"(8 * FLT_SIZE)); - if (incx == 2 * FLT_SIZE) { - __asm__(VLSEG2 "v0, (%0)" : : "r"(x)); - __asm__("vfneg.v v4, v4"); - __asm__("vfmul.vv v8, v0, v0"); - __asm__("vfmacc.vv v8, v4, v4"); - __asm__("vfdiv.vv v0, v0, v8"); - __asm__("vfdiv.vv v4, v4, v8"); - __asm__(VSSEG2 "v0, (%0)" : : "r"(x)); - } else { - __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx)); - __asm__("vfneg.v v4, v4"); - __asm__("vfmul.vv v8, v0, v0"); - __asm__("vfmacc.vv v8, v4, v4"); - __asm__("vfdiv.vv v0, v0, v8"); - __asm__("vfdiv.vv v4, v4, v8"); - __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx)); - } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); - avl -= vl; - } - return; -} - -#undef FLT_SIZE -#undef VLSEG2 -#undef VLSSEG2 -#undef VSSEG2 -#undef VSSSEG2 - -#define FLT_SIZE 8 -#define VLSEG2 "vlseg2e64.v " -#define VLSSEG2 "vlsseg2e64.v " -#define VSSEG2 "vsseg2e64.v " -#define VSSSEG2 "vssseg2e64.v " - -void bli_zinvertv_sifive_x280_asm(dim_t n, void * restrict x_, inc_t incx, - const cntx_t *cntx) { - (void)cntx; - dcomplex* restrict x = x_; - if (n <= 0) - return; - - incx *= 2 * FLT_SIZE; - size_t avl = n; - while (avl) { - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma" - : "=r"(vl) - : "r"(avl), "i"(8 * FLT_SIZE)); - if (incx == 2 * FLT_SIZE) { - __asm__(VLSEG2 "v0, (%0)" : : "r"(x)); - __asm__("vfneg.v v4, v4"); - __asm__("vfmul.vv v8, v0, v0"); - __asm__("vfmacc.vv v8, v4, v4"); - __asm__("vfdiv.vv v0, v0, v8"); - __asm__("vfdiv.vv v4, v4, v8"); - __asm__(VSSEG2 "v0, (%0)" : : "r"(x)); - } else { - __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx)); - __asm__("vfneg.v v4, v4"); - __asm__("vfmul.vv v8, v0, v0"); - __asm__("vfmacc.vv v8, v4, v4"); - __asm__("vfdiv.vv v0, v0, v8"); - __asm__("vfdiv.vv v4, v4, v8"); - __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx)); - } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); - avl -= vl; - } - return; -} diff --git a/kernels/sifive_x280/1/bli_invertv_sifive_x280_intr/bli_invertv_sifive_x280_intr.c b/kernels/sifive_x280/1/bli_invertv_sifive_x280_intr/bli_invertv_sifive_x280_intr.c new file mode 100644 index 0000000000..fc8f8a76d7 --- /dev/null +++ b/kernels/sifive_x280/1/bli_invertv_sifive_x280_intr/bli_invertv_sifive_x280_intr.c @@ -0,0 +1,118 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2024, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off +#include "../../riscv_overloaded_intrinsics.h" +#include "blis.h" +#include +#include + +#define INVERTV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##invertv_sifive_x280_intr(\ + dim_t n, \ + T* restrict x_, inc_t incx, \ + const cntx_t* cntx \ +) + +#define INVERTV(...) INVERTV_(__VA_ARGS__) + +// Single precision real +#define DATATYPE float +#define PRECISION_CHAR s +#define PREC 32 +#define LMUL m8 +#define FLT_SIZE sizeof(float) + +#include "./bli_invertv_sifive_x280_intr_real.c" + +#undef DATATYPE +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +// Double precision real +#define DATATYPE double +#define PRECISION_CHAR d +#define PREC 64 +#define LMUL m8 +#define FLT_SIZE sizeof(double) + +#include "./bli_invertv_sifive_x280_intr_real.c" + +#undef DATATYPE +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +// Single precision complex +#define DATATYPE scomplex +#define BASE_DT float +#define PRECISION_CHAR c +#define PREC 32 +#define LMUL m4 +#define RATIO 8 +#define FLT_SIZE sizeof(float) + +#include "./bli_invertv_sifive_x280_intr_complex.c" + +#undef DATATYPE +#undef BASE_DT +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef RATIO +#undef FLT_SIZE + +// Double precision complex +#define DATATYPE dcomplex +#define BASE_DT double +#define PRECISION_CHAR z +#define PREC 64 +#define LMUL m4 +#define RATIO 16 +#define FLT_SIZE sizeof(double) + +#include "./bli_invertv_sifive_x280_intr_complex.c" + +#undef DATATYPE +#undef BASE_DT +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef RATIO +#undef FLT_SIZE + +#undef INVERTV +#undef INVERTV_ diff --git a/kernels/sifive_x280/1/bli_invertv_sifive_x280_intr/bli_invertv_sifive_x280_intr_complex.c b/kernels/sifive_x280/1/bli_invertv_sifive_x280_intr/bli_invertv_sifive_x280_intr_complex.c new file mode 100644 index 0000000000..994ae3075c --- /dev/null +++ b/kernels/sifive_x280/1/bli_invertv_sifive_x280_intr/bli_invertv_sifive_x280_intr_complex.c @@ -0,0 +1,83 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2024, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off +#ifdef INVERTV + +INVERTV(PRECISION_CHAR, void) +{ + (void)cntx; + DATATYPE* restrict x = x_; + + if (n <= 0) return; + + size_t avl = n; + while (avl) { + size_t vl = VSETVL(PREC, LMUL)(avl); + RVV_TYPE_FX(PREC, LMUL, 2) xvec; + + if (incx == 1) + xvec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, vl); + else + xvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, 2 * FLT_SIZE * incx, vl); + + RVV_TYPE_F(PREC, LMUL) xvec_real = VGET_V_F(PREC, LMUL, 2)(xvec, 0); + RVV_TYPE_F(PREC, LMUL) xvec_imag = VGET_V_F(PREC, LMUL, 2)(xvec, 1); + RVV_TYPE_F(PREC, LMUL) xvec_real_abs = VFABS_V(PREC, LMUL)(xvec_real, vl); + RVV_TYPE_F(PREC, LMUL) xvec_imag_abs = VFABS_V(PREC, LMUL)(xvec_imag, vl); + RVV_TYPE_B(RATIO) mask = VMFGE_VV(PREC, LMUL, RATIO)(xvec_real_abs, xvec_imag_abs, vl); + RVV_TYPE_F(PREC, LMUL) max = VMERGE_VVM_F(PREC, LMUL)(xvec_imag, xvec_real, mask, vl); + RVV_TYPE_F(PREC, LMUL) min = VMERGE_VVM_F(PREC, LMUL)(xvec_real, xvec_imag, mask, vl); + RVV_TYPE_F(PREC, LMUL) f = VFDIV_VV(PREC, LMUL)(min, max, vl); + RVV_TYPE_F(PREC, LMUL) denom = VFMACC_VV(PREC, LMUL)(max, f, min, vl); + RVV_TYPE_F(PREC, LMUL) t1 = VFRDIV_VF(PREC, LMUL)(denom, 1., vl); + RVV_TYPE_F(PREC, LMUL) t2 = VFDIV_VV(PREC, LMUL)(f, denom, vl); + xvec_real = VMERGE_VVM_F(PREC, LMUL)(t2, t1, mask, vl); + xvec_imag = VMERGE_VVM_F(PREC, LMUL)(t1, t2, mask, vl); + xvec_imag = VFNEG_VF(PREC, LMUL)(xvec_imag, vl); + xvec = VSET_V_F(PREC, LMUL, 2)(xvec, 0, xvec_real); + xvec = VSET_V_F(PREC, LMUL, 2)(xvec, 1, xvec_imag); + + if (incx == 1) + VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, xvec, vl); + else + VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, 2 * FLT_SIZE * incx, xvec, vl); + + x += vl * incx; + avl -= vl; + } + return; +} + +#endif // INVERTV diff --git a/kernels/sifive_x280/1/bli_invertv_sifive_x280_intr/bli_invertv_sifive_x280_intr_real.c b/kernels/sifive_x280/1/bli_invertv_sifive_x280_intr/bli_invertv_sifive_x280_intr_real.c new file mode 100644 index 0000000000..621e88c9f6 --- /dev/null +++ b/kernels/sifive_x280/1/bli_invertv_sifive_x280_intr/bli_invertv_sifive_x280_intr_real.c @@ -0,0 +1,68 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2024, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off +#ifdef INVERTV + +INVERTV(PRECISION_CHAR, void) +{ + (void)cntx; + DATATYPE* restrict x = x_; + + if (n <= 0) return; + + size_t avl = n; + while (avl) { + size_t vl = VSETVL(PREC, LMUL)(avl); + RVV_TYPE_F(PREC, LMUL) xvec; + + if (incx == 1) + xvec = VLE_V_F(PREC, LMUL)(x, vl); + else + xvec = VLSE_V_F(PREC, LMUL)(x, FLT_SIZE * incx, vl); + + xvec = VFRDIV_VF(PREC, LMUL)(xvec, 1., vl); + + if (incx == 1) + VSE_V_F(PREC, LMUL)(x, xvec, vl); + else + VSSE_V_F(PREC, LMUL)(x, FLT_SIZE * incx, xvec, vl); + + x += vl * incx; + avl -= vl; + } + return; +} + +#endif // INVERTV diff --git a/kernels/sifive_x280/1/bli_invscalv_sifive_x280_asm.c b/kernels/sifive_x280/1/bli_invscalv_sifive_x280_asm.c deleted file mode 100644 index 51edc92214..0000000000 --- a/kernels/sifive_x280/1/bli_invscalv_sifive_x280_asm.c +++ /dev/null @@ -1,266 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2023, SiFive, Inc. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" -#include -#include -#include -#include - -#define FLT_SIZE 4 -#define FLT_LOAD "flw " -#define FDIV "fdiv.s " -#define VLE "vle32.v " -#define VLSE "vlse32.v " -#define VSE "vse32.v " -#define VSSE "vsse32.v " - -void bli_sinvscalv_sifive_x280_asm(conj_t conjalpha, dim_t n, const void * restrict alpha_, - void * restrict x_, inc_t incx, - const cntx_t *cntx) { - (void)conjalpha; - (void)cntx; - const float* restrict alpha = alpha_; - float* restrict x = x_; - if (n <= 0 || *alpha == 0.f || *alpha == 1.f) - return; - - float one = 1.f; - __asm__(FLT_LOAD "f0, (%0)" : : "r"(&one)); - __asm__(FLT_LOAD "f1, (%0)" : : "r"(alpha)); - __asm__(FDIV "f0, f0, f1"); - incx *= FLT_SIZE; - size_t avl = n; - while (avl) { - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma" - : "=r"(vl) - : "r"(avl), "i"(8 * FLT_SIZE)); - if (incx == FLT_SIZE) { - __asm__(VLE "v0, (%0)" : : "r"(x)); - __asm__("vfmul.vf v0, v0, f0"); - __asm__(VSE "v0, (%0)" : : "r"(x)); - } else { - __asm__(VLSE "v0, (%0), %1" : : "r"(x), "r"(incx)); - __asm__("vfmul.vf v0, v0, f0"); - __asm__(VSSE "v0, (%0), %1" : : "r"(x), "r"(incx)); - } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); - avl -= vl; - } - return; -} - -#undef FLT_SIZE -#undef FLT_LOAD -#undef FDIV -#undef VLE -#undef VLSE -#undef VSE -#undef VSSE - -#define FLT_SIZE 8 -#define FLT_LOAD "fld " -#define FDIV "fdiv.d " -#define VLE "vle64.v " -#define VLSE "vlse64.v " -#define VSE "vse64.v " -#define VSSE "vsse64.v " - -void bli_dinvscalv_sifive_x280_asm(conj_t conjalpha, dim_t n, const void * restrict alpha_, - void * restrict x_, inc_t incx, - const cntx_t *cntx) { - (void)conjalpha; - (void)cntx; - const double* restrict alpha = alpha_; - double* restrict x = x_; - if (n <= 0 || *alpha == 0. || *alpha == 1.) - return; - - double one = 1.; - __asm__(FLT_LOAD "f0, (%0)" : : "r"(&one)); - __asm__(FLT_LOAD "f1, (%0)" : : "r"(alpha)); - __asm__(FDIV "f0, f0, f1"); - incx *= FLT_SIZE; - size_t avl = n; - while (avl) { - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma" - : "=r"(vl) - : "r"(avl), "i"(8 * FLT_SIZE)); - if (incx == FLT_SIZE) { - __asm__(VLE "v0, (%0)" : : "r"(x)); - __asm__("vfmul.vf v0, v0, f0"); - __asm__(VSE "v0, (%0)" : : "r"(x)); - } else { - __asm__(VLSE "v0, (%0), %1" : : "r"(x), "r"(incx)); - __asm__("vfmul.vf v0, v0, f0"); - __asm__(VSSE "v0, (%0), %1" : : "r"(x), "r"(incx)); - } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); - avl -= vl; - } - return; -} - -#undef FLT_SIZE -#undef FLT_LOAD -#undef FDIV -#undef VLE -#undef VLSE -#undef VSE -#undef VSSE - -#define FLT_SIZE 4 -#define FLT_LOAD "flw " -#define FMUL "fmul.s " -#define FMADD "fmadd.s " -#define FDIV "fdiv.s " -#define FNEG "fneg.s " -#define VLSEG2 "vlseg2e32.v " -#define VLSSEG2 "vlsseg2e32.v " -#define VSSEG2 "vsseg2e32.v " -#define VSSSEG2 "vssseg2e32.v " - -void bli_cinvscalv_sifive_x280_asm(conj_t conjalpha, dim_t n, const void * restrict alpha_, - void * restrict x_, inc_t incx, - const cntx_t *cntx) { - (void)cntx; - const scomplex* restrict alpha = alpha_; - scomplex* restrict x = x_; - if (n <= 0 || (alpha->real == 0.f && alpha->imag == 0.f) || (alpha->real == 1.f && alpha->imag == 0.f)) - return; - - __asm__(FLT_LOAD "f0, (%0)" : : "r"(alpha)); - __asm__(FLT_LOAD "f1, %1(%0)" : : "r"(alpha), "I"(FLT_SIZE)); - __asm__(FMUL "f2, f0, f0"); - __asm__(FMADD "f2, f1, f1, f2"); - __asm__(FDIV "f0, f0, f2"); - __asm__(FDIV "f1, f1, f2"); - if (conjalpha == BLIS_NO_CONJUGATE) - __asm__(FNEG "f1, f1"); - incx *= 2 * FLT_SIZE; - size_t avl = n; - while (avl) { - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma" - : "=r"(vl) - : "r"(avl), "i"(8 * FLT_SIZE)); - if (incx == 2 * FLT_SIZE) { - __asm__(VLSEG2 "v0, (%0)" : : "r"(x)); - __asm__("vfmul.vf v8, v0, f0"); - __asm__("vfmul.vf v12, v4, f0"); - __asm__("vfnmsac.vf v8, f1, v4"); - __asm__("vfmacc.vf v12, f1, v0"); - __asm__(VSSEG2 "v8, (%0)" : : "r"(x)); - } else { - __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx)); - __asm__("vfmul.vf v8, v0, f0"); - __asm__("vfmul.vf v12, v4, f0"); - __asm__("vfnmsac.vf v8, f1, v4"); - __asm__("vfmacc.vf v12, f1, v0"); - __asm__(VSSSEG2 "v8, (%0), %1" : : "r"(x), "r"(incx)); - } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); - avl -= vl; - } - return; -} - -#undef FLT_SIZE -#undef FLT_LOAD -#undef FMUL -#undef FMADD -#undef FDIV -#undef FNEG -#undef VLSEG2 -#undef VLSSEG2 -#undef VSSEG2 -#undef VSSSEG2 - -#define FLT_SIZE 8 -#define FLT_LOAD "fld " -#define FMUL "fmul.d " -#define FMADD "fmadd.d " -#define FDIV "fdiv.d " -#define FNEG "fneg.d " -#define VLSEG2 "vlseg2e64.v " -#define VLSSEG2 "vlsseg2e64.v " -#define VSSEG2 "vsseg2e64.v " -#define VSSSEG2 "vssseg2e64.v " - -void bli_zinvscalv_sifive_x280_asm(conj_t conjalpha, dim_t n, const void * restrict alpha_, - void * restrict x_, inc_t incx, - const cntx_t *cntx) { - (void)cntx; - const dcomplex* restrict alpha = alpha_; - dcomplex* restrict x = x_; - if (n <= 0 || (alpha->real == 0. && alpha->imag == 0.) || (alpha->real == 1. && alpha->imag == 0.)) - return; - - __asm__(FLT_LOAD "f0, (%0)" : : "r"(alpha)); - __asm__(FLT_LOAD "f1, %1(%0)" : : "r"(alpha), "I"(FLT_SIZE)); - __asm__(FMUL "f2, f0, f0"); - __asm__(FMADD "f2, f1, f1, f2"); - __asm__(FDIV "f0, f0, f2"); - __asm__(FDIV "f1, f1, f2"); - if (conjalpha == BLIS_NO_CONJUGATE) - __asm__(FNEG "f1, f1"); - incx *= 2 * FLT_SIZE; - size_t avl = n; - while (avl) { - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma" - : "=r"(vl) - : "r"(avl), "i"(8 * FLT_SIZE)); - if (incx == 2 * FLT_SIZE) { - __asm__(VLSEG2 "v0, (%0)" : : "r"(x)); - __asm__("vfmul.vf v8, v0, f0"); - __asm__("vfmul.vf v12, v4, f0"); - __asm__("vfnmsac.vf v8, f1, v4"); - __asm__("vfmacc.vf v12, f1, v0"); - __asm__(VSSEG2 "v8, (%0)" : : "r"(x)); - } else { - __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx)); - __asm__("vfmul.vf v8, v0, f0"); - __asm__("vfmul.vf v12, v4, f0"); - __asm__("vfnmsac.vf v8, f1, v4"); - __asm__("vfmacc.vf v12, f1, v0"); - __asm__(VSSSEG2 "v8, (%0), %1" : : "r"(x), "r"(incx)); - } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); - avl -= vl; - } - return; -} diff --git a/kernels/sifive_x280/1/bli_invscalv_sifive_x280_intr/bli_invscalv_sifive_x280_intr.c b/kernels/sifive_x280/1/bli_invscalv_sifive_x280_intr/bli_invscalv_sifive_x280_intr.c new file mode 100644 index 0000000000..a5c7561bd8 --- /dev/null +++ b/kernels/sifive_x280/1/bli_invscalv_sifive_x280_intr/bli_invscalv_sifive_x280_intr.c @@ -0,0 +1,117 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2024, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off +#include "../../riscv_cmul_macros_intr.h" +#include "../../riscv_overloaded_intrinsics.h" +#include "blis.h" +#include +#include + +#define INVSCALV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##invscalv_sifive_x280_intr(\ + conj_t conjalpha, \ + dim_t n, \ + const T* restrict alpha_, \ + T* restrict x_, inc_t incx, \ + const cntx_t* cntx \ +) + +#define INVSCALV(...) INVSCALV_(__VA_ARGS__) + +// Single precision real +#define DATATYPE float +#define PRECISION_CHAR s +#define PREC 32 +#define LMUL m8 +#define FLT_SIZE sizeof(float) + +#include "./bli_invscalv_sifive_x280_intr_real.c" + +#undef DATATYPE +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +// Double precision real +#define DATATYPE double +#define PRECISION_CHAR d +#define PREC 64 +#define LMUL m8 +#define FLT_SIZE sizeof(double) + +#include "./bli_invscalv_sifive_x280_intr_real.c" + +#undef DATATYPE +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +// Single precision complex +#define DATATYPE scomplex +#define BASE_DT float +#define PRECISION_CHAR c +#define PREC 32 +#define LMUL m4 +#define FLT_SIZE sizeof(float) + +#include "./bli_invscalv_sifive_x280_intr_complex.c" + +#undef DATATYPE +#undef BASE_DT +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +// Double precision complex +#define DATATYPE dcomplex +#define BASE_DT double +#define PRECISION_CHAR z +#define PREC 64 +#define LMUL m4 +#define FLT_SIZE sizeof(double) + +#include "./bli_invscalv_sifive_x280_intr_complex.c" + +#undef DATATYPE +#undef BASE_DT +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +#undef INVSCALV +#undef INVSCALV_ diff --git a/kernels/sifive_x280/1/bli_invscalv_sifive_x280_intr/bli_invscalv_sifive_x280_intr_complex.c b/kernels/sifive_x280/1/bli_invscalv_sifive_x280_intr/bli_invscalv_sifive_x280_intr_complex.c new file mode 100644 index 0000000000..077e9dd061 --- /dev/null +++ b/kernels/sifive_x280/1/bli_invscalv_sifive_x280_intr/bli_invscalv_sifive_x280_intr_complex.c @@ -0,0 +1,83 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2024, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off +#ifdef INVSCALV + +INVSCALV(PRECISION_CHAR, void) +{ + (void)cntx; + const DATATYPE* restrict alpha = alpha_; + DATATYPE* restrict x = x_; + + if (n <= 0) return; + if (PASTEMAC(PRECISION_CHAR, eq1)(*alpha)) return; + if (PASTEMAC(PRECISION_CHAR, eq0)(*alpha)) return; + + DATATYPE alpha_conj_inv; + PASTEMAC(PRECISION_CHAR, copycjs)(conjalpha, *alpha, alpha_conj_inv); + PASTEMAC(PRECISION_CHAR, inverts)(alpha_conj_inv); + + size_t avl = n; + while (avl) { + size_t vl = VSETVL(PREC, LMUL)(avl); + RVV_TYPE_FX(PREC, LMUL, 2) xvec; + + if (incx == 1) + xvec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, vl); + else + xvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, 2 * FLT_SIZE * incx, vl); + + RVV_TYPE_F(PREC, LMUL) xvec_real = VGET_V_F(PREC, LMUL, 2)(xvec, 0); + RVV_TYPE_F(PREC, LMUL) xvec_imag = VGET_V_F(PREC, LMUL, 2)(xvec, 1); + RVV_TYPE_F(PREC, LMUL) yvec_real, yvec_imag; + + VCMUL_VF(PREC, LMUL, yvec_real, yvec_imag, xvec_real, xvec_imag, alpha_conj_inv.real, alpha_conj_inv.imag, vl); + + RVV_TYPE_FX(PREC, LMUL, 2) yvec = VUNDEFINED_FX(PREC, LMUL, 2)(); + yvec = VSET_V_F(PREC, LMUL, 2)(yvec, 0, yvec_real); + yvec = VSET_V_F(PREC, LMUL, 2)(yvec, 1, yvec_imag); + + if (incx == 1) + VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, yvec, vl); + else + VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, 2 * FLT_SIZE * incx, yvec, vl); + + x += vl * incx; + avl -= vl; + } + return; +} + +#endif // INVSCALV diff --git a/kernels/sifive_x280/1/bli_invscalv_sifive_x280_intr/bli_invscalv_sifive_x280_intr_real.c b/kernels/sifive_x280/1/bli_invscalv_sifive_x280_intr/bli_invscalv_sifive_x280_intr_real.c new file mode 100644 index 0000000000..a38b97c335 --- /dev/null +++ b/kernels/sifive_x280/1/bli_invscalv_sifive_x280_intr/bli_invscalv_sifive_x280_intr_real.c @@ -0,0 +1,75 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2024, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off +#ifdef INVSCALV + +INVSCALV(PRECISION_CHAR, void) +{ + (void)conjalpha; + (void)cntx; + const DATATYPE* restrict alpha = alpha_; + DATATYPE* restrict x = x_; + + if (n <= 0) return; + if (PASTEMAC(PRECISION_CHAR, eq1)(*alpha)) return; + if (PASTEMAC(PRECISION_CHAR, eq0)(*alpha)) return; + + DATATYPE alpha_inv = *alpha; + PASTEMAC(PRECISION_CHAR, inverts)(alpha_inv); + + size_t avl = n; + while (avl) { + size_t vl = VSETVL(PREC, LMUL)(avl); + RVV_TYPE_F(PREC, LMUL) xvec; + + if (incx == 1) + xvec = VLE_V_F(PREC, LMUL)(x, vl); + else + xvec = VLSE_V_F(PREC, LMUL)(x, FLT_SIZE * incx, vl); + + xvec = VFMUL_VF(PREC, LMUL)(xvec, alpha_inv, vl); + + if (incx == 1) + VSE_V_F(PREC, LMUL)(x, xvec, vl); + else + VSSE_V_F(PREC, LMUL)(x, FLT_SIZE * incx, xvec, vl); + + x += vl * incx; + avl -= vl; + } + return; +} + +#endif // INVSCALV diff --git a/kernels/sifive_x280/1/bli_scal2v_sifive_x280_intr/bli_scal2v_sifive_x280_intr.c b/kernels/sifive_x280/1/bli_scal2v_sifive_x280_intr/bli_scal2v_sifive_x280_intr.c index cd2dd2c188..4cae8257c3 100644 --- a/kernels/sifive_x280/1/bli_scal2v_sifive_x280_intr/bli_scal2v_sifive_x280_intr.c +++ b/kernels/sifive_x280/1/bli_scal2v_sifive_x280_intr/bli_scal2v_sifive_x280_intr.c @@ -51,9 +51,9 @@ #define SCAL2V(...) SCAL2V_(__VA_ARGS__) -#define COPYV_(PRECISION_CHAR) bli_##PRECISION_CHAR##copyv_sifive_x280_asm +#define COPYV_(PRECISION_CHAR) bli_##PRECISION_CHAR##copyv_sifive_x280_intr #define COPYV(PRECISION_CHAR) COPYV_(PRECISION_CHAR) -#define SETV_(PRECISION_CHAR) bli_##PRECISION_CHAR##setv_sifive_x280_asm +#define SETV_(PRECISION_CHAR) bli_##PRECISION_CHAR##setv_sifive_x280_intr #define SETV(PRECISION_CHAR) SETV_(PRECISION_CHAR) // Single precision real diff --git a/kernels/sifive_x280/1/bli_scal2v_sifive_x280_intr/bli_scal2v_sifive_x280_intr_complex.c b/kernels/sifive_x280/1/bli_scal2v_sifive_x280_intr/bli_scal2v_sifive_x280_intr_complex.c index 4a25ce3e32..2e946a2a4c 100644 --- a/kernels/sifive_x280/1/bli_scal2v_sifive_x280_intr/bli_scal2v_sifive_x280_intr_complex.c +++ b/kernels/sifive_x280/1/bli_scal2v_sifive_x280_intr/bli_scal2v_sifive_x280_intr_complex.c @@ -77,13 +77,9 @@ SCAL2V(PRECISION_CHAR, void) yvec_imag = VFNMSAC_VF(PREC, LMUL)(yvec_imag, alpha->real, xvec_imag, vl); } - // FIXME: remove the #pragmas and change the __riscv_vset_v_f intrinsics to use - // __riscv_vcreate_v_f once they become available in LLVM. - #pragma GCC diagnostic push - #pragma GCC diagnostic ignored "-Wuninitialized" + yvec = VUNDEFINED_FX(PREC, LMUL, 2)(); yvec = VSET_V_F(PREC, LMUL, 2)(yvec, 0, yvec_real); yvec = VSET_V_F(PREC, LMUL, 2)(yvec, 1, yvec_imag); - #pragma GCC diagnostic pop if (incy == 1) VSSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) y, yvec, vl); diff --git a/kernels/sifive_x280/1/bli_scalv_sifive_x280_intr/bli_scalv_sifive_x280_intr.c b/kernels/sifive_x280/1/bli_scalv_sifive_x280_intr/bli_scalv_sifive_x280_intr.c index b5788d632d..d1fb9940eb 100644 --- a/kernels/sifive_x280/1/bli_scalv_sifive_x280_intr/bli_scalv_sifive_x280_intr.c +++ b/kernels/sifive_x280/1/bli_scalv_sifive_x280_intr/bli_scalv_sifive_x280_intr.c @@ -49,7 +49,7 @@ #define SCALV(...) SCALV_(__VA_ARGS__) -#define SETV_(PRECISION_CHAR) bli_##PRECISION_CHAR##setv_sifive_x280_asm +#define SETV_(PRECISION_CHAR) bli_##PRECISION_CHAR##setv_sifive_x280_intr #define SETV(PRECISION_CHAR) SETV_(PRECISION_CHAR) // Single precision real diff --git a/kernels/sifive_x280/1/bli_setv_sifive_x280_asm.c b/kernels/sifive_x280/1/bli_setv_sifive_x280_asm.c deleted file mode 100644 index ef9091f16c..0000000000 --- a/kernels/sifive_x280/1/bli_setv_sifive_x280_asm.c +++ /dev/null @@ -1,204 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2023, SiFive, Inc. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" -#include -#include -#include -#include - -#define FLT_SIZE 4 -#define VLSE "vlse32.v " -#define VSE "vse32.v " -#define VSSE "vsse32.v " - -void bli_ssetv_sifive_x280_asm(conj_t conjalpha, dim_t n, const void * restrict alpha_, - void * restrict x_, inc_t incx, const cntx_t *cntx) { - (void)conjalpha; - (void)cntx; - const float* restrict alpha = alpha_; - float* restrict x = x_; - if (n <= 0) - return; - - __asm__ volatile("vsetvli zero, %0, e%1, m8, ta, ma" - : - : "r"(n), "i"(8 * FLT_SIZE)); - __asm__(VLSE "v0, (%0), zero" : : "r"(alpha)); - incx *= FLT_SIZE; - - size_t avl = n; - while (avl) { - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma" - : "=r"(vl) - : "r"(avl), "i"(8 * FLT_SIZE)); - if (incx == FLT_SIZE) - __asm__(VSE "v0, (%0)" : : "r"(x)); - else - __asm__(VSSE "v0, (%0), %1" : : "r"(x), "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); - avl -= vl; - } - return; -} - -#undef FLT_SIZE -#undef VLSE -#undef VSE -#undef VSSE - -#define FLT_SIZE 8 -#define VLSE "vlse64.v " -#define VSE "vse64.v " -#define VSSE "vsse64.v " - -void bli_dsetv_sifive_x280_asm(conj_t conjalpha, dim_t n, const void * restrict alpha_, - void * restrict x_, inc_t incx, const cntx_t *cntx) { - (void)conjalpha; - (void)cntx; - const double* restrict alpha = alpha_; - double* restrict x = x_; - if (n <= 0) - return; - - __asm__ volatile("vsetvli zero, %0, e%1, m8, ta, ma" - : - : "r"(n), "i"(8 * FLT_SIZE)); - __asm__(VLSE "v0, (%0), zero" : : "r"(alpha)); - incx *= FLT_SIZE; - - size_t avl = n; - while (avl) { - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma" - : "=r"(vl) - : "r"(avl), "i"(8 * FLT_SIZE)); - if (incx == FLT_SIZE) - __asm__(VSE "v0, (%0)" : : "r"(x)); - else - __asm__(VSSE "v0, (%0), %1" : : "r"(x), "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); - avl -= vl; - } - return; -} - -#undef FLT_SIZE -#undef VLSE -#undef VSE -#undef VSSE - -#define FLT_SIZE 4 -#define VLSE "vlse32.v " -#define VSSEG2 "vsseg2e32.v " -#define VSSSEG2 "vssseg2e32.v " - -void bli_csetv_sifive_x280_asm(conj_t conjalpha, dim_t n, const void * restrict alpha_, - void * restrict x_, inc_t incx, const cntx_t *cntx) { - (void)cntx; - const scomplex* restrict alpha = alpha_; - scomplex* restrict x = x_; - if (n <= 0) - return; - - __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" - : - : "r"(n), "i"(8 * FLT_SIZE)); - __asm__(VLSE "v0, (%0), zero" : : "r"(alpha)); - __asm__("addi t0, %0, %1" : : "r"(alpha), "I"(FLT_SIZE)); - __asm__(VLSE "v4, (t0), zero"); - if (conjalpha == BLIS_CONJUGATE) - __asm__("vfneg.v v4, v4"); - incx *= 2 * FLT_SIZE; - - size_t avl = n; - while (avl) { - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma" - : "=r"(vl) - : "r"(avl), "i"(8 * FLT_SIZE)); - if (incx == 2 * FLT_SIZE) - __asm__(VSSEG2 "v0, (%0)" : : "r"(x)); - else - __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); - avl -= vl; - } - return; -} - -#undef FLT_SIZE -#undef VLSE -#undef VSSEG2 -#undef VSSSEG2 - -#define FLT_SIZE 8 -#define VLSE "vlse64.v " -#define VSSEG2 "vsseg2e64.v " -#define VSSSEG2 "vssseg2e64.v " - -void bli_zsetv_sifive_x280_asm(conj_t conjalpha, dim_t n, const void * restrict alpha_, - void * restrict x_, inc_t incx, const cntx_t *cntx) { - (void)cntx; - const dcomplex* restrict alpha = alpha_; - dcomplex* restrict x = x_; - if (n <= 0) - return; - - __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" - : - : "r"(n), "i"(8 * FLT_SIZE)); - __asm__(VLSE "v0, (%0), zero" : : "r"(alpha)); - __asm__("addi t0, %0, %1" : : "r"(alpha), "I"(FLT_SIZE)); - __asm__(VLSE "v4, (t0), zero"); - if (conjalpha == BLIS_CONJUGATE) - __asm__("vfneg.v v4, v4"); - incx *= 2 * FLT_SIZE; - - size_t avl = n; - while (avl) { - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma" - : "=r"(vl) - : "r"(avl), "i"(8 * FLT_SIZE)); - if (incx == 2 * FLT_SIZE) - __asm__(VSSEG2 "v0, (%0)" : : "r"(x)); - else - __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); - avl -= vl; - } - return; -} diff --git a/kernels/sifive_x280/1/bli_setv_sifive_x280_intr/bli_setv_sifive_x280_intr.c b/kernels/sifive_x280/1/bli_setv_sifive_x280_intr/bli_setv_sifive_x280_intr.c new file mode 100644 index 0000000000..8c2ba7c72a --- /dev/null +++ b/kernels/sifive_x280/1/bli_setv_sifive_x280_intr/bli_setv_sifive_x280_intr.c @@ -0,0 +1,116 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2024, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off +#include "../../riscv_overloaded_intrinsics.h" +#include "blis.h" +#include +#include + +#define SETV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##setv_sifive_x280_intr(\ + conj_t conjalpha, \ + dim_t n, \ + const T* restrict alpha_, \ + T* restrict x_, inc_t incx, \ + const cntx_t* cntx \ +) + +#define SETV(...) SETV_(__VA_ARGS__) + +// Single precision real +#define DATATYPE float +#define PRECISION_CHAR s +#define PREC 32 +#define LMUL m8 +#define FLT_SIZE sizeof(float) + +#include "./bli_setv_sifive_x280_intr_real.c" + +#undef DATATYPE +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +// Double precision real +#define DATATYPE double +#define PRECISION_CHAR d +#define PREC 64 +#define LMUL m8 +#define FLT_SIZE sizeof(double) + +#include "./bli_setv_sifive_x280_intr_real.c" + +#undef DATATYPE +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +// Single precision complex +#define DATATYPE scomplex +#define BASE_DT float +#define PRECISION_CHAR c +#define PREC 32 +#define LMUL m4 +#define FLT_SIZE sizeof(float) + +#include "./bli_setv_sifive_x280_intr_complex.c" + +#undef DATATYPE +#undef BASE_DT +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +// Double precision complex +#define DATATYPE dcomplex +#define BASE_DT double +#define PRECISION_CHAR z +#define PREC 64 +#define LMUL m4 +#define FLT_SIZE sizeof(double) + +#include "./bli_setv_sifive_x280_intr_complex.c" + +#undef DATATYPE +#undef BASE_DT +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +#undef SETV +#undef SETV_ diff --git a/kernels/sifive_x280/1/bli_setv_sifive_x280_intr/bli_setv_sifive_x280_intr_complex.c b/kernels/sifive_x280/1/bli_setv_sifive_x280_intr/bli_setv_sifive_x280_intr_complex.c new file mode 100644 index 0000000000..efee3a7f60 --- /dev/null +++ b/kernels/sifive_x280/1/bli_setv_sifive_x280_intr/bli_setv_sifive_x280_intr_complex.c @@ -0,0 +1,71 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2024, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off +#ifdef SETV + +SETV(PRECISION_CHAR, void) +{ + (void)cntx; + const DATATYPE* restrict alpha = alpha_; + DATATYPE* restrict x = x_; + + if (n <= 0) return; + + DATATYPE alpha_conj; + PASTEMAC(PRECISION_CHAR, copycjs)(conjalpha, *alpha, alpha_conj); + + RVV_TYPE_F(PREC, LMUL) alpha_conj_real_vec = VFMV_V_F(PREC, LMUL)(alpha_conj.real, n); + RVV_TYPE_F(PREC, LMUL) alpha_conj_imag_vec = VFMV_V_F(PREC, LMUL)(alpha_conj.imag, n); + + RVV_TYPE_FX(PREC, LMUL, 2) alpha_conj_vec = VUNDEFINED_FX(PREC, LMUL, 2)(); + alpha_conj_vec = VSET_V_F(PREC, LMUL, 2)(alpha_conj_vec, 0, alpha_conj_real_vec); + alpha_conj_vec = VSET_V_F(PREC, LMUL, 2)(alpha_conj_vec, 1, alpha_conj_imag_vec); + + size_t avl = n; + while (avl) { + size_t vl = VSETVL(PREC, LMUL)(avl); + + if (incx == 1) + VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, alpha_conj_vec, vl); + else + VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, 2 * FLT_SIZE * incx, alpha_conj_vec, vl); + + x += vl * incx; + avl -= vl; + } + return; +} + +#endif // SETV diff --git a/kernels/sifive_x280/1/bli_setv_sifive_x280_intr/bli_setv_sifive_x280_intr_real.c b/kernels/sifive_x280/1/bli_setv_sifive_x280_intr/bli_setv_sifive_x280_intr_real.c new file mode 100644 index 0000000000..4b73de5c4c --- /dev/null +++ b/kernels/sifive_x280/1/bli_setv_sifive_x280_intr/bli_setv_sifive_x280_intr_real.c @@ -0,0 +1,64 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2024, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off +#ifdef SETV + +SETV(PRECISION_CHAR, void) +{ + (void)conjalpha; + (void)cntx; + const DATATYPE* restrict alpha = alpha_; + DATATYPE* restrict x = x_; + + if (n <= 0) return; + + RVV_TYPE_F(PREC, LMUL) alpha_vec = VFMV_V_F(PREC, LMUL)(*alpha, n); + + size_t avl = n; + while (avl) { + size_t vl = VSETVL(PREC, LMUL)(avl); + + if (incx == 1) + VSE_V_F(PREC, LMUL)(x, alpha_vec, vl); + else + VSSE_V_F(PREC, LMUL)(x, FLT_SIZE * incx, alpha_vec, vl); + + x += vl * incx; + avl -= vl; + } + return; +} + +#endif // SETV diff --git a/kernels/sifive_x280/1/bli_swapv_sifive_x280_asm.c b/kernels/sifive_x280/1/bli_swapv_sifive_x280_asm.c deleted file mode 100644 index 2342e254a2..0000000000 --- a/kernels/sifive_x280/1/bli_swapv_sifive_x280_asm.c +++ /dev/null @@ -1,245 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2023, SiFive, Inc. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" -#include -#include -#include -#include - -#define FLT_SIZE 4 -#define VLE "vle32.v " -#define VLSE "vlse32.v " -#define VSE "vse32.v " -#define VSSE "vsse32.v " - -void bli_sswapv_sifive_x280_asm(dim_t n, void * restrict x_, inc_t incx, void * restrict y_, - inc_t incy, const cntx_t *cntx) { - (void)cntx; - float* restrict x = x_; - float* restrict y = y_; - if (n <= 0) - return; - - incx *= FLT_SIZE; - incy *= FLT_SIZE; - size_t avl = n; - while (avl) { - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma" - : "=r"(vl) - : "r"(avl), "i"(8 * FLT_SIZE)); - if (incx == FLT_SIZE) - __asm__(VLE "v0, (%0)" : : "r"(x)); - else - __asm__(VLSE "v0, (%0), %1" : : "r"(x), "r"(incx)); - if (incy == FLT_SIZE) - __asm__(VLE "v8, (%0)" : : "r"(y)); - else - __asm__(VLSE "v8, (%0), %1" : : "r"(y), "r"(incy)); - - if (incx == FLT_SIZE) - __asm__(VSE "v8, (%0)" : : "r"(x)); - else - __asm__(VSSE "v8, (%0), %1" : : "r"(x), "r"(incx)); - if (incy == FLT_SIZE) - __asm__(VSE "v0, (%0)" : : "r"(y)); - else - __asm__(VSSE "v0, (%0), %1" : : "r"(y), "r"(incy)); - - __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); - __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy)); - avl -= vl; - } - return; -} - -#undef FLT_SIZE -#undef VLE -#undef VLSE -#undef VSE -#undef VSSE - -#define FLT_SIZE 8 -#define VLE "vle64.v " -#define VLSE "vlse64.v " -#define VSE "vse64.v " -#define VSSE "vsse64.v " - -void bli_dswapv_sifive_x280_asm(dim_t n, void * restrict x_, inc_t incx, - void * restrict y_, inc_t incy, const cntx_t *cntx) { - (void)cntx; - double* restrict x = x_; - double* restrict y = y_; - if (n <= 0) - return; - - incx *= FLT_SIZE; - incy *= FLT_SIZE; - size_t avl = n; - while (avl) { - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma" - : "=r"(vl) - : "r"(avl), "i"(8 * FLT_SIZE)); - if (incx == FLT_SIZE) - __asm__(VLE "v0, (%0)" : : "r"(x)); - else - __asm__(VLSE "v0, (%0), %1" : : "r"(x), "r"(incx)); - if (incy == FLT_SIZE) - __asm__(VLE "v8, (%0)" : : "r"(y)); - else - __asm__(VLSE "v8, (%0), %1" : : "r"(y), "r"(incy)); - - if (incx == FLT_SIZE) - __asm__(VSE "v8, (%0)" : : "r"(x)); - else - __asm__(VSSE "v8, (%0), %1" : : "r"(x), "r"(incx)); - if (incy == FLT_SIZE) - __asm__(VSE "v0, (%0)" : : "r"(y)); - else - __asm__(VSSE "v0, (%0), %1" : : "r"(y), "r"(incy)); - - __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); - __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy)); - avl -= vl; - } - return; -} - -#undef FLT_SIZE -#undef VLE -#undef VLSE -#undef VSE -#undef VSSE - -#define FLT_SIZE 4 -#define VLE "vle64.v " -#define VLSE "vlse64.v " -#define VSE "vse64.v " -#define VSSE "vsse64.v " - -void bli_cswapv_sifive_x280_asm(dim_t n, void * restrict x_, inc_t incx, - void * restrict y_, inc_t incy, const cntx_t *cntx) { - (void)cntx; - scomplex* restrict x = x_; - scomplex* restrict y = y_; - if (n <= 0) - return; - - incx *= 2 * FLT_SIZE; - incy *= 2 * FLT_SIZE; - size_t avl = n; - while (avl) { - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma" - : "=r"(vl) - : "r"(avl), "i"(8 * 2 * FLT_SIZE)); - if (incx == 2 * FLT_SIZE) - __asm__(VLE "v0, (%0)" : : "r"(x)); - else - __asm__(VLSE "v0, (%0), %1" : : "r"(x), "r"(incx)); - if (incy == 2 * FLT_SIZE) - __asm__(VLE "v8, (%0)" : : "r"(y)); - else - __asm__(VLSE "v8, (%0), %1" : : "r"(y), "r"(incy)); - - if (incx == 2 * FLT_SIZE) - __asm__(VSE "v8, (%0)" : : "r"(x)); - else - __asm__(VSSE "v8, (%0), %1" : : "r"(x), "r"(incx)); - if (incy == 2 * FLT_SIZE) - __asm__(VSE "v0, (%0)" : : "r"(y)); - else - __asm__(VSSE "v0, (%0), %1" : : "r"(y), "r"(incy)); - - __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); - __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy)); - avl -= vl; - } - return; -} - -#undef FLT_SIZE -#undef VLE -#undef VLSE -#undef VSE -#undef VSSE - -#define FLT_SIZE 8 -#define VLSEG2 "vlseg2e64.v " -#define VLSSEG2 "vlsseg2e64.v " -#define VSSEG2 "vsseg2e64.v " -#define VSSSEG2 "vssseg2e64.v " - -void bli_zswapv_sifive_x280_asm(dim_t n, void * restrict x_, inc_t incx, - void * restrict y_, inc_t incy, const cntx_t *cntx) { - (void)cntx; - dcomplex* restrict x = x_; - dcomplex* restrict y = y_; - if (n <= 0) - return; - - incx *= 2 * FLT_SIZE; - incy *= 2 * FLT_SIZE; - size_t avl = n; - while (avl) { - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma" - : "=r"(vl) - : "r"(avl), "i"(8 * FLT_SIZE)); - if (incx == 2 * FLT_SIZE) - __asm__(VLSEG2 "v0, (%0)" : : "r"(x)); - else - __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx)); - if (incy == 2 * FLT_SIZE) - __asm__(VLSEG2 "v8, (%0)" : : "r"(y)); - else - __asm__(VLSSEG2 "v8, (%0), %1" : : "r"(y), "r"(incy)); - - if (incx == 2 * FLT_SIZE) - __asm__(VSSEG2 "v8, (%0)" : : "r"(x)); - else - __asm__(VSSSEG2 "v8, (%0), %1" : : "r"(x), "r"(incx)); - if (incy == 2 * FLT_SIZE) - __asm__(VSSEG2 "v0, (%0)" : : "r"(y)); - else - __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(y), "r"(incy)); - - __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); - __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy)); - avl -= vl; - } - return; -} diff --git a/kernels/sifive_x280/1/bli_swapv_sifive_x280_intr/bli_swapv_sifive_x280_intr.c b/kernels/sifive_x280/1/bli_swapv_sifive_x280_intr/bli_swapv_sifive_x280_intr.c new file mode 100644 index 0000000000..baf685d35f --- /dev/null +++ b/kernels/sifive_x280/1/bli_swapv_sifive_x280_intr/bli_swapv_sifive_x280_intr.c @@ -0,0 +1,115 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2024, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off +#include "../../riscv_overloaded_intrinsics.h" +#include "blis.h" +#include +#include + +#define SWAPV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##swapv_sifive_x280_intr(\ + dim_t n, \ + T* restrict x_, inc_t incx, \ + T* restrict y_, inc_t incy, \ + const cntx_t* cntx \ +) + +#define SWAPV(...) SWAPV_(__VA_ARGS__) + +// Single precision real +#define DATATYPE float +#define PRECISION_CHAR s +#define PREC 32 +#define LMUL m8 +#define FLT_SIZE sizeof(float) + +#include "./bli_swapv_sifive_x280_intr_real.c" + +#undef DATATYPE +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +// Double precision real +#define DATATYPE double +#define PRECISION_CHAR d +#define PREC 64 +#define LMUL m8 +#define FLT_SIZE sizeof(double) + +#include "./bli_swapv_sifive_x280_intr_real.c" + +#undef DATATYPE +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +// Single precision complex +#define DATATYPE scomplex +#define BASE_DT float +#define PRECISION_CHAR c +#define PREC 32 +#define LMUL m4 +#define FLT_SIZE sizeof(float) + +#include "./bli_swapv_sifive_x280_intr_complex.c" + +#undef DATATYPE +#undef BASE_DT +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +// Double precision complex +#define DATATYPE dcomplex +#define BASE_DT double +#define PRECISION_CHAR z +#define PREC 64 +#define LMUL m4 +#define FLT_SIZE sizeof(double) + +#include "./bli_swapv_sifive_x280_intr_complex.c" + +#undef DATATYPE +#undef BASE_DT +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +#undef SWAPV +#undef SWAPV_ diff --git a/kernels/sifive_x280/1/bli_swapv_sifive_x280_intr/bli_swapv_sifive_x280_intr_complex.c b/kernels/sifive_x280/1/bli_swapv_sifive_x280_intr/bli_swapv_sifive_x280_intr_complex.c new file mode 100644 index 0000000000..104ba52235 --- /dev/null +++ b/kernels/sifive_x280/1/bli_swapv_sifive_x280_intr/bli_swapv_sifive_x280_intr_complex.c @@ -0,0 +1,76 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2024, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off +#ifdef SWAPV + +SWAPV(PRECISION_CHAR, void) +{ + (void)cntx; + DATATYPE* restrict x = x_; + DATATYPE* restrict y = y_; + + if (n <= 0) return; + + size_t avl = n; + while (avl) { + size_t vl = VSETVL(PREC, LMUL)(avl); + RVV_TYPE_FX(PREC, LMUL, 2) xvec, yvec; + + if (incx == 1) + xvec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, vl); + else + xvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, 2 * FLT_SIZE * incx, vl); + if (incy == 1) + yvec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) y, vl); + else + yvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) y, 2 * FLT_SIZE * incy, vl); + + if (incx == 1) + VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, yvec, vl); + else + VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, 2 * FLT_SIZE * incx, yvec, vl); + if (incy == 1) + VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) y, xvec, vl); + else + VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) y, 2 * FLT_SIZE * incy, xvec, vl); + + x += vl * incx; + y += vl * incy; + avl -= vl; + } + return; +} + +#endif // SWAPV diff --git a/kernels/sifive_x280/1/bli_swapv_sifive_x280_intr/bli_swapv_sifive_x280_intr_real.c b/kernels/sifive_x280/1/bli_swapv_sifive_x280_intr/bli_swapv_sifive_x280_intr_real.c new file mode 100644 index 0000000000..efa7222abf --- /dev/null +++ b/kernels/sifive_x280/1/bli_swapv_sifive_x280_intr/bli_swapv_sifive_x280_intr_real.c @@ -0,0 +1,76 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2024, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off +#ifdef SWAPV + +SWAPV(PRECISION_CHAR, void) +{ + (void)cntx; + DATATYPE* restrict x = x_; + DATATYPE* restrict y = y_; + + if (n <= 0) return; + + size_t avl = n; + while (avl) { + size_t vl = VSETVL(PREC, LMUL)(avl); + RVV_TYPE_F(PREC, LMUL) xvec, yvec; + + if (incx == 1) + xvec = VLE_V_F(PREC, LMUL)(x, vl); + else + xvec = VLSE_V_F(PREC, LMUL)(x, FLT_SIZE * incx, vl); + if (incy == 1) + yvec = VLE_V_F(PREC, LMUL)(y, vl); + else + yvec = VLSE_V_F(PREC, LMUL)(y, FLT_SIZE * incy, vl); + + if (incx == 1) + VSE_V_F(PREC, LMUL)(x, yvec, vl); + else + VSSE_V_F(PREC, LMUL)(x, FLT_SIZE * incx, yvec, vl); + if (incy == 1) + VSE_V_F(PREC, LMUL)(y, xvec, vl); + else + VSSE_V_F(PREC, LMUL)(y, FLT_SIZE * incy, xvec, vl); + + x += vl * incx; + y += vl * incy; + avl -= vl; + } + return; +} + +#endif // SWAPV diff --git a/kernels/sifive_x280/1/bli_xpbyv_sifive_x280_intr/bli_xpbyv_sifive_x280_intr.c b/kernels/sifive_x280/1/bli_xpbyv_sifive_x280_intr/bli_xpbyv_sifive_x280_intr.c index dce4085bff..da688851d0 100644 --- a/kernels/sifive_x280/1/bli_xpbyv_sifive_x280_intr/bli_xpbyv_sifive_x280_intr.c +++ b/kernels/sifive_x280/1/bli_xpbyv_sifive_x280_intr/bli_xpbyv_sifive_x280_intr.c @@ -51,7 +51,7 @@ #define XPBYV(...) XPBYV_(__VA_ARGS__) -#define COPYV_(PRECISION_CHAR) bli_##PRECISION_CHAR##copyv_sifive_x280_asm +#define COPYV_(PRECISION_CHAR) bli_##PRECISION_CHAR##copyv_sifive_x280_intr #define COPYV(PRECISION_CHAR) COPYV_(PRECISION_CHAR) // Single precision real diff --git a/kernels/sifive_x280/1f/bli_axpyf_sifive_x280_asm.c b/kernels/sifive_x280/1f/bli_axpyf_sifive_x280_asm.c deleted file mode 100644 index 43c2ba44e2..0000000000 --- a/kernels/sifive_x280/1f/bli_axpyf_sifive_x280_asm.c +++ /dev/null @@ -1,430 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2023, SiFive, Inc. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" -#include -#include -#include -#include - -#define FLT_SIZE 4 -#define FLT_LOAD "flw " -#define VLE "vle32.v " -#define VLSE "vlse32.v " -#define VSE "vse32.v " -#define VSSE "vsse32.v " - -void bli_saxpyf_sifive_x280_asm(conj_t conja, conj_t conjx, dim_t m, dim_t b, - const void *restrict alpha_, const void *restrict a_, inc_t inca, - inc_t lda, const void *restrict x_, inc_t incx, - void *restrict y_, inc_t incy, const cntx_t *restrict cntx) { - (void)conja; - (void)conjx; - (void)cntx; - const float *restrict alpha = alpha_; - const float *restrict a = a_; - const float *restrict x = x_; - float *restrict y = y_; - - if (m == 0 || b == 0) - return; - __asm__(FLT_LOAD "ft11, (%0)" : : "r"(alpha)); - inca *= FLT_SIZE; - lda *= FLT_SIZE; - incx *= FLT_SIZE; - incy *= FLT_SIZE; - size_t avl = m; - while (avl) { - // process vl elements of y at a time - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma" - : "=r"(vl) - : "r"(avl), "i"(8 * FLT_SIZE)); - // x_tmp traverses x - // a points to the vl x b block of a needed this iteration - // a_tmp traverses the columns of this block - const float* restrict x_tmp = x; - const float* restrict a_tmp = a; - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x_tmp)); - if (inca == FLT_SIZE) - __asm__(VLE "v0, (%0)" : : "r"(a_tmp)); - else - __asm__(VLSE "v0, (%0), %1" : : "r"(a_tmp), "r"(inca)); - __asm__("vfmul.vf v0, v0, ft0"); - __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_tmp) : "r"(lda)); - - for (dim_t i = 1; i < b; ++i) { - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x_tmp)); - if (inca == FLT_SIZE) - __asm__(VLE "v24, (%0)" : : "r"(a_tmp)); - else - __asm__(VLSE "v24, (%0), %1" : : "r"(a_tmp), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_tmp) : "r"(lda)); - __asm__("vfmacc.vf v0, ft0, v24"); - } - - if (incy == FLT_SIZE) { - __asm__(VLE "v24, (%0)" : : "r"(y)); - __asm__("vfmacc.vf v24, ft11, v0"); - __asm__(VSE "v24, (%0)" : : "r"(y)); - } else { - __asm__(VLSE "v24, (%0), %1" : : "r"(y), "r"(incy)); - __asm__("vfmacc.vf v24, ft11, v0"); - __asm__(VSSE "v24, (%0), %1" : : "r"(y), "r"(incy)); - } - - __asm__("add %0, %0, %1" : "+r"(a) : "r"(vl * inca)); - __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy)); - avl -= vl; - } - return; -} - -#undef FLT_SIZE -#undef FLT_LOAD -#undef VLE -#undef VLSE -#undef VSE -#undef VSSE - -#define FLT_SIZE 8 -#define FLT_LOAD "fld " -#define VLE "vle64.v " -#define VLSE "vlse64.v " -#define VSE "vse64.v " -#define VSSE "vsse64.v " - -void bli_daxpyf_sifive_x280_asm(conj_t conja, conj_t conjx, dim_t m, dim_t b, - const void *restrict alpha_, const void *restrict a_, inc_t inca, - inc_t lda, const void *restrict x_, inc_t incx, - void *restrict y_, inc_t incy, const cntx_t *restrict cntx) { - (void)conja; - (void)conjx; - (void)cntx; - const double *restrict alpha = alpha_; - const double *restrict a = a_; - const double *restrict x = x_; - double *restrict y = y_; - - if (m == 0 || b == 0) - return; - __asm__(FLT_LOAD "ft11, (%0)" : : "r"(alpha)); - inca *= FLT_SIZE; - lda *= FLT_SIZE; - incx *= FLT_SIZE; - incy *= FLT_SIZE; - size_t avl = m; - while (avl) { - // process vl elements of y at a time - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma" - : "=r"(vl) - : "r"(avl), "i"(8 * FLT_SIZE)); - // x_tmp traverses x - // a points to the vl x b block of a needed this iteration - // a_tmp traverses the columns of this block - const double* restrict x_tmp = x; - const double* restrict a_tmp = a; - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x_tmp)); - if (inca == FLT_SIZE) - __asm__(VLE "v0, (%0)" : : "r"(a_tmp)); - else - __asm__(VLSE "v0, (%0), %1" : : "r"(a_tmp), "r"(inca)); - __asm__("vfmul.vf v0, v0, ft0"); - __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_tmp) : "r"(lda)); - - for (dim_t i = 1; i < b; ++i) { - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x_tmp)); - if (inca == FLT_SIZE) - __asm__(VLE "v24, (%0)" : : "r"(a_tmp)); - else - __asm__(VLSE "v24, (%0), %1" : : "r"(a_tmp), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_tmp) : "r"(lda)); - __asm__("vfmacc.vf v0, ft0, v24"); - } - - if (incy == FLT_SIZE) { - __asm__(VLE "v24, (%0)" : : "r"(y)); - __asm__("vfmacc.vf v24, ft11, v0"); - __asm__(VSE "v24, (%0)" : : "r"(y)); - } else { - __asm__(VLSE "v24, (%0), %1" : : "r"(y), "r"(incy)); - __asm__("vfmacc.vf v24, ft11, v0"); - __asm__(VSSE "v24, (%0), %1" : : "r"(y), "r"(incy)); - } - - __asm__("add %0, %0, %1" : "+r"(a) : "r"(vl * inca)); - __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy)); - avl -= vl; - } - return; -} - -#undef FLT_SIZE -#undef FLT_LOAD -#undef VLE -#undef VLSE -#undef VSE -#undef VSSE - -#define FLT_SIZE 4 -#define FLT_LOAD "flw " -#define VLSEG "vlseg2e32.v " -#define VLSSEG "vlsseg2e32.v " -#define VSSEG "vsseg2e32.v " -#define VSSSEG "vssseg2e32.v " - -void bli_caxpyf_sifive_x280_asm(conj_t conja, conj_t conjx, dim_t m, dim_t b, - const void *restrict alpha_, const void *restrict a_, - inc_t inca, inc_t lda, const void *restrict x_, - inc_t incx, void *restrict y_, inc_t incy, - const cntx_t *restrict cntx) { - (void)cntx; - const scomplex *restrict alpha = alpha_; - const scomplex *restrict a = a_; - const scomplex *restrict x = x_; - scomplex *restrict y = y_; - - if (m == 0 || b == 0) - return; - __asm__(FLT_LOAD "ft10, (%0)" : : "r"(alpha)); - __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(alpha), "I"(FLT_SIZE)); - inca *= 2 * FLT_SIZE; - lda *= 2 * FLT_SIZE; - incx *= 2 * FLT_SIZE; - incy *= 2 * FLT_SIZE; - size_t avl = m; - while (avl) { - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma" - : "=r"(vl) - : "r"(avl), "i"(8 * FLT_SIZE)); - const scomplex* restrict x_tmp = x; - const scomplex* restrict a_tmp = a; - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x_tmp)); - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x_tmp), "I"(FLT_SIZE)); - if (inca == 2 * FLT_SIZE) - __asm__(VLSEG "v24, (%0)" : : "r"(a_tmp)); - else - __asm__(VLSSEG "v24, (%0), %1" : : "r"(a_tmp), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_tmp) : "r"(lda)); - __asm__("vfmul.vf v0, v24, ft0"); - __asm__("vfmul.vf v4, v24, ft1"); - if (conja == BLIS_NO_CONJUGATE && conjx == BLIS_NO_CONJUGATE) { - __asm__("vfnmsac.vf v0, ft1, v28"); - __asm__("vfmacc.vf v4, ft0, v28"); - } else if (conja == BLIS_NO_CONJUGATE && conjx == BLIS_CONJUGATE) { - __asm__("vfmacc.vf v0, ft1, v28"); - __asm__("vfmsac.vf v4, ft0, v28"); - } else if (conja == BLIS_CONJUGATE && conjx == BLIS_NO_CONJUGATE) { - __asm__("vfmacc.vf v0, ft1, v28"); - __asm__("vfnmsac.vf v4, ft0, v28"); - } else { - __asm__("vfnmsac.vf v0, ft1, v28"); - __asm__("vfnmacc.vf v4, ft0, v28"); - } - - for (dim_t i = 1; i < b; ++i) { - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x_tmp)); - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x_tmp), "I"(FLT_SIZE)); - if (inca == 2 * FLT_SIZE) - __asm__(VLSEG "v24, (%0)" : : "r"(a_tmp)); - else - __asm__(VLSSEG "v24, (%0), %1" : : "r"(a_tmp), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_tmp) : "r"(lda)); - __asm__("vfmacc.vf v0, ft0, v24"); - if (conja == BLIS_NO_CONJUGATE && conjx == BLIS_NO_CONJUGATE) { - __asm__("vfmacc.vf v4, ft1, v24"); - __asm__("vfnmsac.vf v0, ft1, v28"); - __asm__("vfmacc.vf v4, ft0, v28"); - } else if (conja == BLIS_NO_CONJUGATE && conjx == BLIS_CONJUGATE) { - __asm__("vfnmsac.vf v4, ft1, v24"); - __asm__("vfmacc.vf v0, ft1, v28"); - __asm__("vfmacc.vf v4, ft0, v28"); - } else if (conja == BLIS_CONJUGATE && conjx == BLIS_NO_CONJUGATE) { - __asm__("vfmacc.vf v4, ft1, v24"); - __asm__("vfmacc.vf v0, ft1, v28"); - __asm__("vfnmsac.vf v4, ft0, v28"); - } else { // conja == BLIS_CONJUGATE && conjx == BLIS_CONJUGATE - __asm__("vfnmsac.vf v4, ft1, v24"); - __asm__("vfnmsac.vf v0, ft1, v28"); - __asm__("vfnmsac.vf v4, ft0, v28"); - } - } - - if (incy == 2 * FLT_SIZE) { - __asm__(VLSEG "v24, (%0)" : : "r"(y)); - __asm__("vfmacc.vf v24, ft10, v0"); - __asm__("vfmacc.vf v28, ft10, v4"); - __asm__("vfnmsac.vf v24, ft11, v4"); - __asm__("vfmacc.vf v28, ft11, v0"); - __asm__(VSSEG "v24, (%0)" : : "r"(y)); - } else { - __asm__(VLSSEG "v24, (%0), %1" : : "r"(y), "r"(incy)); - __asm__("vfmacc.vf v24, ft10, v0"); - __asm__("vfmacc.vf v28, ft10, v4"); - __asm__("vfnmsac.vf v24, ft11, v4"); - __asm__("vfmacc.vf v28, ft11, v0"); - __asm__(VSSSEG "v24, (%0), %1" : : "r"(y), "r"(incy)); - } - - __asm__("add %0, %0, %1" : "+r"(a) : "r"(vl * inca)); - __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy)); - avl -= vl; - } - return; -} - -#undef FLT_SIZE -#undef FLT_LOAD -#undef VLSEG -#undef VLSSEG -#undef VSSEG -#undef VSSSEG - -#define FLT_SIZE 8 -#define FLT_LOAD "fld " -#define VLSEG "vlseg2e64.v " -#define VLSSEG "vlsseg2e64.v " -#define VSSEG "vsseg2e64.v " -#define VSSSEG "vssseg2e64.v " - -void bli_zaxpyf_sifive_x280_asm(conj_t conja, conj_t conjx, dim_t m, dim_t b, - const void *restrict alpha_, const void *restrict a_, - inc_t inca, inc_t lda, const void *restrict x_, - inc_t incx, void *restrict y_, inc_t incy, - const cntx_t *restrict cntx) { - (void)cntx; - const dcomplex *restrict alpha = alpha_; - const dcomplex *restrict a = a_; - const dcomplex *restrict x = x_; - dcomplex *restrict y = y_; - - if (m == 0 || b == 0) - return; - __asm__(FLT_LOAD "ft10, (%0)" : : "r"(alpha)); - __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(alpha), "I"(FLT_SIZE)); - inca *= 2 * FLT_SIZE; - lda *= 2 * FLT_SIZE; - incx *= 2 * FLT_SIZE; - incy *= 2 * FLT_SIZE; - size_t avl = m; - while (avl) { - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma" - : "=r"(vl) - : "r"(avl), "i"(8 * FLT_SIZE)); - const dcomplex* restrict x_tmp = x; - const dcomplex* restrict a_tmp = a; - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x_tmp)); - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x_tmp), "I"(FLT_SIZE)); - if (inca == 2 * FLT_SIZE) - __asm__(VLSEG "v24, (%0)" : : "r"(a_tmp)); - else - __asm__(VLSSEG "v24, (%0), %1" : : "r"(a_tmp), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_tmp) : "r"(lda)); - __asm__("vfmul.vf v0, v24, ft0"); - __asm__("vfmul.vf v4, v24, ft1"); - if (conja == BLIS_NO_CONJUGATE && conjx == BLIS_NO_CONJUGATE) { - __asm__("vfnmsac.vf v0, ft1, v28"); - __asm__("vfmacc.vf v4, ft0, v28"); - } else if (conja == BLIS_NO_CONJUGATE && conjx == BLIS_CONJUGATE) { - __asm__("vfmacc.vf v0, ft1, v28"); - __asm__("vfmsac.vf v4, ft0, v28"); - } else if (conja == BLIS_CONJUGATE && conjx == BLIS_NO_CONJUGATE) { - __asm__("vfmacc.vf v0, ft1, v28"); - __asm__("vfnmsac.vf v4, ft0, v28"); - } else { - __asm__("vfnmsac.vf v0, ft1, v28"); - __asm__("vfnmacc.vf v4, ft0, v28"); - } - - for (dim_t i = 1; i < b; ++i) { - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x_tmp)); - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x_tmp), "I"(FLT_SIZE)); - if (inca == 2 * FLT_SIZE) - __asm__(VLSEG "v24, (%0)" : : "r"(a_tmp)); - else - __asm__(VLSSEG "v24, (%0), %1" : : "r"(a_tmp), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_tmp) : "r"(lda)); - __asm__("vfmacc.vf v0, ft0, v24"); - if (conja == BLIS_NO_CONJUGATE && conjx == BLIS_NO_CONJUGATE) { - __asm__("vfmacc.vf v4, ft1, v24"); - __asm__("vfnmsac.vf v0, ft1, v28"); - __asm__("vfmacc.vf v4, ft0, v28"); - } else if (conja == BLIS_NO_CONJUGATE && conjx == BLIS_CONJUGATE) { - __asm__("vfnmsac.vf v4, ft1, v24"); - __asm__("vfmacc.vf v0, ft1, v28"); - __asm__("vfmacc.vf v4, ft0, v28"); - } else if (conja == BLIS_CONJUGATE && conjx == BLIS_NO_CONJUGATE) { - __asm__("vfmacc.vf v4, ft1, v24"); - __asm__("vfmacc.vf v0, ft1, v28"); - __asm__("vfnmsac.vf v4, ft0, v28"); - } else { // conja == BLIS_CONJUGATE && conjx == BLIS_CONJUGATE - __asm__("vfnmsac.vf v4, ft1, v24"); - __asm__("vfnmsac.vf v0, ft1, v28"); - __asm__("vfnmsac.vf v4, ft0, v28"); - } - } - - if (incy == 2 * FLT_SIZE) { - __asm__(VLSEG "v24, (%0)" : : "r"(y)); - __asm__("vfmacc.vf v24, ft10, v0"); - __asm__("vfmacc.vf v28, ft10, v4"); - __asm__("vfnmsac.vf v24, ft11, v4"); - __asm__("vfmacc.vf v28, ft11, v0"); - __asm__(VSSEG "v24, (%0)" : : "r"(y)); - } else { - __asm__(VLSSEG "v24, (%0), %1" : : "r"(y), "r"(incy)); - __asm__("vfmacc.vf v24, ft10, v0"); - __asm__("vfmacc.vf v28, ft10, v4"); - __asm__("vfnmsac.vf v24, ft11, v4"); - __asm__("vfmacc.vf v28, ft11, v0"); - __asm__(VSSSEG "v24, (%0), %1" : : "r"(y), "r"(incy)); - } - - __asm__("add %0, %0, %1" : "+r"(a) : "r"(vl * inca)); - __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy)); - avl -= vl; - } - return; -} diff --git a/kernels/sifive_x280/1f/bli_axpyf_sifive_x280_intr/bli_axpyf_sifive_x280_intr.c b/kernels/sifive_x280/1f/bli_axpyf_sifive_x280_intr/bli_axpyf_sifive_x280_intr.c new file mode 100644 index 0000000000..a5e0268467 --- /dev/null +++ b/kernels/sifive_x280/1f/bli_axpyf_sifive_x280_intr/bli_axpyf_sifive_x280_intr.c @@ -0,0 +1,121 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2024, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off +#include "../../riscv_cmul_macros_intr.h" +#include "../../riscv_overloaded_intrinsics.h" +#include "blis.h" +#include +#include + +#define AXPYF_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##axpyf_sifive_x280_intr(\ + conj_t conja, \ + conj_t conjx, \ + dim_t m, \ + dim_t b, \ + const T* restrict alpha_, \ + const T* restrict a_, inc_t inca, inc_t lda, \ + const T* restrict x_, inc_t incx, \ + T* restrict y_, inc_t incy, \ + const cntx_t* restrict cntx \ +) + +#define AXPYF(...) AXPYF_(__VA_ARGS__) + +// Single precision real +#define DATATYPE float +#define PRECISION_CHAR s +#define PREC 32 +#define LMUL m8 +#define FLT_SIZE sizeof(float) + +#include "./bli_axpyf_sifive_x280_intr_real.c" + +#undef DATATYPE +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +// Double precision real +#define DATATYPE double +#define PRECISION_CHAR d +#define PREC 64 +#define LMUL m8 +#define FLT_SIZE sizeof(double) + +#include "./bli_axpyf_sifive_x280_intr_real.c" + +#undef DATATYPE +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +// Single precision complex +#define DATATYPE scomplex +#define BASE_DT float +#define PRECISION_CHAR c +#define PREC 32 +#define LMUL m4 +#define FLT_SIZE sizeof(float) + +#include "./bli_axpyf_sifive_x280_intr_complex.c" + +#undef DATATYPE +#undef BASE_DT +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +// Double precision complex +#define DATATYPE dcomplex +#define BASE_DT double +#define PRECISION_CHAR z +#define PREC 64 +#define LMUL m4 +#define FLT_SIZE sizeof(double) + +#include "./bli_axpyf_sifive_x280_intr_complex.c" + +#undef DATATYPE +#undef BASE_DT +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +#undef AXPYF +#undef AXPYF_ diff --git a/kernels/sifive_x280/1f/bli_axpyf_sifive_x280_intr/bli_axpyf_sifive_x280_intr_complex.c b/kernels/sifive_x280/1f/bli_axpyf_sifive_x280_intr/bli_axpyf_sifive_x280_intr_complex.c new file mode 100644 index 0000000000..0ab5509fab --- /dev/null +++ b/kernels/sifive_x280/1f/bli_axpyf_sifive_x280_intr/bli_axpyf_sifive_x280_intr_complex.c @@ -0,0 +1,149 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2024, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off +#ifdef AXPYF + +AXPYF(PRECISION_CHAR, void) +{ + // Computes y := y + alpha * conja(A) * conjx(x) + + (void) cntx; // Suppress unused parameter warnings + const DATATYPE* restrict alpha = alpha_; + const DATATYPE* restrict a = a_; + const DATATYPE* restrict x = x_; + DATATYPE* restrict y = y_; + + if (m <= 0 || b <= 0 || PASTEMAC(PRECISION_CHAR, eq0)(*alpha)) + return; + + size_t avl = m; + while (avl) { + size_t vl = VSETVL(PREC, LMUL)(avl); + const DATATYPE* restrict a_tmp = a; + const DATATYPE* restrict x_tmp = x; + RVV_TYPE_F(PREC, LMUL) ax_vec_real, ax_vec_imag; + + for (size_t i = 0; i < b; ++i) { + DATATYPE x_tmp_conj; + PASTEMAC(PRECISION_CHAR, copycjs)(conjx, *x_tmp, x_tmp_conj); + + RVV_TYPE_FX(PREC, LMUL, 2) acol_vec; + if (inca == 1) + acol_vec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) a_tmp, vl); + else + acol_vec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) a_tmp, 2 * FLT_SIZE * inca, vl); + + RVV_TYPE_F(PREC, LMUL) acol_vec_real = VGET_V_F(PREC, LMUL, 2)(acol_vec, 0); + RVV_TYPE_F(PREC, LMUL) acol_vec_imag = VGET_V_F(PREC, LMUL, 2)(acol_vec, 1); + + if (bli_is_conj(conja)) { + if (i == 0) + VCMUL_VF_CONJ + ( + PREC, LMUL, + ax_vec_real, ax_vec_imag, + acol_vec_real, acol_vec_imag, + x_tmp_conj.real, x_tmp_conj.imag, + vl + ); + else + VCMACC_VF_CONJ + ( + PREC, LMUL, + ax_vec_real, ax_vec_imag, + x_tmp_conj.real, x_tmp_conj.imag, + acol_vec_real, acol_vec_imag, + vl + ); + } + else { + if (i == 0) + VCMUL_VF + ( + PREC, LMUL, + ax_vec_real, ax_vec_imag, + acol_vec_real, acol_vec_imag, + x_tmp_conj.real, x_tmp_conj.imag, + vl + ); + else + VCMACC_VF + ( + PREC, LMUL, + ax_vec_real, ax_vec_imag, + x_tmp_conj.real, x_tmp_conj.imag, + acol_vec_real, acol_vec_imag, + vl + ); + } + + a_tmp += lda; + x_tmp += incx; + } + + RVV_TYPE_FX(PREC, LMUL, 2) yvec; + if (incy == 1) + yvec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) y, vl); + else + yvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) y, 2 * FLT_SIZE * incy, vl); + + RVV_TYPE_F(PREC, LMUL) yvec_real = VGET_V_F(PREC, LMUL, 2)(yvec, 0); + RVV_TYPE_F(PREC, LMUL) yvec_imag = VGET_V_F(PREC, LMUL, 2)(yvec, 1); + + VCMACC_VF + ( + PREC, LMUL, + yvec_real, yvec_imag, + alpha->real, alpha->imag, + ax_vec_real, ax_vec_imag, + vl + ); + + yvec = VSET_V_F(PREC, LMUL, 2)(yvec, 0, yvec_real); + yvec = VSET_V_F(PREC, LMUL, 2)(yvec, 1, yvec_imag); + + if (incy == 1) + VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) y, yvec, vl); + else + VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) y, 2 * FLT_SIZE * incy, yvec, vl); + + a += vl * inca; + y += vl * incy; + avl -= vl; + } + return; +} + +#endif // AXPYF diff --git a/kernels/sifive_x280/1f/bli_axpyf_sifive_x280_intr/bli_axpyf_sifive_x280_intr_real.c b/kernels/sifive_x280/1f/bli_axpyf_sifive_x280_intr/bli_axpyf_sifive_x280_intr_real.c new file mode 100644 index 0000000000..ae7dcb21d5 --- /dev/null +++ b/kernels/sifive_x280/1f/bli_axpyf_sifive_x280_intr/bli_axpyf_sifive_x280_intr_real.c @@ -0,0 +1,96 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2024, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off +#ifdef AXPYF + +AXPYF(PRECISION_CHAR, void) +{ + // Computes y := y + alpha * conja(A) * conjx(x) + + (void) conja; // Suppress unused parameter warnings + (void) conjx; + (void) cntx; + const DATATYPE* restrict alpha = alpha_; + const DATATYPE* restrict a = a_; + const DATATYPE* restrict x = x_; + DATATYPE* restrict y = y_; + + if (m <= 0 || b <= 0 || PASTEMAC(PRECISION_CHAR, eq0)(*alpha)) + return; + + size_t avl = m; + while (avl) { + size_t vl = VSETVL(PREC, LMUL)(avl); + const DATATYPE* restrict a_tmp = a; + const DATATYPE* restrict x_tmp = x; + RVV_TYPE_F(PREC, LMUL) ax_vec; + + for (size_t i = 0; i < b; ++i) { + RVV_TYPE_F(PREC, LMUL) acol_vec; + if (inca == 1) + acol_vec = VLE_V_F(PREC, LMUL)(a_tmp, vl); + else + acol_vec = VLSE_V_F(PREC, LMUL)(a_tmp, FLT_SIZE * inca, vl); + + if (i == 0) + ax_vec = VFMUL_VF(PREC, LMUL)(acol_vec, *x_tmp, vl); + else + ax_vec = VFMACC_VF(PREC, LMUL)(ax_vec, *x_tmp, acol_vec, vl); + + a_tmp += lda; + x_tmp += incx; + } + + RVV_TYPE_F(PREC, LMUL) yvec; + if (incy == 1) + yvec = VLE_V_F(PREC, LMUL)(y, vl); + else + yvec = VLSE_V_F(PREC, LMUL)(y, FLT_SIZE * incy, vl); + + yvec = VFMACC_VF(PREC, LMUL)(yvec, *alpha, ax_vec, vl); + + if (incy == 1) + VSE_V_F(PREC, LMUL)(y, yvec, vl); + else + VSSE_V_F(PREC, LMUL)(y, FLT_SIZE * incy, yvec, vl); + + a += vl * inca; + y += vl * incy; + avl -= vl; + } + return; +} + +#endif // AXPYF diff --git a/kernels/sifive_x280/1f/bli_dotxaxpyf_sifive_x280_asm.c b/kernels/sifive_x280/1f/bli_dotxaxpyf_sifive_x280_asm.c deleted file mode 100644 index ecb340707b..0000000000 --- a/kernels/sifive_x280/1f/bli_dotxaxpyf_sifive_x280_asm.c +++ /dev/null @@ -1,3120 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2023, SiFive, Inc. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" -#include "../riscv_cmul_macros_asm.h" -#include -#include -#include -#include - -#define FLT_SIZE 4 -#define FLT_LOAD "flw " -#define FMUL "fmul.s " -#define VLE "vle32.v " -#define VLSE "vlse32.v " -#define VSE "vse32.v " -#define VSSE "vsse32.v " - -void bli_sdotxaxpyf_sifive_x280_asm( - conj_t conjat, - conj_t conja, - conj_t conjw, - conj_t conjx, - dim_t m, - dim_t b, - const void* restrict alpha_, - const void* restrict a_, inc_t inca, inc_t lda, - const void* restrict w_, inc_t incw, - const void* restrict x_, inc_t incx, - const void* restrict beta_, - void* restrict y_, inc_t incy, - void* restrict z_, inc_t incz, - const cntx_t* restrict cntx - ) { - (void)conjat; - (void)conja; - (void)conjw; - (void)conjx; - (void)cntx; - const float *restrict alpha = alpha_; - const float *restrict beta = beta_; - const float *restrict a = a_; - const float *restrict w = w_; - const float *restrict x = x_; - float *restrict y = y_; - float *restrict z = z_; - - if (b == 0) - return; - else if (m == 0 || *alpha == 0.f) { - // scale y by beta - if (*beta == 0.f) - bli_ssetv_sifive_x280_asm(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL); - else - bli_sscalv_sifive_x280_intr(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL); - return; - } - - __asm__(FLT_LOAD "ft10, (%0)" : : "r"(alpha)); - __asm__(FLT_LOAD "ft11, (%0)" : : "r"(beta)); - inca *= FLT_SIZE; - lda *= FLT_SIZE; - incw *= FLT_SIZE; - incx *= FLT_SIZE; - incy *= FLT_SIZE; - incz *= FLT_SIZE; - inc_t a_bump = 5 * lda; - while (b >= 5) { - // compute dot product of w with 5 rows of a - const float* restrict w_tmp = w; - const float* restrict z_tmp = z; - const float* restrict a_col = a; - size_t avl = m; - bool first = true; - while (avl) { - const float* restrict a_row = a_col; - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m4, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE)); - if (incw == FLT_SIZE) - __asm__(VLE "v28, (%0)" : : "r"(w_tmp)); - else - __asm__(VLSE "v28, (%0), %1" : : "r"(w_tmp), "r"(incw)); - if (inca == FLT_SIZE) { - // a unit stride - if (first) { - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x)); - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : : "r"(x), "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmul.vf v20, v24, ft0"); - __asm__("vfmul.vv v0, v24, v28"); - __asm__(FLT_LOAD "ft1, (%0)" : : "r"(x)); - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : : "r"(x), "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vf v20, ft1, v24"); - __asm__("vfmul.vv v4, v24, v28"); - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x)); - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : : "r"(x), "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vf v20, ft2, v24"); - __asm__("vfmul.vv v8, v24, v28"); - __asm__(FLT_LOAD "ft3, (%0)" : : "r"(x)); - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : : "r"(x), "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vf v20, ft3, v24"); - __asm__("vfmul.vv v12, v24, v28"); - __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x)); - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : : "r"(x), "r"(incx)); - __asm__("vfmacc.vf v20, ft4, v24"); - __asm__("vfmul.vv v16, v24, v28"); - first = false; - } - else { - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmul.vf v20, v24, ft0"); - __asm__("vfmacc.vv v0, v24, v28"); - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vf v20, ft1, v24"); - __asm__("vfmacc.vv v4, v24, v28"); - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vf v20, ft2, v24"); - __asm__("vfmacc.vv v8, v24, v28"); - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vf v20, ft3, v24"); - __asm__("vfmacc.vv v12, v24, v28"); - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("vfmacc.vf v20, ft4, v24"); - __asm__("vfmacc.vv v16, v24, v28"); - } - } // end a unit stride - else { - // a non-unit stride - if (first) { - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x)); - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : : "r"(x), "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmul.vf v20, v24, ft0"); - __asm__("vfmul.vv v0, v24, v28"); - __asm__(FLT_LOAD "ft1, (%0)" : : "r"(x)); - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : : "r"(x), "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vf v20, ft1, v24"); - __asm__("vfmul.vv v4, v24, v28"); - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x)); - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : : "r"(x), "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vf v20, ft2, v24"); - __asm__("vfmul.vv v8, v24, v28"); - __asm__(FLT_LOAD "ft3, (%0)" : : "r"(x)); - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : : "r"(x), "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vf v20, ft3, v24"); - __asm__("vfmul.vv v12, v24, v28"); - __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x)); - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : : "r"(x), "r"(incx)); - __asm__("vfmacc.vf v20, ft4, v24"); - __asm__("vfmul.vv v16, v24, v28"); - first = false; - } - else { - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmul.vf v20, v24, ft0"); - __asm__("vfmacc.vv v0, v24, v28"); - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vf v20, ft1, v24"); - __asm__("vfmacc.vv v4, v24, v28"); - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vf v20, ft2, v24"); - __asm__("vfmacc.vv v8, v24, v28"); - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vf v20, ft3, v24"); - __asm__("vfmacc.vv v12, v24, v28"); - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("vfmacc.vf v20, ft4, v24"); - __asm__("vfmacc.vv v16, v24, v28"); - } - } // end a non-unit stride - - if (incz == FLT_SIZE) { - __asm__(VLE "v24, (%0)" : : "r"(z_tmp)); - __asm__("vfmacc.vf v24, ft10, v20"); - __asm__(VSE "v24, (%0)" : : "r"(z_tmp)); - } else { - __asm__(VLSE "v24, (%0), %1" : : "r"(z_tmp), "r"(incz)); - __asm__("vfmacc.vf v24, ft10, v20"); - __asm__(VSSE "v24, (%0), %1" : : "r"(z_tmp), "r"(incz)); - } - - __asm__("add %0, %0, %1" : "+r"(w_tmp) : "r"(vl * incx)); - __asm__("add %0, %0, %1" : "+r"(z_tmp) : "r"(vl * incz)); - __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca)); - avl -= vl; - } - - __asm__("vmv.s.x v31, x0"); - - __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v0, v0, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (*beta == 0.f) { - __asm__("vfmul.vf v0, v0, ft10"); - __asm__(VSE "v0, (%0)" : : "r"(y)); - } - else { - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); - __asm__(FMUL "ft0, ft11, ft0"); - __asm__("vfmv.s.f v30, ft0"); - __asm__("vfmacc.vf v30, ft10, v0"); - __asm__(VSE "v30, (%0)" : : "r"(y)); - } - __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy)); - - __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v4, v4, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (*beta == 0.f) { - __asm__("vfmul.vf v4, v4, ft10"); - __asm__(VSE "v4, (%0)" : : "r"(y)); - } - else { - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); - __asm__(FMUL "ft0, ft11, ft0"); - __asm__("vfmv.s.f v30, ft0"); - __asm__("vfmacc.vf v30, ft10, v4"); - __asm__(VSE "v30, (%0)" : : "r"(y)); - } - __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy)); - - __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v8, v8, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (*beta == 0.f) { - __asm__("vfmul.vf v8, v8, ft10"); - __asm__(VSE "v8, (%0)" : : "r"(y)); - } - else { - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); - __asm__(FMUL "ft0, ft11, ft0"); - __asm__("vfmv.s.f v30, ft0"); - __asm__("vfmacc.vf v30, ft10, v8"); - __asm__(VSE "v30, (%0)" : : "r"(y)); - } - __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy)); - - __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v12, v12, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (*beta == 0.f) { - __asm__("vfmul.vf v12, v12, ft10"); - __asm__(VSE "v12, (%0)" : : "r"(y)); - } - else { - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); - __asm__(FMUL "ft0, ft11, ft0"); - __asm__("vfmv.s.f v30, ft0"); - __asm__("vfmacc.vf v30, ft10, v12"); - __asm__(VSE "v30, (%0)" : : "r"(y)); - } - __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy)); - - __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v16, v16, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (*beta == 0.f) { - __asm__("vfmul.vf v16, v16, ft10"); - __asm__(VSE "v16, (%0)" : : "r"(y)); - } - else { - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); - __asm__(FMUL "ft0, ft11, ft0"); - __asm__("vfmv.s.f v30, ft0"); - __asm__("vfmacc.vf v30, ft10, v16"); - __asm__(VSE "v30, (%0)" : : "r"(y)); - } - __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy)); - - __asm__("add %0, %0, %1" : "+r"(a) : "r"(a_bump)); - b -= 5; - } - - if (b > 0) { - const float* restrict w_tmp = w; - const float* restrict z_tmp = z; - const float* restrict a_col; - __asm__("add %0, %1, %2" : "=r"(a_col) : "r"(a), "r"((b - 1) * lda)); - __asm__("add %0, %0, %1" : "+r"(x) : "r"((b - 1) * incx)); - size_t avl = m; - bool first = true; - while (avl) { - const float* restrict a_row = a_col; - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m4, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE)); - if (incw == FLT_SIZE) - __asm__(VLE "v28, (%0)" : : "r"(w_tmp)); - else - __asm__(VLSE "v28, (%0), %1" : : "r"(w_tmp), "r"(incw)); - __asm__("vmv.v.i v20, 0"); - if (inca == FLT_SIZE) { - // a unit stride - if (first) { - switch (b) { - case 4: - __asm__(FLT_LOAD "ft3, (%0)" : : "r"(x)); - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : : "r"(x), "r"(incx)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vf v20, ft3, v24"); - __asm__("vfmul.vv v12, v24, v28"); - case 3: - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x)); - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : : "r"(x), "r"(incx)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vf v20, ft2, v24"); - __asm__("vfmul.vv v8, v24, v28"); - case 2: - __asm__(FLT_LOAD "ft1, (%0)" : : "r"(x)); - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : : "r"(x), "r"(incx)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vf v20, ft1, v24"); - __asm__("vfmul.vv v4, v24, v28"); - case 1: - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x)); - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("vfmacc.vf v20, ft0, v24"); - __asm__("vfmul.vv v0, v24, v28"); - } - first = false; - } - else { - switch (b) { - case 4: - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vf v20, ft3, v24"); - __asm__("vfmacc.vv v12, v24, v28"); - case 3: - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vf v20, ft2, v24"); - __asm__("vfmacc.vv v8, v24, v28"); - case 2: - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vf v20, ft1, v24"); - __asm__("vfmacc.vv v4, v24, v28"); - case 1: - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("vfmacc.vf v20, ft0, v24"); - __asm__("vfmacc.vv v0, v24, v28"); - } - } - } // end a unit stride - else { - // a non-unit stride - if (first) { - switch (b) { - case 4: - __asm__(FLT_LOAD "ft3, (%0)" : : "r"(x)); - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : : "r"(x), "r"(incx)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vf v20, ft3, v24"); - __asm__("vfmul.vv v12, v24, v28"); - case 3: - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x)); - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : : "r"(x), "r"(incx)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vf v20, ft2, v24"); - __asm__("vfmul.vv v8, v24, v28"); - case 2: - __asm__(FLT_LOAD "ft1, (%0)" : : "r"(x)); - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : : "r"(x), "r"(incx)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vf v20, ft1, v24"); - __asm__("vfmul.vv v4, v24, v28"); - case 1: - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x)); - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("vfmacc.vf v20, ft0, v24"); - __asm__("vfmul.vv v0, v24, v28"); - } - first = false; - } - else { - switch (b) { - case 4: - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vf v20, ft3, v24"); - __asm__("vfmacc.vv v12, v24, v28"); - case 3: - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vf v20, ft2, v24"); - __asm__("vfmacc.vv v8, v24, v28"); - case 2: - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vf v20, ft1, v24"); - __asm__("vfmacc.vv v4, v24, v28"); - case 1: - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("vfmacc.vf v20, ft0, v24"); - __asm__("vfmacc.vv v0, v24, v28"); - } - } - } // end a non-unit stride - - if (incz == FLT_SIZE) { - __asm__(VLE "v24, (%0)" : : "r"(z_tmp)); - __asm__("vfmacc.vf v24, ft10, v20"); - __asm__(VSE "v24, (%0)" : : "r"(z_tmp)); - } else { - __asm__(VLSE "v24, (%0), %1" : : "r"(z_tmp), "r"(incz)); - __asm__("vfmacc.vf v24, ft10, v20"); - __asm__(VSSE "v24, (%0), %1" : : "r"(z_tmp), "r"(incz)); - } - - __asm__("add %0, %0, %1" : "+r"(w_tmp) : "r"(vl * incw)); - __asm__("add %0, %0, %1" : "+r"(z_tmp) : "r"(vl * incz)); - __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca)); - avl -= vl; - } - - __asm__("add %0, %0, %1" : "+r"(y) : "r"((b - 1) * incy)); - __asm__("vmv.s.x v31, x0"); - - switch (b) { - case 4: - __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v12, v12, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (*beta == 0.f) { - __asm__("vfmul.vf v12, v12, ft10"); - __asm__(VSE "v12, (%0)" : : "r"(y)); - } - else { - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); - __asm__(FMUL "ft0, ft11, ft0"); - __asm__("vfmv.s.f v30, ft0"); - __asm__("vfmacc.vf v30, ft10, v12"); - __asm__(VSE "v30, (%0)" : : "r"(y)); - } - __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy)); - case 3: - __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v8, v8, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (*beta == 0.f) { - __asm__("vfmul.vf v8, v8, ft10"); - __asm__(VSE "v8, (%0)" : : "r"(y)); - } - else { - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); - __asm__(FMUL "ft0, ft11, ft0"); - __asm__("vfmv.s.f v30, ft0"); - __asm__("vfmacc.vf v30, ft10, v8"); - __asm__(VSE "v30, (%0)" : : "r"(y)); - } - __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy)); - case 2: - __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v4, v4, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (*beta == 0.f) { - __asm__("vfmul.vf v4, v4, ft10"); - __asm__(VSE "v4, (%0)" : : "r"(y)); - } - else { - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); - __asm__(FMUL "ft0, ft11, ft0"); - __asm__("vfmv.s.f v30, ft0"); - __asm__("vfmacc.vf v30, ft10, v4"); - __asm__(VSE "v30, (%0)" : : "r"(y)); - } - __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy)); - case 1: - __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v0, v0, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (*beta == 0.f) { - __asm__("vfmul.vf v0, v0, ft10"); - __asm__(VSE "v0, (%0)" : : "r"(y)); - } - else { - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); - __asm__(FMUL "ft0, ft11, ft0"); - __asm__("vfmv.s.f v30, ft0"); - __asm__("vfmacc.vf v30, ft10, v0"); - __asm__(VSE "v30, (%0)" : : "r"(y)); - } - } - } // end cleanup - return; -} - -#undef FLT_SIZE -#undef FLT_LOAD -#undef FMUL -#undef VLE -#undef VLSE -#undef VSE -#undef VSSE - -#define FLT_SIZE 8 -#define FLT_LOAD "fld " -#define FMUL "fmul.d " -#define VLE "vle64.v " -#define VLSE "vlse64.v " -#define VSE "vse64.v " -#define VSSE "vsse64.v " - -void bli_ddotxaxpyf_sifive_x280_asm( - conj_t conjat, - conj_t conja, - conj_t conjw, - conj_t conjx, - dim_t m, - dim_t b, - const void* restrict alpha_, - const void* restrict a_, inc_t inca, inc_t lda, - const void* restrict w_, inc_t incw, - const void* restrict x_, inc_t incx, - const void* restrict beta_, - void* restrict y_, inc_t incy, - void* restrict z_, inc_t incz, - const cntx_t* restrict cntx - ) { - (void)conjat; - (void)conja; - (void)conjw; - (void)conjx; - (void)cntx; - const double *restrict alpha = alpha_; - const double *restrict beta = beta_; - const double *restrict a = a_; - const double *restrict w = w_; - const double *restrict x = x_; - double *restrict y = y_; - double *restrict z = z_; - - if (b == 0) - return; - else if (m == 0 || *alpha == 0.) { - // scale y by beta - if (*beta == 0.) - bli_dsetv_sifive_x280_asm(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL); - else - bli_dscalv_sifive_x280_intr(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL); - return; - } - - __asm__(FLT_LOAD "ft10, (%0)" : : "r"(alpha)); - __asm__(FLT_LOAD "ft11, (%0)" : : "r"(beta)); - inca *= FLT_SIZE; - lda *= FLT_SIZE; - incw *= FLT_SIZE; - incx *= FLT_SIZE; - incy *= FLT_SIZE; - incz *= FLT_SIZE; - inc_t a_bump = 5 * lda; - while (b >= 5) { - // compute dot product of w with 5 rows of a - const double* restrict w_tmp = w; - const double* restrict z_tmp = z; - const double* restrict a_col = a; - size_t avl = m; - bool first = true; - while (avl) { - const double* restrict a_row = a_col; - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m4, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE)); - if (incw == FLT_SIZE) - __asm__(VLE "v28, (%0)" : : "r"(w_tmp)); - else - __asm__(VLSE "v28, (%0), %1" : : "r"(w_tmp), "r"(incw)); - if (inca == FLT_SIZE) { - // a unit stride - if (first) { - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x)); - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : : "r"(x), "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmul.vf v20, v24, ft0"); - __asm__("vfmul.vv v0, v24, v28"); - __asm__(FLT_LOAD "ft1, (%0)" : : "r"(x)); - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : : "r"(x), "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vf v20, ft1, v24"); - __asm__("vfmul.vv v4, v24, v28"); - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x)); - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : : "r"(x), "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vf v20, ft2, v24"); - __asm__("vfmul.vv v8, v24, v28"); - __asm__(FLT_LOAD "ft3, (%0)" : : "r"(x)); - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : : "r"(x), "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vf v20, ft3, v24"); - __asm__("vfmul.vv v12, v24, v28"); - __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x)); - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : : "r"(x), "r"(incx)); - __asm__("vfmacc.vf v20, ft4, v24"); - __asm__("vfmul.vv v16, v24, v28"); - first = false; - } - else { - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmul.vf v20, v24, ft0"); - __asm__("vfmacc.vv v0, v24, v28"); - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vf v20, ft1, v24"); - __asm__("vfmacc.vv v4, v24, v28"); - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vf v20, ft2, v24"); - __asm__("vfmacc.vv v8, v24, v28"); - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vf v20, ft3, v24"); - __asm__("vfmacc.vv v12, v24, v28"); - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("vfmacc.vf v20, ft4, v24"); - __asm__("vfmacc.vv v16, v24, v28"); - } - } // end a unit stride - else { - // a non-unit stride - if (first) { - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x)); - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : : "r"(x), "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmul.vf v20, v24, ft0"); - __asm__("vfmul.vv v0, v24, v28"); - __asm__(FLT_LOAD "ft1, (%0)" : : "r"(x)); - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : : "r"(x), "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vf v20, ft1, v24"); - __asm__("vfmul.vv v4, v24, v28"); - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x)); - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : : "r"(x), "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vf v20, ft2, v24"); - __asm__("vfmul.vv v8, v24, v28"); - __asm__(FLT_LOAD "ft3, (%0)" : : "r"(x)); - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : : "r"(x), "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vf v20, ft3, v24"); - __asm__("vfmul.vv v12, v24, v28"); - __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x)); - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : : "r"(x), "r"(incx)); - __asm__("vfmacc.vf v20, ft4, v24"); - __asm__("vfmul.vv v16, v24, v28"); - first = false; - } - else { - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmul.vf v20, v24, ft0"); - __asm__("vfmacc.vv v0, v24, v28"); - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vf v20, ft1, v24"); - __asm__("vfmacc.vv v4, v24, v28"); - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vf v20, ft2, v24"); - __asm__("vfmacc.vv v8, v24, v28"); - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vf v20, ft3, v24"); - __asm__("vfmacc.vv v12, v24, v28"); - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("vfmacc.vf v20, ft4, v24"); - __asm__("vfmacc.vv v16, v24, v28"); - } - } // end a non-unit stride - - if (incz == FLT_SIZE) { - __asm__(VLE "v24, (%0)" : : "r"(z_tmp)); - __asm__("vfmacc.vf v24, ft10, v20"); - __asm__(VSE "v24, (%0)" : : "r"(z_tmp)); - } else { - __asm__(VLSE "v24, (%0), %1" : : "r"(z_tmp), "r"(incz)); - __asm__("vfmacc.vf v24, ft10, v20"); - __asm__(VSSE "v24, (%0), %1" : : "r"(z_tmp), "r"(incz)); - } - - __asm__("add %0, %0, %1" : "+r"(w_tmp) : "r"(vl * incx)); - __asm__("add %0, %0, %1" : "+r"(z_tmp) : "r"(vl * incz)); - __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca)); - avl -= vl; - } - - __asm__("vmv.s.x v31, x0"); - - __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v0, v0, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (*beta == 0.) { - __asm__("vfmul.vf v0, v0, ft10"); - __asm__(VSE "v0, (%0)" : : "r"(y)); - } - else { - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); - __asm__(FMUL "ft0, ft11, ft0"); - __asm__("vfmv.s.f v30, ft0"); - __asm__("vfmacc.vf v30, ft10, v0"); - __asm__(VSE "v30, (%0)" : : "r"(y)); - } - __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy)); - - __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v4, v4, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (*beta == 0.) { - __asm__("vfmul.vf v4, v4, ft10"); - __asm__(VSE "v4, (%0)" : : "r"(y)); - } - else { - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); - __asm__(FMUL "ft0, ft11, ft0"); - __asm__("vfmv.s.f v30, ft0"); - __asm__("vfmacc.vf v30, ft10, v4"); - __asm__(VSE "v30, (%0)" : : "r"(y)); - } - __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy)); - - __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v8, v8, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (*beta == 0.) { - __asm__("vfmul.vf v8, v8, ft10"); - __asm__(VSE "v8, (%0)" : : "r"(y)); - } - else { - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); - __asm__(FMUL "ft0, ft11, ft0"); - __asm__("vfmv.s.f v30, ft0"); - __asm__("vfmacc.vf v30, ft10, v8"); - __asm__(VSE "v30, (%0)" : : "r"(y)); - } - __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy)); - - __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v12, v12, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (*beta == 0.) { - __asm__("vfmul.vf v12, v12, ft10"); - __asm__(VSE "v12, (%0)" : : "r"(y)); - } - else { - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); - __asm__(FMUL "ft0, ft11, ft0"); - __asm__("vfmv.s.f v30, ft0"); - __asm__("vfmacc.vf v30, ft10, v12"); - __asm__(VSE "v30, (%0)" : : "r"(y)); - } - __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy)); - - __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v16, v16, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (*beta == 0.) { - __asm__("vfmul.vf v16, v16, ft10"); - __asm__(VSE "v16, (%0)" : : "r"(y)); - } - else { - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); - __asm__(FMUL "ft0, ft11, ft0"); - __asm__("vfmv.s.f v30, ft0"); - __asm__("vfmacc.vf v30, ft10, v16"); - __asm__(VSE "v30, (%0)" : : "r"(y)); - } - __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy)); - - __asm__("add %0, %0, %1" : "+r"(a) : "r"(a_bump)); - b -= 5; - } - - if (b > 0) { - const double* restrict w_tmp = w; - const double* restrict z_tmp = z; - const double* restrict a_col; - __asm__("add %0, %1, %2" : "=r"(a_col) : "r"(a), "r"((b - 1) * lda)); - __asm__("add %0, %0, %1" : "+r"(x) : "r"((b - 1) * incx)); - size_t avl = m; - bool first = true; - while (avl) { - const double* restrict a_row = a_col; - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m4, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE)); - if (incw == FLT_SIZE) - __asm__(VLE "v28, (%0)" : : "r"(w_tmp)); - else - __asm__(VLSE "v28, (%0), %1" : : "r"(w_tmp), "r"(incw)); - __asm__("vmv.v.i v20, 0"); - if (inca == FLT_SIZE) { - // a unit stride - if (first) { - switch (b) { - case 4: - __asm__(FLT_LOAD "ft3, (%0)" : : "r"(x)); - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : : "r"(x), "r"(incx)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vf v20, ft3, v24"); - __asm__("vfmul.vv v12, v24, v28"); - case 3: - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x)); - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : : "r"(x), "r"(incx)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vf v20, ft2, v24"); - __asm__("vfmul.vv v8, v24, v28"); - case 2: - __asm__(FLT_LOAD "ft1, (%0)" : : "r"(x)); - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : : "r"(x), "r"(incx)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vf v20, ft1, v24"); - __asm__("vfmul.vv v4, v24, v28"); - case 1: - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x)); - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("vfmacc.vf v20, ft0, v24"); - __asm__("vfmul.vv v0, v24, v28"); - } - first = false; - } - else { - switch (b) { - case 4: - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vf v20, ft3, v24"); - __asm__("vfmacc.vv v12, v24, v28"); - case 3: - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vf v20, ft2, v24"); - __asm__("vfmacc.vv v8, v24, v28"); - case 2: - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vf v20, ft1, v24"); - __asm__("vfmacc.vv v4, v24, v28"); - case 1: - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("vfmacc.vf v20, ft0, v24"); - __asm__("vfmacc.vv v0, v24, v28"); - } - } - } // end a unit stride - else { - // a non-unit stride - if (first) { - switch (b) { - case 4: - __asm__(FLT_LOAD "ft3, (%0)" : : "r"(x)); - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : : "r"(x), "r"(incx)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vf v20, ft3, v24"); - __asm__("vfmul.vv v12, v24, v28"); - case 3: - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x)); - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : : "r"(x), "r"(incx)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vf v20, ft2, v24"); - __asm__("vfmul.vv v8, v24, v28"); - case 2: - __asm__(FLT_LOAD "ft1, (%0)" : : "r"(x)); - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : : "r"(x), "r"(incx)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vf v20, ft1, v24"); - __asm__("vfmul.vv v4, v24, v28"); - case 1: - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x)); - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("vfmacc.vf v20, ft0, v24"); - __asm__("vfmul.vv v0, v24, v28"); - } - first = false; - } - else { - switch (b) { - case 4: - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vf v20, ft3, v24"); - __asm__("vfmacc.vv v12, v24, v28"); - case 3: - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vf v20, ft2, v24"); - __asm__("vfmacc.vv v8, v24, v28"); - case 2: - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vf v20, ft1, v24"); - __asm__("vfmacc.vv v4, v24, v28"); - case 1: - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("vfmacc.vf v20, ft0, v24"); - __asm__("vfmacc.vv v0, v24, v28"); - } - } - } // end a non-unit stride - - if (incz == FLT_SIZE) { - __asm__(VLE "v24, (%0)" : : "r"(z_tmp)); - __asm__("vfmacc.vf v24, ft10, v20"); - __asm__(VSE "v24, (%0)" : : "r"(z_tmp)); - } else { - __asm__(VLSE "v24, (%0), %1" : : "r"(z_tmp), "r"(incz)); - __asm__("vfmacc.vf v24, ft10, v20"); - __asm__(VSSE "v24, (%0), %1" : : "r"(z_tmp), "r"(incz)); - } - - __asm__("add %0, %0, %1" : "+r"(w_tmp) : "r"(vl * incw)); - __asm__("add %0, %0, %1" : "+r"(z_tmp) : "r"(vl * incz)); - __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca)); - avl -= vl; - } - - __asm__("add %0, %0, %1" : "+r"(y) : "r"((b - 1) * incy)); - __asm__("vmv.s.x v31, x0"); - - switch (b) { - case 4: - __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v12, v12, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (*beta == 0.) { - __asm__("vfmul.vf v12, v12, ft10"); - __asm__(VSE "v12, (%0)" : : "r"(y)); - } - else { - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); - __asm__(FMUL "ft0, ft11, ft0"); - __asm__("vfmv.s.f v30, ft0"); - __asm__("vfmacc.vf v30, ft10, v12"); - __asm__(VSE "v30, (%0)" : : "r"(y)); - } - __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy)); - case 3: - __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v8, v8, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (*beta == 0.) { - __asm__("vfmul.vf v8, v8, ft10"); - __asm__(VSE "v8, (%0)" : : "r"(y)); - } - else { - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); - __asm__(FMUL "ft0, ft11, ft0"); - __asm__("vfmv.s.f v30, ft0"); - __asm__("vfmacc.vf v30, ft10, v8"); - __asm__(VSE "v30, (%0)" : : "r"(y)); - } - __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy)); - case 2: - __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v4, v4, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (*beta == 0.) { - __asm__("vfmul.vf v4, v4, ft10"); - __asm__(VSE "v4, (%0)" : : "r"(y)); - } - else { - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); - __asm__(FMUL "ft0, ft11, ft0"); - __asm__("vfmv.s.f v30, ft0"); - __asm__("vfmacc.vf v30, ft10, v4"); - __asm__(VSE "v30, (%0)" : : "r"(y)); - } - __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy)); - case 1: - __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v0, v0, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (*beta == 0.) { - __asm__("vfmul.vf v0, v0, ft10"); - __asm__(VSE "v0, (%0)" : : "r"(y)); - } - else { - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); - __asm__(FMUL "ft0, ft11, ft0"); - __asm__("vfmv.s.f v30, ft0"); - __asm__("vfmacc.vf v30, ft10, v0"); - __asm__(VSE "v30, (%0)" : : "r"(y)); - } - } - } // end cleanup - return; -} - -#undef FLT_SIZE -#undef FLT_LOAD -#undef FMUL -#undef VLE -#undef VLSE -#undef VSE -#undef VSSE - -#define FLT_SIZE 4 -#define FLT_LOAD "flw " -#define FMUL "fmul.s " -#define FMADD "fmadd.s " -#define FNMSUB "fnmsub.s " -#define FNEG "fneg.s " -#define VLSEG2 "vlseg2e32.v " -#define VLSSEG2 "vlsseg2e32.v " -#define VSSEG2 "vsseg2e32.v " -#define VSSSEG2 "vssseg2e32.v " -#define VSE "vse32.v " - -void bli_cdotxaxpyf_sifive_x280_asm - ( - conj_t conjat, - conj_t conja, - conj_t conjw, - conj_t conjx, - dim_t m, - dim_t b, - const void* restrict alpha_, - const void* restrict a_, inc_t inca, inc_t lda, - const void* restrict w_, inc_t incw, - const void* restrict x_, inc_t incx, - const void* restrict beta_, - void* restrict y_, inc_t incy, - void* restrict z_, inc_t incz, - const cntx_t* restrict cntx - ) -{ - (void)cntx; - const scomplex *restrict alpha = alpha_; - const scomplex *restrict beta = beta_; - const scomplex *restrict a = a_; - const scomplex *restrict w = w_; - const scomplex *restrict x = x_; - scomplex *restrict y = y_; - scomplex *restrict z = z_; - - if (b == 0) - return; - else if (m == 0 || (alpha->real == 0.f && alpha->imag == 0.f)) { - // scale y by beta - if (beta->real == 0.f && beta->imag == 0.f) - bli_csetv_sifive_x280_asm(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL); - else - bli_cscalv_sifive_x280_intr(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL); - return; - } - - // use ft0-ft9 to store 5 entries of x, ft10-ft11 to store alpha, - // and fa6-fa7 to store beta - __asm__(FLT_LOAD "ft10, (%0)" : : "r"(alpha)); - __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(alpha), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "fa6, (%0)" : : "r"(beta)); - __asm__(FLT_LOAD "fa7, %1(%0)" : : "r"(beta), "I"(FLT_SIZE)); - // Reduce to case when A^T is not conjugated, then conjugate - // computed product A^T * w if needed. - conj_t conjatw = BLIS_NO_CONJUGATE; - if (conjat == BLIS_CONJUGATE) { - bli_toggle_conj(&conjat); - bli_toggle_conj(&conjw); - bli_toggle_conj(&conjatw); - } - conj_t conjax = BLIS_NO_CONJUGATE; - if (conja == BLIS_CONJUGATE) { - bli_toggle_conj(&conja); - bli_toggle_conj(&conjx); - bli_toggle_conj(&conjax); - } - inca *= 2 * FLT_SIZE; - lda *= 2 * FLT_SIZE; - incw *= 2 * FLT_SIZE; - incx *= 2 * FLT_SIZE; - incy *= 2 * FLT_SIZE; - incz *= 2 * FLT_SIZE; - // these are used to bump a and y, resp. - inc_t a_bump = 5 * lda; - inc_t y_bump = incy - FLT_SIZE; - while (b >= 5) { - // compute dot product of w with 6 rows of a - const scomplex* restrict w_tmp = w; - const scomplex* restrict z_tmp = z; - const scomplex* restrict a_col = a; - size_t avl = m; - bool first = true; - while (avl) { - const scomplex* restrict a_row = a_col; - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m2, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE)); - if (incw == 2 * FLT_SIZE) - __asm__(VLSEG2 "v28, (%0)" : : "r"(w_tmp)); - else - __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(w_tmp), "r"(incw)); - if (inca == 2 * FLT_SIZE) { - if (conjw == BLIS_NO_CONJUGATE) { - // a unit stride, conjw = no conj - if (first) { - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x)); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vf(v20, v22, v24, v26, ft0, ft1); - vcmul_vv(v0, v2, v24, v26, v28, v30); - - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x)); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft2, ft3, v24, v26); - vcmul_vv(v4, v6, v24, v26, v28, v30); - - __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x)); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft4, ft5, v24, v26); - vcmul_vv(v8, v10, v24, v26, v28, v30); - - __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x)); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft6, ft7, v24, v26); - vcmul_vv(v12, v14, v24, v26, v28, v30); - - __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft8, (%0)" : : "r"(x)); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft9, ft9"); } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); - vcmacc_vf(v20, v22, ft8, ft9, v24, v26); - vcmul_vv(v16, v18, v24, v26, v28, v30); - first = false; - } - else { - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vf(v20, v22, v24, v26, ft0, ft1); - vcmacc_vv(v0, v2, v24, v26, v28, v30); - - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft2, ft3, v24, v26); - vcmacc_vv(v4, v6, v24, v26, v28, v30); - - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft4, ft5, v24, v26); - vcmacc_vv(v8, v10, v24, v26, v28, v30); - - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft6, ft7, v24, v26); - vcmacc_vv(v12, v14, v24, v26, v28, v30); - - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - vcmacc_vf(v20, v22, ft8, ft9, v24, v26); - vcmacc_vv(v16, v18, v24, v26, v28, v30); - } - } // end conjw == BLIS_NO_CONJUGATE - else { // conjw == BLIS_CONJUGATE - // a unit stride, conjw = conj - if (first) { - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x)); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vf(v20, v22, v24, v26, ft0, ft1); - vcmul_vv_conj(v0, v2, v24, v26, v28, v30); - - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x)); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft2, ft3, v24, v26); - vcmul_vv_conj(v4, v6, v24, v26, v28, v30); - - __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x)); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft4, ft5, v24, v26); - vcmul_vv_conj(v8, v10, v24, v26, v28, v30); - - __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x)); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft6, ft7, v24, v26); - vcmul_vv_conj(v12, v14, v24, v26, v28, v30); - - __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft8, (%0)" : : "r"(x)); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft9, ft9"); } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); - vcmacc_vf(v20, v22, ft8, ft9, v24, v26); - vcmul_vv_conj(v16, v18, v24, v26, v28, v30); - first = false; - } - else { - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vf(v20, v22, v24, v26, ft0, ft1); - vcmacc_vv_conj(v0, v2, v24, v26, v28, v30); - - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft2, ft3, v24, v26); - vcmacc_vv_conj(v4, v6, v24, v26, v28, v30); - - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft4, ft5, v24, v26); - vcmacc_vv_conj(v8, v10, v24, v26, v28, v30); - - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft6, ft7, v24, v26); - vcmacc_vv_conj(v12, v14, v24, v26, v28, v30); - - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - vcmacc_vf(v20, v22, ft8, ft9, v24, v26); - vcmacc_vv_conj(v16, v18, v24, v26, v28, v30); - } - } // end conjw == BLIS_CONJUGATE - } // end a unit stride - else { // a non-unit stride - if (conjw == BLIS_NO_CONJUGATE) { - // a non-unit stride, conjw = no conj - if (first) { - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x)); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vf(v20, v22, v24, v26, ft0, ft1); - vcmul_vv(v0, v2, v24, v26, v28, v30); - - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x)); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft2, ft3, v24, v26); - vcmul_vv(v4, v6, v24, v26, v28, v30); - - __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x)); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft4, ft5, v24, v26); - vcmul_vv(v8, v10, v24, v26, v28, v30); - - __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x)); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft6, ft7, v24, v26); - vcmul_vv(v12, v14, v24, v26, v28, v30); - - __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft8, (%0)" : : "r"(x)); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft9, ft9"); } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); - vcmacc_vf(v20, v22, ft8, ft9, v24, v26); - vcmul_vv(v16, v18, v24, v26, v28, v30); - first = false; - } - else { - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vf(v20, v22, v24, v26, ft0, ft1); - vcmacc_vv(v0, v2, v24, v26, v28, v30); - - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft2, ft3, v24, v26); - vcmacc_vv(v4, v6, v24, v26, v28, v30); - - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft4, ft5, v24, v26); - vcmacc_vv(v8, v10, v24, v26, v28, v30); - - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft6, ft7, v24, v26); - vcmacc_vv(v12, v14, v24, v26, v28, v30); - - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - vcmacc_vf(v20, v22, ft8, ft9, v24, v26); - vcmacc_vv(v16, v18, v24, v26, v28, v30); - } - } // end conjw == BLIS_NO_CONJUGATE - else { // conjw == BLIS_CONJUGATE - // a non-unit stride, conjw = conj - if (first) { - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x)); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vf(v20, v22, v24, v26, ft0, ft1); - vcmul_vv_conj(v0, v2, v24, v26, v28, v30); - - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x)); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft2, ft3, v24, v26); - vcmul_vv_conj(v4, v6, v24, v26, v28, v30); - - __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x)); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft4, ft5, v24, v26); - vcmul_vv_conj(v8, v10, v24, v26, v28, v30); - - __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x)); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft6, ft7, v24, v26); - vcmul_vv_conj(v12, v14, v24, v26, v28, v30); - - __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft8, (%0)" : : "r"(x)); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft9, ft9"); } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); - vcmacc_vf(v20, v22, ft8, ft9, v24, v26); - vcmul_vv_conj(v16, v18, v24, v26, v28, v30); - first = false; - } - else { - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vf(v20, v22, v24, v26, ft0, ft1); - vcmacc_vv_conj(v0, v2, v24, v26, v28, v30); - - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft2, ft3, v24, v26); - vcmacc_vv_conj(v4, v6, v24, v26, v28, v30); - - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft4, ft5, v24, v26); - vcmacc_vv_conj(v8, v10, v24, v26, v28, v30); - - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft6, ft7, v24, v26); - vcmacc_vv_conj(v12, v14, v24, v26, v28, v30); - - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - vcmacc_vf(v20, v22, ft8, ft9, v24, v26); - vcmacc_vv_conj(v16, v18, v24, v26, v28, v30); - } - } // end conjw == BLIS_CONJUGATE - } // end a non-unit stride - - if (incz == 2 * FLT_SIZE) { - __asm__(VLSEG2 "v24, (%0)" : : "r"(z_tmp)); - if (conjax == BLIS_NO_CONJUGATE) { - vcmacc_vf(v24, v26, ft10, ft11, v20, v22); - } - else { - vcmacc_vf_conj(v24, v26, ft10, ft11, v20, v22); - } - __asm__(VSSEG2 "v24, (%0)" : : "r"(z_tmp)); - } - else { - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(z_tmp), "r"(incz)); - if (conjax == BLIS_NO_CONJUGATE) { - vcmacc_vf(v24, v26, ft10, ft11, v20, v22); - } - else { - vcmacc_vf_conj(v24, v26, ft10, ft11, v20, v22); - } - __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(z_tmp), "r"(incz)); - } - - __asm__("add %0, %0, %1" : "+r"(w_tmp) : "r"(vl * incw)); - __asm__("add %0, %0, %1" : "+r"(z_tmp) : "r"(vl * incz)); - __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca)); - avl -= vl; - } - - __asm__("vmv.s.x v31, x0"); - - __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v0, v0, v31"); - __asm__("vfredusum.vs v2, v2, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (beta->real == 0.f && beta->imag == 0.f) { - if (conjatw == BLIS_NO_CONJUGATE) { - vcmul_vf(v28, v29, v0, v2, ft10, ft11); - } - else { - vcmul_vf_conj(v28, v29, v0, v2, ft10, ft11); - } - } - else { - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); - cmul(ft0, ft1, fa6, fa7, ft2, ft3); - __asm__("vfmv.s.f v28, ft0"); - __asm__("vfmv.s.f v29, ft1"); - if (conjatw == BLIS_NO_CONJUGATE) { - vcmacc_vf(v28, v29, ft10, ft11, v0, v2); - } - else { - vcmacc_vf_conj(v28, v29, ft10, ft11, v0, v2); - } - } - __asm__(VSE "v28, (%0)" : : "r"(y)); - __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); - __asm__(VSE "v29, (%0)" : : "r"(y)); - __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump)); - - __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v4, v4, v31"); - __asm__("vfredusum.vs v6, v6, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (beta->real == 0.f && beta->imag == 0.f) { - if (conjatw == BLIS_NO_CONJUGATE) { - vcmul_vf(v28, v29, v4, v6, ft10, ft11); - } - else { - vcmul_vf_conj(v28, v29, v4, v6, ft10, ft11); - } - } - else { - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); - cmul(ft0, ft1, fa6, fa7, ft2, ft3); - __asm__("vfmv.s.f v28, ft0"); - __asm__("vfmv.s.f v29, ft1"); - if (conjatw == BLIS_NO_CONJUGATE) { - vcmacc_vf(v28, v29, ft10, ft11, v4, v6); - } - else { - vcmacc_vf_conj(v28, v29, ft10, ft11, v4, v6); - } - } - __asm__(VSE "v28, (%0)" : : "r"(y)); - __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); - __asm__(VSE "v29, (%0)" : : "r"(y)); - __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump)); - - __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v8, v8, v31"); - __asm__("vfredusum.vs v10, v10, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (beta->real == 0.f && beta->imag == 0.f) { - if (conjatw == BLIS_NO_CONJUGATE) { - vcmul_vf(v28, v29, v8, v10, ft10, ft11); - } - else { - vcmul_vf_conj(v28, v29, v8, v10, ft10, ft11); - } - } - else { - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); - cmul(ft0, ft1, fa6, fa7, ft2, ft3); - __asm__("vfmv.s.f v28, ft0"); - __asm__("vfmv.s.f v29, ft1"); - if (conjatw == BLIS_NO_CONJUGATE) { - vcmacc_vf(v28, v29, ft10, ft11, v8, v10); - } - else { - vcmacc_vf_conj(v28, v29, ft10, ft11, v8, v10); - } - } - __asm__(VSE "v28, (%0)" : : "r"(y)); - __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); - __asm__(VSE "v29, (%0)" : : "r"(y)); - __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump)); - - __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v12, v12, v31"); - __asm__("vfredusum.vs v14, v14, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (beta->real == 0.f && beta->imag == 0.f) { - if (conjatw == BLIS_NO_CONJUGATE) { - vcmul_vf(v28, v29, v12, v14, ft10, ft11); - } - else { - vcmul_vf_conj(v28, v29, v12, v14, ft10, ft11); - } - } - else { - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); - cmul(ft0, ft1, fa6, fa7, ft2, ft3); - __asm__("vfmv.s.f v28, ft0"); - __asm__("vfmv.s.f v29, ft1"); - if (conjatw == BLIS_NO_CONJUGATE) { - vcmacc_vf(v28, v29, ft10, ft11, v12, v14); - } - else { - vcmacc_vf_conj(v28, v29, ft10, ft11, v12, v14); - } - } - __asm__(VSE "v28, (%0)" : : "r"(y)); - __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); - __asm__(VSE "v29, (%0)" : : "r"(y)); - __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump)); - - __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v16, v16, v31"); - __asm__("vfredusum.vs v18, v18, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (beta->real == 0.f && beta->imag == 0.f) { - if (conjatw == BLIS_NO_CONJUGATE) { - vcmul_vf(v28, v29, v16, v18, ft10, ft11); - } - else { - vcmul_vf_conj(v28, v29, v16, v18, ft10, ft11); - } - } - else { - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); - cmul(ft0, ft1, fa6, fa7, ft2, ft3); - __asm__("vfmv.s.f v28, ft0"); - __asm__("vfmv.s.f v29, ft1"); - if (conjatw == BLIS_NO_CONJUGATE) { - vcmacc_vf(v28, v29, ft10, ft11, v16, v18); - } - else { - vcmacc_vf_conj(v28, v29, ft10, ft11, v16, v18); - } - } - __asm__(VSE "v28, (%0)" : : "r"(y)); - __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); - __asm__(VSE "v29, (%0)" : : "r"(y)); - __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump)); - - // a += 5 * lda; - __asm__("add %0, %0, %1" : "+r"(a) : "r"(a_bump)); - b -= 5; - } - - if (b > 0) { - // cleanup loop, 0 < b < 5 - const scomplex* restrict w_tmp = w; - const scomplex* restrict z_tmp = z; - const scomplex* restrict a_col; - __asm__("add %0, %1, %2" : "=r"(a_col) : "r"(a), "r"((b - 1) * lda)); - __asm__("add %0, %0, %1" : "+r"(x) : "r"((b - 1) * incx)); - size_t avl = m; - bool first = true; - while (avl) { - const scomplex* restrict a_row = a_col; - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m2, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE)); - if (incw == 2 * FLT_SIZE) - __asm__(VLSEG2 "v28, (%0)" : : "r"(w_tmp)); - else - __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(w_tmp), "r"(incw)); - __asm__("vmv.v.i v20, 0"); - __asm__("vmv.v.i v22, 0"); - if (inca == 2 * FLT_SIZE) { - if (conjw == BLIS_NO_CONJUGATE) { - // a unit stride, conjw = no conj - if (first) { - switch (b) { - case 4: - __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x)); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); } - __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft6, ft7, v24, v26); - vcmul_vv(v12, v14, v24, v26, v28, v30); - case 3: - __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x)); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); } - __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft4, ft5, v24, v26); - vcmul_vv(v8, v10, v24, v26, v28, v30); - case 2: - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x)); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); } - __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft2, ft3, v24, v26); - vcmul_vv(v4, v6, v24, v26, v28, v30); - case 1: - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x)); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); } - vcmacc_vf(v20, v22, ft0, ft1, v24, v26); - vcmul_vv(v0, v2, v24, v26, v28, v30); - } - first = false; - } - else { - switch (b) { - case 4: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft6, ft7, v24, v26); - vcmacc_vv(v12, v14, v24, v26, v28, v30); - case 3: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft4, ft5, v24, v26); - vcmacc_vv(v8, v10, v24, v26, v28, v30); - case 2: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft2, ft3, v24, v26); - vcmacc_vv(v4, v6, v24, v26, v28, v30); - case 1: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - vcmacc_vf(v20, v22, ft0, ft1, v24, v26); - vcmacc_vv(v0, v2, v24, v26, v28, v30); - } - } - } // end conjw == BLIS_NO_CONJUGATE - else { // conjw == BLIS_CONJUGATE - // a unit stride, conjw = conj - if (first) { - switch (b) { - case 4: - __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x)); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); } - __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft6, ft7, v24, v26); - vcmul_vv_conj(v12, v14, v24, v26, v28, v30); - case 3: - __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x)); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); } - __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft4, ft5, v24, v26); - vcmul_vv_conj(v8, v10, v24, v26, v28, v30); - case 2: - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x)); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); } - __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft2, ft3, v24, v26); - vcmul_vv_conj(v4, v6, v24, v26, v28, v30); - case 1: - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x)); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); } - vcmacc_vf(v20, v22, ft0, ft1, v24, v26); - vcmul_vv_conj(v0, v2, v24, v26, v28, v30); - } - first = false; - } - else { - switch (b) { - case 4: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft6, ft7, v24, v26); - vcmacc_vv_conj(v12, v14, v24, v26, v28, v30); - case 3: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft4, ft5, v24, v26); - vcmacc_vv_conj(v8, v10, v24, v26, v28, v30); - case 2: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft2, ft3, v24, v26); - vcmacc_vv_conj(v4, v6, v24, v26, v28, v30); - case 1: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - vcmacc_vf(v20, v22, ft0, ft1, v24, v26); - vcmacc_vv_conj(v0, v2, v24, v26, v28, v30); - } - } - } // end conjw == BLIS_CONJUGATE - } // end a unit stride - else { // a non-unit stride - if (conjw == BLIS_NO_CONJUGATE) { - // a non-unit stride, conjw = no conj - if (first) { - switch (b) { - case 4: - __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x)); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); } - __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft6, ft7, v24, v26); - vcmul_vv(v12, v14, v24, v26, v28, v30); - case 3: - __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x)); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); } - __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft4, ft5, v24, v26); - vcmul_vv(v8, v10, v24, v26, v28, v30); - case 2: - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x)); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); } - __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft2, ft3, v24, v26); - vcmul_vv(v4, v6, v24, v26, v28, v30); - case 1: - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x)); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); } - vcmacc_vf(v20, v22, ft0, ft1, v24, v26); - vcmul_vv(v0, v2, v24, v26, v28, v30); - } - first = false; - } - else { - switch (b) { - case 4: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft6, ft7, v24, v26); - vcmacc_vv(v12, v14, v24, v26, v28, v30); - case 3: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft4, ft5, v24, v26); - vcmacc_vv(v8, v10, v24, v26, v28, v30); - case 2: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft2, ft3, v24, v26); - vcmacc_vv(v4, v6, v24, v26, v28, v30); - case 1: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - vcmacc_vf(v20, v22, ft0, ft1, v24, v26); - vcmacc_vv(v0, v2, v24, v26, v28, v30); - } - } - } // end conjw == BLIS_NO_CONJUGATE - else { // conjw == BLIS_CONJUGATE - // a non-unit stride, conjw = conj - if (first) { - switch (b) { - case 4: - __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x)); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); } - __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft6, ft7, v24, v26); - vcmul_vv_conj(v12, v14, v24, v26, v28, v30); - case 3: - __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x)); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); } - __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft4, ft5, v24, v26); - vcmul_vv_conj(v8, v10, v24, v26, v28, v30); - case 2: - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x)); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); } - __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft2, ft3, v24, v26); - vcmul_vv_conj(v4, v6, v24, v26, v28, v30); - case 1: - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x)); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); } - vcmacc_vf(v20, v22, ft0, ft1, v24, v26); - vcmul_vv_conj(v0, v2, v24, v26, v28, v30); - } - first = false; - } - else { - switch (b) { - case 4: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft6, ft7, v24, v26); - vcmacc_vv_conj(v12, v14, v24, v26, v28, v30); - case 3: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft4, ft5, v24, v26); - vcmacc_vv_conj(v8, v10, v24, v26, v28, v30); - case 2: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft2, ft3, v24, v26); - vcmacc_vv_conj(v4, v6, v24, v26, v28, v30); - case 1: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - vcmacc_vf(v20, v22, ft0, ft1, v24, v26); - vcmacc_vv_conj(v0, v2, v24, v26, v28, v30); - } - } - } // end conjw == BLIS_CONJUGATE - } // end a non-unit stride - - if (incz == 2 * FLT_SIZE) { - __asm__(VLSEG2 "v24, (%0)" : : "r"(z_tmp)); - if (conjax == BLIS_NO_CONJUGATE) { - vcmacc_vf(v24, v26, ft10, ft11, v20, v22); - } - else { - vcmacc_vf_conj(v24, v26, ft10, ft11, v20, v22); - } - __asm__(VSSEG2 "v24, (%0)" : : "r"(z_tmp)); - } - else { - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(z_tmp), "r"(incz)); - if (conjax == BLIS_NO_CONJUGATE) { - vcmacc_vf(v24, v26, ft10, ft11, v20, v22); - } - else { - vcmacc_vf_conj(v24, v26, ft10, ft11, v20, v22); - } - __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(z_tmp), "r"(incz)); - } - - __asm__("add %0, %0, %1" : "+r"(w_tmp) : "r"(vl * incw)); - __asm__("add %0, %0, %1" : "+r"(z_tmp) : "r"(vl * incz)); - __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca)); - avl -= vl; - } - - __asm__("add %0, %0, %1" : "+r"(y) : "r"((b - 1) * incy)); - y_bump = incy + FLT_SIZE; - __asm__("vmv.s.x v31, x0"); - - switch (b) { - case 4: - __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v12, v12, v31"); - __asm__("vfredusum.vs v14, v14, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (beta->real == 0.f && beta->imag == 0.f) { - if (conjatw == BLIS_NO_CONJUGATE) { - vcmul_vf(v28, v29, v12, v14, ft10, ft11); - } - else { - vcmul_vf_conj(v28, v29, v12, v14, ft10, ft11); - } - } - else { - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); - cmul(ft0, ft1, fa6, fa7, ft2, ft3); - __asm__("vfmv.s.f v28, ft0"); - __asm__("vfmv.s.f v29, ft1"); - if (conjatw == BLIS_NO_CONJUGATE) { - vcmacc_vf(v28, v29, ft10, ft11, v12, v14); - } - else { - vcmacc_vf_conj(v28, v29, ft10, ft11, v12, v14); - } - } - __asm__(VSE "v28, (%0)" : : "r"(y)); - __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); - __asm__(VSE "v29, (%0)" : : "r"(y)); - __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump)); - case 3: - __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v8, v8, v31"); - __asm__("vfredusum.vs v10, v10, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (beta->real == 0.f && beta->imag == 0.f) { - if (conjatw == BLIS_NO_CONJUGATE) { - vcmul_vf(v28, v29, v8, v10, ft10, ft11); - } - else { - vcmul_vf_conj(v28, v29, v8, v10, ft10, ft11); - } - } - else { - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); - cmul(ft0, ft1, fa6, fa7, ft2, ft3); - __asm__("vfmv.s.f v28, ft0"); - __asm__("vfmv.s.f v29, ft1"); - if (conjatw == BLIS_NO_CONJUGATE) { - vcmacc_vf(v28, v29, ft10, ft11, v8, v10); - } - else { - vcmacc_vf_conj(v28, v29, ft10, ft11, v8, v10); - } - } - __asm__(VSE "v28, (%0)" : : "r"(y)); - __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); - __asm__(VSE "v29, (%0)" : : "r"(y)); - __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump)); - case 2: - __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v4, v4, v31"); - __asm__("vfredusum.vs v6, v6, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (beta->real == 0.f && beta->imag == 0.f) { - if (conjatw == BLIS_NO_CONJUGATE) { - vcmul_vf(v28, v29, v4, v6, ft10, ft11); - } - else { - vcmul_vf_conj(v28, v29, v4, v6, ft10, ft11); - } - } - else { - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); - cmul(ft0, ft1, fa6, fa7, ft2, ft3); - __asm__("vfmv.s.f v28, ft0"); - __asm__("vfmv.s.f v29, ft1"); - if (conjatw == BLIS_NO_CONJUGATE) { - vcmacc_vf(v28, v29, ft10, ft11, v4, v6); - } - else { - vcmacc_vf_conj(v28, v29, ft10, ft11, v4, v6); - } - } - __asm__(VSE "v28, (%0)" : : "r"(y)); - __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); - __asm__(VSE "v29, (%0)" : : "r"(y)); - __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump)); - case 1: - __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v0, v0, v31"); - __asm__("vfredusum.vs v2, v2, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (beta->real == 0.f && beta->imag == 0.f) { - if (conjatw == BLIS_NO_CONJUGATE) { - vcmul_vf(v28, v29, v0, v2, ft10, ft11); - } - else { - vcmul_vf_conj(v28, v29, v0, v2, ft10, ft11); - } - } - else { - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); - cmul(ft0, ft1, fa6, fa7, ft2, ft3); - __asm__("vfmv.s.f v28, ft0"); - __asm__("vfmv.s.f v29, ft1"); - if (conjatw == BLIS_NO_CONJUGATE) { - vcmacc_vf(v28, v29, ft10, ft11, v0, v2); - } - else { - vcmacc_vf_conj(v28, v29, ft10, ft11, v0, v2); - } - } - __asm__(VSE "v28, (%0)" : : "r"(y)); - __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); - __asm__(VSE "v29, (%0)" : : "r"(y)); - } - } - return; -} - -#undef FLT_SIZE -#undef FLT_LOAD -#undef FMUL -#undef FMADD -#undef FNMSUB -#undef FNEG -#undef VLSEG2 -#undef VLSSEG2 -#undef VSSEG2 -#undef VSSSEG2 -#undef VSE - -#define FLT_SIZE 8 -#define FLT_LOAD "fld " -#define FMUL "fmul.d " -#define FMADD "fmadd.d " -#define FNMSUB "fnmsub.d " -#define FNEG "fneg.d " -#define VLSEG2 "vlseg2e64.v " -#define VLSSEG2 "vlsseg2e64.v " -#define VSSEG2 "vsseg2e64.v " -#define VSSSEG2 "vssseg2e64.v " -#define VSE "vse64.v " - -void bli_zdotxaxpyf_sifive_x280_asm - ( - conj_t conjat, - conj_t conja, - conj_t conjw, - conj_t conjx, - dim_t m, - dim_t b, - const void* restrict alpha_, - const void* restrict a_, inc_t inca, inc_t lda, - const void* restrict w_, inc_t incw, - const void* restrict x_, inc_t incx, - const void* restrict beta_, - void* restrict y_, inc_t incy, - void* restrict z_, inc_t incz, - const cntx_t* restrict cntx - ) -{ - (void)cntx; - const dcomplex *restrict alpha = alpha_; - const dcomplex *restrict beta = beta_; - const dcomplex *restrict a = a_; - const dcomplex *restrict w = w_; - const dcomplex *restrict x = x_; - dcomplex *restrict y = y_; - dcomplex *restrict z = z_; - - if (b == 0) - return; - else if (m == 0 || (alpha->real == 0. && alpha->imag == 0.)) { - // scale y by beta - if (beta->real == 0. && beta->imag == 0.) - bli_zsetv_sifive_x280_asm(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL); - else - bli_zscalv_sifive_x280_intr(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL); - return; - } - - // use ft0-ft9 to store 5 entries of x, ft10-ft11 to store alpha, - // and fa6-fa7 to store beta - __asm__(FLT_LOAD "ft10, (%0)" : : "r"(alpha)); - __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(alpha), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "fa6, (%0)" : : "r"(beta)); - __asm__(FLT_LOAD "fa7, %1(%0)" : : "r"(beta), "I"(FLT_SIZE)); - // Reduce to case when A^T is not conjugated, then conjugate - // computed product A^T * w if needed. - conj_t conjatw = BLIS_NO_CONJUGATE; - if (conjat == BLIS_CONJUGATE) { - bli_toggle_conj(&conjat); - bli_toggle_conj(&conjw); - bli_toggle_conj(&conjatw); - } - conj_t conjax = BLIS_NO_CONJUGATE; - if (conja == BLIS_CONJUGATE) { - bli_toggle_conj(&conja); - bli_toggle_conj(&conjx); - bli_toggle_conj(&conjax); - } - inca *= 2 * FLT_SIZE; - lda *= 2 * FLT_SIZE; - incw *= 2 * FLT_SIZE; - incx *= 2 * FLT_SIZE; - incy *= 2 * FLT_SIZE; - incz *= 2 * FLT_SIZE; - // these are used to bump a and y, resp. - inc_t a_bump = 5 * lda; - inc_t y_bump = incy - FLT_SIZE; - while (b >= 5) { - // compute dot product of w with 6 rows of a - const dcomplex* restrict w_tmp = w; - const dcomplex* restrict z_tmp = z; - const dcomplex* restrict a_col = a; - size_t avl = m; - bool first = true; - while (avl) { - const dcomplex* restrict a_row = a_col; - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m2, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE)); - if (incw == 2 * FLT_SIZE) - __asm__(VLSEG2 "v28, (%0)" : : "r"(w_tmp)); - else - __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(w_tmp), "r"(incw)); - if (inca == 2 * FLT_SIZE) { - if (conjw == BLIS_NO_CONJUGATE) { - // a unit stride, conjw = no conj - if (first) { - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x)); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vf(v20, v22, v24, v26, ft0, ft1); - vcmul_vv(v0, v2, v24, v26, v28, v30); - - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x)); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft2, ft3, v24, v26); - vcmul_vv(v4, v6, v24, v26, v28, v30); - - __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x)); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft4, ft5, v24, v26); - vcmul_vv(v8, v10, v24, v26, v28, v30); - - __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x)); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft6, ft7, v24, v26); - vcmul_vv(v12, v14, v24, v26, v28, v30); - - __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft8, (%0)" : : "r"(x)); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft9, ft9"); } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); - vcmacc_vf(v20, v22, ft8, ft9, v24, v26); - vcmul_vv(v16, v18, v24, v26, v28, v30); - first = false; - } - else { - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vf(v20, v22, v24, v26, ft0, ft1); - vcmacc_vv(v0, v2, v24, v26, v28, v30); - - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft2, ft3, v24, v26); - vcmacc_vv(v4, v6, v24, v26, v28, v30); - - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft4, ft5, v24, v26); - vcmacc_vv(v8, v10, v24, v26, v28, v30); - - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft6, ft7, v24, v26); - vcmacc_vv(v12, v14, v24, v26, v28, v30); - - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - vcmacc_vf(v20, v22, ft8, ft9, v24, v26); - vcmacc_vv(v16, v18, v24, v26, v28, v30); - } - } // end conjw == BLIS_NO_CONJUGATE - else { // conjw == BLIS_CONJUGATE - // a unit stride, conjw = conj - if (first) { - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x)); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vf(v20, v22, v24, v26, ft0, ft1); - vcmul_vv_conj(v0, v2, v24, v26, v28, v30); - - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x)); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft2, ft3, v24, v26); - vcmul_vv_conj(v4, v6, v24, v26, v28, v30); - - __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x)); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft4, ft5, v24, v26); - vcmul_vv_conj(v8, v10, v24, v26, v28, v30); - - __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x)); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft6, ft7, v24, v26); - vcmul_vv_conj(v12, v14, v24, v26, v28, v30); - - __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft8, (%0)" : : "r"(x)); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft9, ft9"); } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); - vcmacc_vf(v20, v22, ft8, ft9, v24, v26); - vcmul_vv_conj(v16, v18, v24, v26, v28, v30); - first = false; - } - else { - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vf(v20, v22, v24, v26, ft0, ft1); - vcmacc_vv_conj(v0, v2, v24, v26, v28, v30); - - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft2, ft3, v24, v26); - vcmacc_vv_conj(v4, v6, v24, v26, v28, v30); - - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft4, ft5, v24, v26); - vcmacc_vv_conj(v8, v10, v24, v26, v28, v30); - - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft6, ft7, v24, v26); - vcmacc_vv_conj(v12, v14, v24, v26, v28, v30); - - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - vcmacc_vf(v20, v22, ft8, ft9, v24, v26); - vcmacc_vv_conj(v16, v18, v24, v26, v28, v30); - } - } // end conjw == BLIS_CONJUGATE - } // end a unit stride - else { // a non-unit stride - if (conjw == BLIS_NO_CONJUGATE) { - // a non-unit stride, conjw = no conj - if (first) { - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x)); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vf(v20, v22, v24, v26, ft0, ft1); - vcmul_vv(v0, v2, v24, v26, v28, v30); - - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x)); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft2, ft3, v24, v26); - vcmul_vv(v4, v6, v24, v26, v28, v30); - - __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x)); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft4, ft5, v24, v26); - vcmul_vv(v8, v10, v24, v26, v28, v30); - - __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x)); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft6, ft7, v24, v26); - vcmul_vv(v12, v14, v24, v26, v28, v30); - - __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft8, (%0)" : : "r"(x)); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft9, ft9"); } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); - vcmacc_vf(v20, v22, ft8, ft9, v24, v26); - vcmul_vv(v16, v18, v24, v26, v28, v30); - first = false; - } - else { - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vf(v20, v22, v24, v26, ft0, ft1); - vcmacc_vv(v0, v2, v24, v26, v28, v30); - - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft2, ft3, v24, v26); - vcmacc_vv(v4, v6, v24, v26, v28, v30); - - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft4, ft5, v24, v26); - vcmacc_vv(v8, v10, v24, v26, v28, v30); - - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft6, ft7, v24, v26); - vcmacc_vv(v12, v14, v24, v26, v28, v30); - - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - vcmacc_vf(v20, v22, ft8, ft9, v24, v26); - vcmacc_vv(v16, v18, v24, v26, v28, v30); - } - } // end conjw == BLIS_NO_CONJUGATE - else { // conjw == BLIS_CONJUGATE - // a non-unit stride, conjw = conj - if (first) { - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x)); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vf(v20, v22, v24, v26, ft0, ft1); - vcmul_vv_conj(v0, v2, v24, v26, v28, v30); - - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x)); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft2, ft3, v24, v26); - vcmul_vv_conj(v4, v6, v24, v26, v28, v30); - - __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x)); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft4, ft5, v24, v26); - vcmul_vv_conj(v8, v10, v24, v26, v28, v30); - - __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x)); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft6, ft7, v24, v26); - vcmul_vv_conj(v12, v14, v24, v26, v28, v30); - - __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft8, (%0)" : : "r"(x)); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft9, ft9"); } - __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); - vcmacc_vf(v20, v22, ft8, ft9, v24, v26); - vcmul_vv_conj(v16, v18, v24, v26, v28, v30); - first = false; - } - else { - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vf(v20, v22, v24, v26, ft0, ft1); - vcmacc_vv_conj(v0, v2, v24, v26, v28, v30); - - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft2, ft3, v24, v26); - vcmacc_vv_conj(v4, v6, v24, v26, v28, v30); - - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft4, ft5, v24, v26); - vcmacc_vv_conj(v8, v10, v24, v26, v28, v30); - - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft6, ft7, v24, v26); - vcmacc_vv_conj(v12, v14, v24, v26, v28, v30); - - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - vcmacc_vf(v20, v22, ft8, ft9, v24, v26); - vcmacc_vv_conj(v16, v18, v24, v26, v28, v30); - } - } // end conjw == BLIS_CONJUGATE - } // end a non-unit stride - - if (incz == 2 * FLT_SIZE) { - __asm__(VLSEG2 "v24, (%0)" : : "r"(z_tmp)); - if (conjax == BLIS_NO_CONJUGATE) { - vcmacc_vf(v24, v26, ft10, ft11, v20, v22); - } - else { - vcmacc_vf_conj(v24, v26, ft10, ft11, v20, v22); - } - __asm__(VSSEG2 "v24, (%0)" : : "r"(z_tmp)); - } - else { - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(z_tmp), "r"(incz)); - if (conjax == BLIS_NO_CONJUGATE) { - vcmacc_vf(v24, v26, ft10, ft11, v20, v22); - } - else { - vcmacc_vf_conj(v24, v26, ft10, ft11, v20, v22); - } - __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(z_tmp), "r"(incz)); - } - - __asm__("add %0, %0, %1" : "+r"(w_tmp) : "r"(vl * incw)); - __asm__("add %0, %0, %1" : "+r"(z_tmp) : "r"(vl * incz)); - __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca)); - avl -= vl; - } - - __asm__("vmv.s.x v31, x0"); - - __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v0, v0, v31"); - __asm__("vfredusum.vs v2, v2, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (beta->real == 0. && beta->imag == 0.) { - if (conjatw == BLIS_NO_CONJUGATE) { - vcmul_vf(v28, v29, v0, v2, ft10, ft11); - } - else { - vcmul_vf_conj(v28, v29, v0, v2, ft10, ft11); - } - } - else { - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); - cmul(ft0, ft1, fa6, fa7, ft2, ft3); - __asm__("vfmv.s.f v28, ft0"); - __asm__("vfmv.s.f v29, ft1"); - if (conjatw == BLIS_NO_CONJUGATE) { - vcmacc_vf(v28, v29, ft10, ft11, v0, v2); - } - else { - vcmacc_vf_conj(v28, v29, ft10, ft11, v0, v2); - } - } - __asm__(VSE "v28, (%0)" : : "r"(y)); - __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); - __asm__(VSE "v29, (%0)" : : "r"(y)); - __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump)); - - __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v4, v4, v31"); - __asm__("vfredusum.vs v6, v6, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (beta->real == 0. && beta->imag == 0.) { - if (conjatw == BLIS_NO_CONJUGATE) { - vcmul_vf(v28, v29, v4, v6, ft10, ft11); - } - else { - vcmul_vf_conj(v28, v29, v4, v6, ft10, ft11); - } - } - else { - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); - cmul(ft0, ft1, fa6, fa7, ft2, ft3); - __asm__("vfmv.s.f v28, ft0"); - __asm__("vfmv.s.f v29, ft1"); - if (conjatw == BLIS_NO_CONJUGATE) { - vcmacc_vf(v28, v29, ft10, ft11, v4, v6); - } - else { - vcmacc_vf_conj(v28, v29, ft10, ft11, v4, v6); - } - } - __asm__(VSE "v28, (%0)" : : "r"(y)); - __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); - __asm__(VSE "v29, (%0)" : : "r"(y)); - __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump)); - - __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v8, v8, v31"); - __asm__("vfredusum.vs v10, v10, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (beta->real == 0. && beta->imag == 0.) { - if (conjatw == BLIS_NO_CONJUGATE) { - vcmul_vf(v28, v29, v8, v10, ft10, ft11); - } - else { - vcmul_vf_conj(v28, v29, v8, v10, ft10, ft11); - } - } - else { - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); - cmul(ft0, ft1, fa6, fa7, ft2, ft3); - __asm__("vfmv.s.f v28, ft0"); - __asm__("vfmv.s.f v29, ft1"); - if (conjatw == BLIS_NO_CONJUGATE) { - vcmacc_vf(v28, v29, ft10, ft11, v8, v10); - } - else { - vcmacc_vf_conj(v28, v29, ft10, ft11, v8, v10); - } - } - __asm__(VSE "v28, (%0)" : : "r"(y)); - __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); - __asm__(VSE "v29, (%0)" : : "r"(y)); - __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump)); - - __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v12, v12, v31"); - __asm__("vfredusum.vs v14, v14, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (beta->real == 0. && beta->imag == 0.) { - if (conjatw == BLIS_NO_CONJUGATE) { - vcmul_vf(v28, v29, v12, v14, ft10, ft11); - } - else { - vcmul_vf_conj(v28, v29, v12, v14, ft10, ft11); - } - } - else { - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); - cmul(ft0, ft1, fa6, fa7, ft2, ft3); - __asm__("vfmv.s.f v28, ft0"); - __asm__("vfmv.s.f v29, ft1"); - if (conjatw == BLIS_NO_CONJUGATE) { - vcmacc_vf(v28, v29, ft10, ft11, v12, v14); - } - else { - vcmacc_vf_conj(v28, v29, ft10, ft11, v12, v14); - } - } - __asm__(VSE "v28, (%0)" : : "r"(y)); - __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); - __asm__(VSE "v29, (%0)" : : "r"(y)); - __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump)); - - __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v16, v16, v31"); - __asm__("vfredusum.vs v18, v18, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (beta->real == 0. && beta->imag == 0.) { - if (conjatw == BLIS_NO_CONJUGATE) { - vcmul_vf(v28, v29, v16, v18, ft10, ft11); - } - else { - vcmul_vf_conj(v28, v29, v16, v18, ft10, ft11); - } - } - else { - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); - cmul(ft0, ft1, fa6, fa7, ft2, ft3); - __asm__("vfmv.s.f v28, ft0"); - __asm__("vfmv.s.f v29, ft1"); - if (conjatw == BLIS_NO_CONJUGATE) { - vcmacc_vf(v28, v29, ft10, ft11, v16, v18); - } - else { - vcmacc_vf_conj(v28, v29, ft10, ft11, v16, v18); - } - } - __asm__(VSE "v28, (%0)" : : "r"(y)); - __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); - __asm__(VSE "v29, (%0)" : : "r"(y)); - __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump)); - - // a += 5 * lda; - __asm__("add %0, %0, %1" : "+r"(a) : "r"(a_bump)); - b -= 5; - } - - if (b > 0) { - // cleanup loop, 0 < b < 5 - const dcomplex* restrict w_tmp = w; - const dcomplex* restrict z_tmp = z; - const dcomplex* restrict a_col; - __asm__("add %0, %1, %2" : "=r"(a_col) : "r"(a), "r"((b - 1) * lda)); - __asm__("add %0, %0, %1" : "+r"(x) : "r"((b - 1) * incx)); - size_t avl = m; - bool first = true; - while (avl) { - const dcomplex* restrict a_row = a_col; - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m2, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE)); - if (incw == 2 * FLT_SIZE) - __asm__(VLSEG2 "v28, (%0)" : : "r"(w_tmp)); - else - __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(w_tmp), "r"(incw)); - __asm__("vmv.v.i v20, 0"); - __asm__("vmv.v.i v22, 0"); - if (inca == 2 * FLT_SIZE) { - if (conjw == BLIS_NO_CONJUGATE) { - // a unit stride, conjw = no conj - if (first) { - switch (b) { - case 4: - __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x)); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); } - __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft6, ft7, v24, v26); - vcmul_vv(v12, v14, v24, v26, v28, v30); - case 3: - __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x)); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); } - __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft4, ft5, v24, v26); - vcmul_vv(v8, v10, v24, v26, v28, v30); - case 2: - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x)); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); } - __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft2, ft3, v24, v26); - vcmul_vv(v4, v6, v24, v26, v28, v30); - case 1: - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x)); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); } - vcmacc_vf(v20, v22, ft0, ft1, v24, v26); - vcmul_vv(v0, v2, v24, v26, v28, v30); - } - first = false; - } - else { - switch (b) { - case 4: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft6, ft7, v24, v26); - vcmacc_vv(v12, v14, v24, v26, v28, v30); - case 3: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft4, ft5, v24, v26); - vcmacc_vv(v8, v10, v24, v26, v28, v30); - case 2: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft2, ft3, v24, v26); - vcmacc_vv(v4, v6, v24, v26, v28, v30); - case 1: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - vcmacc_vf(v20, v22, ft0, ft1, v24, v26); - vcmacc_vv(v0, v2, v24, v26, v28, v30); - } - } - } // end conjw == BLIS_NO_CONJUGATE - else { // conjw == BLIS_CONJUGATE - // a unit stride, conjw = conj - if (first) { - switch (b) { - case 4: - __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x)); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); } - __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft6, ft7, v24, v26); - vcmul_vv_conj(v12, v14, v24, v26, v28, v30); - case 3: - __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x)); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); } - __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft4, ft5, v24, v26); - vcmul_vv_conj(v8, v10, v24, v26, v28, v30); - case 2: - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x)); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); } - __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft2, ft3, v24, v26); - vcmul_vv_conj(v4, v6, v24, v26, v28, v30); - case 1: - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x)); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); } - vcmacc_vf(v20, v22, ft0, ft1, v24, v26); - vcmul_vv_conj(v0, v2, v24, v26, v28, v30); - } - first = false; - } - else { - switch (b) { - case 4: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft6, ft7, v24, v26); - vcmacc_vv_conj(v12, v14, v24, v26, v28, v30); - case 3: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft4, ft5, v24, v26); - vcmacc_vv_conj(v8, v10, v24, v26, v28, v30); - case 2: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft2, ft3, v24, v26); - vcmacc_vv_conj(v4, v6, v24, v26, v28, v30); - case 1: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - vcmacc_vf(v20, v22, ft0, ft1, v24, v26); - vcmacc_vv_conj(v0, v2, v24, v26, v28, v30); - } - } - } // end conjw == BLIS_CONJUGATE - } // end a unit stride - else { // a non-unit stride - if (conjw == BLIS_NO_CONJUGATE) { - // a non-unit stride, conjw = no conj - if (first) { - switch (b) { - case 4: - __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x)); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); } - __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft6, ft7, v24, v26); - vcmul_vv(v12, v14, v24, v26, v28, v30); - case 3: - __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x)); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); } - __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft4, ft5, v24, v26); - vcmul_vv(v8, v10, v24, v26, v28, v30); - case 2: - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x)); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); } - __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft2, ft3, v24, v26); - vcmul_vv(v4, v6, v24, v26, v28, v30); - case 1: - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x)); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); } - vcmacc_vf(v20, v22, ft0, ft1, v24, v26); - vcmul_vv(v0, v2, v24, v26, v28, v30); - } - first = false; - } - else { - switch (b) { - case 4: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft6, ft7, v24, v26); - vcmacc_vv(v12, v14, v24, v26, v28, v30); - case 3: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft4, ft5, v24, v26); - vcmacc_vv(v8, v10, v24, v26, v28, v30); - case 2: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft2, ft3, v24, v26); - vcmacc_vv(v4, v6, v24, v26, v28, v30); - case 1: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - vcmacc_vf(v20, v22, ft0, ft1, v24, v26); - vcmacc_vv(v0, v2, v24, v26, v28, v30); - } - } - } // end conjw == BLIS_NO_CONJUGATE - else { // conjw == BLIS_CONJUGATE - // a non-unit stride, conjw = conj - if (first) { - switch (b) { - case 4: - __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x)); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); } - __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft6, ft7, v24, v26); - vcmul_vv_conj(v12, v14, v24, v26, v28, v30); - case 3: - __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x)); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); } - __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft4, ft5, v24, v26); - vcmul_vv_conj(v8, v10, v24, v26, v28, v30); - case 2: - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x)); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); } - __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft2, ft3, v24, v26); - vcmul_vv_conj(v4, v6, v24, v26, v28, v30); - case 1: - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x)); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); } - vcmacc_vf(v20, v22, ft0, ft1, v24, v26); - vcmul_vv_conj(v0, v2, v24, v26, v28, v30); - } - first = false; - } - else { - switch (b) { - case 4: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft6, ft7, v24, v26); - vcmacc_vv_conj(v12, v14, v24, v26, v28, v30); - case 3: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft4, ft5, v24, v26); - vcmacc_vv_conj(v8, v10, v24, v26, v28, v30); - case 2: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vf(v20, v22, ft2, ft3, v24, v26); - vcmacc_vv_conj(v4, v6, v24, v26, v28, v30); - case 1: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - vcmacc_vf(v20, v22, ft0, ft1, v24, v26); - vcmacc_vv_conj(v0, v2, v24, v26, v28, v30); - } - } - } // end conjw == BLIS_CONJUGATE - } // end a non-unit stride - - if (incz == 2 * FLT_SIZE) { - __asm__(VLSEG2 "v24, (%0)" : : "r"(z_tmp)); - if (conjax == BLIS_NO_CONJUGATE) { - vcmacc_vf(v24, v26, ft10, ft11, v20, v22); - } - else { - vcmacc_vf_conj(v24, v26, ft10, ft11, v20, v22); - } - __asm__(VSSEG2 "v24, (%0)" : : "r"(z_tmp)); - } - else { - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(z_tmp), "r"(incz)); - if (conjax == BLIS_NO_CONJUGATE) { - vcmacc_vf(v24, v26, ft10, ft11, v20, v22); - } - else { - vcmacc_vf_conj(v24, v26, ft10, ft11, v20, v22); - } - __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(z_tmp), "r"(incz)); - } - - __asm__("add %0, %0, %1" : "+r"(w_tmp) : "r"(vl * incw)); - __asm__("add %0, %0, %1" : "+r"(z_tmp) : "r"(vl * incz)); - __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca)); - avl -= vl; - } - - __asm__("add %0, %0, %1" : "+r"(y) : "r"((b - 1) * incy)); - y_bump = incy + FLT_SIZE; - __asm__("vmv.s.x v31, x0"); - - switch (b) { - case 4: - __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v12, v12, v31"); - __asm__("vfredusum.vs v14, v14, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (beta->real == 0. && beta->imag == 0.) { - if (conjatw == BLIS_NO_CONJUGATE) { - vcmul_vf(v28, v29, v12, v14, ft10, ft11); - } - else { - vcmul_vf_conj(v28, v29, v12, v14, ft10, ft11); - } - } - else { - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); - cmul(ft0, ft1, fa6, fa7, ft2, ft3); - __asm__("vfmv.s.f v28, ft0"); - __asm__("vfmv.s.f v29, ft1"); - if (conjatw == BLIS_NO_CONJUGATE) { - vcmacc_vf(v28, v29, ft10, ft11, v12, v14); - } - else { - vcmacc_vf_conj(v28, v29, ft10, ft11, v12, v14); - } - } - __asm__(VSE "v28, (%0)" : : "r"(y)); - __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); - __asm__(VSE "v29, (%0)" : : "r"(y)); - __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump)); - case 3: - __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v8, v8, v31"); - __asm__("vfredusum.vs v10, v10, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (beta->real == 0. && beta->imag == 0.) { - if (conjatw == BLIS_NO_CONJUGATE) { - vcmul_vf(v28, v29, v8, v10, ft10, ft11); - } - else { - vcmul_vf_conj(v28, v29, v8, v10, ft10, ft11); - } - } - else { - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); - cmul(ft0, ft1, fa6, fa7, ft2, ft3); - __asm__("vfmv.s.f v28, ft0"); - __asm__("vfmv.s.f v29, ft1"); - if (conjatw == BLIS_NO_CONJUGATE) { - vcmacc_vf(v28, v29, ft10, ft11, v8, v10); - } - else { - vcmacc_vf_conj(v28, v29, ft10, ft11, v8, v10); - } - } - __asm__(VSE "v28, (%0)" : : "r"(y)); - __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); - __asm__(VSE "v29, (%0)" : : "r"(y)); - __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump)); - case 2: - __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v4, v4, v31"); - __asm__("vfredusum.vs v6, v6, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (beta->real == 0. && beta->imag == 0.) { - if (conjatw == BLIS_NO_CONJUGATE) { - vcmul_vf(v28, v29, v4, v6, ft10, ft11); - } - else { - vcmul_vf_conj(v28, v29, v4, v6, ft10, ft11); - } - } - else { - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); - cmul(ft0, ft1, fa6, fa7, ft2, ft3); - __asm__("vfmv.s.f v28, ft0"); - __asm__("vfmv.s.f v29, ft1"); - if (conjatw == BLIS_NO_CONJUGATE) { - vcmacc_vf(v28, v29, ft10, ft11, v4, v6); - } - else { - vcmacc_vf_conj(v28, v29, ft10, ft11, v4, v6); - } - } - __asm__(VSE "v28, (%0)" : : "r"(y)); - __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); - __asm__(VSE "v29, (%0)" : : "r"(y)); - __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump)); - case 1: - __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v0, v0, v31"); - __asm__("vfredusum.vs v2, v2, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (beta->real == 0. && beta->imag == 0.) { - if (conjatw == BLIS_NO_CONJUGATE) { - vcmul_vf(v28, v29, v0, v2, ft10, ft11); - } - else { - vcmul_vf_conj(v28, v29, v0, v2, ft10, ft11); - } - } - else { - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); - cmul(ft0, ft1, fa6, fa7, ft2, ft3); - __asm__("vfmv.s.f v28, ft0"); - __asm__("vfmv.s.f v29, ft1"); - if (conjatw == BLIS_NO_CONJUGATE) { - vcmacc_vf(v28, v29, ft10, ft11, v0, v2); - } - else { - vcmacc_vf_conj(v28, v29, ft10, ft11, v0, v2); - } - } - __asm__(VSE "v28, (%0)" : : "r"(y)); - __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); - __asm__(VSE "v29, (%0)" : : "r"(y)); - } - } - return; -} diff --git a/kernels/sifive_x280/1f/bli_dotxaxpyf_sifive_x280_intr/bli_dotxaxpyf_sifive_x280_intr.c b/kernels/sifive_x280/1f/bli_dotxaxpyf_sifive_x280_intr/bli_dotxaxpyf_sifive_x280_intr.c new file mode 100644 index 0000000000..dc1bca9f6a --- /dev/null +++ b/kernels/sifive_x280/1f/bli_dotxaxpyf_sifive_x280_intr/bli_dotxaxpyf_sifive_x280_intr.c @@ -0,0 +1,137 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2024, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off + +#include "../../riscv_cmul_macros_intr.h" +#include "../../riscv_overloaded_intrinsics.h" +#include "blis.h" +#include +#include + +#define DOTXAXPYF_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##dotxaxpyf_sifive_x280_intr(\ + conj_t conjat, \ + conj_t conja, \ + conj_t conjw, \ + conj_t conjx, \ + dim_t m, \ + dim_t b, \ + const T* restrict alpha_, \ + const T* restrict a_, inc_t inca, inc_t lda, \ + const T* restrict w_, inc_t incw, \ + const T* restrict x_, inc_t incx, \ + const T* restrict beta_, \ + T* restrict y_, inc_t incy, \ + T* restrict z_, inc_t incz, \ + const cntx_t* restrict cntx \ +) + +#define DOTXAXPYF(...) DOTXAXPYF_(__VA_ARGS__) + +#define SETV_(PRECISION_CHAR) bli_##PRECISION_CHAR##setv_sifive_x280_intr +#define SETV(PRECISION_CHAR) SETV_(PRECISION_CHAR) +#define SCALV_(PRECISION_CHAR) bli_##PRECISION_CHAR##scalv_sifive_x280_intr +#define SCALV(PRECISION_CHAR) SCALV_(PRECISION_CHAR) + +// Single precision real +#define DATATYPE float +#define PRECISION_CHAR s +#define PREC 32 +#define LMUL m4 +#define FLT_SIZE sizeof(float) + +#include "./bli_dotxaxpyf_sifive_x280_intr_real.c" + +#undef DATATYPE +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +// Double precision real +#define DATATYPE double +#define PRECISION_CHAR d +#define PREC 64 +#define LMUL m4 +#define FLT_SIZE sizeof(double) + +#include "./bli_dotxaxpyf_sifive_x280_intr_real.c" + +#undef DATATYPE +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +// Single precision complex +#define DATATYPE scomplex +#define BASE_DT float +#define PRECISION_CHAR c +#define PREC 32 +#define LMUL m2 +#define FLT_SIZE sizeof(float) + +#include "./bli_dotxaxpyf_sifive_x280_intr_complex.c" + +#undef DATATYPE +#undef BASE_DT +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +// Double precision complex +#define DATATYPE dcomplex +#define BASE_DT double +#define PRECISION_CHAR z +#define PREC 64 +#define LMUL m2 +#define FLT_SIZE sizeof(double) + +#include "./bli_dotxaxpyf_sifive_x280_intr_complex.c" + +#undef DATATYPE +#undef BASE_DT +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +#undef SETV_ +#undef SETV +#undef SCALV_ +#undef SCALV + +#undef DOTXAXPYF +#undef DOTXAXPYF_ diff --git a/kernels/sifive_x280/1f/bli_dotxaxpyf_sifive_x280_intr/bli_dotxaxpyf_sifive_x280_intr_complex.c b/kernels/sifive_x280/1f/bli_dotxaxpyf_sifive_x280_intr/bli_dotxaxpyf_sifive_x280_intr_complex.c new file mode 100644 index 0000000000..d8a984064d --- /dev/null +++ b/kernels/sifive_x280/1f/bli_dotxaxpyf_sifive_x280_intr/bli_dotxaxpyf_sifive_x280_intr_complex.c @@ -0,0 +1,427 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2024, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off +#ifdef DOTXAXPYF + +#define DOTXAXPYF_SIFIVE_X280_LOAD_ACOL(i) \ + do { \ + acol_vec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (a_tmp + i * lda), vl); \ + acol_vec_r = VGET_V_F(PREC, LMUL, 2)(acol_vec, 0); \ + acol_vec_i = VGET_V_F(PREC, LMUL, 2)(acol_vec, 1); \ + } while (0) + +#define DOTXAXPYF_SIFIVE_X280_LOAD_ACOL_STRIDED(i) \ + do { \ + acol_vec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (a_tmp + i * lda), 2 * FLT_SIZE * inca, vl); \ + acol_vec_r = VGET_V_F(PREC, LMUL, 2)(acol_vec, 0); \ + acol_vec_i = VGET_V_F(PREC, LMUL, 2)(acol_vec, 1); \ + } while (0) + +#define DOTXAXPYF_SIFIVE_X280_LOOP_BODY_FIRST(LOAD_SUF, DF_CONJ_SUF, AF_CONJ_SUF) \ + do { \ + DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(0); \ + VCMUL_VV##DF_CONJ_SUF(PREC, LMUL, yacc0_r, yacc0_i, acol_vec_r, acol_vec_i, wvec_r, wvec_i, vl); \ + VCMUL_VF##AF_CONJ_SUF(PREC, LMUL, zacc_r, zacc_i, acol_vec_r, acol_vec_i, x[0 * incx].real, x[0 * incx].imag, vl); \ + DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(1); \ + VCMUL_VV##DF_CONJ_SUF(PREC, LMUL, yacc1_r, yacc1_i, acol_vec_r, acol_vec_i, wvec_r, wvec_i, vl); \ + VCMACC_VF##AF_CONJ_SUF(PREC, LMUL, zacc_r, zacc_i, x[1 * incx].real, x[1 * incx].imag, acol_vec_r, acol_vec_i, vl); \ + DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(2); \ + VCMUL_VV##DF_CONJ_SUF(PREC, LMUL, yacc2_r, yacc2_i, acol_vec_r, acol_vec_i, wvec_r, wvec_i, vl); \ + VCMACC_VF##AF_CONJ_SUF(PREC, LMUL, zacc_r, zacc_i, x[2 * incx].real, x[2 * incx].imag, acol_vec_r, acol_vec_i, vl); \ + DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(3); \ + VCMUL_VV##DF_CONJ_SUF(PREC, LMUL, yacc3_r, yacc3_i, acol_vec_r, acol_vec_i, wvec_r, wvec_i, vl); \ + VCMACC_VF##AF_CONJ_SUF(PREC, LMUL, zacc_r, zacc_i, x[3 * incx].real, x[3 * incx].imag, acol_vec_r, acol_vec_i, vl); \ + } while (0) + +#define DOTXAXPYF_SIFIVE_X280_LOOP_BODY(LOAD_SUF, DF_CONJ_SUF, AF_CONJ_SUF) \ + do { \ + DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(0); \ + VCMACC_VV##DF_CONJ_SUF##_TU(PREC, LMUL, yacc0_r, yacc0_i, wvec_r, wvec_i, acol_vec_r, acol_vec_i, vl); \ + VCMUL_VF##AF_CONJ_SUF(PREC, LMUL, zacc_r, zacc_i, acol_vec_r, acol_vec_i, x[0 * incx].real, x[0 * incx].imag, vl); \ + DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(1); \ + VCMACC_VV##DF_CONJ_SUF##_TU(PREC, LMUL, yacc1_r, yacc1_i, wvec_r, wvec_i, acol_vec_r, acol_vec_i, vl); \ + VCMACC_VF##AF_CONJ_SUF(PREC, LMUL, zacc_r, zacc_i, x[1 * incx].real, x[1 * incx].imag, acol_vec_r, acol_vec_i, vl); \ + DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(2); \ + VCMACC_VV##DF_CONJ_SUF##_TU(PREC, LMUL, yacc2_r, yacc2_i, wvec_r, wvec_i, acol_vec_r, acol_vec_i, vl); \ + VCMACC_VF##AF_CONJ_SUF(PREC, LMUL, zacc_r, zacc_i, x[2 * incx].real, x[2 * incx].imag, acol_vec_r, acol_vec_i, vl); \ + DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(3); \ + VCMACC_VV##DF_CONJ_SUF##_TU(PREC, LMUL, yacc3_r, yacc3_i, wvec_r, wvec_i, acol_vec_r, acol_vec_i, vl); \ + VCMACC_VF##AF_CONJ_SUF(PREC, LMUL, zacc_r, zacc_i, x[3 * incx].real, x[3 * incx].imag, acol_vec_r, acol_vec_i, vl); \ + } while (0) + +#define DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY_FIRST(LOAD_SUF, DF_CONJ_SUF, AF_CONJ_SUF) \ + do { \ + switch (b) { \ + case 3: \ + DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(2); \ + VCMUL_VV##DF_CONJ_SUF(PREC, LMUL, yacc2_r, yacc2_i, acol_vec_r, acol_vec_i, wvec_r, wvec_i, vl); \ + VCMACC_VF##AF_CONJ_SUF(PREC, LMUL, zacc_r, zacc_i, x[2 * incx].real, x[2 * incx].imag, acol_vec_r, acol_vec_i, vl); \ + case 2: \ + DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(1); \ + VCMUL_VV##DF_CONJ_SUF(PREC, LMUL, yacc1_r, yacc1_i, acol_vec_r, acol_vec_i, wvec_r, wvec_i, vl); \ + VCMACC_VF##AF_CONJ_SUF(PREC, LMUL, zacc_r, zacc_i, x[1 * incx].real, x[1 * incx].imag, acol_vec_r, acol_vec_i, vl); \ + case 1: \ + DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(0); \ + VCMUL_VV##DF_CONJ_SUF(PREC, LMUL, yacc0_r, yacc0_i, acol_vec_r, acol_vec_i, wvec_r, wvec_i, vl); \ + VCMACC_VF##AF_CONJ_SUF(PREC, LMUL, zacc_r, zacc_i, x[0 * incx].real, x[0 * incx].imag, acol_vec_r, acol_vec_i, vl); \ + } \ + } while (0) + +#define DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY(LOAD_SUF, DF_CONJ_SUF, AF_CONJ_SUF) \ + do { \ + switch (b) { \ + case 3: \ + DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(2); \ + VCMACC_VV##DF_CONJ_SUF##_TU(PREC, LMUL, yacc2_r, yacc2_i, wvec_r, wvec_i, acol_vec_r, acol_vec_i, vl); \ + VCMACC_VF##AF_CONJ_SUF(PREC, LMUL, zacc_r, zacc_i, x[2 * incx].real, x[2 * incx].imag, acol_vec_r, acol_vec_i, vl); \ + case 2: \ + DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(1); \ + VCMACC_VV##DF_CONJ_SUF##_TU(PREC, LMUL, yacc1_r, yacc1_i, wvec_r, wvec_i, acol_vec_r, acol_vec_i, vl); \ + VCMACC_VF##AF_CONJ_SUF(PREC, LMUL, zacc_r, zacc_i, x[1 * incx].real, x[1 * incx].imag, acol_vec_r, acol_vec_i, vl); \ + case 1: \ + DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(0); \ + VCMACC_VV##DF_CONJ_SUF##_TU(PREC, LMUL, yacc0_r, yacc0_i, wvec_r, wvec_i, acol_vec_r, acol_vec_i, vl); \ + VCMACC_VF##AF_CONJ_SUF(PREC, LMUL, zacc_r, zacc_i, x[0 * incx].real, x[0 * incx].imag, acol_vec_r, acol_vec_i, vl); \ + } \ + } while (0) + +#define DOTXAXPYF_SIFIVE_X280_REDUCE(i) \ + do { \ + RVV_TYPE_F(PREC, m1) dot##i##_r = VFMV_S_F(PREC, m1)(0., 1); \ + RVV_TYPE_F(PREC, m1) dot##i##_i = VFMV_S_F(PREC, m1)(0., 1); \ + dot##i##_r = VF_REDUSUM_VS(PREC, LMUL)(yacc##i##_r, dot##i##_r, m); \ + dot##i##_i = VF_REDUSUM_VS(PREC, LMUL)(yacc##i##_i, dot##i##_i, m); \ + RVV_TYPE_F(PREC, m1) y##i##_r, y##i##_i; \ + if (PASTEMAC(PRECISION_CHAR, eq0)(*beta)) { \ + if (bli_is_conj(conjatw)) \ + VCMUL_VF_CONJ(PREC, m1, y##i##_r, y##i##_i, dot##i##_r, dot##i##_i, alpha->real, alpha->imag, 1); \ + else \ + VCMUL_VF(PREC, m1, y##i##_r, y##i##_i, dot##i##_r, dot##i##_i, alpha->real, alpha->imag, 1); \ + y[i * incy].real = VFMV_F_S(PREC)(y##i##_r); \ + y[i * incy].imag = VFMV_F_S(PREC)(y##i##_i); \ + } \ + else { \ + PASTEMAC(PRECISION_CHAR, scals)(*beta, y[i * incy]) \ + y##i##_r = VFMV_S_F(PREC, m1)(y[i * incy].real, 1); \ + y##i##_i = VFMV_S_F(PREC, m1)(y[i * incy].imag, 1); \ + if (bli_is_conj(conjatw)) \ + VCMACC_VF_CONJ(PREC, m1, y##i##_r, y##i##_i, alpha->real, alpha->imag, dot##i##_r, dot##i##_i, 1); \ + else \ + VCMACC_VF(PREC, m1, y##i##_r, y##i##_i, alpha->real, alpha->imag, dot##i##_r, dot##i##_i, 1); \ + y[i * incy].real = VFMV_F_S(PREC)(y##i##_r); \ + y[i * incy].imag = VFMV_F_S(PREC)(y##i##_i); \ + } \ + } while (0) + +DOTXAXPYF(PRECISION_CHAR, void) +{ + // Computes y := beta * y + alpha * conjat(A^T) * conjx(x) + + (void) cntx; // Suppress unused parameter warnings + const DATATYPE* restrict alpha = alpha_; + const DATATYPE* restrict a = a_; + const DATATYPE* restrict w = w_; + const DATATYPE* restrict x = x_; + const DATATYPE* restrict beta = beta_; + DATATYPE* restrict y = y_; + DATATYPE* restrict z = z_; + + if (b == 0) return; + if (m == 0 || PASTEMAC(PRECISION_CHAR, eq0)(*alpha)) { + if (PASTEMAC(PRECISION_CHAR, eq0)(*beta)) + SETV(PRECISION_CHAR)(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL); + else + SCALV(PRECISION_CHAR)(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL); + return; + } + + conj_t conjatw = BLIS_NO_CONJUGATE; + conj_t conjax = BLIS_NO_CONJUGATE; + if (bli_is_conj(conjw)) { + bli_toggle_conj(&conjat); + bli_toggle_conj(&conjw); + bli_toggle_conj(&conjatw); + } + if (bli_is_conj(conjx)) { + bli_toggle_conj(&conja); + bli_toggle_conj(&conjx); + bli_toggle_conj(&conjax); + } + + while (b >= 4) { + // Compute dot product of w with 4 columns of a. + const DATATYPE* restrict a_tmp = a; + const DATATYPE* restrict w_tmp = w; + DATATYPE* restrict z_tmp = z; + RVV_TYPE_F(PREC, LMUL) yacc0_r, yacc0_i, yacc1_r, yacc1_i, + yacc2_r, yacc2_i, yacc3_r, yacc3_i; + bool first = true; + size_t avl = m; + while (avl) { + size_t vl = VSETVL(PREC, LMUL)(avl); + RVV_TYPE_FX(PREC, LMUL, 2) wvec, acol_vec; + RVV_TYPE_F(PREC, LMUL) wvec_r, wvec_i, acol_vec_r, acol_vec_i; + RVV_TYPE_F(PREC, LMUL) zacc_r, zacc_i; + if (incw == 1) + wvec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) w_tmp, vl); + else + wvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) w_tmp, 2 * FLT_SIZE * incw, vl); + wvec_r = VGET_V_F(PREC, LMUL, 2)(wvec, 0); + wvec_i = VGET_V_F(PREC, LMUL, 2)(wvec, 1); + + if (first) { + if (bli_is_conj(conjat)) { + if (bli_is_conj(conja)) { + if (inca == 1) + DOTXAXPYF_SIFIVE_X280_LOOP_BODY_FIRST( , _CONJ, _CONJ); + else + DOTXAXPYF_SIFIVE_X280_LOOP_BODY_FIRST(_STRIDED, _CONJ, _CONJ); + } + else { + if (inca == 1) + DOTXAXPYF_SIFIVE_X280_LOOP_BODY_FIRST( , _CONJ, ); + else + DOTXAXPYF_SIFIVE_X280_LOOP_BODY_FIRST(_STRIDED, _CONJ, ); + } + } + else { + if (bli_is_conj(conja)) { + if (inca == 1) + DOTXAXPYF_SIFIVE_X280_LOOP_BODY_FIRST( , , _CONJ); + else + DOTXAXPYF_SIFIVE_X280_LOOP_BODY_FIRST(_STRIDED, , _CONJ); + } + else { + if (inca == 1) + DOTXAXPYF_SIFIVE_X280_LOOP_BODY_FIRST( , , ); + else + DOTXAXPYF_SIFIVE_X280_LOOP_BODY_FIRST(_STRIDED, , ); + } + } + first = false; + } + else { + if (bli_is_conj(conjat)) { + if (bli_is_conj(conja)) { + if (inca == 1) + DOTXAXPYF_SIFIVE_X280_LOOP_BODY( , _CONJ, _CONJ); + else + DOTXAXPYF_SIFIVE_X280_LOOP_BODY(_STRIDED, _CONJ, _CONJ); + } + else { + if (inca == 1) + DOTXAXPYF_SIFIVE_X280_LOOP_BODY( , _CONJ, ); + else + DOTXAXPYF_SIFIVE_X280_LOOP_BODY(_STRIDED, _CONJ, ); + } + } + else { + if (bli_is_conj(conja)) { + if (inca == 1) + DOTXAXPYF_SIFIVE_X280_LOOP_BODY( , , _CONJ); + else + DOTXAXPYF_SIFIVE_X280_LOOP_BODY(_STRIDED, , _CONJ); + } + else { + if (inca == 1) + DOTXAXPYF_SIFIVE_X280_LOOP_BODY( , , ); + else + DOTXAXPYF_SIFIVE_X280_LOOP_BODY(_STRIDED, , ); + } + } + } + + RVV_TYPE_FX(PREC, LMUL, 2) zvec; + if (incz == 1) + zvec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) z_tmp, vl); + else + zvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) z_tmp, 2 * FLT_SIZE * incz, vl); + RVV_TYPE_F(PREC, LMUL) zvec_r = VGET_V_F(PREC, LMUL, 2)(zvec, 0); + RVV_TYPE_F(PREC, LMUL) zvec_i = VGET_V_F(PREC, LMUL, 2)(zvec, 1); + if (bli_is_conj(conjax)) + VCMACC_VF_CONJ(PREC, LMUL, zvec_r, zvec_i, alpha->real, alpha->imag, zacc_r, zacc_i, vl); + else + VCMACC_VF(PREC, LMUL, zvec_r, zvec_i, alpha->real, alpha->imag, zacc_r, zacc_i, vl); + zvec = VSET_V_F(PREC, LMUL, 2)(zvec, 0, zvec_r); + zvec = VSET_V_F(PREC, LMUL, 2)(zvec, 1, zvec_i); + if (incz == 1) + VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) z_tmp, zvec, vl); + else + VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) z_tmp, 2 * FLT_SIZE * incz, zvec, vl); + + a_tmp += vl * inca; + w_tmp += vl * incw; + z_tmp += vl * incz; + avl -= vl; + } + + DOTXAXPYF_SIFIVE_X280_REDUCE(0); + DOTXAXPYF_SIFIVE_X280_REDUCE(1); + DOTXAXPYF_SIFIVE_X280_REDUCE(2); + DOTXAXPYF_SIFIVE_X280_REDUCE(3); + + a += 4 * lda; + x += 4 * incx; + y += 4 * incy; + b -= 4; + } + + if (b > 0) { + const DATATYPE* restrict a_tmp = a; + const DATATYPE* restrict w_tmp = w; + DATATYPE* restrict z_tmp = z; + RVV_TYPE_F(PREC, LMUL) yacc0_r, yacc0_i, yacc1_r, yacc1_i, yacc2_r, yacc2_i; + bool first = true; + size_t avl = m; + while (avl) { + size_t vl = VSETVL(PREC, LMUL)(avl); + RVV_TYPE_FX(PREC, LMUL, 2) wvec, acol_vec; + RVV_TYPE_F(PREC, LMUL) wvec_r, wvec_i, acol_vec_r, acol_vec_i; + RVV_TYPE_F(PREC, LMUL) zacc_r = VFMV_V_F(PREC, LMUL)(0, vl); + RVV_TYPE_F(PREC, LMUL) zacc_i = VFMV_V_F(PREC, LMUL)(0, vl); + if (incw == 1) + wvec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) w_tmp, vl); + else + wvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) w_tmp, 2 * FLT_SIZE * incw, vl); + wvec_r = VGET_V_F(PREC, LMUL, 2)(wvec, 0); + wvec_i = VGET_V_F(PREC, LMUL, 2)(wvec, 1); + + if (first) { + if (bli_is_conj(conjat)) { + if (bli_is_conj(conja)) { + if (inca == 1) + DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY_FIRST( , _CONJ, _CONJ); + else + DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY_FIRST(_STRIDED, _CONJ, _CONJ); + } + else { + if (inca == 1) + DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY_FIRST( , _CONJ, ); + else + DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY_FIRST(_STRIDED, _CONJ, ); + } + } + else { + if (bli_is_conj(conja)) { + if (inca == 1) + DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY_FIRST( , , _CONJ); + else + DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY_FIRST(_STRIDED, , _CONJ); + } + else { + if (inca == 1) + DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY_FIRST( , , ); + else + DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY_FIRST(_STRIDED, , ); + } + } + first = false; + } + else { + if (bli_is_conj(conjat)) { + if (bli_is_conj(conja)) { + if (inca == 1) + DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY( , _CONJ, _CONJ); + else + DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY(_STRIDED, _CONJ, _CONJ); + } + else { + if (inca == 1) + DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY( , _CONJ, ); + else + DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY(_STRIDED, _CONJ, ); + } + } + else { + if (bli_is_conj(conja)) { + if (inca == 1) + DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY( , , _CONJ); + else + DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY(_STRIDED, , _CONJ); + } + else { + if (inca == 1) + DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY( , , ); + else + DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY(_STRIDED, , ); + } + } + } + + RVV_TYPE_FX(PREC, LMUL, 2) zvec; + if (incz == 1) + zvec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) z_tmp, vl); + else + zvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) z_tmp, 2 * FLT_SIZE * incz, vl); + RVV_TYPE_F(PREC, LMUL) zvec_r = VGET_V_F(PREC, LMUL, 2)(zvec, 0); + RVV_TYPE_F(PREC, LMUL) zvec_i = VGET_V_F(PREC, LMUL, 2)(zvec, 1); + if (bli_is_conj(conjax)) + VCMACC_VF_CONJ(PREC, LMUL, zvec_r, zvec_i, alpha->real, alpha->imag, zacc_r, zacc_i, vl); + else + VCMACC_VF(PREC, LMUL, zvec_r, zvec_i, alpha->real, alpha->imag, zacc_r, zacc_i, vl); + zvec = VSET_V_F(PREC, LMUL, 2)(zvec, 0, zvec_r); + zvec = VSET_V_F(PREC, LMUL, 2)(zvec, 1, zvec_i); + if (incz == 1) + VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) z_tmp, zvec, vl); + else + VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) z_tmp, 2 * FLT_SIZE * incz, zvec, vl); + + a_tmp += vl * inca; + w_tmp += vl * incw; + z_tmp += vl * incz; + avl -= vl; + } + + switch (b) { + case 3: + DOTXAXPYF_SIFIVE_X280_REDUCE(2); + case 2: + DOTXAXPYF_SIFIVE_X280_REDUCE(1); + case 1: + DOTXAXPYF_SIFIVE_X280_REDUCE(0); + } + } + return; +} + +#undef DOTXAXPYF_SIFIVE_X280_LOAD_ACOL +#undef DOTXAXPYF_SIFIVE_X280_LOAD_ACOL_STRIDED +#undef DOTXAXPYF_SIFIVE_X280_LOOP_BODY_FIRST +#undef DOTXAXPYF_SIFIVE_X280_LOOP_BODY +#undef DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY_FIRST +#undef DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY +#undef DOTXAXPYF_SIFIVE_X280_REDUCE + +#endif // DOTXAXPYF diff --git a/kernels/sifive_x280/1f/bli_dotxaxpyf_sifive_x280_intr/bli_dotxaxpyf_sifive_x280_intr_real.c b/kernels/sifive_x280/1f/bli_dotxaxpyf_sifive_x280_intr/bli_dotxaxpyf_sifive_x280_intr_real.c new file mode 100644 index 0000000000..57ef4f7447 --- /dev/null +++ b/kernels/sifive_x280/1f/bli_dotxaxpyf_sifive_x280_intr/bli_dotxaxpyf_sifive_x280_intr_real.c @@ -0,0 +1,283 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2024, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off +#ifdef DOTXAXPYF + +#define DOTXAXPYF_SIFIVE_X280_LOAD_ACOL(i) \ + do { \ + acol_vec = VLE_V_F(PREC, LMUL)(a_tmp + i * lda, vl); \ + } while (0) + +#define DOTXAXPYF_SIFIVE_X280_LOAD_ACOL_STRIDED(i) \ + do { \ + acol_vec = VLSE_V_F(PREC, LMUL)(a_tmp + i * lda, FLT_SIZE * inca, vl); \ + } while (0) + +#define DOTXAXPYF_SIFIVE_X280_LOOP_BODY_FIRST(LOAD_SUF) \ + do { \ + DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(0); \ + yacc0 = VFMUL_VV(PREC, LMUL)(acol_vec, wvec, vl); \ + zacc = VFMUL_VF(PREC, LMUL)(acol_vec, x[0 * incx], vl); \ + DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(1); \ + yacc1 = VFMUL_VV(PREC, LMUL)(acol_vec, wvec, vl); \ + zacc = VFMACC_VF(PREC, LMUL)(zacc, x[1 * incx], acol_vec, vl); \ + DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(2); \ + yacc2 = VFMUL_VV(PREC, LMUL)(acol_vec, wvec, vl); \ + zacc = VFMACC_VF(PREC, LMUL)(zacc, x[2 * incx], acol_vec, vl); \ + DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(3); \ + yacc3 = VFMUL_VV(PREC, LMUL)(acol_vec, wvec, vl); \ + zacc = VFMACC_VF(PREC, LMUL)(zacc, x[3 * incx], acol_vec, vl); \ + } while (0) + +#define DOTXAXPYF_SIFIVE_X280_LOOP_BODY(LOAD_SUF) \ + do { \ + DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(0); \ + yacc0 = VFMACC_VV_TU(PREC, LMUL)(yacc0, acol_vec, wvec, vl); \ + zacc = VFMUL_VF(PREC, LMUL)(acol_vec, x[0 * incx], vl); \ + DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(1); \ + yacc1 = VFMACC_VV_TU(PREC, LMUL)(yacc1, acol_vec, wvec, vl); \ + zacc = VFMACC_VF(PREC, LMUL)(zacc, x[1 * incx], acol_vec, vl); \ + DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(2); \ + yacc2 = VFMACC_VV_TU(PREC, LMUL)(yacc2, acol_vec, wvec, vl); \ + zacc = VFMACC_VF(PREC, LMUL)(zacc, x[2 * incx], acol_vec, vl); \ + DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(3); \ + yacc3 = VFMACC_VV_TU(PREC, LMUL)(yacc3, acol_vec, wvec, vl); \ + zacc = VFMACC_VF(PREC, LMUL)(zacc, x[3 * incx], acol_vec, vl); \ + } while (0) + +#define DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY_FIRST(LOAD_SUF) \ + do { \ + switch (b) { \ + case 3: \ + DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(2); \ + yacc2 = VFMUL_VV(PREC, LMUL)(acol_vec, wvec, vl); \ + zacc = VFMACC_VF(PREC, LMUL)(zacc, x[2 * incx], acol_vec, vl); \ + case 2: \ + DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(1); \ + yacc1 = VFMUL_VV(PREC, LMUL)(acol_vec, wvec, vl); \ + zacc = VFMACC_VF(PREC, LMUL)(zacc, x[1 * incx], acol_vec, vl); \ + case 1: \ + DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(0); \ + yacc0 = VFMUL_VV(PREC, LMUL)(acol_vec, wvec, vl); \ + zacc = VFMACC_VF(PREC, LMUL)(zacc, x[0 * incx], acol_vec, vl); \ + } \ + } while (0) + +#define DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY(LOAD_SUF) \ + do { \ + switch (b) { \ + case 3: \ + DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(2); \ + yacc2 = VFMACC_VV_TU(PREC, LMUL)(yacc2, acol_vec, wvec, vl); \ + zacc = VFMACC_VF(PREC, LMUL)(zacc, x[2 * incx], acol_vec, vl); \ + case 2: \ + DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(1); \ + yacc1 = VFMACC_VV_TU(PREC, LMUL)(yacc1, acol_vec, wvec, vl); \ + zacc = VFMACC_VF(PREC, LMUL)(zacc, x[1 * incx], acol_vec, vl); \ + case 1: \ + DOTXAXPYF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(0); \ + yacc0 = VFMACC_VV_TU(PREC, LMUL)(yacc0, acol_vec, wvec, vl); \ + zacc = VFMACC_VF(PREC, LMUL)(zacc, x[0 * incx], acol_vec, vl); \ + } \ + } while (0) + +#define DOTXAXPYF_SIFIVE_X280_REDUCE(i) \ + do { \ + RVV_TYPE_F(PREC, m1) dot##i = VFMV_S_F(PREC, m1)(0., 1); \ + dot##i = VF_REDUSUM_VS(PREC, LMUL)(yacc##i, dot##i, m); \ + if (PASTEMAC(PRECISION_CHAR, eq0)(*beta)) { \ + dot##i = VFMUL_VF(PREC, m1)(dot##i, *alpha, 1); \ + y[i * incy] = VFMV_F_S(PREC)(dot##i); \ + } \ + else { \ + y[i * incy] *= *beta; \ + RVV_TYPE_F(PREC, m1) y##i = VFMV_S_F(PREC, m1)(y[i * incy], 1); \ + y##i = VFMACC_VF(PREC, m1)(y##i, *alpha, dot##i, 1); \ + y[i * incy] = VFMV_F_S(PREC)(y##i); \ + } \ + } while (0) + +DOTXAXPYF(PRECISION_CHAR, void) +{ + // Computes y := beta * y + alpha * conjat(A^T) * conjw(w) + // z := z + alpha * conja(A) * conjx(x) + + (void) conjat; // Suppress unused parameter warnings + (void) conja; + (void) conjw; + (void) conjx; + (void) cntx; + const DATATYPE* restrict alpha = alpha_; + const DATATYPE* restrict a = a_; + const DATATYPE* restrict w = w_; + const DATATYPE* restrict x = x_; + const DATATYPE* restrict beta = beta_; + DATATYPE* restrict y = y_; + DATATYPE* restrict z = z_; + + if (b == 0) return; + if (m == 0 || PASTEMAC(PRECISION_CHAR, eq0)(*alpha)) { + if (PASTEMAC(PRECISION_CHAR, eq0)(*beta)) + SETV(PRECISION_CHAR)(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL); + else + SCALV(PRECISION_CHAR)(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL); + return; + } + + while (b >= 4) { + // Process 4 columns of a at a time. + const DATATYPE* restrict a_tmp = a; + const DATATYPE* restrict w_tmp = w; + DATATYPE* restrict z_tmp = z; + RVV_TYPE_F(PREC, LMUL) yacc0, yacc1, yacc2, yacc3; + bool first = true; + size_t avl = m; + while (avl) { + size_t vl = VSETVL(PREC, LMUL)(avl); + RVV_TYPE_F(PREC, LMUL) wvec, acol_vec; + RVV_TYPE_F(PREC, LMUL) zacc; + if (incw == 1) + wvec = VLE_V_F(PREC, LMUL)(w_tmp, vl); + else + wvec = VLSE_V_F(PREC, LMUL)(w_tmp, FLT_SIZE * incw, vl); + if (first) { + if (inca == 1) + DOTXAXPYF_SIFIVE_X280_LOOP_BODY_FIRST( ); + else + DOTXAXPYF_SIFIVE_X280_LOOP_BODY_FIRST(_STRIDED); + first = false; + } + else { + if (inca == 1) + DOTXAXPYF_SIFIVE_X280_LOOP_BODY( ); + else + DOTXAXPYF_SIFIVE_X280_LOOP_BODY(_STRIDED); + } + + RVV_TYPE_F(PREC, LMUL) zvec; + if (incz == 1) + zvec = VLE_V_F(PREC, LMUL)(z_tmp, vl); + else + zvec = VLSE_V_F(PREC, LMUL)(z_tmp, FLT_SIZE * incz, vl); + zvec = VFMACC_VF(PREC, LMUL)(zvec, *alpha, zacc, vl); + if (incz == 1) + VSE_V_F(PREC, LMUL)(z_tmp, zvec, vl); + else + VSSE_V_F(PREC, LMUL)(z_tmp, FLT_SIZE * incz, zvec, vl); + + a_tmp += vl * inca; + w_tmp += vl * incw; + z_tmp += vl * incz; + avl -= vl; + } + + DOTXAXPYF_SIFIVE_X280_REDUCE(0); + DOTXAXPYF_SIFIVE_X280_REDUCE(1); + DOTXAXPYF_SIFIVE_X280_REDUCE(2); + DOTXAXPYF_SIFIVE_X280_REDUCE(3); + + a += 4 * lda; + x += 4 * incx; + y += 4 * incy; + b -= 4; + } + + if (b > 0) { + const DATATYPE* restrict a_tmp = a; + const DATATYPE* restrict w_tmp = w; + DATATYPE* restrict z_tmp = z; + RVV_TYPE_F(PREC, LMUL) yacc0, yacc1, yacc2; + bool first = true; + size_t avl = m; + while (avl) { + size_t vl = VSETVL(PREC, LMUL)(avl); + RVV_TYPE_F(PREC, LMUL) wvec, acol_vec; + RVV_TYPE_F(PREC, LMUL) zacc = VFMV_V_F(PREC, LMUL)(0, vl); + if (incw == 1) + wvec = VLE_V_F(PREC, LMUL)(w_tmp, vl); + else + wvec = VLSE_V_F(PREC, LMUL)(w_tmp, FLT_SIZE * incw, vl); + if (first) { + if (inca == 1) + DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY_FIRST( ); + else + DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY_FIRST(_STRIDED); + first = false; + } + else { + if (inca == 1) + DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY( ); + else + DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY(_STRIDED); + } + + RVV_TYPE_F(PREC, LMUL) zvec; + if (incz == 1) + zvec = VLE_V_F(PREC, LMUL)(z_tmp, vl); + else + zvec = VLSE_V_F(PREC, LMUL)(z_tmp, FLT_SIZE * incz, vl); + zvec = VFMACC_VF(PREC, LMUL)(zvec, *alpha, zacc, vl); + if (incz == 1) + VSE_V_F(PREC, LMUL)(z_tmp, zvec, vl); + else + VSSE_V_F(PREC, LMUL)(z_tmp, FLT_SIZE * incz, zvec, vl); + + a_tmp += vl * inca; + w_tmp += vl * incw; + z_tmp += vl * incz; + avl -= vl; + } + + switch (b) { + case 3: + DOTXAXPYF_SIFIVE_X280_REDUCE(2); + case 2: + DOTXAXPYF_SIFIVE_X280_REDUCE(1); + case 1: + DOTXAXPYF_SIFIVE_X280_REDUCE(0); + } + } + return; +} + +#undef DOTXAXPYF_SIFIVE_X280_LOAD_ACOL +#undef DOTXAXPYF_SIFIVE_X280_LOAD_ACOL_STRIDED +#undef DOTXAXPYF_SIFIVE_X280_LOOP_BODY_FIRST +#undef DOTXAXPYF_SIFIVE_X280_LOOP_BODY +#undef DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY_FIRST +#undef DOTXAXPYF_SIFIVE_X280_CLEANUP_BODY +#undef DOTXAXPYF_SIFIVE_X280_REDUCE + +#endif // DOTXAXPYF diff --git a/kernels/sifive_x280/1f/bli_dotxf_sifive_x280_asm.c b/kernels/sifive_x280/1f/bli_dotxf_sifive_x280_asm.c deleted file mode 100644 index 5ac2d41667..0000000000 --- a/kernels/sifive_x280/1f/bli_dotxf_sifive_x280_asm.c +++ /dev/null @@ -1,2645 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2023, SiFive, Inc. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" -#include "../riscv_cmul_macros_asm.h" -#include -#include -#include -#include - -#define FLT_SIZE 4 -#define FLT_LOAD "flw " -#define FMUL "fmul.s " -#define VLE "vle32.v " -#define VLSE "vlse32.v " -#define VSE "vse32.v " -#define VSSE "vsse32.v " - -void bli_sdotxf_sifive_x280_asm( - conj_t conjat, - conj_t conjx, - dim_t m, - dim_t b, - const void* restrict alpha_, - const void* restrict a_, inc_t inca, inc_t lda, - const void* restrict x_, inc_t incx, - const void* restrict beta_, - void* restrict y_, inc_t incy, - const cntx_t* restrict cntx - ) { - // think of a as b x m row major matrix (i.e. rsa = lda, csa = inca) - // we process 6 elements of y per iteration, using y_tmp to load/store from - // y a points to the 6 x m block of a needed this iteration each 6 x m block - // is broken into 6 x vl blocks a_col points to the current 6 x vl block, we - // use x_tmp to load from x a_row is used to load each of the 6 rows of this - // 6 x vl block - (void)conjat; - (void)conjx; - (void)cntx; - const float* restrict alpha = alpha_; - const float* restrict a = a_; - const float* restrict x = x_; - const float* restrict beta = beta_; - float* restrict y = y_; - - if (b == 0) - return; - else if (m == 0 || *alpha == 0.f) { - // scale y by beta - if (*beta == 0.f) - bli_ssetv_sifive_x280_asm(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL); - else - bli_sscalv_sifive_x280_intr(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL); - return; - } - - __asm__(FLT_LOAD "ft10, (%0)" : : "r"(alpha)); - __asm__(FLT_LOAD "ft11, (%0)" : : "r"(beta)); - inca *= FLT_SIZE; - lda *= FLT_SIZE; - incx *= FLT_SIZE; - incy *= FLT_SIZE; - inc_t a_bump = 6 * lda; // to bump a down 6 rows - - while (b >= 6) { - // compute dot product of x with 6 rows of a - const float* restrict x_tmp = x; - const float* restrict a_col = a; - size_t avl = m; - bool first = true; - while (avl) { - const float* restrict a_row = a_col; - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m4, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE)); - if (incx == FLT_SIZE) - __asm__(VLE "v28, (%0)" : : "r"(x_tmp)); - else - __asm__(VLSE "v28, (%0), %1" : : "r"(x_tmp), "r"(incx)); - if (inca == FLT_SIZE) { - // a unit stride - if (first) { - __asm__(VLE "v0, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmul.vv v0, v0, v28"); - __asm__(VLE "v4, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmul.vv v4, v4, v28"); - __asm__(VLE "v8, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmul.vv v8, v8, v28"); - __asm__(VLE "v12, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmul.vv v12, v12, v28"); - __asm__(VLE "v16, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmul.vv v16, v16, v28"); - __asm__(VLE "v20, (%0)" : : "r"(a_row)); - __asm__("vfmul.vv v20, v20, v28"); - first = false; - } - else { - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vv v0, v24, v28"); - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vv v4, v24, v28"); - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vv v8, v24, v28"); - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vv v12, v24, v28"); - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vv v16, v24, v28"); - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("vfmacc.vv v20, v24, v28"); - } - } // end a unit stride - else { - // a non-unit stride - if (first) { - __asm__(VLSE "v0, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmul.vv v0, v0, v28"); - __asm__(VLSE "v4, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmul.vv v4, v4, v28"); - __asm__(VLSE "v8, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmul.vv v8, v8, v28"); - __asm__(VLSE "v12, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmul.vv v12, v12, v28"); - __asm__(VLSE "v16, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmul.vv v16, v16, v28"); - __asm__(VLSE "v20, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("vfmul.vv v20, v20, v28"); - first = false; - } - else { - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vv v0, v24, v28"); - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vv v4, v24, v28"); - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vv v8, v24, v28"); - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vv v12, v24, v28"); - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vv v16, v24, v28"); - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("vfmacc.vv v20, v24, v28"); - } - } // end a non-unit stride - __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(vl * incx)); - __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca)); - avl -= vl; - } - - __asm__("vmv.s.x v31, x0"); - - __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v0, v0, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (*beta == 0.f) { - __asm__("vfmul.vf v0, v0, ft10"); - __asm__(VSE "v0, (%0)" : : "r"(y)); - } - else { - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); - __asm__(FMUL "ft0, ft11, ft0"); - __asm__("vfmv.s.f v30, ft0"); - __asm__("vfmacc.vf v30, ft10, v0"); - __asm__(VSE "v30, (%0)" : : "r"(y)); - } - __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy)); - - __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v4, v4, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (*beta == 0.f) { - __asm__("vfmul.vf v4, v4, ft10"); - __asm__(VSE "v4, (%0)" : : "r"(y)); - } - else { - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); - __asm__(FMUL "ft0, ft11, ft0"); - __asm__("vfmv.s.f v30, ft0"); - __asm__("vfmacc.vf v30, ft10, v4"); - __asm__(VSE "v30, (%0)" : : "r"(y)); - } - __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy)); - - __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v8, v8, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (*beta == 0.f) { - __asm__("vfmul.vf v8, v8, ft10"); - __asm__(VSE "v8, (%0)" : : "r"(y)); - } - else { - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); - __asm__(FMUL "ft0, ft11, ft0"); - __asm__("vfmv.s.f v30, ft0"); - __asm__("vfmacc.vf v30, ft10, v8"); - __asm__(VSE "v30, (%0)" : : "r"(y)); - } - __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy)); - - __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v12, v12, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (*beta == 0.f) { - __asm__("vfmul.vf v12, v12, ft10"); - __asm__(VSE "v12, (%0)" : : "r"(y)); - } - else { - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); - __asm__(FMUL "ft0, ft11, ft0"); - __asm__("vfmv.s.f v30, ft0"); - __asm__("vfmacc.vf v30, ft10, v12"); - __asm__(VSE "v30, (%0)" : : "r"(y)); - } - __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy)); - - __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v16, v16, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (*beta == 0.f) { - __asm__("vfmul.vf v16, v16, ft10"); - __asm__(VSE "v16, (%0)" : : "r"(y)); - } - else { - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); - __asm__(FMUL "ft0, ft11, ft0"); - __asm__("vfmv.s.f v30, ft0"); - __asm__("vfmacc.vf v30, ft10, v16"); - __asm__(VSE "v30, (%0)" : : "r"(y)); - } - __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy)); - - __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v20, v20, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (*beta == 0.f) { - __asm__("vfmul.vf v20, v20, ft10"); - __asm__(VSE "v20, (%0)" : : "r"(y)); - } - else { - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); - __asm__(FMUL "ft0, ft11, ft0"); - __asm__("vfmv.s.f v30, ft0"); - __asm__("vfmacc.vf v30, ft10, v20"); - __asm__(VSE "v30, (%0)" : : "r"(y)); - } - __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy)); - - // a += 6 * lda; - __asm__("add %0, %0, %1" : "+r"(a) : "r"(a_bump)); - b -= 6; - } - - if (b > 0) { - // compute dot product of x with remaining < 6 rows of a - const float* restrict x_tmp = x; - // a_col will move along the last row of a! - const float* restrict a_col; - __asm__("add %0, %1, %2" : "=r"(a_col) : "r"(a), "r"((b - 1) * lda)); - size_t avl = m; - bool first = true; - while (avl) { - const float* restrict a_row = a_col; - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m4, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE)); - if (incx == FLT_SIZE) - __asm__(VLE "v28, (%0)" : : "r"(x_tmp)); - else - __asm__(VLSE "v28, (%0), %1" : : "r"(x_tmp), "r"(incx)); - if (inca == FLT_SIZE) { - // a unit stride - if (first) { - switch (b) { - case 5: - __asm__(VLE "v16, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmul.vv v16, v16, v28"); - case 4: - __asm__(VLE "v12, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmul.vv v12, v12, v28"); - case 3: - __asm__(VLE "v8, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmul.vv v8, v8, v28"); - case 2: - __asm__(VLE "v4, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmul.vv v4, v4, v28"); - case 1: - __asm__(VLE "v0, (%0)" : : "r"(a_row)); - __asm__("vfmul.vv v0, v0, v28"); - } - first = false; - } - else { - switch (b) { - case 5: - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vv v16, v24, v28"); - case 4: - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vv v12, v24, v28"); - case 3: - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vv v8, v24, v28"); - case 2: - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vv v4, v24, v28"); - case 1: - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("vfmacc.vv v0, v24, v28"); - } - } - } // end a unit stride - else { - // a non-unit stride - if (first) { - switch (b) { - case 5: - __asm__(VLSE "v16, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmul.vv v16, v16, v28"); - case 4: - __asm__(VLSE "v12, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmul.vv v12, v12, v28"); - case 3: - __asm__(VLSE "v8, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmul.vv v8, v8, v28"); - case 2: - __asm__(VLSE "v4, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmul.vv v4, v4, v28"); - case 1: - __asm__(VLSE "v0, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("vfmul.vv v0, v0, v28"); - } - first = false; - } - else { - switch (b) { - case 5: - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vv v16, v24, v28"); - case 4: - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vv v12, v24, v28"); - case 3: - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vv v8, v24, v28"); - case 2: - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vv v4, v24, v28"); - case 1: - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("vfmacc.vv v0, v24, v28"); - } - } - } // end a non-unit stride - __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(vl * incx)); - __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca)); - avl -= vl; - } - - __asm__("add %0, %0, %1" : "+r"(y) : "r"((b - 1) * incy)); - __asm__("vmv.s.x v31, x0"); - switch (b) { - case 5: - __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v16, v16, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (*beta == 0.f) { - __asm__("vfmul.vf v16, v16, ft10"); - __asm__(VSE "v16, (%0)" : : "r"(y)); - } - else { - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); - __asm__(FMUL "ft0, ft11, ft0"); - __asm__("vfmv.s.f v30, ft0"); - __asm__("vfmacc.vf v30, ft10, v16"); - __asm__(VSE "v30, (%0)" : : "r"(y)); - } - __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy)); - case 4: - __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v12, v12, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (*beta == 0.f) { - __asm__("vfmul.vf v12, v12, ft10"); - __asm__(VSE "v12, (%0)" : : "r"(y)); - } - else { - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); - __asm__(FMUL "ft0, ft11, ft0"); - __asm__("vfmv.s.f v30, ft0"); - __asm__("vfmacc.vf v30, ft10, v12"); - __asm__(VSE "v30, (%0)" : : "r"(y)); - } - __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy)); - case 3: - __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v8, v8, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (*beta == 0.f) { - __asm__("vfmul.vf v8, v8, ft10"); - __asm__(VSE "v8, (%0)" : : "r"(y)); - } - else { - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); - __asm__(FMUL "ft0, ft11, ft0"); - __asm__("vfmv.s.f v30, ft0"); - __asm__("vfmacc.vf v30, ft10, v8"); - __asm__(VSE "v30, (%0)" : : "r"(y)); - } - __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy)); - case 2: - __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v4, v4, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (*beta == 0.f) { - __asm__("vfmul.vf v4, v4, ft10"); - __asm__(VSE "v4, (%0)" : : "r"(y)); - } - else { - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); - __asm__(FMUL "ft0, ft11, ft0"); - __asm__("vfmv.s.f v30, ft0"); - __asm__("vfmacc.vf v30, ft10, v4"); - __asm__(VSE "v30, (%0)" : : "r"(y)); - } - __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy)); - case 1: - __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v0, v0, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (*beta == 0.f) { - __asm__("vfmul.vf v0, v0, ft10"); - __asm__(VSE "v0, (%0)" : : "r"(y)); - } - else { - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); - __asm__(FMUL "ft0, ft11, ft0"); - __asm__("vfmv.s.f v30, ft0"); - __asm__("vfmacc.vf v30, ft10, v0"); - __asm__(VSE "v30, (%0)" : : "r"(y)); - } - } - } // end cleanup - return; -} - -#undef FLT_SIZE -#undef FLT_LOAD -#undef FMUL -#undef VLE -#undef VLSE -#undef VSE -#undef VSSE - -#define FLT_SIZE 8 -#define FLT_LOAD "fld " -#define FMUL "fmul.d " -#define VLE "vle64.v " -#define VLSE "vlse64.v " -#define VSE "vse64.v " -#define VSSE "vsse64.v " - -void bli_ddotxf_sifive_x280_asm( - conj_t conjat, - conj_t conjx, - dim_t m, - dim_t b, - const void* restrict alpha_, - const void* restrict a_, inc_t inca, inc_t lda, - const void* restrict x_, inc_t incx, - const void* restrict beta_, - void* restrict y_, inc_t incy, - const cntx_t* restrict cntx - ) { - // think of a as b x m row major matrix (i.e. rsa = lda, csa = inca) - // we process 6 elements of y per iteration, using y_tmp to load/store from - // y a points to the 6 x m block of a needed this iteration each 6 x m block - // is broken into 6 x vl blocks a_col points to the current 6 x vl block, we - // use x_tmp to load from x a_row is used to load each of the 6 rows of this - // 6 x vl block - (void)conjat; - (void)conjx; - (void)cntx; - const double* restrict alpha = alpha_; - const double* restrict a = a_; - const double* restrict x = x_; - const double* restrict beta = beta_; - double* restrict y = y_; - - if (b == 0) - return; - else if (m == 0 || *alpha == 0.) { - // scale y by beta - if (*beta == 0.) - bli_dsetv_sifive_x280_asm(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL); - else - bli_dscalv_sifive_x280_intr(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL); - return; - } - - __asm__(FLT_LOAD "ft10, (%0)" : : "r"(alpha)); - __asm__(FLT_LOAD "ft11, (%0)" : : "r"(beta)); - inca *= FLT_SIZE; - lda *= FLT_SIZE; - incx *= FLT_SIZE; - incy *= FLT_SIZE; - inc_t a_bump = 6 * lda; // to bump a down 6 rows - - while (b >= 6) { - // compute dot product of x with 6 rows of a - const double* restrict x_tmp = x; - const double* restrict a_col = a; - size_t avl = m; - bool first = true; - while (avl) { - const double* restrict a_row = a_col; - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m4, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE)); - if (incx == FLT_SIZE) - __asm__(VLE "v28, (%0)" : : "r"(x_tmp)); - else - __asm__(VLSE "v28, (%0), %1" : : "r"(x_tmp), "r"(incx)); - if (inca == FLT_SIZE) { - // a unit stride - if (first) { - __asm__(VLE "v0, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmul.vv v0, v0, v28"); - __asm__(VLE "v4, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmul.vv v4, v4, v28"); - __asm__(VLE "v8, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmul.vv v8, v8, v28"); - __asm__(VLE "v12, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmul.vv v12, v12, v28"); - __asm__(VLE "v16, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmul.vv v16, v16, v28"); - __asm__(VLE "v20, (%0)" : : "r"(a_row)); - __asm__("vfmul.vv v20, v20, v28"); - first = false; - } - else { - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vv v0, v24, v28"); - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vv v4, v24, v28"); - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vv v8, v24, v28"); - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vv v12, v24, v28"); - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vv v16, v24, v28"); - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("vfmacc.vv v20, v24, v28"); - } - } // end a unit stride - else { - // a non-unit stride - if (first) { - __asm__(VLSE "v0, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmul.vv v0, v0, v28"); - __asm__(VLSE "v4, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmul.vv v4, v4, v28"); - __asm__(VLSE "v8, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmul.vv v8, v8, v28"); - __asm__(VLSE "v12, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmul.vv v12, v12, v28"); - __asm__(VLSE "v16, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmul.vv v16, v16, v28"); - __asm__(VLSE "v20, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("vfmul.vv v20, v20, v28"); - first = false; - } - else { - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vv v0, v24, v28"); - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vv v4, v24, v28"); - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vv v8, v24, v28"); - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vv v12, v24, v28"); - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vv v16, v24, v28"); - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("vfmacc.vv v20, v24, v28"); - } - } // end a non-unit stride - __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(vl * incx)); - __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca)); - avl -= vl; - } - - __asm__("vmv.s.x v31, x0"); - - __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v0, v0, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (*beta == 0.) { - __asm__("vfmul.vf v0, v0, ft10"); - __asm__(VSE "v0, (%0)" : : "r"(y)); - } - else { - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); - __asm__(FMUL "ft0, ft11, ft0"); - __asm__("vfmv.s.f v30, ft0"); - __asm__("vfmacc.vf v30, ft10, v0"); - __asm__(VSE "v30, (%0)" : : "r"(y)); - } - __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy)); - - __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v4, v4, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (*beta == 0.) { - __asm__("vfmul.vf v4, v4, ft10"); - __asm__(VSE "v4, (%0)" : : "r"(y)); - } - else { - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); - __asm__(FMUL "ft0, ft11, ft0"); - __asm__("vfmv.s.f v30, ft0"); - __asm__("vfmacc.vf v30, ft10, v4"); - __asm__(VSE "v30, (%0)" : : "r"(y)); - } - __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy)); - - __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v8, v8, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (*beta == 0.) { - __asm__("vfmul.vf v8, v8, ft10"); - __asm__(VSE "v8, (%0)" : : "r"(y)); - } - else { - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); - __asm__(FMUL "ft0, ft11, ft0"); - __asm__("vfmv.s.f v30, ft0"); - __asm__("vfmacc.vf v30, ft10, v8"); - __asm__(VSE "v30, (%0)" : : "r"(y)); - } - __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy)); - - __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v12, v12, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (*beta == 0.) { - __asm__("vfmul.vf v12, v12, ft10"); - __asm__(VSE "v12, (%0)" : : "r"(y)); - } - else { - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); - __asm__(FMUL "ft0, ft11, ft0"); - __asm__("vfmv.s.f v30, ft0"); - __asm__("vfmacc.vf v30, ft10, v12"); - __asm__(VSE "v30, (%0)" : : "r"(y)); - } - __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy)); - - __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v16, v16, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (*beta == 0.) { - __asm__("vfmul.vf v16, v16, ft10"); - __asm__(VSE "v16, (%0)" : : "r"(y)); - } - else { - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); - __asm__(FMUL "ft0, ft11, ft0"); - __asm__("vfmv.s.f v30, ft0"); - __asm__("vfmacc.vf v30, ft10, v16"); - __asm__(VSE "v30, (%0)" : : "r"(y)); - } - __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy)); - - __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v20, v20, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (*beta == 0.) { - __asm__("vfmul.vf v20, v20, ft10"); - __asm__(VSE "v20, (%0)" : : "r"(y)); - } - else { - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); - __asm__(FMUL "ft0, ft11, ft0"); - __asm__("vfmv.s.f v30, ft0"); - __asm__("vfmacc.vf v30, ft10, v20"); - __asm__(VSE "v30, (%0)" : : "r"(y)); - } - __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy)); - - // a += 6 * lda; - __asm__("add %0, %0, %1" : "+r"(a) : "r"(a_bump)); - b -= 6; - } - - if (b > 0) { - // compute dot product of x with remaining < 6 rows of a - const double* restrict x_tmp = x; - // a_col will move along the last row of a! - const double* restrict a_col; - __asm__("add %0, %1, %2" : "=r"(a_col) : "r"(a), "r"((b - 1) * lda)); - size_t avl = m; - bool first = true; - while (avl) { - const double* restrict a_row = a_col; - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m4, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE)); - if (incx == FLT_SIZE) - __asm__(VLE "v28, (%0)" : : "r"(x_tmp)); - else - __asm__(VLSE "v28, (%0), %1" : : "r"(x_tmp), "r"(incx)); - if (inca == FLT_SIZE) { - // a unit stride - if (first) { - switch (b) { - case 5: - __asm__(VLE "v16, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmul.vv v16, v16, v28"); - case 4: - __asm__(VLE "v12, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmul.vv v12, v12, v28"); - case 3: - __asm__(VLE "v8, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmul.vv v8, v8, v28"); - case 2: - __asm__(VLE "v4, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmul.vv v4, v4, v28"); - case 1: - __asm__(VLE "v0, (%0)" : : "r"(a_row)); - __asm__("vfmul.vv v0, v0, v28"); - } - first = false; - } - else { - switch (b) { - case 5: - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vv v16, v24, v28"); - case 4: - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vv v12, v24, v28"); - case 3: - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vv v8, v24, v28"); - case 2: - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vv v4, v24, v28"); - case 1: - __asm__(VLE "v24, (%0)" : : "r"(a_row)); - __asm__("vfmacc.vv v0, v24, v28"); - } - } - } // end a unit stride - else { - // a non-unit stride - if (first) { - switch (b) { - case 5: - __asm__(VLSE "v16, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmul.vv v16, v16, v28"); - case 4: - __asm__(VLSE "v12, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmul.vv v12, v12, v28"); - case 3: - __asm__(VLSE "v8, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmul.vv v8, v8, v28"); - case 2: - __asm__(VLSE "v4, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmul.vv v4, v4, v28"); - case 1: - __asm__(VLSE "v0, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("vfmul.vv v0, v0, v28"); - } - first = false; - } - else { - switch (b) { - case 5: - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vv v16, v24, v28"); - case 4: - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vv v12, v24, v28"); - case 3: - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vv v8, v24, v28"); - case 2: - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - __asm__("vfmacc.vv v4, v24, v28"); - case 1: - __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("vfmacc.vv v0, v24, v28"); - } - } - } // end a non-unit stride - __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(vl * incx)); - __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca)); - avl -= vl; - } - - __asm__("add %0, %0, %1" : "+r"(y) : "r"((b - 1) * incy)); - __asm__("vmv.s.x v31, x0"); - switch (b) { - case 5: - __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v16, v16, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (*beta == 0.) { - __asm__("vfmul.vf v16, v16, ft10"); - __asm__(VSE "v16, (%0)" : : "r"(y)); - } - else { - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); - __asm__(FMUL "ft0, ft11, ft0"); - __asm__("vfmv.s.f v30, ft0"); - __asm__("vfmacc.vf v30, ft10, v16"); - __asm__(VSE "v30, (%0)" : : "r"(y)); - } - __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy)); - case 4: - __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v12, v12, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (*beta == 0.) { - __asm__("vfmul.vf v12, v12, ft10"); - __asm__(VSE "v12, (%0)" : : "r"(y)); - } - else { - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); - __asm__(FMUL "ft0, ft11, ft0"); - __asm__("vfmv.s.f v30, ft0"); - __asm__("vfmacc.vf v30, ft10, v12"); - __asm__(VSE "v30, (%0)" : : "r"(y)); - } - __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy)); - case 3: - __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v8, v8, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (*beta == 0.) { - __asm__("vfmul.vf v8, v8, ft10"); - __asm__(VSE "v8, (%0)" : : "r"(y)); - } - else { - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); - __asm__(FMUL "ft0, ft11, ft0"); - __asm__("vfmv.s.f v30, ft0"); - __asm__("vfmacc.vf v30, ft10, v8"); - __asm__(VSE "v30, (%0)" : : "r"(y)); - } - __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy)); - case 2: - __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v4, v4, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (*beta == 0.) { - __asm__("vfmul.vf v4, v4, ft10"); - __asm__(VSE "v4, (%0)" : : "r"(y)); - } - else { - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); - __asm__(FMUL "ft0, ft11, ft0"); - __asm__("vfmv.s.f v30, ft0"); - __asm__("vfmacc.vf v30, ft10, v4"); - __asm__(VSE "v30, (%0)" : : "r"(y)); - } - __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy)); - case 1: - __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v0, v0, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (*beta == 0.) { - __asm__("vfmul.vf v0, v0, ft10"); - __asm__(VSE "v0, (%0)" : : "r"(y)); - } - else { - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); - __asm__(FMUL "ft0, ft11, ft0"); - __asm__("vfmv.s.f v30, ft0"); - __asm__("vfmacc.vf v30, ft10, v0"); - __asm__(VSE "v30, (%0)" : : "r"(y)); - } - } - } // end cleanup - return; -} - -#undef FLT_SIZE -#undef FLT_LOAD -#undef FMUL -#undef VLE -#undef VLSE -#undef VSE -#undef VSSE - -#define FLT_SIZE 4 -#define FLT_LOAD "flw " -#define FMUL "fmul.s " -#define FMADD "fmadd.s " -#define FNMSUB "fnmsub.s " -#define VLSEG2 "vlseg2e32.v " -#define VLSSEG2 "vlsseg2e32.v " -#define VSSEG2 "vsseg2e32.v " -#define VSSSEG2 "vssseg2e32.v " -#define VSE "vse32.v " - -void bli_cdotxf_sifive_x280_asm( - conj_t conjat, - conj_t conjx, - dim_t m, - dim_t b, - const void* restrict alpha_, - const void* restrict a_, inc_t inca, inc_t lda, - const void* restrict x_, inc_t incx, - const void* restrict beta_, - void* restrict y_, inc_t incy, - const cntx_t* restrict cntx - ) { - (void)cntx; - const scomplex* restrict alpha = alpha_; - const scomplex* restrict a = a_; - const scomplex* restrict x = x_; - const scomplex* restrict beta = beta_; - scomplex* restrict y = y_; - - if (b == 0) - return; - else if (m == 0 || (alpha->real == 0.f && alpha->imag == 0.f)) { - // scale y by beta - if (beta->real == 0.f && beta->imag == 0.f) - bli_csetv_sifive_x280_asm(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL); - else - bli_cscalv_sifive_x280_intr(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL); - return; - } - - __asm__(FLT_LOAD "ft8, (%0)" : : "r"(alpha)); - __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(alpha), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft10, (%0)" : : "r"(beta)); - __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(beta), "I"(FLT_SIZE)); - // Reduce to case when A^T is not conjugated, then conjugate - // computed product A^T * x if needed. - conj_t conjatx = BLIS_NO_CONJUGATE; - if (conjat == BLIS_CONJUGATE) { - bli_toggle_conj(&conjat); - bli_toggle_conj(&conjx); - bli_toggle_conj(&conjatx); - } - inca *= 2 * FLT_SIZE; - lda *= 2 * FLT_SIZE; - incx *= 2 * FLT_SIZE; - incy *= 2 * FLT_SIZE; - // these are used to bump a and y, resp. - inc_t a_bump = 6 * lda; - inc_t y_bump = incy - FLT_SIZE; - while (b >= 6) { - // compute dot product of x with 6 rows of a - const scomplex* restrict x_tmp = x; - const scomplex* restrict a_col = a; - size_t avl = m; - bool first = true; - while (avl) { - const scomplex* restrict a_row = a_col; - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m2, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE)); - if (incx == 2 * FLT_SIZE) - __asm__(VLSEG2 "v28, (%0)" : : "r"(x_tmp)); - else - __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(x_tmp), "r"(incx)); - if (inca == 2 * FLT_SIZE) { - if (conjx == BLIS_NO_CONJUGATE) { - // a unit stride, conjx = no conj - if (first) { - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv(v0, v2, v24, v26, v28, v30); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv(v4, v6, v24, v26, v28, v30); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv(v8, v10, v24, v26, v28, v30); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv(v12, v14, v24, v26, v28, v30); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv(v16, v18, v24, v26, v28, v30); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - vcmul_vv(v20, v22, v24, v26, v28, v30); - first = false; - } - else { - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv(v0, v2, v24, v26, v28, v30); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv(v4, v6, v24, v26, v28, v30); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv(v8, v10, v24, v26, v28, v30); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv(v12, v14, v24, v26, v28, v30); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv(v16, v18, v24, v26, v28, v30); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - vcmacc_vv(v20, v22, v24, v26, v28, v30); - } - } // end conjx == BLIS_NO_CONJUGATE - else { // conjx == BLIS_CONJUGATE - // a unit stride, conjx = conj - if (first) { - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv_conj(v0, v2, v24, v26, v28, v30); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv_conj(v4, v6, v24, v26, v28, v30); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv_conj(v8, v10, v24, v26, v28, v30); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv_conj(v12, v14, v24, v26, v28, v30); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv_conj(v16, v18, v24, v26, v28, v30); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - vcmul_vv_conj(v20, v22, v24, v26, v28, v30); - first = false; - } - else { - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv_conj(v0, v2, v24, v26, v28, v30); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv_conj(v4, v6, v24, v26, v28, v30); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv_conj(v8, v10, v24, v26, v28, v30); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv_conj(v12, v14, v24, v26, v28, v30); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv_conj(v16, v18, v24, v26, v28, v30); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - vcmacc_vv_conj(v20, v22, v24, v26, v28, v30); - } - } // end conjx == BLIS_CONJUGATE - } // end a unit stride - else { // a non-unit stride - if (conjx == BLIS_NO_CONJUGATE) { - // a non-unit stride, conjx = no conj - if (first) { - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv(v0, v2, v24, v26, v28, v30); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv(v4, v6, v24, v26, v28, v30); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv(v8, v10, v24, v26, v28, v30); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv(v12, v14, v24, v26, v28, v30); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv(v16, v18, v24, v26, v28, v30); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - vcmul_vv(v20, v22, v24, v26, v28, v30); - first = false; - } - else { - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv(v0, v2, v24, v26, v28, v30); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv(v4, v6, v24, v26, v28, v30); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv(v8, v10, v24, v26, v28, v30); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv(v12, v14, v24, v26, v28, v30); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv(v16, v18, v24, v26, v28, v30); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - vcmacc_vv(v20, v22, v24, v26, v28, v30); - } - } // end conjx == BLIS_NO_CONJUGATE - else { // conjx = BLIS_CONJUGATE - // a non-unit stride, conjx = conj - if (first) { - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv_conj(v0, v2, v24, v26, v28, v30); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv_conj(v4, v6, v24, v26, v28, v30); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv_conj(v8, v10, v24, v26, v28, v30); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv_conj(v12, v14, v24, v26, v28, v30); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv_conj(v16, v18, v24, v26, v28, v30); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - vcmul_vv_conj(v20, v22, v24, v26, v28, v30); - first = false; - } - else { - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv_conj(v0, v2, v24, v26, v28, v30); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv_conj(v4, v6, v24, v26, v28, v30); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv_conj(v8, v10, v24, v26, v28, v30); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv_conj(v12, v14, v24, v26, v28, v30); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv_conj(v16, v18, v24, v26, v28, v30); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - vcmacc_vv_conj(v20, v22, v24, v26, v28, v30); - } - } // end conjx == BLIS_CONJUGATE - } // end a non-unit stride - __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(vl * incx)); - __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca)); - avl -= vl; - } - - __asm__("vmv.s.x v31, x0"); - - __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v0, v0, v31"); - __asm__("vfredusum.vs v2, v2, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (beta->real == 0.f && beta->imag == 0.f) { - if (conjatx == BLIS_NO_CONJUGATE) { - vcmul_vf(v28, v29, v0, v2, ft8, ft9); - } - else { - vcmul_vf_conj(v28, v29, v0, v2, ft8, ft9); - } - } - else { - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); - cmul(ft0, ft1, ft10, ft11, ft2, ft3); - __asm__("vfmv.s.f v28, ft0"); - __asm__("vfmv.s.f v29, ft1"); - if (conjatx == BLIS_NO_CONJUGATE) { - vcmacc_vf(v28, v29, ft8, ft9, v0, v2); - } - else { - vcmacc_vf_conj(v28, v29, ft8, ft9, v0, v2); - } - } - __asm__(VSE "v28, (%0)" : : "r"(y)); - __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); - __asm__(VSE "v29, (%0)" : : "r"(y)); - __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump)); - - __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v4, v4, v31"); - __asm__("vfredusum.vs v6, v6, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (beta->real == 0.f && beta->imag == 0.f) { - if (conjatx == BLIS_NO_CONJUGATE) { - vcmul_vf(v28, v29, v4, v6, ft8, ft9); - } - else { - vcmul_vf_conj(v28, v29, v4, v6, ft8, ft9); - } - } - else { - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); - cmul(ft0, ft1, ft10, ft11, ft2, ft3); - __asm__("vfmv.s.f v28, ft0"); - __asm__("vfmv.s.f v29, ft1"); - if (conjatx == BLIS_NO_CONJUGATE) { - vcmacc_vf(v28, v29, ft8, ft9, v4, v6); - } - else { - vcmacc_vf_conj(v28, v29, ft8, ft9, v4, v6); - } - } - __asm__(VSE "v28, (%0)" : : "r"(y)); - __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); - __asm__(VSE "v29, (%0)" : : "r"(y)); - __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump)); - - __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v8, v8, v31"); - __asm__("vfredusum.vs v10, v10, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (beta->real == 0.f && beta->imag == 0.f) { - if (conjatx == BLIS_NO_CONJUGATE) { - vcmul_vf(v28, v29, v8, v10, ft8, ft9); - } - else { - vcmul_vf_conj(v28, v29, v8, v10, ft8, ft9); - } - } - else { - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); - cmul(ft0, ft1, ft10, ft11, ft2, ft3); - __asm__("vfmv.s.f v28, ft0"); - __asm__("vfmv.s.f v29, ft1"); - if (conjatx == BLIS_NO_CONJUGATE) { - vcmacc_vf(v28, v29, ft8, ft9, v8, v10); - } - else { - vcmacc_vf_conj(v28, v29, ft8, ft9, v8, v10); - } - } - __asm__(VSE "v28, (%0)" : : "r"(y)); - __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); - __asm__(VSE "v29, (%0)" : : "r"(y)); - __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump)); - - __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v12, v12, v31"); - __asm__("vfredusum.vs v14, v14, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (beta->real == 0.f && beta->imag == 0.f) { - if (conjatx == BLIS_NO_CONJUGATE) { - vcmul_vf(v28, v29, v12, v14, ft8, ft9); - } - else { - vcmul_vf_conj(v28, v29, v12, v14, ft8, ft9); - } - } - else { - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); - cmul(ft0, ft1, ft10, ft11, ft2, ft3); - __asm__("vfmv.s.f v28, ft0"); - __asm__("vfmv.s.f v29, ft1"); - if (conjatx == BLIS_NO_CONJUGATE) { - vcmacc_vf(v28, v29, ft8, ft9, v12, v14); - } - else { - vcmacc_vf_conj(v28, v29, ft8, ft9, v12, v14); - } - } - __asm__(VSE "v28, (%0)" : : "r"(y)); - __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); - __asm__(VSE "v29, (%0)" : : "r"(y)); - __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump)); - - __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v16, v16, v31"); - __asm__("vfredusum.vs v18, v18, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (beta->real == 0.f && beta->imag == 0.f) { - if (conjatx == BLIS_NO_CONJUGATE) { - vcmul_vf(v28, v29, v16, v18, ft8, ft9); - } - else { - vcmul_vf_conj(v28, v29, v16, v18, ft8, ft9); - } - } - else { - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); - cmul(ft0, ft1, ft10, ft11, ft2, ft3); - __asm__("vfmv.s.f v28, ft0"); - __asm__("vfmv.s.f v29, ft1"); - if (conjatx == BLIS_NO_CONJUGATE) { - vcmacc_vf(v28, v29, ft8, ft9, v16, v18); - } - else { - vcmacc_vf_conj(v28, v29, ft8, ft9, v16, v18); - } - } - __asm__(VSE "v28, (%0)" : : "r"(y)); - __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); - __asm__(VSE "v29, (%0)" : : "r"(y)); - __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump)); - - __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v20, v20, v31"); - __asm__("vfredusum.vs v22, v22, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (beta->real == 0.f && beta->imag == 0.f) { - if (conjatx == BLIS_NO_CONJUGATE) { - vcmul_vf(v28, v29, v20, v22, ft8, ft9); - } - else { - vcmul_vf_conj(v28, v29, v20, v22, ft8, ft9); - } - } - else { - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); - cmul(ft0, ft1, ft10, ft11, ft2, ft3); - __asm__("vfmv.s.f v28, ft0"); - __asm__("vfmv.s.f v29, ft1"); - if (conjatx == BLIS_NO_CONJUGATE) { - vcmacc_vf(v28, v29, ft8, ft9, v20, v22); - } - else { - vcmacc_vf_conj(v28, v29, ft8, ft9, v20, v22); - } - } - __asm__(VSE "v28, (%0)" : : "r"(y)); - __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); - __asm__(VSE "v29, (%0)" : : "r"(y)); - __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump)); - - // a += 6 * lda; - __asm__("add %0, %0, %1" : "+r"(a) : "r"(a_bump)); - b -= 6; - } - - if (b > 0) { - // cleanup loop, 0 < b < 6 - const scomplex* restrict x_tmp = x; - const scomplex* restrict a_col; - __asm__("add %0, %1, %2" : "=r"(a_col) : "r"(a), "r"((b - 1) * lda)); - size_t avl = m; - bool first = true; - while (avl) { - const scomplex* restrict a_row = a_col; - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m2, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE)); - if (incx == 2 * FLT_SIZE) - __asm__(VLSEG2 "v28, (%0)" : : "r"(x_tmp)); - else - __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(x_tmp), "r"(incx)); - if (inca == 2 * FLT_SIZE) { - if (conjx == BLIS_NO_CONJUGATE) { - // a unit stride, conjx = no conj - if (first) { - switch (b) { - case 5: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv(v16, v18, v24, v26, v28, v30); - case 4: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv(v12, v14, v24, v26, v28, v30); - case 3: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv(v8, v10, v24, v26, v28, v30); - case 2: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv(v4, v6, v24, v26, v28, v30); - case 1: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - vcmul_vv(v0, v2, v24, v26, v28, v30); - } - first = false; - } - else { - switch (b) { - case 5: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv(v16, v18, v24, v26, v28, v30); - case 4: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv(v12, v14, v24, v26, v28, v30); - case 3: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv(v8, v10, v24, v26, v28, v30); - case 2: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv(v4, v6, v24, v26, v28, v30); - case 1: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - vcmacc_vv(v0, v2, v24, v26, v28, v30); - } - } - } // end conjx == BLIS_NO_CONJUGATE - else { // conjx == BLIS_CONJUGATE - // a unit stride, conjx = conj - if (first) { - switch (b) { - case 5: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv_conj(v16, v18, v24, v26, v28, v30); - case 4: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv_conj(v12, v14, v24, v26, v28, v30); - case 3: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv_conj(v8, v10, v24, v26, v28, v30); - case 2: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv_conj(v4, v6, v24, v26, v28, v30); - case 1: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - vcmul_vv_conj(v0, v2, v24, v26, v28, v30); - } - first = false; - } - else { - switch (b) { - case 5: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv_conj(v16, v18, v24, v26, v28, v30); - case 4: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv_conj(v12, v14, v24, v26, v28, v30); - case 3: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv_conj(v8, v10, v24, v26, v28, v30); - case 2: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv_conj(v4, v6, v24, v26, v28, v30); - case 1: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - vcmacc_vv_conj(v0, v2, v24, v26, v28, v30); - } - } - } // end conjx == BLIS_CONJUGATE - } // end a unit stride - else { // a non-unit stride - if (conjx == BLIS_NO_CONJUGATE) { - // a non-unit stride, conjx = no conj - if (first) { - switch (b) { - case 5: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv(v16, v18, v24, v26, v28, v30); - case 4: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv(v12, v14, v24, v26, v28, v30); - case 3: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv(v8, v10, v24, v26, v28, v30); - case 2: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv(v4, v6, v24, v26, v28, v30); - case 1: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - vcmul_vv(v0, v2, v24, v26, v28, v30); - } - first = false; - } - else { - switch (b) { - case 5: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv(v16, v18, v24, v26, v28, v30); - case 4: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv(v12, v14, v24, v26, v28, v30); - case 3: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv(v8, v10, v24, v26, v28, v30); - case 2: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv(v4, v6, v24, v26, v28, v30); - case 1: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - vcmacc_vv(v0, v2, v24, v26, v28, v30); - } - } - } // end conjx == BLIS_NO_CONJUGATE - else { // conjx == BLIS_CONJUGATE - // a non-unit stride, conjx = conj - if (first) { - switch (b) { - case 5: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv_conj(v16, v18, v24, v26, v28, v30); - case 4: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv_conj(v12, v14, v24, v26, v28, v30); - case 3: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv_conj(v8, v10, v24, v26, v28, v30); - case 2: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv_conj(v4, v6, v24, v26, v28, v30); - case 1: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - vcmul_vv_conj(v0, v2, v24, v26, v28, v30); - } - first = false; - } - else { - switch (b) { - case 5: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv_conj(v16, v18, v24, v26, v28, v30); - case 4: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv_conj(v12, v14, v24, v26, v28, v30); - case 3: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv_conj(v8, v10, v24, v26, v28, v30); - case 2: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv_conj(v4, v6, v24, v26, v28, v30); - case 1: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - vcmacc_vv_conj(v0, v2, v24, v26, v28, v30); - } - } - } // end conjx == BLIS_CONJUGATE - } // end a non-unit stride - __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(vl * incx)); - __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca)); - avl -= vl; - } - - __asm__("add %0, %0, %1" : "+r"(y) : "r"((b - 1) * incy)); - y_bump = incy + FLT_SIZE; - __asm__("vmv.s.x v31, x0"); - - switch (b) { - case 5: - __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v16, v16, v31"); - __asm__("vfredusum.vs v18, v18, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (beta->real == 0.f && beta->imag == 0.f) { - if (conjatx == BLIS_NO_CONJUGATE) { - vcmul_vf(v28, v29, v16, v18, ft8, ft9); - } - else { - vcmul_vf_conj(v28, v29, v16, v18, ft8, ft9); - } - } - else { - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); - cmul(ft0, ft1, ft10, ft11, ft2, ft3); - __asm__("vfmv.s.f v28, ft0"); - __asm__("vfmv.s.f v29, ft1"); - if (conjatx == BLIS_NO_CONJUGATE) { - vcmacc_vf(v28, v29, ft8, ft9, v16, v18); - } - else { - vcmacc_vf_conj(v28, v29, ft8, ft9, v16, v18); - } - } - __asm__(VSE "v28, (%0)" : : "r"(y)); - __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); - __asm__(VSE "v29, (%0)" : : "r"(y)); - __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump)); - case 4: - __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v12, v12, v31"); - __asm__("vfredusum.vs v14, v14, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (beta->real == 0.f && beta->imag == 0.f) { - if (conjatx == BLIS_NO_CONJUGATE) { - vcmul_vf(v28, v29, v12, v14, ft8, ft9); - } - else { - vcmul_vf_conj(v28, v29, v12, v14, ft8, ft9); - } - } - else { - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); - cmul(ft0, ft1, ft10, ft11, ft2, ft3); - __asm__("vfmv.s.f v28, ft0"); - __asm__("vfmv.s.f v29, ft1"); - if (conjatx == BLIS_NO_CONJUGATE) { - vcmacc_vf(v28, v29, ft8, ft9, v12, v14); - } - else { - vcmacc_vf_conj(v28, v29, ft8, ft9, v12, v14); - } - } - __asm__(VSE "v28, (%0)" : : "r"(y)); - __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); - __asm__(VSE "v29, (%0)" : : "r"(y)); - __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump)); - case 3: - __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v8, v8, v31"); - __asm__("vfredusum.vs v10, v10, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (beta->real == 0.f && beta->imag == 0.f) { - if (conjatx == BLIS_NO_CONJUGATE) { - vcmul_vf(v28, v29, v8, v10, ft8, ft9); - } - else { - vcmul_vf_conj(v28, v29, v8, v10, ft8, ft9); - } - } - else { - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); - cmul(ft0, ft1, ft10, ft11, ft2, ft3); - __asm__("vfmv.s.f v28, ft0"); - __asm__("vfmv.s.f v29, ft1"); - if (conjatx == BLIS_NO_CONJUGATE) { - vcmacc_vf(v28, v29, ft8, ft9, v8, v10); - } - else { - vcmacc_vf_conj(v28, v29, ft8, ft9, v8, v10); - } - } - __asm__(VSE "v28, (%0)" : : "r"(y)); - __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); - __asm__(VSE "v29, (%0)" : : "r"(y)); - __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump)); - case 2: - __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v4, v4, v31"); - __asm__("vfredusum.vs v6, v6, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (beta->real == 0.f && beta->imag == 0.f) { - if (conjatx == BLIS_NO_CONJUGATE) { - vcmul_vf(v28, v29, v4, v6, ft8, ft9); - } - else { - vcmul_vf_conj(v28, v29, v4, v6, ft8, ft9); - } - } - else { - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); - cmul(ft0, ft1, ft10, ft11, ft2, ft3); - __asm__("vfmv.s.f v28, ft0"); - __asm__("vfmv.s.f v29, ft1"); - if (conjatx == BLIS_NO_CONJUGATE) { - vcmacc_vf(v28, v29, ft8, ft9, v4, v6); - } - else { - vcmacc_vf_conj(v28, v29, ft8, ft9, v4, v6); - } - } - __asm__(VSE "v28, (%0)" : : "r"(y)); - __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); - __asm__(VSE "v29, (%0)" : : "r"(y)); - __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump)); - case 1: - __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v0, v0, v31"); - __asm__("vfredusum.vs v2, v2, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (beta->real == 0.f && beta->imag == 0.f) { - if (conjatx == BLIS_NO_CONJUGATE) { - vcmul_vf(v28, v29, v0, v2, ft8, ft9); - } - else { - vcmul_vf_conj(v28, v29, v0, v2, ft8, ft9); - } - } - else { - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); - cmul(ft0, ft1, ft10, ft11, ft2, ft3); - __asm__("vfmv.s.f v28, ft0"); - __asm__("vfmv.s.f v29, ft1"); - if (conjatx == BLIS_NO_CONJUGATE) { - vcmacc_vf(v28, v29, ft8, ft9, v0, v2); - } - else { - vcmacc_vf_conj(v28, v29, ft8, ft9, v0, v2); - } - } - __asm__(VSE "v28, (%0)" : : "r"(y)); - __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); - __asm__(VSE "v29, (%0)" : : "r"(y)); - } - } // end cleanup - return; -} - -#undef FLT_SIZE -#undef FLT_LOAD -#undef FMUL -#undef FMADD -#undef FNMSUB -#undef VLSEG2 -#undef VLSSEG2 -#undef VSSEG2 -#undef VSSSEG2 -#undef VSE - -#define FLT_SIZE 8 -#define FLT_LOAD "fld " -#define FMUL "fmul.d " -#define FMADD "fmadd.d " -#define FNMSUB "fnmsub.d " -#define VLSEG2 "vlseg2e64.v " -#define VLSSEG2 "vlsseg2e64.v " -#define VSSEG2 "vsseg2e64.v " -#define VSSSEG2 "vssseg2e64.v " -#define VSE "vse64.v " - -void bli_zdotxf_sifive_x280_asm( - conj_t conjat, - conj_t conjx, - dim_t m, - dim_t b, - const void* restrict alpha_, - const void* restrict a_, inc_t inca, inc_t lda, - const void* restrict x_, inc_t incx, - const void* restrict beta_, - void* restrict y_, inc_t incy, - const cntx_t* restrict cntx - ) { - (void)cntx; - const dcomplex* restrict alpha = alpha_; - const dcomplex* restrict a = a_; - const dcomplex* restrict x = x_; - const dcomplex* restrict beta = beta_; - dcomplex* restrict y = y_; - - if (b == 0) - return; - else if (m == 0 || (alpha->real == 0. && alpha->imag == 0.)) { - // scale y by beta - if (beta->real == 0. && beta->imag == 0.) - bli_zsetv_sifive_x280_asm(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL); - else - bli_zscalv_sifive_x280_intr(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL); - return; - } - - __asm__(FLT_LOAD "ft8, (%0)" : : "r"(alpha)); - __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(alpha), "I"(FLT_SIZE)); - __asm__(FLT_LOAD "ft10, (%0)" : : "r"(beta)); - __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(beta), "I"(FLT_SIZE)); - // Reduce to case when A^T is not conjugated, then conjugate - // computed product A^T * x if needed. - conj_t conjatx = BLIS_NO_CONJUGATE; - if (conjat == BLIS_CONJUGATE) { - bli_toggle_conj(&conjat); - bli_toggle_conj(&conjx); - bli_toggle_conj(&conjatx); - } - inca *= 2 * FLT_SIZE; - lda *= 2 * FLT_SIZE; - incx *= 2 * FLT_SIZE; - incy *= 2 * FLT_SIZE; - // these are used to bump a and y, resp. - inc_t a_bump = 6 * lda; - inc_t y_bump = incy - FLT_SIZE; - while (b >= 6) { - // compute dot product of x with 6 rows of a - const dcomplex* restrict x_tmp = x; - const dcomplex* restrict a_col = a; - size_t avl = m; - bool first = true; - while (avl) { - const dcomplex* restrict a_row = a_col; - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m2, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE)); - if (incx == 2 * FLT_SIZE) - __asm__(VLSEG2 "v28, (%0)" : : "r"(x_tmp)); - else - __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(x_tmp), "r"(incx)); - if (inca == 2 * FLT_SIZE) { - if (conjx == BLIS_NO_CONJUGATE) { - // a unit stride, conjx = no conj - if (first) { - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv(v0, v2, v24, v26, v28, v30); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv(v4, v6, v24, v26, v28, v30); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv(v8, v10, v24, v26, v28, v30); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv(v12, v14, v24, v26, v28, v30); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv(v16, v18, v24, v26, v28, v30); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - vcmul_vv(v20, v22, v24, v26, v28, v30); - first = false; - } - else { - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv(v0, v2, v24, v26, v28, v30); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv(v4, v6, v24, v26, v28, v30); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv(v8, v10, v24, v26, v28, v30); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv(v12, v14, v24, v26, v28, v30); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv(v16, v18, v24, v26, v28, v30); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - vcmacc_vv(v20, v22, v24, v26, v28, v30); - } - } // end conjx == BLIS_NO_CONJUGATE - else { // conjx == BLIS_CONJUGATE - // a unit stride, conjx = conj - if (first) { - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv_conj(v0, v2, v24, v26, v28, v30); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv_conj(v4, v6, v24, v26, v28, v30); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv_conj(v8, v10, v24, v26, v28, v30); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv_conj(v12, v14, v24, v26, v28, v30); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv_conj(v16, v18, v24, v26, v28, v30); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - vcmul_vv_conj(v20, v22, v24, v26, v28, v30); - first = false; - } - else { - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv_conj(v0, v2, v24, v26, v28, v30); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv_conj(v4, v6, v24, v26, v28, v30); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv_conj(v8, v10, v24, v26, v28, v30); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv_conj(v12, v14, v24, v26, v28, v30); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv_conj(v16, v18, v24, v26, v28, v30); - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - vcmacc_vv_conj(v20, v22, v24, v26, v28, v30); - } - } // end conjx == BLIS_CONJUGATE - } // end a unit stride - else { // a non-unit stride - if (conjx == BLIS_NO_CONJUGATE) { - // a non-unit stride, conjx = no conj - if (first) { - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv(v0, v2, v24, v26, v28, v30); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv(v4, v6, v24, v26, v28, v30); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv(v8, v10, v24, v26, v28, v30); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv(v12, v14, v24, v26, v28, v30); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv(v16, v18, v24, v26, v28, v30); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - vcmul_vv(v20, v22, v24, v26, v28, v30); - first = false; - } - else { - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv(v0, v2, v24, v26, v28, v30); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv(v4, v6, v24, v26, v28, v30); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv(v8, v10, v24, v26, v28, v30); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv(v12, v14, v24, v26, v28, v30); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv(v16, v18, v24, v26, v28, v30); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - vcmacc_vv(v20, v22, v24, v26, v28, v30); - } - } // end conjx == BLIS_NO_CONJUGATE - else { // conjx = BLIS_CONJUGATE - // a non-unit stride, conjx = conj - if (first) { - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv_conj(v0, v2, v24, v26, v28, v30); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv_conj(v4, v6, v24, v26, v28, v30); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv_conj(v8, v10, v24, v26, v28, v30); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv_conj(v12, v14, v24, v26, v28, v30); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv_conj(v16, v18, v24, v26, v28, v30); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - vcmul_vv_conj(v20, v22, v24, v26, v28, v30); - first = false; - } - else { - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv_conj(v0, v2, v24, v26, v28, v30); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv_conj(v4, v6, v24, v26, v28, v30); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv_conj(v8, v10, v24, v26, v28, v30); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv_conj(v12, v14, v24, v26, v28, v30); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv_conj(v16, v18, v24, v26, v28, v30); - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - vcmacc_vv_conj(v20, v22, v24, v26, v28, v30); - } - } // end conjx == BLIS_CONJUGATE - } // end a non-unit stride - __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(vl * incx)); - __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca)); - avl -= vl; - } - - __asm__("vmv.s.x v31, x0"); - - __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v0, v0, v31"); - __asm__("vfredusum.vs v2, v2, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (beta->real == 0. && beta->imag == 0.) { - if (conjatx == BLIS_NO_CONJUGATE) { - vcmul_vf(v28, v29, v0, v2, ft8, ft9); - } - else { - vcmul_vf_conj(v28, v29, v0, v2, ft8, ft9); - } - } - else { - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); - cmul(ft0, ft1, ft10, ft11, ft2, ft3); - __asm__("vfmv.s.f v28, ft0"); - __asm__("vfmv.s.f v29, ft1"); - if (conjatx == BLIS_NO_CONJUGATE) { - vcmacc_vf(v28, v29, ft8, ft9, v0, v2); - } - else { - vcmacc_vf_conj(v28, v29, ft8, ft9, v0, v2); - } - } - __asm__(VSE "v28, (%0)" : : "r"(y)); - __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); - __asm__(VSE "v29, (%0)" : : "r"(y)); - __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump)); - - __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v4, v4, v31"); - __asm__("vfredusum.vs v6, v6, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (beta->real == 0. && beta->imag == 0.) { - if (conjatx == BLIS_NO_CONJUGATE) { - vcmul_vf(v28, v29, v4, v6, ft8, ft9); - } - else { - vcmul_vf_conj(v28, v29, v4, v6, ft8, ft9); - } - } - else { - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); - cmul(ft0, ft1, ft10, ft11, ft2, ft3); - __asm__("vfmv.s.f v28, ft0"); - __asm__("vfmv.s.f v29, ft1"); - if (conjatx == BLIS_NO_CONJUGATE) { - vcmacc_vf(v28, v29, ft8, ft9, v4, v6); - } - else { - vcmacc_vf_conj(v28, v29, ft8, ft9, v4, v6); - } - } - __asm__(VSE "v28, (%0)" : : "r"(y)); - __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); - __asm__(VSE "v29, (%0)" : : "r"(y)); - __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump)); - - __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v8, v8, v31"); - __asm__("vfredusum.vs v10, v10, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (beta->real == 0. && beta->imag == 0.) { - if (conjatx == BLIS_NO_CONJUGATE) { - vcmul_vf(v28, v29, v8, v10, ft8, ft9); - } - else { - vcmul_vf_conj(v28, v29, v8, v10, ft8, ft9); - } - } - else { - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); - cmul(ft0, ft1, ft10, ft11, ft2, ft3); - __asm__("vfmv.s.f v28, ft0"); - __asm__("vfmv.s.f v29, ft1"); - if (conjatx == BLIS_NO_CONJUGATE) { - vcmacc_vf(v28, v29, ft8, ft9, v8, v10); - } - else { - vcmacc_vf_conj(v28, v29, ft8, ft9, v8, v10); - } - } - __asm__(VSE "v28, (%0)" : : "r"(y)); - __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); - __asm__(VSE "v29, (%0)" : : "r"(y)); - __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump)); - - __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v12, v12, v31"); - __asm__("vfredusum.vs v14, v14, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (beta->real == 0. && beta->imag == 0.) { - if (conjatx == BLIS_NO_CONJUGATE) { - vcmul_vf(v28, v29, v12, v14, ft8, ft9); - } - else { - vcmul_vf_conj(v28, v29, v12, v14, ft8, ft9); - } - } - else { - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); - cmul(ft0, ft1, ft10, ft11, ft2, ft3); - __asm__("vfmv.s.f v28, ft0"); - __asm__("vfmv.s.f v29, ft1"); - if (conjatx == BLIS_NO_CONJUGATE) { - vcmacc_vf(v28, v29, ft8, ft9, v12, v14); - } - else { - vcmacc_vf_conj(v28, v29, ft8, ft9, v12, v14); - } - } - __asm__(VSE "v28, (%0)" : : "r"(y)); - __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); - __asm__(VSE "v29, (%0)" : : "r"(y)); - __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump)); - - __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v16, v16, v31"); - __asm__("vfredusum.vs v18, v18, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (beta->real == 0. && beta->imag == 0.) { - if (conjatx == BLIS_NO_CONJUGATE) { - vcmul_vf(v28, v29, v16, v18, ft8, ft9); - } - else { - vcmul_vf_conj(v28, v29, v16, v18, ft8, ft9); - } - } - else { - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); - cmul(ft0, ft1, ft10, ft11, ft2, ft3); - __asm__("vfmv.s.f v28, ft0"); - __asm__("vfmv.s.f v29, ft1"); - if (conjatx == BLIS_NO_CONJUGATE) { - vcmacc_vf(v28, v29, ft8, ft9, v16, v18); - } - else { - vcmacc_vf_conj(v28, v29, ft8, ft9, v16, v18); - } - } - __asm__(VSE "v28, (%0)" : : "r"(y)); - __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); - __asm__(VSE "v29, (%0)" : : "r"(y)); - __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump)); - - __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v20, v20, v31"); - __asm__("vfredusum.vs v22, v22, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (beta->real == 0. && beta->imag == 0.) { - if (conjatx == BLIS_NO_CONJUGATE) { - vcmul_vf(v28, v29, v20, v22, ft8, ft9); - } - else { - vcmul_vf_conj(v28, v29, v20, v22, ft8, ft9); - } - } - else { - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); - cmul(ft0, ft1, ft10, ft11, ft2, ft3); - __asm__("vfmv.s.f v28, ft0"); - __asm__("vfmv.s.f v29, ft1"); - if (conjatx == BLIS_NO_CONJUGATE) { - vcmacc_vf(v28, v29, ft8, ft9, v20, v22); - } - else { - vcmacc_vf_conj(v28, v29, ft8, ft9, v20, v22); - } - } - __asm__(VSE "v28, (%0)" : : "r"(y)); - __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); - __asm__(VSE "v29, (%0)" : : "r"(y)); - __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump)); - - // a += 6 * lda; - __asm__("add %0, %0, %1" : "+r"(a) : "r"(a_bump)); - b -= 6; - } - - if (b > 0) { - // cleanup loop, 0 < b < 6 - const dcomplex* restrict x_tmp = x; - const dcomplex* restrict a_col; - __asm__("add %0, %1, %2" : "=r"(a_col) : "r"(a), "r"((b - 1) * lda)); - size_t avl = m; - bool first = true; - while (avl) { - const dcomplex* restrict a_row = a_col; - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m2, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE)); - if (incx == 2 * FLT_SIZE) - __asm__(VLSEG2 "v28, (%0)" : : "r"(x_tmp)); - else - __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(x_tmp), "r"(incx)); - if (inca == 2 * FLT_SIZE) { - if (conjx == BLIS_NO_CONJUGATE) { - // a unit stride, conjx = no conj - if (first) { - switch (b) { - case 5: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv(v16, v18, v24, v26, v28, v30); - case 4: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv(v12, v14, v24, v26, v28, v30); - case 3: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv(v8, v10, v24, v26, v28, v30); - case 2: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv(v4, v6, v24, v26, v28, v30); - case 1: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - vcmul_vv(v0, v2, v24, v26, v28, v30); - } - first = false; - } - else { - switch (b) { - case 5: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv(v16, v18, v24, v26, v28, v30); - case 4: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv(v12, v14, v24, v26, v28, v30); - case 3: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv(v8, v10, v24, v26, v28, v30); - case 2: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv(v4, v6, v24, v26, v28, v30); - case 1: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - vcmacc_vv(v0, v2, v24, v26, v28, v30); - } - } - } // end conjx == BLIS_NO_CONJUGATE - else { // conjx == BLIS_CONJUGATE - // a unit stride, conjx = conj - if (first) { - switch (b) { - case 5: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv_conj(v16, v18, v24, v26, v28, v30); - case 4: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv_conj(v12, v14, v24, v26, v28, v30); - case 3: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv_conj(v8, v10, v24, v26, v28, v30); - case 2: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv_conj(v4, v6, v24, v26, v28, v30); - case 1: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - vcmul_vv_conj(v0, v2, v24, v26, v28, v30); - } - first = false; - } - else { - switch (b) { - case 5: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv_conj(v16, v18, v24, v26, v28, v30); - case 4: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv_conj(v12, v14, v24, v26, v28, v30); - case 3: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv_conj(v8, v10, v24, v26, v28, v30); - case 2: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv_conj(v4, v6, v24, v26, v28, v30); - case 1: - __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); - vcmacc_vv_conj(v0, v2, v24, v26, v28, v30); - } - } - } // end conjx == BLIS_CONJUGATE - } // end a unit stride - else { // a non-unit stride - if (conjx == BLIS_NO_CONJUGATE) { - // a non-unit stride, conjx = no conj - if (first) { - switch (b) { - case 5: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv(v16, v18, v24, v26, v28, v30); - case 4: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv(v12, v14, v24, v26, v28, v30); - case 3: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv(v8, v10, v24, v26, v28, v30); - case 2: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv(v4, v6, v24, v26, v28, v30); - case 1: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - vcmul_vv(v0, v2, v24, v26, v28, v30); - } - first = false; - } - else { - switch (b) { - case 5: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv(v16, v18, v24, v26, v28, v30); - case 4: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv(v12, v14, v24, v26, v28, v30); - case 3: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv(v8, v10, v24, v26, v28, v30); - case 2: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv(v4, v6, v24, v26, v28, v30); - case 1: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - vcmacc_vv(v0, v2, v24, v26, v28, v30); - } - } - } // end conjx == BLIS_NO_CONJUGATE - else { // conjx == BLIS_CONJUGATE - // a non-unit stride, conjx = conj - if (first) { - switch (b) { - case 5: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv_conj(v16, v18, v24, v26, v28, v30); - case 4: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv_conj(v12, v14, v24, v26, v28, v30); - case 3: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv_conj(v8, v10, v24, v26, v28, v30); - case 2: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmul_vv_conj(v4, v6, v24, v26, v28, v30); - case 1: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - vcmul_vv_conj(v0, v2, v24, v26, v28, v30); - } - first = false; - } - else { - switch (b) { - case 5: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv_conj(v16, v18, v24, v26, v28, v30); - case 4: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv_conj(v12, v14, v24, v26, v28, v30); - case 3: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv_conj(v8, v10, v24, v26, v28, v30); - case 2: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); - vcmacc_vv_conj(v4, v6, v24, v26, v28, v30); - case 1: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); - vcmacc_vv_conj(v0, v2, v24, v26, v28, v30); - } - } - } // end conjx == BLIS_CONJUGATE - } // end a non-unit stride - __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(vl * incx)); - __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca)); - avl -= vl; - } - - __asm__("add %0, %0, %1" : "+r"(y) : "r"((b - 1) * incy)); - y_bump = incy + FLT_SIZE; - __asm__("vmv.s.x v31, x0"); - - switch (b) { - case 5: - __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v16, v16, v31"); - __asm__("vfredusum.vs v18, v18, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (beta->real == 0. && beta->imag == 0.) { - if (conjatx == BLIS_NO_CONJUGATE) { - vcmul_vf(v28, v29, v16, v18, ft8, ft9); - } - else { - vcmul_vf_conj(v28, v29, v16, v18, ft8, ft9); - } - } - else { - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); - cmul(ft0, ft1, ft10, ft11, ft2, ft3); - __asm__("vfmv.s.f v28, ft0"); - __asm__("vfmv.s.f v29, ft1"); - if (conjatx == BLIS_NO_CONJUGATE) { - vcmacc_vf(v28, v29, ft8, ft9, v16, v18); - } - else { - vcmacc_vf_conj(v28, v29, ft8, ft9, v16, v18); - } - } - __asm__(VSE "v28, (%0)" : : "r"(y)); - __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); - __asm__(VSE "v29, (%0)" : : "r"(y)); - __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump)); - case 4: - __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v12, v12, v31"); - __asm__("vfredusum.vs v14, v14, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (beta->real == 0. && beta->imag == 0.) { - if (conjatx == BLIS_NO_CONJUGATE) { - vcmul_vf(v28, v29, v12, v14, ft8, ft9); - } - else { - vcmul_vf_conj(v28, v29, v12, v14, ft8, ft9); - } - } - else { - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); - cmul(ft0, ft1, ft10, ft11, ft2, ft3); - __asm__("vfmv.s.f v28, ft0"); - __asm__("vfmv.s.f v29, ft1"); - if (conjatx == BLIS_NO_CONJUGATE) { - vcmacc_vf(v28, v29, ft8, ft9, v12, v14); - } - else { - vcmacc_vf_conj(v28, v29, ft8, ft9, v12, v14); - } - } - __asm__(VSE "v28, (%0)" : : "r"(y)); - __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); - __asm__(VSE "v29, (%0)" : : "r"(y)); - __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump)); - case 3: - __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v8, v8, v31"); - __asm__("vfredusum.vs v10, v10, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (beta->real == 0. && beta->imag == 0.) { - if (conjatx == BLIS_NO_CONJUGATE) { - vcmul_vf(v28, v29, v8, v10, ft8, ft9); - } - else { - vcmul_vf_conj(v28, v29, v8, v10, ft8, ft9); - } - } - else { - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); - cmul(ft0, ft1, ft10, ft11, ft2, ft3); - __asm__("vfmv.s.f v28, ft0"); - __asm__("vfmv.s.f v29, ft1"); - if (conjatx == BLIS_NO_CONJUGATE) { - vcmacc_vf(v28, v29, ft8, ft9, v8, v10); - } - else { - vcmacc_vf_conj(v28, v29, ft8, ft9, v8, v10); - } - } - __asm__(VSE "v28, (%0)" : : "r"(y)); - __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); - __asm__(VSE "v29, (%0)" : : "r"(y)); - __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump)); - case 2: - __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v4, v4, v31"); - __asm__("vfredusum.vs v6, v6, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (beta->real == 0. && beta->imag == 0.) { - if (conjatx == BLIS_NO_CONJUGATE) { - vcmul_vf(v28, v29, v4, v6, ft8, ft9); - } - else { - vcmul_vf_conj(v28, v29, v4, v6, ft8, ft9); - } - } - else { - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); - cmul(ft0, ft1, ft10, ft11, ft2, ft3); - __asm__("vfmv.s.f v28, ft0"); - __asm__("vfmv.s.f v29, ft1"); - if (conjatx == BLIS_NO_CONJUGATE) { - vcmacc_vf(v28, v29, ft8, ft9, v4, v6); - } - else { - vcmacc_vf_conj(v28, v29, ft8, ft9, v4, v6); - } - } - __asm__(VSE "v28, (%0)" : : "r"(y)); - __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); - __asm__(VSE "v29, (%0)" : : "r"(y)); - __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump)); - case 1: - __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); - __asm__("vfredusum.vs v0, v0, v31"); - __asm__("vfredusum.vs v2, v2, v31"); - __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - if (beta->real == 0. && beta->imag == 0.) { - if (conjatx == BLIS_NO_CONJUGATE) { - vcmul_vf(v28, v29, v0, v2, ft8, ft9); - } - else { - vcmul_vf_conj(v28, v29, v0, v2, ft8, ft9); - } - } - else { - __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); - cmul(ft0, ft1, ft10, ft11, ft2, ft3); - __asm__("vfmv.s.f v28, ft0"); - __asm__("vfmv.s.f v29, ft1"); - if (conjatx == BLIS_NO_CONJUGATE) { - vcmacc_vf(v28, v29, ft8, ft9, v0, v2); - } - else { - vcmacc_vf_conj(v28, v29, ft8, ft9, v0, v2); - } - } - __asm__(VSE "v28, (%0)" : : "r"(y)); - __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); - __asm__(VSE "v29, (%0)" : : "r"(y)); - } - } // end cleanup - return; -} diff --git a/kernels/sifive_x280/1f/bli_dotxf_sifive_x280_intr/bli_dotxf_sifive_x280_intr.c b/kernels/sifive_x280/1f/bli_dotxf_sifive_x280_intr/bli_dotxf_sifive_x280_intr.c new file mode 100644 index 0000000000..9396515b30 --- /dev/null +++ b/kernels/sifive_x280/1f/bli_dotxf_sifive_x280_intr/bli_dotxf_sifive_x280_intr.c @@ -0,0 +1,132 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2024, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off +#include "../../riscv_cmul_macros_intr.h" +#include "../../riscv_overloaded_intrinsics.h" +#include "blis.h" +#include +#include + +#define DOTXF_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##dotxf_sifive_x280_intr(\ + conj_t conjat, \ + conj_t conjx, \ + dim_t m, \ + dim_t b, \ + const T* restrict alpha_, \ + const T* restrict a_, inc_t inca, inc_t lda, \ + const T* restrict x_, inc_t incx, \ + const T* restrict beta_, \ + T* restrict y_, inc_t incy, \ + const cntx_t* restrict cntx \ +) + +#define DOTXF(...) DOTXF_(__VA_ARGS__) + +#define SETV_(PRECISION_CHAR) bli_##PRECISION_CHAR##setv_sifive_x280_intr +#define SETV(PRECISION_CHAR) SETV_(PRECISION_CHAR) +#define SCALV_(PRECISION_CHAR) bli_##PRECISION_CHAR##scalv_sifive_x280_intr +#define SCALV(PRECISION_CHAR) SCALV_(PRECISION_CHAR) + +// Single precision real +#define DATATYPE float +#define PRECISION_CHAR s +#define PREC 32 +#define LMUL m4 +#define FLT_SIZE sizeof(float) + +#include "./bli_dotxf_sifive_x280_intr_real.c" + +#undef DATATYPE +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +// Double precision real +#define DATATYPE double +#define PRECISION_CHAR d +#define PREC 64 +#define LMUL m4 +#define FLT_SIZE sizeof(double) + +#include "./bli_dotxf_sifive_x280_intr_real.c" + +#undef DATATYPE +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +// Single precision complex +#define DATATYPE scomplex +#define BASE_DT float +#define PRECISION_CHAR c +#define PREC 32 +#define LMUL m2 +#define FLT_SIZE sizeof(float) + +#include "./bli_dotxf_sifive_x280_intr_complex.c" + +#undef DATATYPE +#undef BASE_DT +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +// Double precision complex +#define DATATYPE dcomplex +#define BASE_DT double +#define PRECISION_CHAR z +#define PREC 64 +#define LMUL m2 +#define FLT_SIZE sizeof(double) + +#include "./bli_dotxf_sifive_x280_intr_complex.c" + +#undef DATATYPE +#undef BASE_DT +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +#undef SETV_ +#undef SETV +#undef SCALV_ +#undef SCALV + +#undef DOTXF +#undef DOTXF_ diff --git a/kernels/sifive_x280/1f/bli_dotxf_sifive_x280_intr/bli_dotxf_sifive_x280_intr_complex.c b/kernels/sifive_x280/1f/bli_dotxf_sifive_x280_intr/bli_dotxf_sifive_x280_intr_complex.c new file mode 100644 index 0000000000..463a111f07 --- /dev/null +++ b/kernels/sifive_x280/1f/bli_dotxf_sifive_x280_intr/bli_dotxf_sifive_x280_intr_complex.c @@ -0,0 +1,324 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2024, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off +#ifdef DOTXF + +#define DOTXF_SIFIVE_X280_LOAD_ACOL(i) \ + do { \ + acol_vec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (a_tmp + i * lda), vl); \ + acol_vec_r = VGET_V_F(PREC, LMUL, 2)(acol_vec, 0); \ + acol_vec_i = VGET_V_F(PREC, LMUL, 2)(acol_vec, 1); \ + } while (0) + +#define DOTXF_SIFIVE_X280_LOAD_ACOL_STRIDED(i) \ + do { \ + acol_vec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (a_tmp + i * lda), 2 * FLT_SIZE * inca, vl); \ + acol_vec_r = VGET_V_F(PREC, LMUL, 2)(acol_vec, 0); \ + acol_vec_i = VGET_V_F(PREC, LMUL, 2)(acol_vec, 1); \ + } while (0) + +#define DOTXF_SIFIVE_X280_LOOP_BODY_FIRST(LOAD_SUF, CONJ_SUF) \ + do { \ + DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(0); \ + VCMUL_VV##CONJ_SUF(PREC, LMUL, acc0_r, acc0_i, acol_vec_r, acol_vec_i, xvec_r, xvec_i, vl); \ + DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(1); \ + VCMUL_VV##CONJ_SUF(PREC, LMUL, acc1_r, acc1_i, acol_vec_r, acol_vec_i, xvec_r, xvec_i, vl); \ + DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(2); \ + VCMUL_VV##CONJ_SUF(PREC, LMUL, acc2_r, acc2_i, acol_vec_r, acol_vec_i, xvec_r, xvec_i, vl); \ + DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(3); \ + VCMUL_VV##CONJ_SUF(PREC, LMUL, acc3_r, acc3_i, acol_vec_r, acol_vec_i, xvec_r, xvec_i, vl); \ + DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(4); \ + VCMUL_VV##CONJ_SUF(PREC, LMUL, acc4_r, acc4_i, acol_vec_r, acol_vec_i, xvec_r, xvec_i, vl); \ + DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(5); \ + VCMUL_VV##CONJ_SUF(PREC, LMUL, acc5_r, acc5_i, acol_vec_r, acol_vec_i, xvec_r, xvec_i, vl); \ + } while (0) + +#define DOTXF_SIFIVE_X280_LOOP_BODY(LOAD_SUF, CONJ_SUF) \ + do { \ + DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(0); \ + VCMACC_VV##CONJ_SUF##_TU(PREC, LMUL, acc0_r, acc0_i, xvec_r, xvec_i, acol_vec_r, acol_vec_i, vl); \ + DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(1); \ + VCMACC_VV##CONJ_SUF##_TU(PREC, LMUL, acc1_r, acc1_i, xvec_r, xvec_i, acol_vec_r, acol_vec_i, vl); \ + DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(2); \ + VCMACC_VV##CONJ_SUF##_TU(PREC, LMUL, acc2_r, acc2_i, xvec_r, xvec_i, acol_vec_r, acol_vec_i, vl); \ + DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(3); \ + VCMACC_VV##CONJ_SUF##_TU(PREC, LMUL, acc3_r, acc3_i, xvec_r, xvec_i, acol_vec_r, acol_vec_i, vl); \ + DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(4); \ + VCMACC_VV##CONJ_SUF##_TU(PREC, LMUL, acc4_r, acc4_i, xvec_r, xvec_i, acol_vec_r, acol_vec_i, vl); \ + DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(5); \ + VCMACC_VV##CONJ_SUF##_TU(PREC, LMUL, acc5_r, acc5_i, xvec_r, xvec_i, acol_vec_r, acol_vec_i, vl); \ + } while (0) + +#define DOTXF_SIFIVE_X280_CLEANUP_BODY_FIRST(LOAD_SUF, CONJ_SUF) \ + do { \ + switch (b) { \ + case 5: \ + DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(4); \ + VCMUL_VV##CONJ_SUF(PREC, LMUL, acc4_r, acc4_i, acol_vec_r, acol_vec_i, xvec_r, xvec_i, vl); \ + case 4: \ + DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(3); \ + VCMUL_VV##CONJ_SUF(PREC, LMUL, acc3_r, acc3_i, acol_vec_r, acol_vec_i, xvec_r, xvec_i, vl); \ + case 3: \ + DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(2); \ + VCMUL_VV##CONJ_SUF(PREC, LMUL, acc2_r, acc2_i, acol_vec_r, acol_vec_i, xvec_r, xvec_i, vl); \ + case 2: \ + DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(1); \ + VCMUL_VV##CONJ_SUF(PREC, LMUL, acc1_r, acc1_i, acol_vec_r, acol_vec_i, xvec_r, xvec_i, vl); \ + case 1: \ + DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(0); \ + VCMUL_VV##CONJ_SUF(PREC, LMUL, acc0_r, acc0_i, acol_vec_r, acol_vec_i, xvec_r, xvec_i, vl); \ + } \ + } while (0) + +#define DOTXF_SIFIVE_X280_CLEANUP_BODY(LOAD_SUF, CONJ_SUF) \ + do { \ + switch (b) { \ + case 5: \ + DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(4); \ + VCMACC_VV##CONJ_SUF##_TU(PREC, LMUL, acc4_r, acc4_i, xvec_r, xvec_i, acol_vec_r, acol_vec_i, vl); \ + case 4: \ + DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(3); \ + VCMACC_VV##CONJ_SUF##_TU(PREC, LMUL, acc3_r, acc3_i, xvec_r, xvec_i, acol_vec_r, acol_vec_i, vl); \ + case 3: \ + DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(2); \ + VCMACC_VV##CONJ_SUF##_TU(PREC, LMUL, acc2_r, acc2_i, xvec_r, xvec_i, acol_vec_r, acol_vec_i, vl); \ + case 2: \ + DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(1); \ + VCMACC_VV##CONJ_SUF##_TU(PREC, LMUL, acc1_r, acc1_i, xvec_r, xvec_i, acol_vec_r, acol_vec_i, vl); \ + case 1: \ + DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(0); \ + VCMACC_VV##CONJ_SUF##_TU(PREC, LMUL, acc0_r, acc0_i, xvec_r, xvec_i, acol_vec_r, acol_vec_i, vl); \ + } \ + } while (0) + +#define DOTXF_SIFIVE_X280_REDUCE(i) \ + do { \ + RVV_TYPE_F(PREC, m1) dot##i##_r = VFMV_S_F(PREC, m1)(0., 1); \ + RVV_TYPE_F(PREC, m1) dot##i##_i = VFMV_S_F(PREC, m1)(0., 1); \ + dot##i##_r = VF_REDUSUM_VS(PREC, LMUL)(acc##i##_r, dot##i##_r, m); \ + dot##i##_i = VF_REDUSUM_VS(PREC, LMUL)(acc##i##_i, dot##i##_i, m); \ + RVV_TYPE_F(PREC, m1) y##i##_r, y##i##_i; \ + if (PASTEMAC(PRECISION_CHAR, eq0)(*beta)) { \ + if (bli_is_conj(conjatx)) \ + VCMUL_VF_CONJ(PREC, m1, y##i##_r, y##i##_i, dot##i##_r, dot##i##_i, alpha->real, alpha->imag, 1); \ + else \ + VCMUL_VF(PREC, m1, y##i##_r, y##i##_i, dot##i##_r, dot##i##_i, alpha->real, alpha->imag, 1); \ + y[i * incy].real = VFMV_F_S(PREC)(y##i##_r); \ + y[i * incy].imag = VFMV_F_S(PREC)(y##i##_i); \ + } \ + else { \ + PASTEMAC(PRECISION_CHAR, scals)(*beta, y[i * incy]) \ + y##i##_r = VFMV_S_F(PREC, m1)(y[i * incy].real, 1); \ + y##i##_i = VFMV_S_F(PREC, m1)(y[i * incy].imag, 1); \ + if (bli_is_conj(conjatx)) \ + VCMACC_VF_CONJ(PREC, m1, y##i##_r, y##i##_i, alpha->real, alpha->imag, dot##i##_r, dot##i##_i, 1); \ + else \ + VCMACC_VF(PREC, m1, y##i##_r, y##i##_i, alpha->real, alpha->imag, dot##i##_r, dot##i##_i, 1); \ + y[i * incy].real = VFMV_F_S(PREC)(y##i##_r); \ + y[i * incy].imag = VFMV_F_S(PREC)(y##i##_i); \ + } \ + } while (0) + +DOTXF(PRECISION_CHAR, void) +{ + // Computes y := beta * y + alpha * conjat(A^T) * conjx(x) + + (void) cntx; // Suppress unused parameter warnings + const DATATYPE* restrict alpha = alpha_; + const DATATYPE* restrict a = a_; + const DATATYPE* restrict x = x_; + const DATATYPE* restrict beta = beta_; + DATATYPE* restrict y = y_; + + if (b == 0) return; + if (m == 0 || PASTEMAC(PRECISION_CHAR, eq0)(*alpha)) { + if (PASTEMAC(PRECISION_CHAR, eq0)(*beta)) + SETV(PRECISION_CHAR)(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL); + else + SCALV(PRECISION_CHAR)(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL); + return; + } + + conj_t conjatx = BLIS_NO_CONJUGATE; + if (bli_is_conj(conjx)) { + bli_toggle_conj(&conjat); + bli_toggle_conj(&conjx); + bli_toggle_conj(&conjatx); + } + + while (b >= 6) { + // Compute dot product of x with 6 columns of a. + const DATATYPE* restrict a_tmp = a; + const DATATYPE* restrict x_tmp = x; + RVV_TYPE_F(PREC, LMUL) acc0_r, acc0_i, acc1_r, acc1_i, acc2_r, acc2_i, + acc3_r, acc3_i, acc4_r, acc4_i, acc5_r, acc5_i; + RVV_TYPE_FX(PREC, LMUL, 2) xvec, acol_vec; + RVV_TYPE_F(PREC, LMUL) xvec_r, xvec_i, acol_vec_r, acol_vec_i; + bool first = true; + size_t avl = m; + while (avl) { + size_t vl = VSETVL(PREC, LMUL)(avl); + if (incx == 1) + xvec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x_tmp, vl); + else + xvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x_tmp, 2 * FLT_SIZE * incx, vl); + xvec_r = VGET_V_F(PREC, LMUL, 2)(xvec, 0); + xvec_i = VGET_V_F(PREC, LMUL, 2)(xvec, 1); + + if (first) { + if (bli_is_conj(conjat)) { + if (inca == 1) + DOTXF_SIFIVE_X280_LOOP_BODY_FIRST( , _CONJ); + else + DOTXF_SIFIVE_X280_LOOP_BODY_FIRST(_STRIDED, _CONJ); + } + else { + if (inca == 1) + DOTXF_SIFIVE_X280_LOOP_BODY_FIRST( , ); + else + DOTXF_SIFIVE_X280_LOOP_BODY_FIRST(_STRIDED, ); + } + first = false; + } + else { + if (bli_is_conj(conjat)) { + if (inca == 1) + DOTXF_SIFIVE_X280_LOOP_BODY( , _CONJ); + else + DOTXF_SIFIVE_X280_LOOP_BODY(_STRIDED, _CONJ); + } + else { + if (inca == 1) + DOTXF_SIFIVE_X280_LOOP_BODY( , ); + else + DOTXF_SIFIVE_X280_LOOP_BODY(_STRIDED, ); + } + } + + a_tmp += vl * inca; + x_tmp += vl * incx; + avl -= vl; + } + + DOTXF_SIFIVE_X280_REDUCE(0); + DOTXF_SIFIVE_X280_REDUCE(1); + DOTXF_SIFIVE_X280_REDUCE(2); + DOTXF_SIFIVE_X280_REDUCE(3); + DOTXF_SIFIVE_X280_REDUCE(4); + DOTXF_SIFIVE_X280_REDUCE(5); + + a += 6 * lda; + y += 6 * incy; + b -= 6; + } + + if (b > 0) { + const DATATYPE* restrict a_tmp = a; + const DATATYPE* restrict x_tmp = x; + RVV_TYPE_F(PREC, LMUL) acc0_r, acc0_i, acc1_r, acc1_i, acc2_r, acc2_i, + acc3_r, acc3_i, acc4_r, acc4_i; + RVV_TYPE_FX(PREC, LMUL, 2) xvec, acol_vec; + RVV_TYPE_F(PREC, LMUL) xvec_r, xvec_i, acol_vec_r, acol_vec_i; + bool first = true; + size_t avl = m; + while (avl) { + size_t vl = VSETVL(PREC, LMUL)(avl); + if (incx == 1) + xvec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x_tmp, vl); + else + xvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x_tmp, 2 * FLT_SIZE * incx, vl); + xvec_r = VGET_V_F(PREC, LMUL, 2)(xvec, 0); + xvec_i = VGET_V_F(PREC, LMUL, 2)(xvec, 1); + + if (first) { + if (bli_is_conj(conjat)) { + if (inca == 1) + DOTXF_SIFIVE_X280_CLEANUP_BODY_FIRST( , _CONJ); + else + DOTXF_SIFIVE_X280_CLEANUP_BODY_FIRST(_STRIDED, _CONJ); + } + else { + if (inca == 1) + DOTXF_SIFIVE_X280_CLEANUP_BODY_FIRST( , ); + else + DOTXF_SIFIVE_X280_CLEANUP_BODY_FIRST(_STRIDED, ); + } + first = false; + } + else { + if (bli_is_conj(conjat)) { + if (inca == 1) + DOTXF_SIFIVE_X280_CLEANUP_BODY( , _CONJ); + else + DOTXF_SIFIVE_X280_CLEANUP_BODY(_STRIDED, _CONJ); + } + else { + if (inca == 1) + DOTXF_SIFIVE_X280_CLEANUP_BODY( , ); + else + DOTXF_SIFIVE_X280_CLEANUP_BODY(_STRIDED, ); + } + } + + a_tmp += vl * inca; + x_tmp += vl * incx; + avl -= vl; + } + + switch (b) { + case 5: + DOTXF_SIFIVE_X280_REDUCE(4); + case 4: + DOTXF_SIFIVE_X280_REDUCE(3); + case 3: + DOTXF_SIFIVE_X280_REDUCE(2); + case 2: + DOTXF_SIFIVE_X280_REDUCE(1); + case 1: + DOTXF_SIFIVE_X280_REDUCE(0); + } + } + return; +} + +#undef DOTXF_SIFIVE_X280_LOAD_ACOL +#undef DOTXF_SIFIVE_X280_LOAD_ACOL_STRIDED +#undef DOTXF_SIFIVE_X280_LOOP_BODY_FIRST +#undef DOTXF_SIFIVE_X280_LOOP_BODY +#undef DOTXF_SIFIVE_X280_CLEANUP_BODY_FIRST +#undef DOTXF_SIFIVE_X280_CLEANUP_BODY +#undef DOTXF_SIFIVE_X280_REDUCE + +#endif // DOTXF diff --git a/kernels/sifive_x280/1f/bli_dotxf_sifive_x280_intr/bli_dotxf_sifive_x280_intr_real.c b/kernels/sifive_x280/1f/bli_dotxf_sifive_x280_intr/bli_dotxf_sifive_x280_intr_real.c new file mode 100644 index 0000000000..8286e2476f --- /dev/null +++ b/kernels/sifive_x280/1f/bli_dotxf_sifive_x280_intr/bli_dotxf_sifive_x280_intr_real.c @@ -0,0 +1,262 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2024, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off +#ifdef DOTXF + +#define DOTXF_SIFIVE_X280_LOAD_ACOL(i) \ + do { \ + acol_vec = VLE_V_F(PREC, LMUL)(a_tmp + i * lda, vl); \ + } while (0) + +#define DOTXF_SIFIVE_X280_LOAD_ACOL_STRIDED(i) \ + do { \ + acol_vec = VLSE_V_F(PREC, LMUL)(a_tmp + i * lda, FLT_SIZE * inca, vl); \ + } while (0) + +#define DOTXF_SIFIVE_X280_LOOP_BODY_FIRST(LOAD_SUF) \ + do { \ + DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(0); \ + acc0 = VFMUL_VV(PREC, LMUL)(acol_vec, xvec, vl); \ + DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(1); \ + acc1 = VFMUL_VV(PREC, LMUL)(acol_vec, xvec, vl); \ + DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(2); \ + acc2 = VFMUL_VV(PREC, LMUL)(acol_vec, xvec, vl); \ + DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(3); \ + acc3 = VFMUL_VV(PREC, LMUL)(acol_vec, xvec, vl); \ + DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(4); \ + acc4 = VFMUL_VV(PREC, LMUL)(acol_vec, xvec, vl); \ + DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(5); \ + acc5 = VFMUL_VV(PREC, LMUL)(acol_vec, xvec, vl); \ + } while (0) + +#define DOTXF_SIFIVE_X280_LOOP_BODY(LOAD_SUF) \ + do { \ + DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(0); \ + acc0 = VFMACC_VV_TU(PREC, LMUL)(acc0, acol_vec, xvec, vl); \ + DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(1); \ + acc1 = VFMACC_VV_TU(PREC, LMUL)(acc1, acol_vec, xvec, vl); \ + DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(2); \ + acc2 = VFMACC_VV_TU(PREC, LMUL)(acc2, acol_vec, xvec, vl); \ + DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(3); \ + acc3 = VFMACC_VV_TU(PREC, LMUL)(acc3, acol_vec, xvec, vl); \ + DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(4); \ + acc4 = VFMACC_VV_TU(PREC, LMUL)(acc4, acol_vec, xvec, vl); \ + DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(5); \ + acc5 = VFMACC_VV_TU(PREC, LMUL)(acc5, acol_vec, xvec, vl); \ + } while (0) + +#define DOTXF_SIFIVE_X280_CLEANUP_BODY_FIRST(LOAD_SUF) \ + do { \ + switch (b) { \ + case 5: \ + DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(4); \ + acc4 = VFMUL_VV(PREC, LMUL)(acol_vec, xvec, vl); \ + case 4: \ + DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(3); \ + acc3 = VFMUL_VV(PREC, LMUL)(acol_vec, xvec, vl); \ + case 3: \ + DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(2); \ + acc2 = VFMUL_VV(PREC, LMUL)(acol_vec, xvec, vl); \ + case 2: \ + DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(1); \ + acc1 = VFMUL_VV(PREC, LMUL)(acol_vec, xvec, vl); \ + case 1: \ + DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(0); \ + acc0 = VFMUL_VV(PREC, LMUL)(acol_vec, xvec, vl); \ + } \ + } while (0) + +#define DOTXF_SIFIVE_X280_CLEANUP_BODY(LOAD_SUF) \ + do { \ + switch (b) { \ + case 5: \ + DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(4); \ + acc4 = VFMACC_VV_TU(PREC, LMUL)(acc4, acol_vec, xvec, vl); \ + case 4: \ + DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(3); \ + acc3 = VFMACC_VV_TU(PREC, LMUL)(acc3, acol_vec, xvec, vl); \ + case 3: \ + DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(2); \ + acc2 = VFMACC_VV_TU(PREC, LMUL)(acc2, acol_vec, xvec, vl); \ + case 2: \ + DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(1); \ + acc1 = VFMACC_VV_TU(PREC, LMUL)(acc1, acol_vec, xvec, vl); \ + case 1: \ + DOTXF_SIFIVE_X280_LOAD_ACOL##LOAD_SUF(0); \ + acc0 = VFMACC_VV_TU(PREC, LMUL)(acc0, acol_vec, xvec, vl); \ + } \ + } while (0) + +#define DOTXF_SIFIVE_X280_REDUCE(i) \ + do { \ + RVV_TYPE_F(PREC, m1) dot##i = VFMV_S_F(PREC, m1)(0., 1); \ + dot##i = VF_REDUSUM_VS(PREC, LMUL)(acc##i, dot##i, m); \ + if (PASTEMAC(PRECISION_CHAR, eq0)(*beta)) { \ + dot##i = VFMUL_VF(PREC, m1)(dot##i, *alpha, 1); \ + y[i * incy] = VFMV_F_S(PREC)(dot##i); \ + } \ + else { \ + y[i * incy] *= *beta; \ + RVV_TYPE_F(PREC, m1) y##i = VFMV_S_F(PREC, m1)(y[i * incy], 1); \ + y##i = VFMACC_VF(PREC, m1)(y##i, *alpha, dot##i, 1); \ + y[i * incy] = VFMV_F_S(PREC)(y##i); \ + } \ + } while (0) + +DOTXF(PRECISION_CHAR, void) +{ + // Computes y := beta * y + alpha * conjat(A^T) * conjx(x) + + (void) conjat; // Suppress unused parameter warnings + (void) conjx; + (void) cntx; + const DATATYPE* restrict alpha = alpha_; + const DATATYPE* restrict a = a_; + const DATATYPE* restrict x = x_; + const DATATYPE* restrict beta = beta_; + DATATYPE* restrict y = y_; + + if (b == 0) return; + if (m == 0 || PASTEMAC(PRECISION_CHAR, eq0)(*alpha)) { + if (PASTEMAC(PRECISION_CHAR, eq0)(*beta)) + SETV(PRECISION_CHAR)(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL); + else + SCALV(PRECISION_CHAR)(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL); + return; + } + + while (b >= 6) { + // Compute dot product of x with 6 columns of a. + const DATATYPE* restrict a_tmp = a; + const DATATYPE* restrict x_tmp = x; + RVV_TYPE_F(PREC, LMUL) acc0, acc1, acc2, acc3, acc4, acc5; + RVV_TYPE_F(PREC, LMUL) xvec, acol_vec; + bool first = true; + size_t avl = m; + while (avl) { + size_t vl = VSETVL(PREC, LMUL)(avl); + if (incx == 1) + xvec = VLE_V_F(PREC, LMUL)(x_tmp, vl); + else + xvec = VLSE_V_F(PREC, LMUL)(x_tmp, FLT_SIZE * incx, vl); + if (first) { + if (inca == 1) + DOTXF_SIFIVE_X280_LOOP_BODY_FIRST(); + else + DOTXF_SIFIVE_X280_LOOP_BODY_FIRST(_STRIDED); + first = false; + } + else { + if (inca == 1) + DOTXF_SIFIVE_X280_LOOP_BODY(); + else + DOTXF_SIFIVE_X280_LOOP_BODY(_STRIDED); + } + + a_tmp += vl * inca; + x_tmp += vl * incx; + avl -= vl; + } + + DOTXF_SIFIVE_X280_REDUCE(0); + DOTXF_SIFIVE_X280_REDUCE(1); + DOTXF_SIFIVE_X280_REDUCE(2); + DOTXF_SIFIVE_X280_REDUCE(3); + DOTXF_SIFIVE_X280_REDUCE(4); + DOTXF_SIFIVE_X280_REDUCE(5); + + a += 6 * lda; + y += 6 * incy; + b -= 6; + } + + if (b > 0) { + const DATATYPE* restrict a_tmp = a; + const DATATYPE* restrict x_tmp = x; + RVV_TYPE_F(PREC, LMUL) acc0, acc1, acc2, acc3, acc4; + RVV_TYPE_F(PREC, LMUL) xvec, acol_vec; + bool first = true; + size_t avl = m; + while (avl) { + size_t vl = VSETVL(PREC, LMUL)(avl); + if (incx == 1) + xvec = VLE_V_F(PREC, LMUL)(x_tmp, vl); + else + xvec = VLSE_V_F(PREC, LMUL)(x_tmp, FLT_SIZE * incx, vl); + if (first) { + if (inca == 1) + DOTXF_SIFIVE_X280_CLEANUP_BODY_FIRST(); + else + DOTXF_SIFIVE_X280_CLEANUP_BODY_FIRST(_STRIDED); + first = false; + } + else { + if (inca == 1) + DOTXF_SIFIVE_X280_CLEANUP_BODY(); + else + DOTXF_SIFIVE_X280_CLEANUP_BODY(_STRIDED); + } + + a_tmp += vl * inca; + x_tmp += vl * incx; + avl -= vl; + } + + switch (b) { + case 5: + DOTXF_SIFIVE_X280_REDUCE(4); + case 4: + DOTXF_SIFIVE_X280_REDUCE(3); + case 3: + DOTXF_SIFIVE_X280_REDUCE(2); + case 2: + DOTXF_SIFIVE_X280_REDUCE(1); + case 1: + DOTXF_SIFIVE_X280_REDUCE(0); + } + + } + return; +} + +#undef DOTXF_SIFIVE_X280_LOAD_ACOL +#undef DOTXF_SIFIVE_X280_LOAD_ACOL_STRIDED +#undef DOTXF_SIFIVE_X280_LOOP_BODY_FIRST +#undef DOTXF_SIFIVE_X280_LOOP_BODY +#undef DOTXF_SIFIVE_X280_CLEANUP_BODY_FIRST +#undef DOTXF_SIFIVE_X280_CLEANUP_BODY +#undef DOTXF_SIFIVE_X280_REDUCE + +#endif // DOTXF diff --git a/kernels/sifive_x280/1m/bli_packm_sifive_x280_asm.c b/kernels/sifive_x280/1m/bli_packm_sifive_x280_asm.c deleted file mode 100644 index 3ee4cdd20c..0000000000 --- a/kernels/sifive_x280/1m/bli_packm_sifive_x280_asm.c +++ /dev/null @@ -1,1465 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2023, SiFive, Inc. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" -#include "../riscv_cmul_macros_asm.h" -#include "../bli_kernels_sifive_x280.h" -#include -#include -#include -#include - -#define FLT_SIZE 4 -#define FLT_LOAD "flw " -#define VLE "vle32.v " -#define VLSE "vlse32.v " -#define VSE "vse32.v " -#define VSSE "vsse32.v " -#define VSSSEG8 "vssseg8e32.v " -#define VSSSEG7 "vssseg7e32.v " -#define VSSSEG6 "vssseg6e32.v " -#define VSSSEG5 "vssseg5e32.v " -#define VSSSEG4 "vssseg4e32.v " -#define VSSSEG3 "vssseg3e32.v " -#define VSSSEG2 "vssseg2e32.v " -#define NR 64 - -void bli_spackm_sifive_x280_asm_7m4 - ( - conj_t conja, - pack_t schema, - dim_t cdim, - dim_t cdim_max, - dim_t cdim_bcast, - dim_t n, - dim_t n_max, - const void* restrict kappa_, - const void* restrict a_, inc_t inca, inc_t lda, - void* restrict p_, inc_t ldp, - const void* restrict params, - const cntx_t* cntx - ) -{ - (void) conja; - (void) cntx; - const float* kappa = kappa_; - const float* a = a_; - float* p = p_; - - float kappa_cast = *kappa; - - // MRxk kernel - if (cdim <= 7 && cdim_max == 7 && cdim_bcast == 1) - { - if (lda == 1) { - __asm__ volatile("vsetvli zero, %0, e%1, m1, ta, ma" : : "r"(n), "i"(8 * FLT_SIZE)); - switch (cdim) { - case 0: __asm__("vmv.v.i v0, 0"); - case 1: __asm__("vmv.v.i v1, 0"); - case 2: __asm__("vmv.v.i v2, 0"); - case 3: __asm__("vmv.v.i v3, 0"); - case 4: __asm__("vmv.v.i v4, 0"); - case 5: __asm__("vmv.v.i v5, 0"); - case 6: __asm__("vmv.v.i v6, 0"); - } - a += (cdim - 1) * inca; - size_t avl = n; - while (avl) { - const float* a_tmp = a; - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m1, ta, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE)); - switch (cdim) { - case 7: - __asm__(VLE "v6, (%0)" : : "r"(a_tmp)); - a_tmp -= inca; - case 6: - __asm__(VLE "v5, (%0)" : : "r"(a_tmp)); - a_tmp -= inca; - case 5: - __asm__(VLE "v4, (%0)" : : "r"(a_tmp)); - a_tmp -= inca; - case 4: - __asm__(VLE "v3, (%0)" : : "r"(a_tmp)); - a_tmp -= inca; - case 3: - __asm__(VLE "v2, (%0)" : : "r"(a_tmp)); - a_tmp -= inca; - case 2: - __asm__(VLE "v1, (%0)" : : "r"(a_tmp)); - a_tmp -= inca; - case 1: - __asm__(VLE "v0, (%0)" : : "r"(a_tmp)); - } - if (kappa_cast != 1.f) { - switch (cdim) { - case 7: __asm__("vfmul.vf v6, v6, %0" : : "f"(kappa_cast)); - case 6: __asm__("vfmul.vf v5, v5, %0" : : "f"(kappa_cast)); - case 5: __asm__("vfmul.vf v4, v4, %0" : : "f"(kappa_cast)); - case 4: __asm__("vfmul.vf v3, v3, %0" : : "f"(kappa_cast)); - case 3: __asm__("vfmul.vf v2, v2, %0" : : "f"(kappa_cast)); - case 2: __asm__("vfmul.vf v1, v1, %0" : : "f"(kappa_cast)); - case 1: __asm__("vfmul.vf v0, v0, %0" : : "f"(kappa_cast)); - } - } - __asm__(VSSSEG7 "v0, (%0), %1" : : "r"(p), "r"(FLT_SIZE * ldp)); - a += vl; - p += vl * ldp; - avl -= vl; - } - __asm__ volatile("vsetivli zero, 7, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - __asm__("vmv.v.i v0, 0"); - for (size_t i = n; i < n_max; ++i) { - __asm__(VSE "v0, (%0)" : : "r"(p)); - p += ldp; - } - } - else { - inca *= FLT_SIZE; - __asm__ volatile("vsetivli zero, 7, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - __asm__("vmv.v.i v0, 0"); - for (size_t i = 0; i < n; ++i) { - __asm__ volatile("vsetvli zero, %0, e%1, m1, tu, ma" : : "r"(cdim), "i"(8 * FLT_SIZE)); - if (inca == FLT_SIZE) { - __asm__(VLE "v0, (%0)" : : "r"(a)); - } - else { - __asm__(VLSE "v0, (%0), %1" : : "r"(a), "r"(inca)); - } - if (kappa_cast != 1.f) { - __asm__("vfmul.vf v0, v0, %0" : : "f"(kappa_cast)); - } - __asm__ volatile("vsetivli zero, 7, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - __asm__(VSE "v0, (%0)" : : "r"(p)); - a += lda; - p += ldp; - } - __asm__ volatile("vsetivli zero, 7, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - __asm__("vmv.v.i v0, 0"); - for (size_t i = n; i < n_max; ++i) { - __asm__(VSE "v0, (%0)" : : "r"(p)); - p += ldp; - } - } - } - // NRxk kernel - else if (cdim <= 64 && cdim_max == 64 && cdim_bcast == 1) - { - if (lda == 1) { - __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE)); - __asm__("vmv.v.i v8, 0"); - size_t avl = n; - while (avl) { - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m1, ta, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE)); - dim_t cdim_tmp = cdim; - const float* a_tmp = a; - float* p_tmp = p; - while (cdim_tmp >= 8) { - __asm__(VLE "v0, (%0)" : : "r"(a_tmp)); - a_tmp += inca; - __asm__(VLE "v1, (%0)" : : "r"(a_tmp)); - a_tmp += inca; - __asm__(VLE "v2, (%0)" : : "r"(a_tmp)); - a_tmp += inca; - __asm__(VLE "v3, (%0)" : : "r"(a_tmp)); - a_tmp += inca; - __asm__(VLE "v4, (%0)" : : "r"(a_tmp)); - a_tmp += inca; - __asm__(VLE "v5, (%0)" : : "r"(a_tmp)); - a_tmp += inca; - __asm__(VLE "v6, (%0)" : : "r"(a_tmp)); - a_tmp += inca; - __asm__(VLE "v7, (%0)" : : "r"(a_tmp)); - a_tmp += inca; - if (kappa_cast != 1.f) { - __asm__("vfmul.vf v0, v0, %0" : : "f"(kappa_cast)); - __asm__("vfmul.vf v1, v1, %0" : : "f"(kappa_cast)); - __asm__("vfmul.vf v2, v2, %0" : : "f"(kappa_cast)); - __asm__("vfmul.vf v3, v3, %0" : : "f"(kappa_cast)); - __asm__("vfmul.vf v4, v4, %0" : : "f"(kappa_cast)); - __asm__("vfmul.vf v5, v5, %0" : : "f"(kappa_cast)); - __asm__("vfmul.vf v6, v6, %0" : : "f"(kappa_cast)); - __asm__("vfmul.vf v7, v7, %0" : : "f"(kappa_cast)); - } - __asm__(VSSSEG8 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp)); - p_tmp += 8; - cdim_tmp -= 8; - } - if (cdim_tmp > 0) { - a_tmp += (cdim_tmp - 1) * inca; - switch (cdim_tmp) { - case 7: - __asm__(VLE "v6, (%0)" : : "r"(a_tmp)); - a_tmp -= inca; - case 6: - __asm__(VLE "v5, (%0)" : : "r"(a_tmp)); - a_tmp -= inca; - case 5: - __asm__(VLE "v4, (%0)" : : "r"(a_tmp)); - a_tmp -= inca; - case 4: - __asm__(VLE "v3, (%0)" : : "r"(a_tmp)); - a_tmp -= inca; - case 3: - __asm__(VLE "v2, (%0)" : : "r"(a_tmp)); - a_tmp -= inca; - case 2: - __asm__(VLE "v1, (%0)" : : "r"(a_tmp)); - a_tmp -= inca; - case 1: - __asm__(VLE "v0, (%0)" : : "r"(a_tmp)); - } - if (kappa_cast != 1.f) { - switch (cdim_tmp) { - case 7: __asm__("vfmul.vf v6, v6, %0" : : "f"(kappa_cast)); - case 6: __asm__("vfmul.vf v5, v5, %0" : : "f"(kappa_cast)); - case 5: __asm__("vfmul.vf v4, v4, %0" : : "f"(kappa_cast)); - case 4: __asm__("vfmul.vf v3, v3, %0" : : "f"(kappa_cast)); - case 3: __asm__("vfmul.vf v2, v2, %0" : : "f"(kappa_cast)); - case 2: __asm__("vfmul.vf v1, v1, %0" : : "f"(kappa_cast)); - case 1: __asm__("vfmul.vf v0, v0, %0" : : "f"(kappa_cast)); - } - } - switch (cdim_tmp) { - case 7: - __asm__(VSSSEG7 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp)); - break; - case 6: - __asm__(VSSSEG6 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp)); - break; - case 5: - __asm__(VSSSEG5 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp)); - break; - case 4: - __asm__(VSSSEG4 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp)); - break; - case 3: - __asm__(VSSSEG3 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp)); - break; - case 2: - __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp)); - break; - case 1: - __asm__(VSSE "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp)); - break; - } - p_tmp += cdim_tmp; - } - __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR - cdim), "i"(8 * FLT_SIZE)); - for (size_t i = 0; i < vl; ++i) { - __asm__(VSE "v8, (%0)" : : "r"(p_tmp)); - p_tmp += ldp; - } - a += vl; - p += vl * ldp; - avl -= vl; - } - __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE)); - for (size_t i = n; i < n_max; ++i) { - __asm__(VSE "v8, (%0)" : : "r"(p)); - p += ldp; - } - } - else { - inca *= FLT_SIZE; - __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE)); - __asm__("vmv.v.i v0, 0"); - for (size_t i = 0; i < n; ++i) { - __asm__ volatile("vsetvli zero, %0, e%1, m4, tu, ma" : : "r"(cdim), "i"(8 * FLT_SIZE)); - if (inca == FLT_SIZE) { - __asm__(VLE "v0, (%0)" : : "r"(a)); - } - else { - __asm__(VLSE "v0, (%0), %1" : : "r"(a), "r"(inca)); - } - if (kappa_cast != 1.f) { - __asm__("vfmul.vf v0, v0, %0" : : "f"(kappa_cast)); - } - __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE)); - __asm__(VSE "v0, (%0)" : : "r"(p)); - a += lda; - p += ldp; - } - __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE)); - __asm__("vmv.v.i v0, 0"); - for (size_t i = n; i < n_max; ++i) { - __asm__(VSE "v0, (%0)" : : "r"(p)); - p += ldp; - } - } - } - // generic kernel - else - { - bli_sspackm_sifive_x280_ref - ( - conja, - schema, - cdim, - cdim_max, - cdim_bcast, - n, - n_max, - kappa, - a, inca, lda, - p, ldp, - params, - cntx - ); - } -} - -#undef FLT_SIZE -#undef FLT_LOAD -#undef VLE -#undef VLSE -#undef VSE -#undef VSSE -#undef VSSSEG8 -#undef VSSSEG7 -#undef VSSSEG6 -#undef VSSSEG5 -#undef VSSSEG4 -#undef VSSSEG3 -#undef VSSSEG2 -#undef NR - -#define FLT_SIZE 8 -#define FLT_LOAD "fld " -#define VLE "vle64.v " -#define VLSE "vlse64.v " -#define VSE "vse64.v " -#define VSSE "vsse64.v " -#define VSSSEG8 "vssseg8e64.v " -#define VSSSEG7 "vssseg7e64.v " -#define VSSSEG6 "vssseg6e64.v " -#define VSSSEG5 "vssseg5e64.v " -#define VSSSEG4 "vssseg4e64.v " -#define VSSSEG3 "vssseg3e64.v " -#define VSSSEG2 "vssseg2e64.v " -#define NR 32 - -void bli_dpackm_sifive_x280_asm_7m4 - ( - conj_t conja, - pack_t schema, - dim_t cdim, - dim_t cdim_max, - dim_t cdim_bcast, - dim_t n, - dim_t n_max, - const void* restrict kappa_, - const void* restrict a_, inc_t inca, inc_t lda, - void* restrict p_, inc_t ldp, - const void* restrict params, - const cntx_t* cntx - ) -{ - (void) conja; - (void) cntx; - const double* kappa = kappa_; - const double* a = a_; - double* p = p_; - - double kappa_cast = *kappa; - - // MRxk kernel - if (cdim <= 7 && cdim_max == 7 && cdim_bcast == 1) - { - if (lda == 1) { - __asm__ volatile("vsetvli zero, %0, e%1, m1, ta, ma" : : "r"(n), "i"(8 * FLT_SIZE)); - switch (cdim) { - case 0: __asm__("vmv.v.i v0, 0"); - case 1: __asm__("vmv.v.i v1, 0"); - case 2: __asm__("vmv.v.i v2, 0"); - case 3: __asm__("vmv.v.i v3, 0"); - case 4: __asm__("vmv.v.i v4, 0"); - case 5: __asm__("vmv.v.i v5, 0"); - case 6: __asm__("vmv.v.i v6, 0"); - } - a += (cdim - 1) * inca; - size_t avl = n; - while (avl) { - const double* a_tmp = a; - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m1, ta, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE)); - switch (cdim) { - case 7: - __asm__(VLE "v6, (%0)" : : "r"(a_tmp)); - a_tmp -= inca; - case 6: - __asm__(VLE "v5, (%0)" : : "r"(a_tmp)); - a_tmp -= inca; - case 5: - __asm__(VLE "v4, (%0)" : : "r"(a_tmp)); - a_tmp -= inca; - case 4: - __asm__(VLE "v3, (%0)" : : "r"(a_tmp)); - a_tmp -= inca; - case 3: - __asm__(VLE "v2, (%0)" : : "r"(a_tmp)); - a_tmp -= inca; - case 2: - __asm__(VLE "v1, (%0)" : : "r"(a_tmp)); - a_tmp -= inca; - case 1: - __asm__(VLE "v0, (%0)" : : "r"(a_tmp)); - } - if (kappa_cast != 1.) { - switch (cdim) { - case 7: __asm__("vfmul.vf v6, v6, %0" : : "f"(kappa_cast)); - case 6: __asm__("vfmul.vf v5, v5, %0" : : "f"(kappa_cast)); - case 5: __asm__("vfmul.vf v4, v4, %0" : : "f"(kappa_cast)); - case 4: __asm__("vfmul.vf v3, v3, %0" : : "f"(kappa_cast)); - case 3: __asm__("vfmul.vf v2, v2, %0" : : "f"(kappa_cast)); - case 2: __asm__("vfmul.vf v1, v1, %0" : : "f"(kappa_cast)); - case 1: __asm__("vfmul.vf v0, v0, %0" : : "f"(kappa_cast)); - } - } - __asm__(VSSSEG7 "v0, (%0), %1" : : "r"(p), "r"(FLT_SIZE * ldp)); - a += vl; - p += vl * ldp; - avl -= vl; - } - __asm__ volatile("vsetivli zero, 7, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - __asm__("vmv.v.i v0, 0"); - for (size_t i = n; i < n_max; ++i) { - __asm__(VSE "v0, (%0)" : : "r"(p)); - p += ldp; - } - } - else { - inca *= FLT_SIZE; - __asm__ volatile("vsetivli zero, 7, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - __asm__("vmv.v.i v0, 0"); - for (size_t i = 0; i < n; ++i) { - __asm__ volatile("vsetvli zero, %0, e%1, m1, tu, ma" : : "r"(cdim), "i"(8 * FLT_SIZE)); - if (inca == FLT_SIZE) { - __asm__(VLE "v0, (%0)" : : "r"(a)); - } - else { - __asm__(VLSE "v0, (%0), %1" : : "r"(a), "r"(inca)); - } - if (kappa_cast != 1.) { - __asm__("vfmul.vf v0, v0, %0" : : "f"(kappa_cast)); - } - __asm__ volatile("vsetivli zero, 7, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - __asm__(VSE "v0, (%0)" : : "r"(p)); - a += lda; - p += ldp; - } - __asm__ volatile("vsetivli zero, 7, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - __asm__("vmv.v.i v0, 0"); - for (size_t i = n; i < n_max; ++i) { - __asm__(VSE "v0, (%0)" : : "r"(p)); - p += ldp; - } - } - } - // NRxk kernel - else if (cdim <= 32 && cdim_max == 32 && cdim_bcast == 1) - { - if (lda == 1) { - __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE)); - __asm__("vmv.v.i v8, 0"); - size_t avl = n; - while (avl) { - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m1, ta, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE)); - dim_t cdim_tmp = cdim; - const double* a_tmp = a; - double* p_tmp = p; - while (cdim_tmp >= 8) { - __asm__(VLE "v0, (%0)" : : "r"(a_tmp)); - a_tmp += inca; - __asm__(VLE "v1, (%0)" : : "r"(a_tmp)); - a_tmp += inca; - __asm__(VLE "v2, (%0)" : : "r"(a_tmp)); - a_tmp += inca; - __asm__(VLE "v3, (%0)" : : "r"(a_tmp)); - a_tmp += inca; - __asm__(VLE "v4, (%0)" : : "r"(a_tmp)); - a_tmp += inca; - __asm__(VLE "v5, (%0)" : : "r"(a_tmp)); - a_tmp += inca; - __asm__(VLE "v6, (%0)" : : "r"(a_tmp)); - a_tmp += inca; - __asm__(VLE "v7, (%0)" : : "r"(a_tmp)); - a_tmp += inca; - if (kappa_cast != 1.) { - __asm__("vfmul.vf v0, v0, %0" : : "f"(kappa_cast)); - __asm__("vfmul.vf v1, v1, %0" : : "f"(kappa_cast)); - __asm__("vfmul.vf v2, v2, %0" : : "f"(kappa_cast)); - __asm__("vfmul.vf v3, v3, %0" : : "f"(kappa_cast)); - __asm__("vfmul.vf v4, v4, %0" : : "f"(kappa_cast)); - __asm__("vfmul.vf v5, v5, %0" : : "f"(kappa_cast)); - __asm__("vfmul.vf v6, v6, %0" : : "f"(kappa_cast)); - __asm__("vfmul.vf v7, v7, %0" : : "f"(kappa_cast)); - } - __asm__(VSSSEG8 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp)); - p_tmp += 8; - cdim_tmp -= 8; - } - if (cdim_tmp > 0) { - a_tmp += (cdim_tmp - 1) * inca; - switch (cdim_tmp) { - case 7: - __asm__(VLE "v6, (%0)" : : "r"(a_tmp)); - a_tmp -= inca; - case 6: - __asm__(VLE "v5, (%0)" : : "r"(a_tmp)); - a_tmp -= inca; - case 5: - __asm__(VLE "v4, (%0)" : : "r"(a_tmp)); - a_tmp -= inca; - case 4: - __asm__(VLE "v3, (%0)" : : "r"(a_tmp)); - a_tmp -= inca; - case 3: - __asm__(VLE "v2, (%0)" : : "r"(a_tmp)); - a_tmp -= inca; - case 2: - __asm__(VLE "v1, (%0)" : : "r"(a_tmp)); - a_tmp -= inca; - case 1: - __asm__(VLE "v0, (%0)" : : "r"(a_tmp)); - } - if (kappa_cast != 1.) { - switch (cdim_tmp) { - case 7: __asm__("vfmul.vf v6, v6, %0" : : "f"(kappa_cast)); - case 6: __asm__("vfmul.vf v5, v5, %0" : : "f"(kappa_cast)); - case 5: __asm__("vfmul.vf v4, v4, %0" : : "f"(kappa_cast)); - case 4: __asm__("vfmul.vf v3, v3, %0" : : "f"(kappa_cast)); - case 3: __asm__("vfmul.vf v2, v2, %0" : : "f"(kappa_cast)); - case 2: __asm__("vfmul.vf v1, v1, %0" : : "f"(kappa_cast)); - case 1: __asm__("vfmul.vf v0, v0, %0" : : "f"(kappa_cast)); - } - } - switch (cdim_tmp) { - case 7: - __asm__(VSSSEG7 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp)); - break; - case 6: - __asm__(VSSSEG6 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp)); - break; - case 5: - __asm__(VSSSEG5 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp)); - break; - case 4: - __asm__(VSSSEG4 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp)); - break; - case 3: - __asm__(VSSSEG3 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp)); - break; - case 2: - __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp)); - break; - case 1: - __asm__(VSSE "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp)); - break; - } - p_tmp += cdim_tmp; - } - __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR - cdim), "i"(8 * FLT_SIZE)); - for (size_t i = 0; i < vl; ++i) { - __asm__(VSE "v8, (%0)" : : "r"(p_tmp)); - p_tmp += ldp; - } - a += vl; - p += vl * ldp; - avl -= vl; - } - __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE)); - for (size_t i = n; i < n_max; ++i) { - __asm__(VSE "v8, (%0)" : : "r"(p)); - p += ldp; - } - } - else { - inca *= FLT_SIZE; - __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE)); - __asm__("vmv.v.i v0, 0"); - for (size_t i = 0; i < n; ++i) { - __asm__ volatile("vsetvli zero, %0, e%1, m4, tu, ma" : : "r"(cdim), "i"(8 * FLT_SIZE)); - if (inca == FLT_SIZE) { - __asm__(VLE "v0, (%0)" : : "r"(a)); - } - else { - __asm__(VLSE "v0, (%0), %1" : : "r"(a), "r"(inca)); - } - if (kappa_cast != 1.) { - __asm__("vfmul.vf v0, v0, %0" : : "f"(kappa_cast)); - } - __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE)); - __asm__(VSE "v0, (%0)" : : "r"(p)); - a += lda; - p += ldp; - } - __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE)); - __asm__("vmv.v.i v0, 0"); - for (size_t i = n; i < n_max; ++i) { - __asm__(VSE "v0, (%0)" : : "r"(p)); - p += ldp; - } - } - } - // generic kernel - else - { - bli_ddpackm_sifive_x280_ref - ( - conja, - schema, - cdim, - cdim_max, - cdim_bcast, - n, - n_max, - kappa, - a, inca, lda, - p, ldp, - params, - cntx - ); - } -} - -#undef FLT_SIZE -#undef FLT_LOAD -#undef VLE -#undef VLSE -#undef VSE -#undef VSSE -#undef VSSSEG8 -#undef VSSSEG7 -#undef VSSSEG6 -#undef VSSSEG5 -#undef VSSSEG4 -#undef VSSSEG3 -#undef VSSSEG2 -#undef NR - -#define FLT_SIZE 4 -#define VLSEG2 "vlseg2e32.v " -#define VLSSEG2 "vlsseg2e32.v " -#define VSSEG2 "vsseg2e32.v " -#define VSSSEG2 "vssseg2e32.v " -#define VSSSEG4 "vssseg4e32.v " -#define VSSSEG6 "vssseg6e32.v " -#define VSSSEG8 "vssseg8e32.v " -#define NR 32 - -void bli_cpackm_sifive_x280_asm_6m2 - ( - conj_t conja, - pack_t schema, - dim_t cdim, - dim_t cdim_max, - dim_t cdim_bcast, - dim_t n, - dim_t n_max, - const void* restrict kappa_, - const void* restrict a_, inc_t inca, inc_t lda, - void* restrict p_, inc_t ldp, - const void* restrict params, - const cntx_t* cntx - ) -{ - (void) cntx; - const scomplex* kappa = kappa_; - const scomplex* a = a_; - scomplex* p = p_; - - scomplex kappa_cast = *kappa; - - // MRxk kernel - if (cdim <= 6 && cdim_max == 6 && cdim_bcast == 1) - { - if (lda == 1) { - __asm__ volatile("vsetvli zero, %0, e%1, m1, ta, ma" : : "r"(n), "i"(8 * FLT_SIZE)); - if (kappa_cast.real == 1.f && kappa_cast.imag == 0.f) { - switch (cdim) { - case 0: - __asm__("vmv.v.i v0, 0"); - __asm__("vmv.v.i v1, 0"); - case 1: - __asm__("vmv.v.i v2, 0"); - __asm__("vmv.v.i v3, 0"); - case 2: - __asm__("vmv.v.i v4, 0"); - __asm__("vmv.v.i v5, 0"); - case 3: - __asm__("vmv.v.i v6, 0"); - __asm__("vmv.v.i v7, 0"); - case 4: - __asm__("vmv.v.i v8, 0"); - __asm__("vmv.v.i v9, 0"); - case 5: - __asm__("vmv.v.i v10, 0"); - __asm__("vmv.v.i v11, 0"); - } - } - else { - switch (cdim) { - case 0: - __asm__("vmv.v.i v12, 0"); - __asm__("vmv.v.i v13, 0"); - case 1: - __asm__("vmv.v.i v14, 0"); - __asm__("vmv.v.i v15, 0"); - case 2: - __asm__("vmv.v.i v16, 0"); - __asm__("vmv.v.i v17, 0"); - case 3: - __asm__("vmv.v.i v18, 0"); - __asm__("vmv.v.i v19, 0"); - case 4: - __asm__("vmv.v.i v20, 0"); - __asm__("vmv.v.i v21, 0"); - case 5: - __asm__("vmv.v.i v22, 0"); - __asm__("vmv.v.i v23, 0"); - } - } - a += (cdim - 1) * inca; - size_t avl = n; - while (avl) { - const scomplex* a_tmp = a; - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m1, ta, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE)); - switch (cdim) { - case 6: - __asm__(VLSEG2 "v10, (%0)" : : "r"(a_tmp)); - a_tmp -= inca; - case 5: - __asm__(VLSEG2 "v8, (%0)" : : "r"(a_tmp)); - a_tmp -= inca; - case 4: - __asm__(VLSEG2 "v6, (%0)" : : "r"(a_tmp)); - a_tmp -= inca; - case 3: - __asm__(VLSEG2 "v4, (%0)" : : "r"(a_tmp)); - a_tmp -= inca; - case 2: - __asm__(VLSEG2 "v2, (%0)" : : "r"(a_tmp)); - a_tmp -= inca; - case 1: - __asm__(VLSEG2 "v0, (%0)" : : "r"(a_tmp)); - } - if (kappa_cast.real == 1.f && kappa_cast.imag == 0.f) { - if (conja == BLIS_CONJUGATE) { - switch (cdim) { - case 6: __asm__("vfneg.v v11, v11"); - case 5: __asm__("vfneg.v v9, v9"); - case 4: __asm__("vfneg.v v7, v7"); - case 3: __asm__("vfneg.v v5, v5"); - case 2: __asm__("vfneg.v v3, v3"); - case 1: __asm__("vfneg.v v1, v1"); - } - } - __asm__(VSSSEG6 "v0, (%0), %1" : : "r"(p), "r"(2 * FLT_SIZE * ldp)); - __asm__(VSSSEG6 "v6, (%0), %1" : : "r"(p + 3), "r"(2 * FLT_SIZE * ldp)); - } - else { - if (conja == BLIS_NO_CONJUGATE) { - switch (cdim) { - case 6: vcmul_vf2(v22, v23, v10, v11, kappa_cast.real, kappa_cast.imag); - case 5: vcmul_vf2(v20, v21, v8, v9, kappa_cast.real, kappa_cast.imag); - case 4: vcmul_vf2(v18, v19, v6, v7, kappa_cast.real, kappa_cast.imag); - case 3: vcmul_vf2(v16, v17, v4, v5, kappa_cast.real, kappa_cast.imag); - case 2: vcmul_vf2(v14, v15, v2, v3, kappa_cast.real, kappa_cast.imag); - case 1: vcmul_vf2(v12, v13, v0, v1, kappa_cast.real, kappa_cast.imag); - } - } - else { - switch (cdim) { - case 6: vcmul_vf_conj2(v22, v23, v10, v11, kappa_cast.real, kappa_cast.imag); - case 5: vcmul_vf_conj2(v20, v21, v8, v9, kappa_cast.real, kappa_cast.imag); - case 4: vcmul_vf_conj2(v18, v19, v6, v7, kappa_cast.real, kappa_cast.imag); - case 3: vcmul_vf_conj2(v16, v17, v4, v5, kappa_cast.real, kappa_cast.imag); - case 2: vcmul_vf_conj2(v14, v15, v2, v3, kappa_cast.real, kappa_cast.imag); - case 1: vcmul_vf_conj2(v12, v13, v0, v1, kappa_cast.real, kappa_cast.imag); - } - } - __asm__(VSSSEG6 "v12, (%0), %1" : : "r"(p), "r"(2 * FLT_SIZE * ldp)); - __asm__(VSSSEG6 "v18, (%0), %1" : : "r"(p + 3), "r"(2 * FLT_SIZE * ldp)); - } - a += vl; - p += vl * ldp; - avl -= vl; - } - __asm__ volatile("vsetivli zero, 6, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - __asm__("vmv.v.i v0, 0"); - __asm__("vmv.v.i v1, 0"); - for (size_t i = n; i < n_max; ++i) { - __asm__(VSSEG2 "v0, (%0)" : : "r"(p)); - p += ldp; - } - } - else { - inca *= 2 * FLT_SIZE; - __asm__ volatile("vsetivli zero, 6, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - __asm__("vmv.v.i v0, 0"); - __asm__("vmv.v.i v1, 0"); - __asm__("vmv.v.i v2, 0"); - __asm__("vmv.v.i v3, 0"); - for (size_t i = 0; i < n; ++i) { - __asm__ volatile("vsetvli zero, %0, e%1, m1, tu, ma" : : "r"(cdim), "i"(8 * FLT_SIZE)); - if (inca == 2 * FLT_SIZE) { - __asm__(VLSEG2 "v0, (%0)" : : "r"(a)); - } - else { - __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(a), "r"(inca)); - } - if (kappa_cast.real == 1.f && kappa_cast.imag == 0.f) { - if (conja == BLIS_CONJUGATE) { - __asm__("vfneg.v v1, v1"); - } - __asm__ volatile("vsetivli zero, 6, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - __asm__(VSSEG2 "v0, (%0)" : : "r"(p)); - } - else { - if (conja == BLIS_NO_CONJUGATE) { - vcmul_vf2(v2, v3, v0, v1, kappa_cast.real, kappa_cast.imag); - } - else { - vcmul_vf_conj2(v2, v3, v0, v1, kappa_cast.real, kappa_cast.imag); - } - __asm__ volatile("vsetivli zero, 6, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - __asm__(VSSEG2 "v2, (%0)" : : "r"(p)); - } - a += lda; - p += ldp; - } - __asm__ volatile("vsetivli zero, 6, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - __asm__("vmv.v.i v0, 0"); - __asm__("vmv.v.i v1, 0"); - for (size_t i = n; i < n_max; ++i) { - __asm__(VSSEG2 "v0, (%0)" : : "r"(p)); - p += ldp; - } - } - } - // NRxk kernel - else if (cdim <= 32 && cdim_max == 32 && cdim_bcast == 1) - { - if (lda == 1) { - __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE)); - __asm__("vmv.v.i v16, 0"); - __asm__("vmv.v.i v18, 0"); - size_t avl = n; - while (avl) { - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m1, ta, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE)); - dim_t cdim_tmp = cdim; - const scomplex* a_tmp = a; - scomplex* p_tmp = p; - while (cdim_tmp >= 4) { - __asm__(VLSEG2 "v0, (%0)" : : "r"(a_tmp)); - a_tmp += inca; - __asm__(VLSEG2 "v2, (%0)" : : "r"(a_tmp)); - a_tmp += inca; - __asm__(VLSEG2 "v4, (%0)" : : "r"(a_tmp)); - a_tmp += inca; - __asm__(VLSEG2 "v6, (%0)" : : "r"(a_tmp)); - a_tmp += inca; - if (kappa_cast.real == 1.f && kappa_cast.imag == 0.f) { - if (conja == BLIS_CONJUGATE) { - __asm__("vfneg.v v1, v1"); - __asm__("vfneg.v v3, v3"); - __asm__("vfneg.v v5, v5"); - __asm__("vfneg.v v7, v7"); - } - __asm__(VSSSEG8 "v0, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp)); - } - else { - if (conja == BLIS_NO_CONJUGATE) { - vcmul_vf2(v8, v9, v0, v1, kappa_cast.real, kappa_cast.imag); - vcmul_vf2(v10, v11, v2, v3, kappa_cast.real, kappa_cast.imag); - vcmul_vf2(v12, v13, v4, v5, kappa_cast.real, kappa_cast.imag); - vcmul_vf2(v14, v15, v6, v7, kappa_cast.real, kappa_cast.imag); - } - else { - vcmul_vf_conj2(v8, v9, v0, v1, kappa_cast.real, kappa_cast.imag); - vcmul_vf_conj2(v10, v11, v2, v3, kappa_cast.real, kappa_cast.imag); - vcmul_vf_conj2(v12, v13, v4, v5, kappa_cast.real, kappa_cast.imag); - vcmul_vf_conj2(v14, v15, v6, v7, kappa_cast.real, kappa_cast.imag); - } - __asm__(VSSSEG8 "v8, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp)); - } - p_tmp += 4; - cdim_tmp -= 4; - } - if (cdim_tmp > 0) { - a_tmp += (cdim_tmp - 1) * inca; - switch (cdim_tmp) { - case 3: - __asm__(VLSEG2 "v4, (%0)" : : "r"(a_tmp)); - a_tmp -= inca; - case 2: - __asm__(VLSEG2 "v2, (%0)" : : "r"(a_tmp)); - a_tmp -= inca; - case 1: - __asm__(VLSEG2 "v0, (%0)" : : "r"(a_tmp)); - } - if (kappa_cast.real == 1.f && kappa_cast.imag == 0.f) { - if (conja == BLIS_CONJUGATE) { - switch (cdim_tmp) { - case 3: __asm__("vfneg.v v5, v5"); - case 2: __asm__("vfneg.v v3, v3"); - case 1: __asm__("vfneg.v v1, v1"); - } - } - switch (cdim_tmp) { - case 3: - __asm__(VSSSEG6 "v0, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp)); - break; - case 2: - __asm__(VSSSEG4 "v0, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp)); - break; - case 1: - __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp)); - break; - } - } - else { - if (conja == BLIS_NO_CONJUGATE) { - switch (cdim_tmp) { - case 3: vcmul_vf2(v12, v13, v4, v5, kappa_cast.real, kappa_cast.imag); - case 2: vcmul_vf2(v10, v11, v2, v3, kappa_cast.real, kappa_cast.imag); - case 1: vcmul_vf2(v8, v9, v0, v1, kappa_cast.real, kappa_cast.imag); - } - } - else { - switch (cdim_tmp) { - case 3: vcmul_vf_conj2(v12, v13, v4, v5, kappa_cast.real, kappa_cast.imag); - case 2: vcmul_vf_conj2(v10, v11, v2, v3, kappa_cast.real, kappa_cast.imag); - case 1: vcmul_vf_conj2(v8, v9, v0, v1, kappa_cast.real, kappa_cast.imag); - } - } - switch (cdim_tmp) { - case 3: - __asm__(VSSSEG6 "v8, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp)); - break; - case 2: - __asm__(VSSSEG4 "v8, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp)); - break; - case 1: - __asm__(VSSSEG2 "v8, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp)); - break; - } - } - p_tmp += cdim_tmp; - } - __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR - cdim), "i"(8 * FLT_SIZE)); - for (size_t i = 0; i < vl; ++i) { - __asm__(VSSEG2 "v16, (%0)" : : "r"(p_tmp)); - p_tmp += ldp; - } - a += vl; - p += vl * ldp; - avl -= vl; - } - __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE)); - for (size_t i = n; i < n_max; ++i) { - __asm__(VSSEG2 "v16, (%0)" : : "r"(p)); - p += ldp; - } - } - else { - inca *= 2 * FLT_SIZE; - __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE)); - __asm__("vmv.v.i v0, 0"); - __asm__("vmv.v.i v2, 0"); - __asm__("vmv.v.i v4, 0"); - __asm__("vmv.v.i v6, 0"); - for (size_t i = 0; i < n; ++i) { - __asm__ volatile("vsetvli zero, %0, e%1, m2, tu, ma" : : "r"(cdim), "i"(8 * FLT_SIZE)); - if (inca == 2 * FLT_SIZE) { - __asm__(VLSEG2 "v0, (%0)" : : "r"(a)); - } - else { - __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(a), "r"(inca)); - } - if (kappa_cast.real == 1.f && kappa_cast.imag == 0.f) { - if (conja == BLIS_CONJUGATE) { - __asm__("vfneg.v v2, v2"); - } - __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE)); - __asm__(VSSEG2 "v0, (%0)" : : "r"(p)); - } - else { - if (conja == BLIS_NO_CONJUGATE) { - vcmul_vf2(v4, v6, v0, v2, kappa_cast.real, kappa_cast.imag); - } - else { - vcmul_vf_conj2(v4, v6, v0, v2, kappa_cast.real, kappa_cast.imag); - } - __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE)); - __asm__(VSSEG2 "v4, (%0)" : : "r"(p)); - } - a += lda; - p += ldp; - } - __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE)); - __asm__("vmv.v.i v0, 0"); - __asm__("vmv.v.i v2, 0"); - for (size_t i = n; i < n_max; ++i) { - __asm__(VSSEG2 "v0, (%0)" : : "r"(p)); - p += ldp; - } - } - } - // generic kernel - else - { - bli_ccpackm_sifive_x280_ref - ( - conja, - schema, - cdim, - cdim_max, - cdim_bcast, - n, - n_max, - kappa, - a, inca, lda, - p, ldp, - params, - cntx - ); - } -} - -#undef FLT_SIZE -#undef VLSEG2 -#undef VLSSEG2 -#undef VSSEG2 -#undef VSSSEG2 -#undef VSSSEG4 -#undef VSSSEG6 -#undef VSSSEG8 -#undef NR - -#define FLT_SIZE 8 -#define VLSEG2 "vlseg2e64.v " -#define VLSSEG2 "vlsseg2e64.v " -#define VSSEG2 "vsseg2e64.v " -#define VSSSEG2 "vssseg2e64.v " -#define VSSSEG4 "vssseg4e64.v " -#define VSSSEG6 "vssseg6e64.v " -#define VSSSEG8 "vssseg8e64.v " -#define NR 16 - -void bli_zpackm_sifive_x280_asm_6m2 - ( - conj_t conja, - pack_t schema, - dim_t cdim, - dim_t cdim_max, - dim_t cdim_bcast, - dim_t n, - dim_t n_max, - const void* restrict kappa_, - const void* restrict a_, inc_t inca, inc_t lda, - void* restrict p_, inc_t ldp, - const void* restrict params, - const cntx_t* cntx - ) -{ - (void) cntx; - const dcomplex* kappa = kappa_; - const dcomplex* a = a_; - dcomplex* p = p_; - - dcomplex kappa_cast = *kappa; - - // MRxk kernel - if (cdim <= 6 && cdim_max == 6 && cdim_bcast == 1) - { - if (lda == 1) { - __asm__ volatile("vsetvli zero, %0, e%1, m1, ta, ma" : : "r"(n), "i"(8 * FLT_SIZE)); - if (kappa_cast.real == 1. && kappa_cast.imag == 0.) { - switch (cdim) { - case 0: - __asm__("vmv.v.i v0, 0"); - __asm__("vmv.v.i v1, 0"); - case 1: - __asm__("vmv.v.i v2, 0"); - __asm__("vmv.v.i v3, 0"); - case 2: - __asm__("vmv.v.i v4, 0"); - __asm__("vmv.v.i v5, 0"); - case 3: - __asm__("vmv.v.i v6, 0"); - __asm__("vmv.v.i v7, 0"); - case 4: - __asm__("vmv.v.i v8, 0"); - __asm__("vmv.v.i v9, 0"); - case 5: - __asm__("vmv.v.i v10, 0"); - __asm__("vmv.v.i v11, 0"); - } - } - else { - switch (cdim) { - case 0: - __asm__("vmv.v.i v12, 0"); - __asm__("vmv.v.i v13, 0"); - case 1: - __asm__("vmv.v.i v14, 0"); - __asm__("vmv.v.i v15, 0"); - case 2: - __asm__("vmv.v.i v16, 0"); - __asm__("vmv.v.i v17, 0"); - case 3: - __asm__("vmv.v.i v18, 0"); - __asm__("vmv.v.i v19, 0"); - case 4: - __asm__("vmv.v.i v20, 0"); - __asm__("vmv.v.i v21, 0"); - case 5: - __asm__("vmv.v.i v22, 0"); - __asm__("vmv.v.i v23, 0"); - } - } - a += (cdim - 1) * inca; - size_t avl = n; - while (avl) { - const dcomplex* a_tmp = a; - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m1, ta, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE)); - switch (cdim) { - case 6: - __asm__(VLSEG2 "v10, (%0)" : : "r"(a_tmp)); - a_tmp -= inca; - case 5: - __asm__(VLSEG2 "v8, (%0)" : : "r"(a_tmp)); - a_tmp -= inca; - case 4: - __asm__(VLSEG2 "v6, (%0)" : : "r"(a_tmp)); - a_tmp -= inca; - case 3: - __asm__(VLSEG2 "v4, (%0)" : : "r"(a_tmp)); - a_tmp -= inca; - case 2: - __asm__(VLSEG2 "v2, (%0)" : : "r"(a_tmp)); - a_tmp -= inca; - case 1: - __asm__(VLSEG2 "v0, (%0)" : : "r"(a_tmp)); - } - if (kappa_cast.real == 1. && kappa_cast.imag == 0.) { - if (conja == BLIS_CONJUGATE) { - switch (cdim) { - case 6: __asm__("vfneg.v v11, v11"); - case 5: __asm__("vfneg.v v9, v9"); - case 4: __asm__("vfneg.v v7, v7"); - case 3: __asm__("vfneg.v v5, v5"); - case 2: __asm__("vfneg.v v3, v3"); - case 1: __asm__("vfneg.v v1, v1"); - } - } - __asm__(VSSSEG6 "v0, (%0), %1" : : "r"(p), "r"(2 * FLT_SIZE * ldp)); - __asm__(VSSSEG6 "v6, (%0), %1" : : "r"(p + 3), "r"(2 * FLT_SIZE * ldp)); - } - else { - if (conja == BLIS_NO_CONJUGATE) { - switch (cdim) { - case 6: vcmul_vf2(v22, v23, v10, v11, kappa_cast.real, kappa_cast.imag); - case 5: vcmul_vf2(v20, v21, v8, v9, kappa_cast.real, kappa_cast.imag); - case 4: vcmul_vf2(v18, v19, v6, v7, kappa_cast.real, kappa_cast.imag); - case 3: vcmul_vf2(v16, v17, v4, v5, kappa_cast.real, kappa_cast.imag); - case 2: vcmul_vf2(v14, v15, v2, v3, kappa_cast.real, kappa_cast.imag); - case 1: vcmul_vf2(v12, v13, v0, v1, kappa_cast.real, kappa_cast.imag); - } - } - else { - switch (cdim) { - case 6: vcmul_vf_conj2(v22, v23, v10, v11, kappa_cast.real, kappa_cast.imag); - case 5: vcmul_vf_conj2(v20, v21, v8, v9, kappa_cast.real, kappa_cast.imag); - case 4: vcmul_vf_conj2(v18, v19, v6, v7, kappa_cast.real, kappa_cast.imag); - case 3: vcmul_vf_conj2(v16, v17, v4, v5, kappa_cast.real, kappa_cast.imag); - case 2: vcmul_vf_conj2(v14, v15, v2, v3, kappa_cast.real, kappa_cast.imag); - case 1: vcmul_vf_conj2(v12, v13, v0, v1, kappa_cast.real, kappa_cast.imag); - } - } - __asm__(VSSSEG6 "v12, (%0), %1" : : "r"(p), "r"(2 * FLT_SIZE * ldp)); - __asm__(VSSSEG6 "v18, (%0), %1" : : "r"(p + 3), "r"(2 * FLT_SIZE * ldp)); - } - a += vl; - p += vl * ldp; - avl -= vl; - } - __asm__ volatile("vsetivli zero, 6, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - __asm__("vmv.v.i v0, 0"); - __asm__("vmv.v.i v1, 0"); - for (size_t i = n; i < n_max; ++i) { - __asm__(VSSEG2 "v0, (%0)" : : "r"(p)); - p += ldp; - } - } - else { - inca *= 2 * FLT_SIZE; - __asm__ volatile("vsetivli zero, 6, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - __asm__("vmv.v.i v0, 0"); - __asm__("vmv.v.i v1, 0"); - __asm__("vmv.v.i v2, 0"); - __asm__("vmv.v.i v3, 0"); - for (size_t i = 0; i < n; ++i) { - __asm__ volatile("vsetvli zero, %0, e%1, m1, tu, ma" : : "r"(cdim), "i"(8 * FLT_SIZE)); - if (inca == 2 * FLT_SIZE) { - __asm__(VLSEG2 "v0, (%0)" : : "r"(a)); - } - else { - __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(a), "r"(inca)); - } - if (kappa_cast.real == 1. && kappa_cast.imag == 0.) { - if (conja == BLIS_CONJUGATE) { - __asm__("vfneg.v v1, v1"); - } - __asm__ volatile("vsetivli zero, 6, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - __asm__(VSSEG2 "v0, (%0)" : : "r"(p)); - } - else { - if (conja == BLIS_NO_CONJUGATE) { - vcmul_vf2(v2, v3, v0, v1, kappa_cast.real, kappa_cast.imag); - } - else { - vcmul_vf_conj2(v2, v3, v0, v1, kappa_cast.real, kappa_cast.imag); - } - __asm__ volatile("vsetivli zero, 6, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - __asm__(VSSEG2 "v2, (%0)" : : "r"(p)); - } - a += lda; - p += ldp; - } - __asm__ volatile("vsetivli zero, 6, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); - __asm__("vmv.v.i v0, 0"); - __asm__("vmv.v.i v1, 0"); - for (size_t i = n; i < n_max; ++i) { - __asm__(VSSEG2 "v0, (%0)" : : "r"(p)); - p += ldp; - } - } - } - // NRxk kernel - else if (cdim <= 16 && cdim_max == 16 && cdim_bcast == 1) - { - if (lda == 1) { - __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE)); - __asm__("vmv.v.i v16, 0"); - __asm__("vmv.v.i v18, 0"); - size_t avl = n; - while (avl) { - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m1, ta, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE)); - dim_t cdim_tmp = cdim; - const dcomplex* a_tmp = a; - dcomplex* p_tmp = p; - while (cdim_tmp >= 4) { - __asm__(VLSEG2 "v0, (%0)" : : "r"(a_tmp)); - a_tmp += inca; - __asm__(VLSEG2 "v2, (%0)" : : "r"(a_tmp)); - a_tmp += inca; - __asm__(VLSEG2 "v4, (%0)" : : "r"(a_tmp)); - a_tmp += inca; - __asm__(VLSEG2 "v6, (%0)" : : "r"(a_tmp)); - a_tmp += inca; - if (kappa_cast.real == 1. && kappa_cast.imag == 0.) { - if (conja == BLIS_CONJUGATE) { - __asm__("vfneg.v v1, v1"); - __asm__("vfneg.v v3, v3"); - __asm__("vfneg.v v5, v5"); - __asm__("vfneg.v v7, v7"); - } - __asm__(VSSSEG8 "v0, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp)); - } - else { - if (conja == BLIS_NO_CONJUGATE) { - vcmul_vf2(v8, v9, v0, v1, kappa_cast.real, kappa_cast.imag); - vcmul_vf2(v10, v11, v2, v3, kappa_cast.real, kappa_cast.imag); - vcmul_vf2(v12, v13, v4, v5, kappa_cast.real, kappa_cast.imag); - vcmul_vf2(v14, v15, v6, v7, kappa_cast.real, kappa_cast.imag); - } - else { - vcmul_vf_conj2(v8, v9, v0, v1, kappa_cast.real, kappa_cast.imag); - vcmul_vf_conj2(v10, v11, v2, v3, kappa_cast.real, kappa_cast.imag); - vcmul_vf_conj2(v12, v13, v4, v5, kappa_cast.real, kappa_cast.imag); - vcmul_vf_conj2(v14, v15, v6, v7, kappa_cast.real, kappa_cast.imag); - } - __asm__(VSSSEG8 "v8, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp)); - } - p_tmp += 4; - cdim_tmp -= 4; - } - if (cdim_tmp > 0) { - a_tmp += (cdim_tmp - 1) * inca; - switch (cdim_tmp) { - case 3: - __asm__(VLSEG2 "v4, (%0)" : : "r"(a_tmp)); - a_tmp -= inca; - case 2: - __asm__(VLSEG2 "v2, (%0)" : : "r"(a_tmp)); - a_tmp -= inca; - case 1: - __asm__(VLSEG2 "v0, (%0)" : : "r"(a_tmp)); - } - if (kappa_cast.real == 1. && kappa_cast.imag == 0.) { - if (conja == BLIS_CONJUGATE) { - switch (cdim_tmp) { - case 3: __asm__("vfneg.v v5, v5"); - case 2: __asm__("vfneg.v v3, v3"); - case 1: __asm__("vfneg.v v1, v1"); - } - } - switch (cdim_tmp) { - case 3: - __asm__(VSSSEG6 "v0, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp)); - break; - case 2: - __asm__(VSSSEG4 "v0, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp)); - break; - case 1: - __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp)); - break; - } - } - else { - if (conja == BLIS_NO_CONJUGATE) { - switch (cdim_tmp) { - case 3: vcmul_vf2(v12, v13, v4, v5, kappa_cast.real, kappa_cast.imag); - case 2: vcmul_vf2(v10, v11, v2, v3, kappa_cast.real, kappa_cast.imag); - case 1: vcmul_vf2(v8, v9, v0, v1, kappa_cast.real, kappa_cast.imag); - } - } - else { - switch (cdim_tmp) { - case 3: vcmul_vf_conj2(v12, v13, v4, v5, kappa_cast.real, kappa_cast.imag); - case 2: vcmul_vf_conj2(v10, v11, v2, v3, kappa_cast.real, kappa_cast.imag); - case 1: vcmul_vf_conj2(v8, v9, v0, v1, kappa_cast.real, kappa_cast.imag); - } - } - switch (cdim_tmp) { - case 3: - __asm__(VSSSEG6 "v8, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp)); - break; - case 2: - __asm__(VSSSEG4 "v8, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp)); - break; - case 1: - __asm__(VSSSEG2 "v8, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp)); - break; - } - } - p_tmp += cdim_tmp; - } - __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR - cdim), "i"(8 * FLT_SIZE)); - for (size_t i = 0; i < vl; ++i) { - __asm__(VSSEG2 "v16, (%0)" : : "r"(p_tmp)); - p_tmp += ldp; - } - a += vl; - p += vl * ldp; - avl -= vl; - } - __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE)); - for (size_t i = n; i < n_max; ++i) { - __asm__(VSSEG2 "v16, (%0)" : : "r"(p)); - p += ldp; - } - } - else { - inca *= 2 * FLT_SIZE; - __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE)); - __asm__("vmv.v.i v0, 0"); - __asm__("vmv.v.i v2, 0"); - __asm__("vmv.v.i v4, 0"); - __asm__("vmv.v.i v6, 0"); - for (size_t i = 0; i < n; ++i) { - __asm__ volatile("vsetvli zero, %0, e%1, m2, tu, ma" : : "r"(cdim), "i"(8 * FLT_SIZE)); - if (inca == 2 * FLT_SIZE) { - __asm__(VLSEG2 "v0, (%0)" : : "r"(a)); - } - else { - __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(a), "r"(inca)); - } - if (kappa_cast.real == 1. && kappa_cast.imag == 0.) { - if (conja == BLIS_CONJUGATE) { - __asm__("vfneg.v v2, v2"); - } - __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE)); - __asm__(VSSEG2 "v0, (%0)" : : "r"(p)); - } - else { - if (conja == BLIS_NO_CONJUGATE) { - vcmul_vf2(v4, v6, v0, v2, kappa_cast.real, kappa_cast.imag); - } - else { - vcmul_vf_conj2(v4, v6, v0, v2, kappa_cast.real, kappa_cast.imag); - } - __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE)); - __asm__(VSSEG2 "v4, (%0)" : : "r"(p)); - } - a += lda; - p += ldp; - } - __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE)); - __asm__("vmv.v.i v0, 0"); - __asm__("vmv.v.i v2, 0"); - for (size_t i = n; i < n_max; ++i) { - __asm__(VSSEG2 "v0, (%0)" : : "r"(p)); - p += ldp; - } - } - } - // generic kernel - else - { - bli_zzpackm_sifive_x280_ref - ( - conja, - schema, - cdim, - cdim_max, - cdim_bcast, - n, - n_max, - kappa, - a, inca, lda, - p, ldp, - params, - cntx - ); - } -} diff --git a/kernels/sifive_x280/1m/bli_packm_sifive_x280_intr/bli_packm_sifive_x280_intr.c b/kernels/sifive_x280/1m/bli_packm_sifive_x280_intr/bli_packm_sifive_x280_intr.c new file mode 100644 index 0000000000..119872197a --- /dev/null +++ b/kernels/sifive_x280/1m/bli_packm_sifive_x280_intr/bli_packm_sifive_x280_intr.c @@ -0,0 +1,168 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2024, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off + +#include "../../riscv_cmul_macros_intr.h" +#include "../../riscv_overloaded_intrinsics.h" +#include "blis.h" +#include +#include + +#define PACKM_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##packm_sifive_x280_intr(\ + conj_t conja, \ + pack_t schema, \ + dim_t cdim, \ + dim_t cdim_max, \ + dim_t cdim_bcast, \ + dim_t n, \ + dim_t n_max, \ + const T* restrict kappa_, \ + const T* restrict a_, inc_t inca, inc_t lda, \ + T* restrict p_, inc_t ldp, \ + const T* restrict params, \ + const cntx_t* cntx \ +) + +#define PACKM(...) PACKM_(__VA_ARGS__) + +#define REF_KERNEL_(PRECISION_CHAR) bli_##PRECISION_CHAR##PRECISION_CHAR##packm_sifive_x280_ref +#define REF_KERNEL(PRECISION_CHAR) REF_KERNEL_(PRECISION_CHAR) + +// LMUL is the LMUL used when a is "row major" (lda == 1). Since we use +// segment stores with more than 4 fields, this is usually m1. +// LMUL_MR is an LMUL large enough to hold MR floats (for spackm, cpackm) +// or doubles (for dpackm, zpackm). LMUL_NR is analogous. + +// Single precision real +#define DATATYPE float +#define PRECISION_CHAR s +#define PREC 32 +#define LMUL m1 +#define LMUL_MR m1 +#define LMUL_NR m4 +#define FLT_SIZE sizeof(float) +#define MR 7 +#define NR 64 + +#include "./bli_packm_sifive_x280_intr_real.c" + +#undef DATATYPE +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef LMUL_MR +#undef LMUL_NR +#undef FLT_SIZE +#undef MR +#undef NR + +// Double precision real +#define DATATYPE double +#define PRECISION_CHAR d +#define PREC 64 +#define LMUL m1 +#define LMUL_MR m1 +#define LMUL_NR m4 +#define FLT_SIZE sizeof(double) +#define MR 7 +#define NR 32 + +#include "./bli_packm_sifive_x280_intr_real.c" + +#undef DATATYPE +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef LMUL_MR +#undef LMUL_NR +#undef FLT_SIZE +#undef MR +#undef NR + +// Single precision complex +#define DATATYPE scomplex +#define BASE_DT float +#define PRECISION_CHAR c +#define PREC 32 +#define LMUL m1 +#define LMUL_MR m1 +#define LMUL_NR m2 +#define FLT_SIZE sizeof(float) +#define MR 6 +#define NR 32 + +#include "./bli_packm_sifive_x280_intr_complex.c" + +#undef DATATYPE +#undef BASE_DT +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef LMUL_MR +#undef LMUL_NR +#undef FLT_SIZE +#undef MR +#undef NR + +// Double precision complex +#define DATATYPE dcomplex +#define BASE_DT double +#define PRECISION_CHAR z +#define PREC 64 +#define LMUL m1 +#define LMUL_MR m1 +#define LMUL_NR m2 +#define FLT_SIZE sizeof(double) +#define MR 6 +#define NR 16 + +#include "./bli_packm_sifive_x280_intr_complex.c" + +#undef DATATYPE +#undef BASE_DT +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef LMUL_MR +#undef LMUL_NR +#undef FLT_SIZE +#undef MR +#undef NR + +#undef REF_KERNEL_ +#undef REF_KERNEL + +#undef PACKM +#undef PACKM_ diff --git a/kernels/sifive_x280/1m/bli_packm_sifive_x280_intr/bli_packm_sifive_x280_intr_complex.c b/kernels/sifive_x280/1m/bli_packm_sifive_x280_intr/bli_packm_sifive_x280_intr_complex.c new file mode 100644 index 0000000000..ee49090dc9 --- /dev/null +++ b/kernels/sifive_x280/1m/bli_packm_sifive_x280_intr/bli_packm_sifive_x280_intr_complex.c @@ -0,0 +1,545 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2024, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off +#ifdef PACKM + +PACKM(PRECISION_CHAR, void) +{ + (void) schema; // Suppress unused parameter warnings + (void) params; + (void) cntx; + const DATATYPE* restrict kappa = kappa_; + const DATATYPE* restrict a = a_; + DATATYPE* restrict p = p_; + + // MRxk kernel + if (cdim <= MR && cdim_max == MR && cdim_bcast == 1) + { + if (lda == 1) { + // a is "row major" + RVV_TYPE_F(PREC, LMUL) arow0_r, arow1_r, arow2_r, arow3_r, arow4_r, arow5_r; + RVV_TYPE_F(PREC, LMUL) arow0_i, arow1_i, arow2_i, arow3_i, arow4_i, arow5_i; + RVV_TYPE_F(PREC, LMUL) kappa_arow0_r, kappa_arow1_r, kappa_arow2_r, + kappa_arow3_r, kappa_arow4_r, kappa_arow5_r; + RVV_TYPE_F(PREC, LMUL) kappa_arow0_i, kappa_arow1_i, kappa_arow2_i, + kappa_arow3_i, kappa_arow4_i, kappa_arow5_i; + // pad lower edge + if (PASTEMAC(PRECISION_CHAR, eq1)(*kappa)) { + switch (cdim) { + case 0: + arow0_r = VFMV_V_F(PREC, LMUL)(0., n); + arow0_i = VFMV_V_F(PREC, LMUL)(0., n); + case 1: + arow1_r = VFMV_V_F(PREC, LMUL)(0., n); + arow1_i = VFMV_V_F(PREC, LMUL)(0., n); + case 2: + arow2_r = VFMV_V_F(PREC, LMUL)(0., n); + arow2_i = VFMV_V_F(PREC, LMUL)(0., n); + case 3: + arow3_r = VFMV_V_F(PREC, LMUL)(0., n); + arow3_i = VFMV_V_F(PREC, LMUL)(0., n); + case 4: + arow4_r = VFMV_V_F(PREC, LMUL)(0., n); + arow4_i = VFMV_V_F(PREC, LMUL)(0., n); + case 5: + arow5_r = VFMV_V_F(PREC, LMUL)(0., n); + arow5_i = VFMV_V_F(PREC, LMUL)(0., n); + } + } else { + switch (cdim) { + case 0: + kappa_arow0_r = VFMV_V_F(PREC, LMUL)(0., n); + kappa_arow0_i = VFMV_V_F(PREC, LMUL)(0., n); + case 1: + kappa_arow1_r = VFMV_V_F(PREC, LMUL)(0., n); + kappa_arow1_i = VFMV_V_F(PREC, LMUL)(0., n); + case 2: + kappa_arow2_r = VFMV_V_F(PREC, LMUL)(0., n); + kappa_arow2_i = VFMV_V_F(PREC, LMUL)(0., n); + case 3: + kappa_arow3_r = VFMV_V_F(PREC, LMUL)(0., n); + kappa_arow3_i = VFMV_V_F(PREC, LMUL)(0., n); + case 4: + kappa_arow4_r = VFMV_V_F(PREC, LMUL)(0., n); + kappa_arow4_i = VFMV_V_F(PREC, LMUL)(0., n); + case 5: + kappa_arow5_r = VFMV_V_F(PREC, LMUL)(0., n); + kappa_arow5_i = VFMV_V_F(PREC, LMUL)(0., n); + } + } + + size_t avl = n; + while (avl) { + size_t vl = VSETVL(PREC, LMUL)(avl); + RVV_TYPE_FX(PREC, LMUL, 2) arow_vec; + switch (cdim) { + case 6: + arow_vec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(a + 5 * inca), vl); + arow5_r = VGET_V_F(PREC, LMUL, 2)(arow_vec, 0); + arow5_i = VGET_V_F(PREC, LMUL, 2)(arow_vec, 1); + case 5: + arow_vec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(a + 4 * inca), vl); + arow4_r = VGET_V_F(PREC, LMUL, 2)(arow_vec, 0); + arow4_i = VGET_V_F(PREC, LMUL, 2)(arow_vec, 1); + case 4: + arow_vec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(a + 3 * inca), vl); + arow3_r = VGET_V_F(PREC, LMUL, 2)(arow_vec, 0); + arow3_i = VGET_V_F(PREC, LMUL, 2)(arow_vec, 1); + case 3: + arow_vec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(a + 2 * inca), vl); + arow2_r = VGET_V_F(PREC, LMUL, 2)(arow_vec, 0); + arow2_i = VGET_V_F(PREC, LMUL, 2)(arow_vec, 1); + case 2: + arow_vec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(a + 1 * inca), vl); + arow1_r = VGET_V_F(PREC, LMUL, 2)(arow_vec, 0); + arow1_i = VGET_V_F(PREC, LMUL, 2)(arow_vec, 1); + case 1: + arow_vec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(a + 0 * inca), vl); + arow0_r = VGET_V_F(PREC, LMUL, 2)(arow_vec, 0); + arow0_i = VGET_V_F(PREC, LMUL, 2)(arow_vec, 1); + } + + if (PASTEMAC(PRECISION_CHAR, eq1)(*kappa)) { + if (bli_is_conj(conja)) { + switch (cdim) { + case 6: + arow5_i = VFNEG_VF(PREC, LMUL)(arow5_i, vl); + case 5: + arow4_i = VFNEG_VF(PREC, LMUL)(arow4_i, vl); + case 4: + arow3_i = VFNEG_VF(PREC, LMUL)(arow3_i, vl); + case 3: + arow2_i = VFNEG_VF(PREC, LMUL)(arow2_i, vl); + case 2: + arow1_i = VFNEG_VF(PREC, LMUL)(arow1_i, vl); + case 1: + arow0_i = VFNEG_VF(PREC, LMUL)(arow0_i, vl); + } + } + + RVV_TYPE_FX(PREC, LMUL, 6) ablock = VUNDEFINED_FX(PREC, LMUL, 6)(); + ablock = VSET_V_F(PREC, LMUL, 6)(ablock, 0, arow0_r); + ablock = VSET_V_F(PREC, LMUL, 6)(ablock, 1, arow0_i); + ablock = VSET_V_F(PREC, LMUL, 6)(ablock, 2, arow1_r); + ablock = VSET_V_F(PREC, LMUL, 6)(ablock, 3, arow1_i); + ablock = VSET_V_F(PREC, LMUL, 6)(ablock, 4, arow2_r); + ablock = VSET_V_F(PREC, LMUL, 6)(ablock, 5, arow2_i); + VSSSEG6_V_F(PREC, LMUL, 6)((BASE_DT*) p, 2 * FLT_SIZE * ldp, ablock, vl); + ablock = VSET_V_F(PREC, LMUL, 6)(ablock, 0, arow3_r); + ablock = VSET_V_F(PREC, LMUL, 6)(ablock, 1, arow3_i); + ablock = VSET_V_F(PREC, LMUL, 6)(ablock, 2, arow4_r); + ablock = VSET_V_F(PREC, LMUL, 6)(ablock, 3, arow4_i); + ablock = VSET_V_F(PREC, LMUL, 6)(ablock, 4, arow5_r); + ablock = VSET_V_F(PREC, LMUL, 6)(ablock, 5, arow5_i); + VSSSEG6_V_F(PREC, LMUL, 6)((BASE_DT*)(p + 3), 2 * FLT_SIZE * ldp, ablock, vl); + } else { + if (bli_is_conj(conja)) { + switch (cdim) { + case 6: + VCMUL_VF_CONJ(PREC, LMUL, kappa_arow5_r, kappa_arow5_i, arow5_r, arow5_i, kappa->real, kappa->imag, vl); + case 5: + VCMUL_VF_CONJ(PREC, LMUL, kappa_arow4_r, kappa_arow4_i, arow4_r, arow4_i, kappa->real, kappa->imag, vl); + case 4: + VCMUL_VF_CONJ(PREC, LMUL, kappa_arow3_r, kappa_arow3_i, arow3_r, arow3_i, kappa->real, kappa->imag, vl); + case 3: + VCMUL_VF_CONJ(PREC, LMUL, kappa_arow2_r, kappa_arow2_i, arow2_r, arow2_i, kappa->real, kappa->imag, vl); + case 2: + VCMUL_VF_CONJ(PREC, LMUL, kappa_arow1_r, kappa_arow1_i, arow1_r, arow1_i, kappa->real, kappa->imag, vl); + case 1: + VCMUL_VF_CONJ(PREC, LMUL, kappa_arow0_r, kappa_arow0_i, arow0_r, arow0_i, kappa->real, kappa->imag, vl); + } + } else { + switch (cdim) { + case 6: + VCMUL_VF(PREC, LMUL, kappa_arow5_r, kappa_arow5_i, arow5_r, arow5_i, kappa->real, kappa->imag, vl); + case 5: + VCMUL_VF(PREC, LMUL, kappa_arow4_r, kappa_arow4_i, arow4_r, arow4_i, kappa->real, kappa->imag, vl); + case 4: + VCMUL_VF(PREC, LMUL, kappa_arow3_r, kappa_arow3_i, arow3_r, arow3_i, kappa->real, kappa->imag, vl); + case 3: + VCMUL_VF(PREC, LMUL, kappa_arow2_r, kappa_arow2_i, arow2_r, arow2_i, kappa->real, kappa->imag, vl); + case 2: + VCMUL_VF(PREC, LMUL, kappa_arow1_r, kappa_arow1_i, arow1_r, arow1_i, kappa->real, kappa->imag, vl); + case 1: + VCMUL_VF(PREC, LMUL, kappa_arow0_r, kappa_arow0_i, arow0_r, arow0_i, kappa->real, kappa->imag, vl); + } + } + + RVV_TYPE_FX(PREC, LMUL, 6) ablock = VUNDEFINED_FX(PREC, LMUL, 6)(); + ablock = VSET_V_F(PREC, LMUL, 6)(ablock, 0, kappa_arow0_r); + ablock = VSET_V_F(PREC, LMUL, 6)(ablock, 1, kappa_arow0_i); + ablock = VSET_V_F(PREC, LMUL, 6)(ablock, 2, kappa_arow1_r); + ablock = VSET_V_F(PREC, LMUL, 6)(ablock, 3, kappa_arow1_i); + ablock = VSET_V_F(PREC, LMUL, 6)(ablock, 4, kappa_arow2_r); + ablock = VSET_V_F(PREC, LMUL, 6)(ablock, 5, kappa_arow2_i); + VSSSEG6_V_F(PREC, LMUL, 6)((BASE_DT*) p, 2 * FLT_SIZE * ldp, ablock, vl); + ablock = VSET_V_F(PREC, LMUL, 6)(ablock, 0, kappa_arow3_r); + ablock = VSET_V_F(PREC, LMUL, 6)(ablock, 1, kappa_arow3_i); + ablock = VSET_V_F(PREC, LMUL, 6)(ablock, 2, kappa_arow4_r); + ablock = VSET_V_F(PREC, LMUL, 6)(ablock, 3, kappa_arow4_i); + ablock = VSET_V_F(PREC, LMUL, 6)(ablock, 4, kappa_arow5_r); + ablock = VSET_V_F(PREC, LMUL, 6)(ablock, 5, kappa_arow5_i); + VSSSEG6_V_F(PREC, LMUL, 6)((BASE_DT*)(p + 3), 2 * FLT_SIZE * ldp, ablock, vl); + } + + a += vl; + p += vl * ldp; + avl -= vl; + } + + RVV_TYPE_FX(PREC, LMUL_MR, 2) zero_padding = VUNDEFINED_FX(PREC, LMUL_MR, 2)(); + zero_padding = VSET_V_F(PREC, LMUL_MR, 2)(zero_padding, 0, VFMV_V_F(PREC, LMUL_MR)(0., cdim_max)); + zero_padding = VSET_V_F(PREC, LMUL_MR, 2)(zero_padding, 1, VFMV_V_F(PREC, LMUL_MR)(0., cdim_max)); + for (size_t i = n; i < n_max; ++i) { + VSSEG2_V_F(PREC, LMUL_MR, 2)((BASE_DT*) p, zero_padding, cdim_max); + p += ldp; + } + } + else { + RVV_TYPE_FX(PREC, LMUL_MR, 2) zero_padding = VUNDEFINED_FX(PREC, LMUL_MR, 2)(); + zero_padding = VSET_V_F(PREC, LMUL_MR, 2)(zero_padding, 0, VFMV_V_F(PREC, LMUL_MR)(0., cdim_max)); + zero_padding = VSET_V_F(PREC, LMUL_MR, 2)(zero_padding, 1, VFMV_V_F(PREC, LMUL_MR)(0., cdim_max)); + + for (size_t i = 0; i < n; ++i) { + RVV_TYPE_FX(PREC, LMUL_MR, 2) acol; + if (inca == 1) + acol = VLSEG2_V_F_TU(PREC, LMUL_MR, 2)(zero_padding, (BASE_DT*) a, cdim); + else + acol = VLSSEG2_V_F_TU(PREC, LMUL_MR, 2)(zero_padding, (BASE_DT*) a, 2 * FLT_SIZE * inca, cdim); + RVV_TYPE_F(PREC, LMUL_MR) acol_r = VGET_V_F(PREC, LMUL_MR, 2)(acol, 0); + RVV_TYPE_F(PREC, LMUL_MR) acol_i = VGET_V_F(PREC, LMUL_MR, 2)(acol, 1); + + if (PASTEMAC(PRECISION_CHAR, eq1)(*kappa)) { + if (bli_is_conj(conja)) { + acol_i = VFNEG_VF_TU(PREC, LMUL_MR)(acol_i, acol_i, cdim); + acol = VSET_V_F(PREC, LMUL_MR, 2)(acol, 0, acol_r); + acol = VSET_V_F(PREC, LMUL_MR, 2)(acol, 1, acol_i); + } + } else { + RVV_TYPE_F(PREC, LMUL_MR) kappa_acol_r, kappa_acol_i; + if (bli_is_conj(conja)) + VCMUL_VF_CONJ_TU(PREC, LMUL_MR, kappa_acol_r, kappa_acol_i, acol_r, acol_i, kappa->real, kappa->imag, cdim); + else + VCMUL_VF_TU(PREC, LMUL_MR, kappa_acol_r, kappa_acol_i, acol_r, acol_i, kappa->real, kappa->imag, cdim); + acol = VSET_V_F(PREC, LMUL_MR, 2)(acol, 0, kappa_acol_r); + acol = VSET_V_F(PREC, LMUL_MR, 2)(acol, 1, kappa_acol_i); + } + + VSSEG2_V_F(PREC, LMUL_MR, 2)((BASE_DT*) p, acol, cdim_max); + + a += lda; + p += ldp; + } + + for (size_t i = n; i < n_max; ++i) { + VSSEG2_V_F(PREC, LMUL_MR, 2)((BASE_DT*) p, zero_padding, cdim_max); + p += ldp; + } + } + } + // NRxk kernel + else if (cdim <= NR && cdim_max == NR && cdim_bcast == 1) + { + if (lda == 1) { + // a is "row major" + RVV_TYPE_FX(PREC, LMUL_NR, 2) zero_padding = VUNDEFINED_FX(PREC, LMUL_NR, 2)(); + zero_padding = VSET_V_F(PREC, LMUL_NR, 2)(zero_padding, 0, VFMV_V_F(PREC, LMUL_NR)(0., cdim_max)); + zero_padding = VSET_V_F(PREC, LMUL_NR, 2)(zero_padding, 1, VFMV_V_F(PREC, LMUL_NR)(0., cdim_max)); + size_t avl = n; + while (avl) { + size_t vl = VSETVL(PREC, LMUL)(avl); + dim_t cdim_tmp = cdim; + const DATATYPE* restrict a_tmp = a; + DATATYPE* restrict p_tmp = p; + while (cdim_tmp >= 4) { + RVV_TYPE_FX(PREC, LMUL, 2) arow_vec; + RVV_TYPE_F(PREC, LMUL) arow0_r, arow1_r, arow2_r, arow3_r; + RVV_TYPE_F(PREC, LMUL) arow0_i, arow1_i, arow2_i, arow3_i; + arow_vec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(a_tmp + 0 * inca), vl); + arow0_r = VGET_V_F(PREC, LMUL, 2)(arow_vec, 0); + arow0_i = VGET_V_F(PREC, LMUL, 2)(arow_vec, 1); + arow_vec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(a_tmp + 1 * inca), vl); + arow1_r = VGET_V_F(PREC, LMUL, 2)(arow_vec, 0); + arow1_i = VGET_V_F(PREC, LMUL, 2)(arow_vec, 1); + arow_vec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(a_tmp + 2 * inca), vl); + arow2_r = VGET_V_F(PREC, LMUL, 2)(arow_vec, 0); + arow2_i = VGET_V_F(PREC, LMUL, 2)(arow_vec, 1); + arow_vec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(a_tmp + 3 * inca), vl); + arow3_r = VGET_V_F(PREC, LMUL, 2)(arow_vec, 0); + arow3_i = VGET_V_F(PREC, LMUL, 2)(arow_vec, 1); + + if (PASTEMAC(PRECISION_CHAR, eq1)(*kappa)) { + if (bli_is_conj(conja)) { + arow0_i = VFNEG_VF(PREC, LMUL)(arow0_i, vl); + arow1_i = VFNEG_VF(PREC, LMUL)(arow1_i, vl); + arow2_i = VFNEG_VF(PREC, LMUL)(arow2_i, vl); + arow3_i = VFNEG_VF(PREC, LMUL)(arow3_i, vl); + } + + RVV_TYPE_FX(PREC, LMUL, 8) ablock = VUNDEFINED_FX(PREC, LMUL, 8)(); + ablock = VSET_V_F(PREC, LMUL, 8)(ablock, 0, arow0_r); + ablock = VSET_V_F(PREC, LMUL, 8)(ablock, 1, arow0_i); + ablock = VSET_V_F(PREC, LMUL, 8)(ablock, 2, arow1_r); + ablock = VSET_V_F(PREC, LMUL, 8)(ablock, 3, arow1_i); + ablock = VSET_V_F(PREC, LMUL, 8)(ablock, 4, arow2_r); + ablock = VSET_V_F(PREC, LMUL, 8)(ablock, 5, arow2_i); + ablock = VSET_V_F(PREC, LMUL, 8)(ablock, 6, arow3_r); + ablock = VSET_V_F(PREC, LMUL, 8)(ablock, 7, arow3_i); + VSSSEG8_V_F(PREC, LMUL, 8)((BASE_DT*) p_tmp, 2 * FLT_SIZE * ldp, ablock, vl); + } else { + RVV_TYPE_F(PREC, LMUL) kappa_arow0_r, kappa_arow1_r, kappa_arow2_r, kappa_arow3_r; + RVV_TYPE_F(PREC, LMUL) kappa_arow0_i, kappa_arow1_i, kappa_arow2_i, kappa_arow3_i; + if (bli_is_conj(conja)) { + VCMUL_VF_CONJ(PREC, LMUL, kappa_arow0_r, kappa_arow0_i, arow0_r, arow0_i, kappa->real, kappa->imag, vl); + VCMUL_VF_CONJ(PREC, LMUL, kappa_arow1_r, kappa_arow1_i, arow1_r, arow1_i, kappa->real, kappa->imag, vl); + VCMUL_VF_CONJ(PREC, LMUL, kappa_arow2_r, kappa_arow2_i, arow2_r, arow2_i, kappa->real, kappa->imag, vl); + VCMUL_VF_CONJ(PREC, LMUL, kappa_arow3_r, kappa_arow3_i, arow3_r, arow3_i, kappa->real, kappa->imag, vl); + } else { + VCMUL_VF(PREC, LMUL, kappa_arow0_r, kappa_arow0_i, arow0_r, arow0_i, kappa->real, kappa->imag, vl); + VCMUL_VF(PREC, LMUL, kappa_arow1_r, kappa_arow1_i, arow1_r, arow1_i, kappa->real, kappa->imag, vl); + VCMUL_VF(PREC, LMUL, kappa_arow2_r, kappa_arow2_i, arow2_r, arow2_i, kappa->real, kappa->imag, vl); + VCMUL_VF(PREC, LMUL, kappa_arow3_r, kappa_arow3_i, arow3_r, arow3_i, kappa->real, kappa->imag, vl); + } + + RVV_TYPE_FX(PREC, LMUL, 8) ablock = VUNDEFINED_FX(PREC, LMUL, 8)(); + ablock = VSET_V_F(PREC, LMUL, 8)(ablock, 0, kappa_arow0_r); + ablock = VSET_V_F(PREC, LMUL, 8)(ablock, 1, kappa_arow0_i); + ablock = VSET_V_F(PREC, LMUL, 8)(ablock, 2, kappa_arow1_r); + ablock = VSET_V_F(PREC, LMUL, 8)(ablock, 3, kappa_arow1_i); + ablock = VSET_V_F(PREC, LMUL, 8)(ablock, 4, kappa_arow2_r); + ablock = VSET_V_F(PREC, LMUL, 8)(ablock, 5, kappa_arow2_i); + ablock = VSET_V_F(PREC, LMUL, 8)(ablock, 6, kappa_arow3_r); + ablock = VSET_V_F(PREC, LMUL, 8)(ablock, 7, kappa_arow3_i); + VSSSEG8_V_F(PREC, LMUL, 8)((BASE_DT*) p_tmp, 2 * FLT_SIZE * ldp, ablock, vl); + } + + a_tmp += 4 * inca; + p_tmp += 4; + cdim_tmp -= 4; + } + + if (cdim_tmp > 0) { + RVV_TYPE_FX(PREC, LMUL, 2) arow_vec; + RVV_TYPE_F(PREC, LMUL) arow0_r, arow1_r, arow2_r; + RVV_TYPE_F(PREC, LMUL) arow0_i, arow1_i, arow2_i; + switch (cdim_tmp) { + case 3: + arow_vec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(a_tmp + 2 * inca), vl); + arow2_r = VGET_V_F(PREC, LMUL, 2)(arow_vec, 0); + arow2_i = VGET_V_F(PREC, LMUL, 2)(arow_vec, 1); + case 2: + arow_vec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(a_tmp + 1 * inca), vl); + arow1_r = VGET_V_F(PREC, LMUL, 2)(arow_vec, 0); + arow1_i = VGET_V_F(PREC, LMUL, 2)(arow_vec, 1); + case 1: + arow_vec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(a_tmp + 0 * inca), vl); + arow0_r = VGET_V_F(PREC, LMUL, 2)(arow_vec, 0); + arow0_i = VGET_V_F(PREC, LMUL, 2)(arow_vec, 1); + } + + if (PASTEMAC(PRECISION_CHAR, eq1)(*kappa)) { + if (bli_is_conj(conja)) { + switch (cdim_tmp) { + case 3: + arow2_i = VFNEG_VF(PREC, LMUL)(arow2_i, vl); + case 2: + arow1_i = VFNEG_VF(PREC, LMUL)(arow1_i, vl); + case 1: + arow0_i = VFNEG_VF(PREC, LMUL)(arow0_i, vl); + } + } + + RVV_TYPE_FX(PREC, LMUL, 6) ablock3 = VUNDEFINED_FX(PREC, LMUL, 6)(); + RVV_TYPE_FX(PREC, LMUL, 4) ablock2 = VUNDEFINED_FX(PREC, LMUL, 4)(); + RVV_TYPE_FX(PREC, LMUL, 2) ablock1 = VUNDEFINED_FX(PREC, LMUL, 2)(); + switch (cdim_tmp) { + case 3: + ablock3 = VSET_V_F(PREC, LMUL, 6)(ablock3, 0, arow0_r); + ablock3 = VSET_V_F(PREC, LMUL, 6)(ablock3, 1, arow0_i); + ablock3 = VSET_V_F(PREC, LMUL, 6)(ablock3, 2, arow1_r); + ablock3 = VSET_V_F(PREC, LMUL, 6)(ablock3, 3, arow1_i); + ablock3 = VSET_V_F(PREC, LMUL, 6)(ablock3, 4, arow2_r); + ablock3 = VSET_V_F(PREC, LMUL, 6)(ablock3, 5, arow2_i); + VSSSEG6_V_F(PREC, LMUL, 6)((BASE_DT*) p_tmp, 2 * FLT_SIZE * ldp, ablock3, vl); + break; + case 2: + ablock2 = VSET_V_F(PREC, LMUL, 4)(ablock2, 0, arow0_r); + ablock2 = VSET_V_F(PREC, LMUL, 4)(ablock2, 1, arow0_i); + ablock2 = VSET_V_F(PREC, LMUL, 4)(ablock2, 2, arow1_r); + ablock2 = VSET_V_F(PREC, LMUL, 4)(ablock2, 3, arow1_i); + VSSSEG4_V_F(PREC, LMUL, 4)((BASE_DT*) p_tmp, 2 * FLT_SIZE * ldp, ablock2, vl); + break; + case 1: + ablock1 = VSET_V_F(PREC, LMUL, 2)(ablock1, 0, arow0_r); + ablock1 = VSET_V_F(PREC, LMUL, 2)(ablock1, 1, arow0_i); + VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) p_tmp, 2 * FLT_SIZE * ldp, ablock1, vl); + break; + } + } else { + RVV_TYPE_F(PREC, LMUL) kappa_arow0_r, kappa_arow1_r, kappa_arow2_r; + RVV_TYPE_F(PREC, LMUL) kappa_arow0_i, kappa_arow1_i, kappa_arow2_i; + if (bli_is_conj(conja)) { + switch (cdim_tmp) { + case 3: + VCMUL_VF_CONJ(PREC, LMUL, kappa_arow2_r, kappa_arow2_i, arow2_r, arow2_i, kappa->real, kappa->imag, vl); + case 2: + VCMUL_VF_CONJ(PREC, LMUL, kappa_arow1_r, kappa_arow1_i, arow1_r, arow1_i, kappa->real, kappa->imag, vl); + case 1: + VCMUL_VF_CONJ(PREC, LMUL, kappa_arow0_r, kappa_arow0_i, arow0_r, arow0_i, kappa->real, kappa->imag, vl); + } + } else { + switch (cdim_tmp) { + case 3: + VCMUL_VF(PREC, LMUL, kappa_arow2_r, kappa_arow2_i, arow2_r, arow2_i, kappa->real, kappa->imag, vl); + case 2: + VCMUL_VF(PREC, LMUL, kappa_arow1_r, kappa_arow1_i, arow1_r, arow1_i, kappa->real, kappa->imag, vl); + case 1: + VCMUL_VF(PREC, LMUL, kappa_arow0_r, kappa_arow0_i, arow0_r, arow0_i, kappa->real, kappa->imag, vl); + } + } + + RVV_TYPE_FX(PREC, LMUL, 6) ablock3 = VUNDEFINED_FX(PREC, LMUL, 6)(); + RVV_TYPE_FX(PREC, LMUL, 4) ablock2 = VUNDEFINED_FX(PREC, LMUL, 4)(); + RVV_TYPE_FX(PREC, LMUL, 2) ablock1 = VUNDEFINED_FX(PREC, LMUL, 2)(); + switch (cdim_tmp) { + case 3: + ablock3 = VSET_V_F(PREC, LMUL, 6)(ablock3, 0, kappa_arow0_r); + ablock3 = VSET_V_F(PREC, LMUL, 6)(ablock3, 1, kappa_arow0_i); + ablock3 = VSET_V_F(PREC, LMUL, 6)(ablock3, 2, kappa_arow1_r); + ablock3 = VSET_V_F(PREC, LMUL, 6)(ablock3, 3, kappa_arow1_i); + ablock3 = VSET_V_F(PREC, LMUL, 6)(ablock3, 4, kappa_arow2_r); + ablock3 = VSET_V_F(PREC, LMUL, 6)(ablock3, 5, kappa_arow2_i); + VSSSEG6_V_F(PREC, LMUL, 6)((BASE_DT*) p_tmp, 2 * FLT_SIZE * ldp, ablock3, vl); + break; + case 2: + ablock2 = VSET_V_F(PREC, LMUL, 4)(ablock2, 0, kappa_arow0_r); + ablock2 = VSET_V_F(PREC, LMUL, 4)(ablock2, 1, kappa_arow0_i); + ablock2 = VSET_V_F(PREC, LMUL, 4)(ablock2, 2, kappa_arow1_r); + ablock2 = VSET_V_F(PREC, LMUL, 4)(ablock2, 3, kappa_arow1_i); + VSSSEG4_V_F(PREC, LMUL, 4)((BASE_DT*) p_tmp, 2 * FLT_SIZE * ldp, ablock2, vl); + break; + case 1: + ablock1 = VSET_V_F(PREC, LMUL, 2)(ablock1, 0, kappa_arow0_r); + ablock1 = VSET_V_F(PREC, LMUL, 2)(ablock1, 1, kappa_arow0_i); + VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) p_tmp, 2 * FLT_SIZE * ldp, ablock1, vl); + break; + } + } + + p_tmp += cdim_tmp; + } + + // pad lower edge + for (size_t i = 0; i < vl; ++i) { + VSSEG2_V_F(PREC, LMUL_NR, 2)((BASE_DT*) p_tmp, zero_padding, cdim_max - cdim); + p_tmp += ldp; + } + + a += vl; + p += vl * ldp; + avl -= vl; + } + + // pad right edge + for (size_t i = n; i < n_max; ++i) { + VSSEG2_V_F(PREC, LMUL_NR, 2)((BASE_DT*) p, zero_padding, cdim_max); + p += ldp; + } + } else { + RVV_TYPE_FX(PREC, LMUL_NR, 2) zero_padding = VUNDEFINED_FX(PREC, LMUL_NR, 2)(); + zero_padding = VSET_V_F(PREC, LMUL_NR, 2)(zero_padding, 0, VFMV_V_F(PREC, LMUL_NR)(0., cdim_max)); + zero_padding = VSET_V_F(PREC, LMUL_NR, 2)(zero_padding, 1, VFMV_V_F(PREC, LMUL_NR)(0., cdim_max)); + + for (size_t i = 0; i < n; ++i) { + RVV_TYPE_FX(PREC, LMUL_NR, 2) acol; + if (inca == 1) + acol = VLSEG2_V_F_TU(PREC, LMUL_NR, 2)(zero_padding, (BASE_DT*) a, cdim); + else + acol = VLSSEG2_V_F_TU(PREC, LMUL_NR, 2)(zero_padding, (BASE_DT*) a, 2 * FLT_SIZE * inca, cdim); + RVV_TYPE_F(PREC, LMUL_NR) acol_r = VGET_V_F(PREC, LMUL_NR, 2)(acol, 0); + RVV_TYPE_F(PREC, LMUL_NR) acol_i = VGET_V_F(PREC, LMUL_NR, 2)(acol, 1); + + if (PASTEMAC(PRECISION_CHAR, eq1)(*kappa)) { + if (bli_is_conj(conja)) { + acol_i = VFNEG_VF_TU(PREC, LMUL_NR)(acol_i, acol_i, cdim); + acol = VSET_V_F(PREC, LMUL_NR, 2)(acol, 0, acol_r); + acol = VSET_V_F(PREC, LMUL_NR, 2)(acol, 1, acol_i); + } + } else { + RVV_TYPE_F(PREC, LMUL_NR) kappa_acol_r, kappa_acol_i; + if (bli_is_conj(conja)) + VCMUL_VF_CONJ_TU(PREC, LMUL_NR, kappa_acol_r, kappa_acol_i, acol_r, acol_i, kappa->real, kappa->imag, cdim); + else + VCMUL_VF_TU(PREC, LMUL_NR, kappa_acol_r, kappa_acol_i, acol_r, acol_i, kappa->real, kappa->imag, cdim); + acol = VSET_V_F(PREC, LMUL_NR, 2)(acol, 0, kappa_acol_r); + acol = VSET_V_F(PREC, LMUL_NR, 2)(acol, 1, kappa_acol_i); + } + + VSSEG2_V_F(PREC, LMUL_NR, 2)((BASE_DT*) p, acol, cdim_max); + + a += lda; + p += ldp; + } + + for (size_t i = n; i < n_max; ++i) { + VSSEG2_V_F(PREC, LMUL_NR, 2)((BASE_DT*) p, zero_padding, cdim_max); + p += ldp; + } + } + } + // generic kernel + else + { + REF_KERNEL(PRECISION_CHAR) + ( + conja, + schema, + cdim, + cdim_max, + cdim_bcast, + n, + n_max, + kappa, + a, inca, lda, + p, ldp, + params, + cntx + ); + } + + return; +} + +#endif // PACKM diff --git a/kernels/sifive_x280/1m/bli_packm_sifive_x280_intr/bli_packm_sifive_x280_intr_real.c b/kernels/sifive_x280/1m/bli_packm_sifive_x280_intr/bli_packm_sifive_x280_intr_real.c new file mode 100644 index 0000000000..741714d60a --- /dev/null +++ b/kernels/sifive_x280/1m/bli_packm_sifive_x280_intr/bli_packm_sifive_x280_intr_real.c @@ -0,0 +1,364 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2024, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off +#ifdef PACKM + +PACKM(PRECISION_CHAR, void) +{ + (void) conja; // Suppress unused parameter warnings + (void) schema; + (void) params; + (void) cntx; + const DATATYPE* restrict kappa = kappa_; + const DATATYPE* restrict a = a_; + DATATYPE* restrict p = p_; + + // MRxk kernel + if (cdim <= MR && cdim_max == MR && cdim_bcast == 1) + { + if (lda == 1) { + // a is "row major" + // pad the lower edge with zeros + RVV_TYPE_F(PREC, LMUL) arow0, arow1, arow2, arow3, arow4, arow5, arow6; + switch (cdim) { + case 0: + arow0 = VFMV_V_F(PREC, LMUL)(0., n); + case 1: + arow1 = VFMV_V_F(PREC, LMUL)(0., n); + case 2: + arow2 = VFMV_V_F(PREC, LMUL)(0., n); + case 3: + arow3 = VFMV_V_F(PREC, LMUL)(0., n); + case 4: + arow4 = VFMV_V_F(PREC, LMUL)(0., n); + case 5: + arow5 = VFMV_V_F(PREC, LMUL)(0., n); + case 6: + arow6 = VFMV_V_F(PREC, LMUL)(0., n); + } + + size_t avl = n; + while (avl) { + size_t vl = VSETVL(PREC, LMUL)(avl); + switch (cdim) { + case 7: + arow6 = VLE_V_F(PREC, LMUL)(a + 6 * inca, vl); + case 6: + arow5 = VLE_V_F(PREC, LMUL)(a + 5 * inca, vl); + case 5: + arow4 = VLE_V_F(PREC, LMUL)(a + 4 * inca, vl); + case 4: + arow3 = VLE_V_F(PREC, LMUL)(a + 3 * inca, vl); + case 3: + arow2 = VLE_V_F(PREC, LMUL)(a + 2 * inca, vl); + case 2: + arow1 = VLE_V_F(PREC, LMUL)(a + 1 * inca, vl); + case 1: + arow0 = VLE_V_F(PREC, LMUL)(a + 0 * inca, vl); + } + + if (!PASTEMAC(PRECISION_CHAR, eq1)(*kappa)) { + switch (cdim) { + case 7: + arow6 = VFMUL_VF(PREC, LMUL)(arow6, *kappa, vl); + case 6: + arow5 = VFMUL_VF(PREC, LMUL)(arow5, *kappa, vl); + case 5: + arow4 = VFMUL_VF(PREC, LMUL)(arow4, *kappa, vl); + case 4: + arow3 = VFMUL_VF(PREC, LMUL)(arow3, *kappa, vl); + case 3: + arow2 = VFMUL_VF(PREC, LMUL)(arow2, *kappa, vl); + case 2: + arow1 = VFMUL_VF(PREC, LMUL)(arow1, *kappa, vl); + case 1: + arow0 = VFMUL_VF(PREC, LMUL)(arow0, *kappa, vl); + } + } + + RVV_TYPE_FX(PREC, LMUL, 7) ablock = VUNDEFINED_FX(PREC, LMUL, 7)(); + ablock = VSET_V_F(PREC, LMUL, 7)(ablock, 0, arow0); + ablock = VSET_V_F(PREC, LMUL, 7)(ablock, 1, arow1); + ablock = VSET_V_F(PREC, LMUL, 7)(ablock, 2, arow2); + ablock = VSET_V_F(PREC, LMUL, 7)(ablock, 3, arow3); + ablock = VSET_V_F(PREC, LMUL, 7)(ablock, 4, arow4); + ablock = VSET_V_F(PREC, LMUL, 7)(ablock, 5, arow5); + ablock = VSET_V_F(PREC, LMUL, 7)(ablock, 6, arow6); + VSSSEG7_V_F(PREC, LMUL, 7)(p, FLT_SIZE * ldp, ablock, vl); + + a += vl; + p += vl * ldp; + avl -= vl; + } + + RVV_TYPE_F(PREC, LMUL_MR) zero_padding = VFMV_V_F(PREC, LMUL_MR)(0., cdim_max); + for (size_t i = n; i < n_max; ++i) { + VSE_V_F(PREC, LMUL_MR)(p, zero_padding, cdim_max); + p += ldp; + } + } + else { + RVV_TYPE_F(PREC, LMUL_MR) zero_padding = VFMV_V_F(PREC, LMUL_MR)(0., cdim_max); + for (size_t i = 0; i < n; ++i) { + RVV_TYPE_F(PREC, LMUL_MR) acol_vec; + if (inca == 1) + acol_vec = VLE_V_F_TU(PREC, LMUL_MR)(zero_padding, a, cdim); + else + acol_vec = VLSE_V_F_TU(PREC, LMUL_MR)(zero_padding, a, FLT_SIZE * inca, cdim); + + if (!PASTEMAC(PRECISION_CHAR, eq1)(*kappa)) + acol_vec = VFMUL_VF_TU(PREC, LMUL_MR)(acol_vec, acol_vec, *kappa, cdim); + + VSE_V_F(PREC, LMUL_MR)(p, acol_vec, cdim_max); + + a += lda; + p += ldp; + } + + for (size_t i = n; i < n_max; ++i) { + VSE_V_F(PREC, LMUL_MR)(p, zero_padding, cdim_max); + p += ldp; + } + } + } + // NRxk kernel + else if (cdim <= NR && cdim_max == NR && cdim_bcast == 1) + { + if (lda == 1) { + // a is "row major" + RVV_TYPE_F(PREC, LMUL_NR) zero_padding = VFMV_V_F(PREC, LMUL_NR)(0., cdim_max); + size_t avl = n; + while (avl) { + size_t vl = VSETVL(PREC, LMUL)(avl); + dim_t cdim_tmp = cdim; + const DATATYPE* restrict a_tmp = a; + DATATYPE* restrict p_tmp = p; + while (cdim_tmp >= 8) { + RVV_TYPE_F(PREC, LMUL) arow0, arow1, arow2, arow3, arow4, arow5, arow6, arow7; + arow0 = VLE_V_F(PREC, LMUL)(a_tmp + 0 * inca, vl); + arow1 = VLE_V_F(PREC, LMUL)(a_tmp + 1 * inca, vl); + arow2 = VLE_V_F(PREC, LMUL)(a_tmp + 2 * inca, vl); + arow3 = VLE_V_F(PREC, LMUL)(a_tmp + 3 * inca, vl); + arow4 = VLE_V_F(PREC, LMUL)(a_tmp + 4 * inca, vl); + arow5 = VLE_V_F(PREC, LMUL)(a_tmp + 5 * inca, vl); + arow6 = VLE_V_F(PREC, LMUL)(a_tmp + 6 * inca, vl); + arow7 = VLE_V_F(PREC, LMUL)(a_tmp + 7 * inca, vl); + + if (!PASTEMAC(PRECISION_CHAR, eq1)(*kappa)) { + arow0 = VFMUL_VF(PREC, LMUL)(arow0, *kappa, vl); + arow1 = VFMUL_VF(PREC, LMUL)(arow1, *kappa, vl); + arow2 = VFMUL_VF(PREC, LMUL)(arow2, *kappa, vl); + arow3 = VFMUL_VF(PREC, LMUL)(arow3, *kappa, vl); + arow4 = VFMUL_VF(PREC, LMUL)(arow4, *kappa, vl); + arow5 = VFMUL_VF(PREC, LMUL)(arow5, *kappa, vl); + arow6 = VFMUL_VF(PREC, LMUL)(arow6, *kappa, vl); + arow7 = VFMUL_VF(PREC, LMUL)(arow7, *kappa, vl); + } + + RVV_TYPE_FX(PREC, LMUL, 8) ablock = VUNDEFINED_FX(PREC, LMUL, 8)(); + ablock = VSET_V_F(PREC, LMUL, 8)(ablock, 0, arow0); + ablock = VSET_V_F(PREC, LMUL, 8)(ablock, 1, arow1); + ablock = VSET_V_F(PREC, LMUL, 8)(ablock, 2, arow2); + ablock = VSET_V_F(PREC, LMUL, 8)(ablock, 3, arow3); + ablock = VSET_V_F(PREC, LMUL, 8)(ablock, 4, arow4); + ablock = VSET_V_F(PREC, LMUL, 8)(ablock, 5, arow5); + ablock = VSET_V_F(PREC, LMUL, 8)(ablock, 6, arow6); + ablock = VSET_V_F(PREC, LMUL, 8)(ablock, 7, arow7); + VSSSEG8_V_F(PREC, LMUL, 8)(p_tmp, FLT_SIZE * ldp, ablock, vl); + + a_tmp += 8 * inca; + p_tmp += 8; + cdim_tmp -= 8; + } + + if (cdim_tmp > 0) { + RVV_TYPE_F(PREC, LMUL) arow0, arow1, arow2, arow3, arow4, arow5, arow6; + switch (cdim_tmp) { + case 7: + arow6 = VLE_V_F(PREC, LMUL)(a_tmp + 6 * inca, vl); + case 6: + arow5 = VLE_V_F(PREC, LMUL)(a_tmp + 5 * inca, vl); + case 5: + arow4 = VLE_V_F(PREC, LMUL)(a_tmp + 4 * inca, vl); + case 4: + arow3 = VLE_V_F(PREC, LMUL)(a_tmp + 3 * inca, vl); + case 3: + arow2 = VLE_V_F(PREC, LMUL)(a_tmp + 2 * inca, vl); + case 2: + arow1 = VLE_V_F(PREC, LMUL)(a_tmp + 1 * inca, vl); + case 1: + arow0 = VLE_V_F(PREC, LMUL)(a_tmp + 0 * inca, vl); + } + + if (!PASTEMAC(PRECISION_CHAR, eq1)(*kappa)) { + switch (cdim_tmp) { + case 7: + arow6 = VFMUL_VF(PREC, LMUL)(arow6, *kappa, vl); + case 6: + arow5 = VFMUL_VF(PREC, LMUL)(arow5, *kappa, vl); + case 5: + arow4 = VFMUL_VF(PREC, LMUL)(arow4, *kappa, vl); + case 4: + arow3 = VFMUL_VF(PREC, LMUL)(arow3, *kappa, vl); + case 3: + arow2 = VFMUL_VF(PREC, LMUL)(arow2, *kappa, vl); + case 2: + arow1 = VFMUL_VF(PREC, LMUL)(arow1, *kappa, vl); + case 1: + arow0 = VFMUL_VF(PREC, LMUL)(arow0, *kappa, vl); + } + } + + RVV_TYPE_FX(PREC, LMUL, 7) ablock7 = VUNDEFINED_FX(PREC, LMUL, 7)(); + RVV_TYPE_FX(PREC, LMUL, 6) ablock6 = VUNDEFINED_FX(PREC, LMUL, 6)(); + RVV_TYPE_FX(PREC, LMUL, 5) ablock5 = VUNDEFINED_FX(PREC, LMUL, 5)(); + RVV_TYPE_FX(PREC, LMUL, 4) ablock4 = VUNDEFINED_FX(PREC, LMUL, 4)(); + RVV_TYPE_FX(PREC, LMUL, 3) ablock3 = VUNDEFINED_FX(PREC, LMUL, 3)(); + RVV_TYPE_FX(PREC, LMUL, 2) ablock2 = VUNDEFINED_FX(PREC, LMUL, 2)(); + switch (cdim_tmp) { + case 7: + ablock7 = VSET_V_F(PREC, LMUL, 7)(ablock7, 0, arow0); + ablock7 = VSET_V_F(PREC, LMUL, 7)(ablock7, 1, arow1); + ablock7 = VSET_V_F(PREC, LMUL, 7)(ablock7, 2, arow2); + ablock7 = VSET_V_F(PREC, LMUL, 7)(ablock7, 3, arow3); + ablock7 = VSET_V_F(PREC, LMUL, 7)(ablock7, 4, arow4); + ablock7 = VSET_V_F(PREC, LMUL, 7)(ablock7, 5, arow5); + ablock7 = VSET_V_F(PREC, LMUL, 7)(ablock7, 6, arow6); + VSSSEG7_V_F(PREC, LMUL, 7)(p_tmp, FLT_SIZE * ldp, ablock7, vl); + break; + case 6: + ablock6 = VSET_V_F(PREC, LMUL, 6)(ablock6, 0, arow0); + ablock6 = VSET_V_F(PREC, LMUL, 6)(ablock6, 1, arow1); + ablock6 = VSET_V_F(PREC, LMUL, 6)(ablock6, 2, arow2); + ablock6 = VSET_V_F(PREC, LMUL, 6)(ablock6, 3, arow3); + ablock6 = VSET_V_F(PREC, LMUL, 6)(ablock6, 4, arow4); + ablock6 = VSET_V_F(PREC, LMUL, 6)(ablock6, 5, arow5); + VSSSEG6_V_F(PREC, LMUL, 6)(p_tmp, FLT_SIZE * ldp, ablock6, vl); + break; + case 5: + ablock5 = VSET_V_F(PREC, LMUL, 5)(ablock5, 0, arow0); + ablock5 = VSET_V_F(PREC, LMUL, 5)(ablock5, 1, arow1); + ablock5 = VSET_V_F(PREC, LMUL, 5)(ablock5, 2, arow2); + ablock5 = VSET_V_F(PREC, LMUL, 5)(ablock5, 3, arow3); + ablock5 = VSET_V_F(PREC, LMUL, 5)(ablock5, 4, arow4); + VSSSEG5_V_F(PREC, LMUL, 5)(p_tmp, FLT_SIZE * ldp, ablock5, vl); + break; + case 4: + ablock4 = VSET_V_F(PREC, LMUL, 4)(ablock4, 0, arow0); + ablock4 = VSET_V_F(PREC, LMUL, 4)(ablock4, 1, arow1); + ablock4 = VSET_V_F(PREC, LMUL, 4)(ablock4, 2, arow2); + ablock4 = VSET_V_F(PREC, LMUL, 4)(ablock4, 3, arow3); + VSSSEG4_V_F(PREC, LMUL, 4)(p_tmp, FLT_SIZE * ldp, ablock4, vl); + break; + case 3: + ablock3 = VSET_V_F(PREC, LMUL, 3)(ablock3, 0, arow0); + ablock3 = VSET_V_F(PREC, LMUL, 3)(ablock3, 1, arow1); + ablock3 = VSET_V_F(PREC, LMUL, 3)(ablock3, 2, arow2); + VSSSEG3_V_F(PREC, LMUL, 3)(p_tmp, FLT_SIZE * ldp, ablock3, vl); + break; + case 2: + ablock2 = VSET_V_F(PREC, LMUL, 2)(ablock2, 0, arow0); + ablock2 = VSET_V_F(PREC, LMUL, 2)(ablock2, 1, arow1); + VSSSEG2_V_F(PREC, LMUL, 2)(p_tmp, FLT_SIZE * ldp, ablock2, vl); + break; + case 1: + VSSE_V_F(PREC, LMUL)(p_tmp, FLT_SIZE * ldp, arow0, vl); + break; + } + p_tmp += cdim_tmp; + } + + for (size_t i = 0; i < vl; ++i) { + VSE_V_F(PREC, LMUL_NR)(p_tmp, zero_padding, cdim_max - cdim); + p_tmp += ldp; + } + + a += vl; + p += vl * ldp; + avl -= vl; + } + + for (size_t i = n; i < n_max; ++i) { + VSE_V_F(PREC, LMUL_NR)(p, zero_padding, cdim_max); + p += ldp; + } + } else { + RVV_TYPE_F(PREC, LMUL_NR) zero_padding = VFMV_V_F(PREC, LMUL_NR)(0., cdim_max); + for (size_t i = 0; i < n; ++i) { + RVV_TYPE_F(PREC, LMUL_NR) acol_vec; + if (inca == 1) + acol_vec = VLE_V_F_TU(PREC, LMUL_NR)(zero_padding, a, cdim); + else + acol_vec = VLSE_V_F_TU(PREC, LMUL_NR)(zero_padding, a, FLT_SIZE * inca, cdim); + + if (!PASTEMAC(PRECISION_CHAR, eq1)(*kappa)) + acol_vec = VFMUL_VF_TU(PREC, LMUL_NR)(acol_vec, acol_vec, *kappa, cdim); + + VSE_V_F(PREC, LMUL_NR)(p, acol_vec, cdim_max); + + a += lda; + p += ldp; + } + + for (size_t i = n; i < n_max; ++i) { + VSE_V_F(PREC, LMUL_NR)(p, zero_padding, cdim_max); + p += ldp; + } + } + } + // generic kernel + else + { + REF_KERNEL(PRECISION_CHAR) + ( + conja, + schema, + cdim, + cdim_max, + cdim_bcast, + n, + n_max, + kappa, + a, inca, lda, + p, ldp, + params, + cntx + ); + } + + return; +} + +#endif // PACKM diff --git a/kernels/sifive_x280/3/bli_gemm_sifive_x280_asm.c b/kernels/sifive_x280/3/bli_gemm_sifive_x280_asm.c deleted file mode 100644 index f4a5a26caf..0000000000 --- a/kernels/sifive_x280/3/bli_gemm_sifive_x280_asm.c +++ /dev/null @@ -1,2406 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2023, SiFive, Inc. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -// clang-format off -#include "blis.h" -#include "../riscv_cmul_macros_asm.h" -#include "../bli_kernels_sifive_x280.h" -#include -#include -#include -#include - -// byte-size of the floating point type -#define FLT_SIZE 4 -#define FLT_LOAD "flw " -#define VLE "vle32.v " -#define VLSE "vlse32.v " -#define VSE "vse32.v " -#define VSSE "vsse32.v " -#define PACKMR 8 -#define PACKNR 64 - -void bli_sgemm_7m4 - ( - dim_t N, - dim_t K, - const float* restrict alpha, - const float* restrict a, - const float* restrict b, - const float* restrict beta, - float* restrict c, inc_t rsc, inc_t csc - ) -{ - // 7 x N x K sgemm, 0 < N <= 64 = vlmax, K > 0 - __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(N), "i"(8 * FLT_SIZE)); - bool first = true; - // compute a*b - for (dim_t k = 0; k < K; ++k) { - __asm__(VLE "v28, (%0)" : : "r"(b)); - if (first) { - __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE)); - __asm__("vfmul.vf v0, v28, ft0"); - - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE)); - __asm__("vfmul.vf v4, v28, ft1"); - - __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE)); - __asm__("vfmul.vf v8, v28, ft2"); - - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE)); - __asm__("vfmul.vf v12, v28, ft3"); - - __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE)); - __asm__("vfmul.vf v16, v28, ft4"); - - __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE)); - __asm__("vfmul.vf v20, v28, ft5"); - - __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE)); - __asm__("vfmul.vf v24, v28, ft6"); - - first = false; - } - else { - __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE)); - __asm__("vfmacc.vf v0, ft0, v28"); - - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE)); - __asm__("vfmacc.vf v4, ft1, v28"); - - __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE)); - __asm__("vfmacc.vf v8, ft2, v28"); - - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE)); - __asm__("vfmacc.vf v12, ft3, v28"); - - __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE)); - __asm__("vfmacc.vf v16, ft4, v28"); - - __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE)); - __asm__("vfmacc.vf v20, ft5, v28"); - - __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE)); - __asm__("vfmacc.vf v24, ft6, v28"); - } - - __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * FLT_SIZE)); - __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * FLT_SIZE)); - } - - rsc *= FLT_SIZE; - csc *= FLT_SIZE; - - __asm__(FLT_LOAD "ft10, (%0)" : : "r"(alpha)); - - // compute alpha*a*b + beta*c - if (*beta == 0.f) { - __asm__("vfmul.vf v0, v0, ft10"); - __asm__("vfmul.vf v4, v4, ft10"); - __asm__("vfmul.vf v8, v8, ft10"); - __asm__("vfmul.vf v12, v12, ft10"); - __asm__("vfmul.vf v16, v16, ft10"); - __asm__("vfmul.vf v20, v20, ft10"); - __asm__("vfmul.vf v24, v24, ft10"); - } - else { // beta != 0.f - __asm__(FLT_LOAD "ft11, (%0)" : : "r"(beta)); - float *c_tmp = c; - if (csc == FLT_SIZE) { // c unit column stride - __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); - __asm__("vfmul.vf v0, v0, ft10"); - __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - __asm__("vfmacc.vf v0, ft11, v28"); - - __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); - __asm__("vfmul.vf v4, v4, ft10"); - __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - __asm__("vfmacc.vf v4, ft11, v28"); - - __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); - __asm__("vfmul.vf v8, v8, ft10"); - __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - __asm__("vfmacc.vf v8, ft11, v28"); - - __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); - __asm__("vfmul.vf v12, v12, ft10"); - __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - __asm__("vfmacc.vf v12, ft11, v28"); - - __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); - __asm__("vfmul.vf v16, v16, ft10"); - __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - __asm__("vfmacc.vf v16, ft11, v28"); - - __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); - __asm__("vfmul.vf v20, v20, ft10"); - __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - __asm__("vfmacc.vf v20, ft11, v28"); - - __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); - __asm__("vfmul.vf v24, v24, ft10"); - __asm__("vfmacc.vf v24, ft11, v28"); - } // end c unit column stride - else { // c non-unit column stride - __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); - __asm__("vfmul.vf v0, v0, ft10"); - __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - __asm__("vfmacc.vf v0, ft11, v28"); - - __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); - __asm__("vfmul.vf v4, v4, ft10"); - __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - __asm__("vfmacc.vf v4, ft11, v28"); - - __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); - __asm__("vfmul.vf v8, v8, ft10"); - __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - __asm__("vfmacc.vf v8, ft11, v28"); - - __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); - __asm__("vfmul.vf v12, v12, ft10"); - __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - __asm__("vfmacc.vf v12, ft11, v28"); - - __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); - __asm__("vfmul.vf v16, v16, ft10"); - __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - __asm__("vfmacc.vf v16, ft11, v28"); - - __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); - __asm__("vfmul.vf v20, v20, ft10"); - __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - __asm__("vfmacc.vf v20, ft11, v28"); - - __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); - __asm__("vfmul.vf v24, v24, ft10"); - __asm__("vfmacc.vf v24, ft11, v28"); - } // end c non-unit column stride - } // end beta != 0.f - - // store c - if (csc == FLT_SIZE) { - __asm__(VSE "v0, (%0)" : : "r"(c)); - __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); - __asm__(VSE "v4, (%0)" : : "r"(c)); - __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); - __asm__(VSE "v8, (%0)" : : "r"(c)); - __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); - __asm__(VSE "v12, (%0)" : : "r"(c)); - __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); - __asm__(VSE "v16, (%0)" : : "r"(c)); - __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); - __asm__(VSE "v20, (%0)" : : "r"(c)); - __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); - __asm__(VSE "v24, (%0)" : : "r"(c)); - } - else { - __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); - __asm__(VSSE "v4, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); - __asm__(VSSE "v8, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); - __asm__(VSSE "v12, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); - __asm__(VSSE "v16, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); - __asm__(VSSE "v20, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); - __asm__(VSSE "v24, (%0), %1" : : "r"(c), "r"(csc)); - } - - return; -} - -void bli_sgemm_7m4_cleanup - ( - dim_t M, - dim_t N, - dim_t K, - const float* restrict alpha, - const float* restrict a, - const float* restrict b, - const float* restrict beta, - float* restrict c, inc_t rsc, inc_t csc - ) -{ - // M x N x K sgemm, 0 < M < 6, 0 < N <= 64 = vlmax, K > 0 - __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(N), "i"(8 * FLT_SIZE)); - bool first = true; - // compute a*b - for (dim_t k = 0; k < K; ++k) { - __asm__(VLE "v28, (%0)" : : "r"(b)); - if (first) { - switch (M) { - case 6: - __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE)); - __asm__("vfmul.vf v20, v28, ft5"); - case 5: - __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE)); - __asm__("vfmul.vf v16, v28, ft4"); - case 4: - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE)); - __asm__("vfmul.vf v12, v28, ft3"); - case 3: - __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE)); - __asm__("vfmul.vf v8, v28, ft2"); - case 2: - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE)); - __asm__("vfmul.vf v4, v28, ft1"); - case 1: - __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE)); - __asm__("vfmul.vf v0, v28, ft0"); - } - first = false; - } - else { - switch (M) { - case 6: - __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE)); - __asm__("vfmacc.vf v20, ft5, v28"); - case 5: - __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE)); - __asm__("vfmacc.vf v16, ft4, v28"); - case 4: - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE)); - __asm__("vfmacc.vf v12, ft3, v28"); - case 3: - __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE)); - __asm__("vfmacc.vf v8, ft2, v28"); - case 2: - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE)); - __asm__("vfmacc.vf v4, ft1, v28"); - case 1: - __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE)); - __asm__("vfmacc.vf v0, ft0, v28"); - } - } - __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * FLT_SIZE)); - __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * FLT_SIZE)); - } - - c += (M - 1) * rsc; - rsc *= FLT_SIZE; - csc *= FLT_SIZE; - - __asm__(FLT_LOAD "ft10, (%0)" : : "r"(alpha)); - - // compute alpha*a*b + beta*c - if (*beta == 0.f) { - switch (M) { - case 6: - __asm__("vfmul.vf v20, v20, ft10"); - case 5: - __asm__("vfmul.vf v16, v16, ft10"); - case 4: - __asm__("vfmul.vf v12, v12, ft10"); - case 3: - __asm__("vfmul.vf v8, v8, ft10"); - case 2: - __asm__("vfmul.vf v4, v4, ft10"); - case 1: - __asm__("vfmul.vf v0, v0, ft10"); - } - } - else { // beta != 0.f - __asm__(FLT_LOAD "ft11, (%0)" : : "r"(beta)); - float *c_tmp = c; - if (csc == FLT_SIZE) { - switch (M) { - case 6: - __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); - __asm__("vfmul.vf v20, v20, ft10"); - __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - __asm__("vfmacc.vf v20, ft11, v28"); - case 5: - __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); - __asm__("vfmul.vf v16, v16, ft10"); - __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - __asm__("vfmacc.vf v16, ft11, v28"); - case 4: - __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); - __asm__("vfmul.vf v12, v12, ft10"); - __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - __asm__("vfmacc.vf v12, ft11, v28"); - case 3: - __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); - __asm__("vfmul.vf v8, v8, ft10"); - __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - __asm__("vfmacc.vf v8, ft11, v28"); - case 2: - __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); - __asm__("vfmul.vf v4, v4, ft10"); - __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - __asm__("vfmacc.vf v4, ft11, v28"); - case 1: - __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); - __asm__("vfmul.vf v0, v0, ft10"); - __asm__("vfmacc.vf v0, ft11, v28"); - } - } // end c unit column stride - else { // c non-unit column stride - switch (M) { - case 6: - __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); - __asm__("vfmul.vf v20, v20, ft10"); - __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - __asm__("vfmacc.vf v20, ft11, v28"); - case 5: - __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); - __asm__("vfmul.vf v16, v16, ft10"); - __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - __asm__("vfmacc.vf v16, ft11, v28"); - case 4: - __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); - __asm__("vfmul.vf v12, v12, ft10"); - __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - __asm__("vfmacc.vf v12, ft11, v28"); - case 3: - __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); - __asm__("vfmul.vf v8, v8, ft10"); - __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - __asm__("vfmacc.vf v8, ft11, v28"); - case 2: - __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); - __asm__("vfmul.vf v4, v4, ft10"); - __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - __asm__("vfmacc.vf v4, ft11, v28"); - case 1: - __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); - __asm__("vfmul.vf v0, v0, ft10"); - __asm__("vfmacc.vf v0, ft11, v28"); - } - } // end c non-unit column stride - } // end beta != 0.f - - // store c - if (csc == FLT_SIZE) { - switch (M) { - case 6: - __asm__(VSE "v20, (%0)" : : "r"(c)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 5: - __asm__(VSE "v16, (%0)" : : "r"(c)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 4: - __asm__(VSE "v12, (%0)" : : "r"(c)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 3: - __asm__(VSE "v8, (%0)" : : "r"(c)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 2: - __asm__(VSE "v4, (%0)" : : "r"(c)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 1: - __asm__(VSE "v0, (%0)" : : "r"(c)); - } - } - else { - switch (M) { - case 6: - __asm__(VSSE "v20, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 5: - __asm__(VSSE "v16, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 4: - __asm__(VSSE "v12, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 3: - __asm__(VSSE "v8, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 2: - __asm__(VSSE "v4, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 1: - __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc)); - } - } - return; -} - -void bli_sgemm_7m4_k0 - ( - dim_t M, - dim_t N, - const float* restrict beta, - float* restrict c, inc_t rsc, inc_t csc - ) -{ - // 0 < M <= 7, 0 < N < 64 = vlmax, K = 0 - // This may not produce the same result as the reference kernel if alpha is infinite or NaN. - __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(N), "i"(8 * FLT_SIZE)); - c += (M - 1) * rsc; - rsc *= FLT_SIZE; - csc *= FLT_SIZE; - if (*beta == 0.f) { - // set c to 0 - __asm__("vmv.v.i v0, 0"); - if (csc == FLT_SIZE) { // c unit column stride - switch (M) { - case 7: - __asm__(VSE "v0, (%0)" : : "r"(c)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 6: - __asm__(VSE "v0, (%0)" : : "r"(c)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 5: - __asm__(VSE "v0, (%0)" : : "r"(c)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 4: - __asm__(VSE "v0, (%0)" : : "r"(c)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 3: - __asm__(VSE "v0, (%0)" : : "r"(c)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 2: - __asm__(VSE "v0, (%0)" : : "r"(c)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 1: - __asm__(VSE "v0, (%0)" : : "r"(c)); - } - } // end c unit column stride - else { // c non-unit column stride - switch (M) { - case 7: - __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 6: - __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 5: - __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 4: - __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 3: - __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 2: - __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 1: - __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc)); - } - } // end c non-unit column stride - } // end beta == 0.f - else { // beta != 0.f - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(beta)); - if (csc == FLT_SIZE) { // c unit column stride - switch (M) { - case 7: - __asm__(VLE "v24, (%0)" : : "r"(c)); - __asm__("vfmul.vf v24, v24, ft0"); - __asm__(VSE "v24, (%0)" : : "r"(c)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 6: - __asm__(VLE "v20, (%0)" : : "r"(c)); - __asm__("vfmul.vf v20, v20, ft0"); - __asm__(VSE "v20, (%0)" : : "r"(c)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 5: - __asm__(VLE "v16, (%0)" : : "r"(c)); - __asm__("vfmul.vf v16, v16, ft0"); - __asm__(VSE "v16, (%0)" : : "r"(c)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 4: - __asm__(VLE "v12, (%0)" : : "r"(c)); - __asm__("vfmul.vf v12, v12, ft0"); - __asm__(VSE "v12, (%0)" : : "r"(c)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 3: - __asm__(VLE "v8, (%0)" : : "r"(c)); - __asm__("vfmul.vf v8, v8, ft0"); - __asm__(VSE "v8, (%0)" : : "r"(c)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 2: - __asm__(VLE "v4, (%0)" : : "r"(c)); - __asm__("vfmul.vf v4, v4, ft0"); - __asm__(VSE "v4, (%0)" : : "r"(c)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 1: - __asm__(VLE "v0, (%0)" : : "r"(c)); - __asm__("vfmul.vf v0, v0, ft0"); - __asm__(VSE "v0, (%0)" : : "r"(c)); - - } - } // end c unit column stride - else { // c non-unit column stride - switch (M) { - case 7: - __asm__(VLSE "v24, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("vfmul.vf v24, v24, ft0"); - __asm__(VSSE "v24, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 6: - __asm__(VLSE "v20, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("vfmul.vf v20, v20, ft0"); - __asm__(VSSE "v20, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 5: - __asm__(VLSE "v16, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("vfmul.vf v16, v16, ft0"); - __asm__(VSSE "v16, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 4: - __asm__(VLSE "v12, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("vfmul.vf v12, v12, ft0"); - __asm__(VSSE "v12, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 3: - __asm__(VLSE "v8, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("vfmul.vf v8, v8, ft0"); - __asm__(VSSE "v8, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 2: - __asm__(VLSE "v4, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("vfmul.vf v4, v4, ft0"); - __asm__(VSSE "v4, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 1: - __asm__(VLSE "v0, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("vfmul.vf v0, v0, ft0"); - __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc)); - } - } // end c non-unit column stride - } // end beta != 0.f - return; -} - -void bli_sgemm_sifive_x280_asm_7m4 - ( - dim_t M, - dim_t N, - dim_t K, - const void* restrict alpha_, - const void* restrict a_, - const void* restrict b_, - const void* restrict beta_, - void* restrict c_, inc_t rsc, inc_t csc, - const auxinfo_t* restrict data, - const cntx_t* restrict cntx - ) -{ - (void) data; - (void) cntx; - const float* restrict alpha = alpha_; - const float* restrict beta = beta_; - const float* restrict a = a_; - const float* restrict b = b_; - float* restrict c = c_; - - // M x N x K sgemm - if (M <= 0 || N <= 0 || K < 0) - return; - else if (K == 0) - bli_sgemm_7m4_k0(M, N, beta, c, rsc, csc); - else if (M == 7) - bli_sgemm_7m4(N, K, alpha, a, b, beta, c, rsc, csc); - else - bli_sgemm_7m4_cleanup(M, N, K, alpha, a, b, beta, c, rsc, csc); - return; -} - -#undef FLT_SIZE -#undef FLT_LOAD -#undef VLE -#undef VLSE -#undef VSE -#undef VSSE -#undef PACKMR -#undef PACKNR - -// byte-size of the floating point type -#define FLT_SIZE 8 -#define FLT_LOAD "fld " -#define VLE "vle64.v " -#define VLSE "vlse64.v " -#define VSE "vse64.v " -#define VSSE "vsse64.v " -#define PACKMR 8 -#define PACKNR 32 - -void bli_dgemm_7m4 - ( - dim_t N, - dim_t K, - const double* restrict alpha, - const double* restrict a, - const double* restrict b, - const double* restrict beta, - double* restrict c, inc_t rsc, inc_t csc - ) -{ - // 7 x N x K dgemm, 0 < N <= 64 = vlmax, K > 0 - __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(N), "i"(8 * FLT_SIZE)); - bool first = true; - // compute a*b - for (dim_t k = 0; k < K; ++k) { - __asm__(VLE "v28, (%0)" : : "r"(b)); - if (first) { - __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE)); - __asm__("vfmul.vf v0, v28, ft0"); - - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE)); - __asm__("vfmul.vf v4, v28, ft1"); - - __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE)); - __asm__("vfmul.vf v8, v28, ft2"); - - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE)); - __asm__("vfmul.vf v12, v28, ft3"); - - __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE)); - __asm__("vfmul.vf v16, v28, ft4"); - - __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE)); - __asm__("vfmul.vf v20, v28, ft5"); - - __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE)); - __asm__("vfmul.vf v24, v28, ft6"); - - first = false; - } - else { - __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE)); - __asm__("vfmacc.vf v0, ft0, v28"); - - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE)); - __asm__("vfmacc.vf v4, ft1, v28"); - - __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE)); - __asm__("vfmacc.vf v8, ft2, v28"); - - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE)); - __asm__("vfmacc.vf v12, ft3, v28"); - - __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE)); - __asm__("vfmacc.vf v16, ft4, v28"); - - __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE)); - __asm__("vfmacc.vf v20, ft5, v28"); - - __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE)); - __asm__("vfmacc.vf v24, ft6, v28"); - } - - __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * FLT_SIZE)); - __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * FLT_SIZE)); - } - - rsc *= FLT_SIZE; - csc *= FLT_SIZE; - - __asm__(FLT_LOAD "ft10, (%0)" : : "r"(alpha)); - - // compute alpha*a*b + beta*c - if (*beta == 0.) { - __asm__("vfmul.vf v0, v0, ft10"); - __asm__("vfmul.vf v4, v4, ft10"); - __asm__("vfmul.vf v8, v8, ft10"); - __asm__("vfmul.vf v12, v12, ft10"); - __asm__("vfmul.vf v16, v16, ft10"); - __asm__("vfmul.vf v20, v20, ft10"); - __asm__("vfmul.vf v24, v24, ft10"); - } - else { // beta != 0. - __asm__(FLT_LOAD "ft11, (%0)" : : "r"(beta)); - double *c_tmp = c; - if (csc == FLT_SIZE) { // c unit column stride - __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); - __asm__("vfmul.vf v0, v0, ft10"); - __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - __asm__("vfmacc.vf v0, ft11, v28"); - - __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); - __asm__("vfmul.vf v4, v4, ft10"); - __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - __asm__("vfmacc.vf v4, ft11, v28"); - - __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); - __asm__("vfmul.vf v8, v8, ft10"); - __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - __asm__("vfmacc.vf v8, ft11, v28"); - - __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); - __asm__("vfmul.vf v12, v12, ft10"); - __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - __asm__("vfmacc.vf v12, ft11, v28"); - - __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); - __asm__("vfmul.vf v16, v16, ft10"); - __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - __asm__("vfmacc.vf v16, ft11, v28"); - - __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); - __asm__("vfmul.vf v20, v20, ft10"); - __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - __asm__("vfmacc.vf v20, ft11, v28"); - - __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); - __asm__("vfmul.vf v24, v24, ft10"); - __asm__("vfmacc.vf v24, ft11, v28"); - } // end c unit column stride - else { // c non-unit column stride - __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); - __asm__("vfmul.vf v0, v0, ft10"); - __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - __asm__("vfmacc.vf v0, ft11, v28"); - - __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); - __asm__("vfmul.vf v4, v4, ft10"); - __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - __asm__("vfmacc.vf v4, ft11, v28"); - - __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); - __asm__("vfmul.vf v8, v8, ft10"); - __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - __asm__("vfmacc.vf v8, ft11, v28"); - - __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); - __asm__("vfmul.vf v12, v12, ft10"); - __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - __asm__("vfmacc.vf v12, ft11, v28"); - - __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); - __asm__("vfmul.vf v16, v16, ft10"); - __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - __asm__("vfmacc.vf v16, ft11, v28"); - - __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); - __asm__("vfmul.vf v20, v20, ft10"); - __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - __asm__("vfmacc.vf v20, ft11, v28"); - - __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); - __asm__("vfmul.vf v24, v24, ft10"); - __asm__("vfmacc.vf v24, ft11, v28"); - } // end c non-unit column stride - } // end beta != 0. - - // store c - if (csc == FLT_SIZE) { - __asm__(VSE "v0, (%0)" : : "r"(c)); - __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); - __asm__(VSE "v4, (%0)" : : "r"(c)); - __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); - __asm__(VSE "v8, (%0)" : : "r"(c)); - __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); - __asm__(VSE "v12, (%0)" : : "r"(c)); - __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); - __asm__(VSE "v16, (%0)" : : "r"(c)); - __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); - __asm__(VSE "v20, (%0)" : : "r"(c)); - __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); - __asm__(VSE "v24, (%0)" : : "r"(c)); - } - else { - __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); - __asm__(VSSE "v4, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); - __asm__(VSSE "v8, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); - __asm__(VSSE "v12, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); - __asm__(VSSE "v16, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); - __asm__(VSSE "v20, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); - __asm__(VSSE "v24, (%0), %1" : : "r"(c), "r"(csc)); - } - - return; -} - -void bli_dgemm_7m4_cleanup - ( - dim_t M, - dim_t N, - dim_t K, - const double* restrict alpha, - const double* restrict a, - const double* restrict b, - const double* restrict beta, - double* restrict c, inc_t rsc, inc_t csc - ) -{ - // M x N x K dgemm, 0 < M < 6, 0 < N <= 64 = vlmax, K > 0 - __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(N), "i"(8 * FLT_SIZE)); - bool first = true; - // compute a*b - for (dim_t k = 0; k < K; ++k) { - __asm__(VLE "v28, (%0)" : : "r"(b)); - if (first) { - switch (M) { - case 6: - __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE)); - __asm__("vfmul.vf v20, v28, ft5"); - case 5: - __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE)); - __asm__("vfmul.vf v16, v28, ft4"); - case 4: - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE)); - __asm__("vfmul.vf v12, v28, ft3"); - case 3: - __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE)); - __asm__("vfmul.vf v8, v28, ft2"); - case 2: - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE)); - __asm__("vfmul.vf v4, v28, ft1"); - case 1: - __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE)); - __asm__("vfmul.vf v0, v28, ft0"); - } - first = false; - } - else { - switch (M) { - case 6: - __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE)); - __asm__("vfmacc.vf v20, ft5, v28"); - case 5: - __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE)); - __asm__("vfmacc.vf v16, ft4, v28"); - case 4: - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE)); - __asm__("vfmacc.vf v12, ft3, v28"); - case 3: - __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE)); - __asm__("vfmacc.vf v8, ft2, v28"); - case 2: - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE)); - __asm__("vfmacc.vf v4, ft1, v28"); - case 1: - __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE)); - __asm__("vfmacc.vf v0, ft0, v28"); - } - } - __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * FLT_SIZE)); - __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * FLT_SIZE)); - } - - c += (M - 1) * rsc; - rsc *= FLT_SIZE; - csc *= FLT_SIZE; - - __asm__(FLT_LOAD "ft10, (%0)" : : "r"(alpha)); - - // compute alpha*a*b + beta*c - if (*beta == 0.) { - switch (M) { - case 6: - __asm__("vfmul.vf v20, v20, ft10"); - case 5: - __asm__("vfmul.vf v16, v16, ft10"); - case 4: - __asm__("vfmul.vf v12, v12, ft10"); - case 3: - __asm__("vfmul.vf v8, v8, ft10"); - case 2: - __asm__("vfmul.vf v4, v4, ft10"); - case 1: - __asm__("vfmul.vf v0, v0, ft10"); - } - } - else { // beta != 0. - __asm__(FLT_LOAD "ft11, (%0)" : : "r"(beta)); - double *c_tmp = c; - if (csc == FLT_SIZE) { - switch (M) { - case 6: - __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); - __asm__("vfmul.vf v20, v20, ft10"); - __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - __asm__("vfmacc.vf v20, ft11, v28"); - case 5: - __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); - __asm__("vfmul.vf v16, v16, ft10"); - __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - __asm__("vfmacc.vf v16, ft11, v28"); - case 4: - __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); - __asm__("vfmul.vf v12, v12, ft10"); - __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - __asm__("vfmacc.vf v12, ft11, v28"); - case 3: - __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); - __asm__("vfmul.vf v8, v8, ft10"); - __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - __asm__("vfmacc.vf v8, ft11, v28"); - case 2: - __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); - __asm__("vfmul.vf v4, v4, ft10"); - __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - __asm__("vfmacc.vf v4, ft11, v28"); - case 1: - __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); - __asm__("vfmul.vf v0, v0, ft10"); - __asm__("vfmacc.vf v0, ft11, v28"); - } - } // end c unit column stride - else { // c non-unit column stride - switch (M) { - case 6: - __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); - __asm__("vfmul.vf v20, v20, ft10"); - __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - __asm__("vfmacc.vf v20, ft11, v28"); - case 5: - __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); - __asm__("vfmul.vf v16, v16, ft10"); - __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - __asm__("vfmacc.vf v16, ft11, v28"); - case 4: - __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); - __asm__("vfmul.vf v12, v12, ft10"); - __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - __asm__("vfmacc.vf v12, ft11, v28"); - case 3: - __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); - __asm__("vfmul.vf v8, v8, ft10"); - __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - __asm__("vfmacc.vf v8, ft11, v28"); - case 2: - __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); - __asm__("vfmul.vf v4, v4, ft10"); - __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - __asm__("vfmacc.vf v4, ft11, v28"); - case 1: - __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); - __asm__("vfmul.vf v0, v0, ft10"); - __asm__("vfmacc.vf v0, ft11, v28"); - } - } // end c non-unit column stride - } // end beta != 0. - - // store c - if (csc == FLT_SIZE) { - switch (M) { - case 6: - __asm__(VSE "v20, (%0)" : : "r"(c)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 5: - __asm__(VSE "v16, (%0)" : : "r"(c)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 4: - __asm__(VSE "v12, (%0)" : : "r"(c)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 3: - __asm__(VSE "v8, (%0)" : : "r"(c)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 2: - __asm__(VSE "v4, (%0)" : : "r"(c)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 1: - __asm__(VSE "v0, (%0)" : : "r"(c)); - } - } - else { - switch (M) { - case 6: - __asm__(VSSE "v20, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 5: - __asm__(VSSE "v16, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 4: - __asm__(VSSE "v12, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 3: - __asm__(VSSE "v8, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 2: - __asm__(VSSE "v4, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 1: - __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc)); - } - } - return; -} - -void bli_dgemm_7m4_k0 - ( - dim_t M, - dim_t N, - const double* restrict beta, - double* restrict c, inc_t rsc, inc_t csc - ) -{ - // 0 < M <= 7, 0 < N < 64 = vlmax, K = 0 - // This may not produce the same result as the reference kernel if alpha is infinite or NaN. - __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(N), "i"(8 * FLT_SIZE)); - c += (M - 1) * rsc; - rsc *= FLT_SIZE; - csc *= FLT_SIZE; - if (*beta == 0.) { - // set c to 0 - __asm__("vmv.v.i v0, 0"); - if (csc == FLT_SIZE) { // c unit column stride - switch (M) { - case 7: - __asm__(VSE "v0, (%0)" : : "r"(c)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 6: - __asm__(VSE "v0, (%0)" : : "r"(c)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 5: - __asm__(VSE "v0, (%0)" : : "r"(c)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 4: - __asm__(VSE "v0, (%0)" : : "r"(c)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 3: - __asm__(VSE "v0, (%0)" : : "r"(c)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 2: - __asm__(VSE "v0, (%0)" : : "r"(c)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 1: - __asm__(VSE "v0, (%0)" : : "r"(c)); - } - } // end c unit column stride - else { // c non-unit column stride - switch (M) { - case 7: - __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 6: - __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 5: - __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 4: - __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 3: - __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 2: - __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 1: - __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc)); - } - } // end c non-unit column stride - } // end beta == 0. - else { // beta != 0. - __asm__(FLT_LOAD "ft0, (%0)" : : "r"(beta)); - if (csc == FLT_SIZE) { // c unit column stride - switch (M) { - case 7: - __asm__(VLE "v24, (%0)" : : "r"(c)); - __asm__("vfmul.vf v24, v24, ft0"); - __asm__(VSE "v24, (%0)" : : "r"(c)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 6: - __asm__(VLE "v20, (%0)" : : "r"(c)); - __asm__("vfmul.vf v20, v20, ft0"); - __asm__(VSE "v20, (%0)" : : "r"(c)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 5: - __asm__(VLE "v16, (%0)" : : "r"(c)); - __asm__("vfmul.vf v16, v16, ft0"); - __asm__(VSE "v16, (%0)" : : "r"(c)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 4: - __asm__(VLE "v12, (%0)" : : "r"(c)); - __asm__("vfmul.vf v12, v12, ft0"); - __asm__(VSE "v12, (%0)" : : "r"(c)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 3: - __asm__(VLE "v8, (%0)" : : "r"(c)); - __asm__("vfmul.vf v8, v8, ft0"); - __asm__(VSE "v8, (%0)" : : "r"(c)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 2: - __asm__(VLE "v4, (%0)" : : "r"(c)); - __asm__("vfmul.vf v4, v4, ft0"); - __asm__(VSE "v4, (%0)" : : "r"(c)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 1: - __asm__(VLE "v0, (%0)" : : "r"(c)); - __asm__("vfmul.vf v0, v0, ft0"); - __asm__(VSE "v0, (%0)" : : "r"(c)); - - } - } // end c unit column stride - else { // c non-unit column stride - switch (M) { - case 7: - __asm__(VLSE "v24, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("vfmul.vf v24, v24, ft0"); - __asm__(VSSE "v24, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 6: - __asm__(VLSE "v20, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("vfmul.vf v20, v20, ft0"); - __asm__(VSSE "v20, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 5: - __asm__(VLSE "v16, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("vfmul.vf v16, v16, ft0"); - __asm__(VSSE "v16, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 4: - __asm__(VLSE "v12, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("vfmul.vf v12, v12, ft0"); - __asm__(VSSE "v12, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 3: - __asm__(VLSE "v8, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("vfmul.vf v8, v8, ft0"); - __asm__(VSSE "v8, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 2: - __asm__(VLSE "v4, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("vfmul.vf v4, v4, ft0"); - __asm__(VSSE "v4, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 1: - __asm__(VLSE "v0, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("vfmul.vf v0, v0, ft0"); - __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc)); - } - } // end c non-unit column stride - } // end beta != 0. - return; -} - -void bli_dgemm_sifive_x280_asm_7m4 - ( - dim_t M, - dim_t N, - dim_t K, - const void* restrict alpha_, - const void* restrict a_, - const void* restrict b_, - const void* restrict beta_, - void* restrict c_, inc_t rsc, inc_t csc, - const auxinfo_t* restrict data, - const cntx_t* restrict cntx - ) -{ - (void) data; - (void) cntx; - const double* restrict alpha = alpha_; - const double* restrict beta = beta_; - const double* restrict a = a_; - const double* restrict b = b_; - double* restrict c = c_; - - // M x N x K dgemm - if (M <= 0 || N <= 0 || K < 0) - return; - else if (K == 0) - bli_dgemm_7m4_k0(M, N, beta, c, rsc, csc); - else if (M == 7) - bli_dgemm_7m4(N, K, alpha, a, b, beta, c, rsc, csc); - else - bli_dgemm_7m4_cleanup(M, N, K, alpha, a, b, beta, c, rsc, csc); - return; -} - -#undef FLT_SIZE -#undef FLT_LOAD -#undef VLE -#undef VLSE -#undef VSE -#undef VSSE -#undef PACKMR -#undef PACKNR - -// byte-size of underlying floating point type -#define FLT_SIZE 4 -#define FLT_LOAD "flw " -#define VLSEG2 "vlseg2e32.v " -#define VLSSEG2 "vlsseg2e32.v " -#define VSSEG2 "vsseg2e32.v " -#define VSSSEG2 "vssseg2e32.v " -#define PACKMR 8 -#define PACKNR 32 - -void bli_cgemm_6m2 - ( - dim_t N, - dim_t K, - const scomplex* restrict alpha, - const scomplex* restrict a, - const scomplex* restrict b, - const scomplex* restrict beta, - scomplex* restrict c, inc_t rsc, inc_t csc - ) -{ - // 6 x N x K cgemm, N <= 32 = vlmax, K > 0 - // pairs of register groups hold the real and imag. parts of rows of c and b - __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(N), "i"(8 * FLT_SIZE)); - __asm__(VLSEG2 "v24, (%0)" : : "r"(b)); - __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE)); - if (K >= 2) { - __asm__(VLSEG2 "v28, (%0)" : : "r"(b)); - __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE)); - } - - __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE)); - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE)); - vcmul_vf(v0, v2, v24, v26, ft0, ft1); - - __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE)); - vcmul_vf(v4, v6, v24, v26, ft2, ft3); - - __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE)); - __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE)); - vcmul_vf(v8, v10, v24, v26, ft4, ft5); - - __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE)); - __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a), "I"(7 * FLT_SIZE)); - vcmul_vf(v12, v14, v24, v26, ft6, ft7); - - __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a), "I"(8 * FLT_SIZE)); - __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a), "I"(9 * FLT_SIZE)); - vcmul_vf(v16, v18, v24, v26, ft8, ft9); - - __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a), "I"(10 * FLT_SIZE)); - __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a), "I"(11 * FLT_SIZE)); - vcmul_vf(v20, v22, v24, v26, ft10, ft11); - K -= 1; - - if (K >= 2) { - __asm__(VLSEG2 "v24, (%0)" : : "r"(b)); - __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE)); - } - __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * 2 * FLT_SIZE)); - - while (K > 0) { - __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE)); - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE)); - vcmacc_vf(v0, v2, ft0, ft1, v28, v30); - - __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE)); - vcmacc_vf(v4, v6, ft2, ft3, v28, v30); - - __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE)); - __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE)); - vcmacc_vf(v8, v10, ft4, ft5, v28, v30); - - __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE)); - __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a), "I"(7 * FLT_SIZE)); - vcmacc_vf(v12, v14, ft6, ft7, v28, v30); - - __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a), "I"(8 * FLT_SIZE)); - __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a), "I"(9 * FLT_SIZE)); - vcmacc_vf(v16, v18, ft8, ft9, v28, v30); - - __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a), "I"(10 * FLT_SIZE)); - __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a), "I"(11 * FLT_SIZE)); - vcmacc_vf(v20, v22, ft10, ft11, v28, v30); - K -= 1; - - if (K == 0) { break; } - - if (K >= 2) { - __asm__(VLSEG2 "v28, (%0)" : : "r"(b)); - __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE)); - } - __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * 2 * FLT_SIZE)); - - __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE)); - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE)); - vcmacc_vf(v0, v2, ft0, ft1, v24, v26); - - __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE)); - vcmacc_vf(v4, v6, ft2, ft3, v24, v26); - - __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE)); - __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE)); - vcmacc_vf(v8, v10, ft4, ft5, v24, v26); - - __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE)); - __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a), "I"(7 * FLT_SIZE)); - vcmacc_vf(v12, v14, ft6, ft7, v24, v26); - - __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a), "I"(8 * FLT_SIZE)); - __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a), "I"(9 * FLT_SIZE)); - vcmacc_vf(v16, v18, ft8, ft9, v24, v26); - - __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a), "I"(10 * FLT_SIZE)); - __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a), "I"(11 * FLT_SIZE)); - vcmacc_vf(v20, v22, ft10, ft11, v24, v26); - K -= 1; - - if (K >= 2) { - __asm__(VLSEG2 "v24, (%0)" : : "r"(b)); - __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE)); - } - __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * 2 * FLT_SIZE)); - } - - rsc *= 2 * FLT_SIZE; - csc *= 2 * FLT_SIZE; - - __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(alpha), "I"(0 * FLT_SIZE)); - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(alpha), "I"(1 * FLT_SIZE)); - - __asm__("vfmul.vf v24, v2, ft1"); - __asm__("vfmul.vf v26, v0, ft1"); - __asm__("vfmul.vf v28, v6, ft1"); - __asm__("vfmul.vf v30, v4, ft1"); - - __asm__("vfmsub.vf v0, ft0, v24"); - __asm__("vfmadd.vf v2, ft0, v26"); - __asm__("vfmsub.vf v4, ft0, v28"); - __asm__("vfmadd.vf v6, ft0, v30"); - - __asm__("vfmul.vf v24, v10, ft1"); - __asm__("vfmul.vf v26, v8, ft1"); - __asm__("vfmul.vf v28, v14, ft1"); - __asm__("vfmul.vf v30, v12, ft1"); - - __asm__("vfmsub.vf v8, ft0, v24"); - __asm__("vfmadd.vf v10, ft0, v26"); - __asm__("vfmsub.vf v12, ft0, v28"); - __asm__("vfmadd.vf v14, ft0, v30"); - - __asm__("vfmul.vf v24, v18, ft1"); - __asm__("vfmul.vf v26, v16, ft1"); - __asm__("vfmul.vf v28, v22, ft1"); - __asm__("vfmul.vf v30, v20, ft1"); - - __asm__("vfmsub.vf v16, ft0, v24"); - __asm__("vfmadd.vf v18, ft0, v26"); - __asm__("vfmsub.vf v20, ft0, v28"); - __asm__("vfmadd.vf v22, ft0, v30"); - - scomplex beta_cast = *beta; - if (beta_cast.real != 0.f || beta_cast.imag != 0.f) { - if (csc == 2 * FLT_SIZE) { - scomplex *c_tmp = c; - __asm__(VLSEG2 "v24, (%0)" : : "r"(c_tmp)); - __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - __asm__(VLSEG2 "v28, (%0)" : : "r"(c_tmp)); - __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - vcmacc_vf2(v0, v2, beta_cast.real, beta_cast.imag, v24, v26); - - __asm__(VLSEG2 "v24, (%0)" : : "r"(c_tmp)); - __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - vcmacc_vf2(v4, v6, beta_cast.real, beta_cast.imag, v28, v30); - - __asm__(VLSEG2 "v28, (%0)" : : "r"(c_tmp)); - __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - vcmacc_vf2(v8, v10, beta_cast.real, beta_cast.imag, v24, v26); - - __asm__(VLSEG2 "v24, (%0)" : : "r"(c_tmp)); - __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - vcmacc_vf2(v12, v14, beta_cast.real, beta_cast.imag, v28, v30); - - __asm__(VLSEG2 "v28, (%0)" : : "r"(c_tmp)); - vcmacc_vf2(v16, v18, beta_cast.real, beta_cast.imag, v24, v26); - - vcmacc_vf2(v20, v22, beta_cast.real, beta_cast.imag, v28, v30); - } - else { - scomplex *c_tmp = c; - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(c_tmp), "r"(csc)); - __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); - __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - vcmacc_vf2(v0, v2, beta_cast.real, beta_cast.imag, v24, v26); - - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(c_tmp), "r"(csc)); - __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - vcmacc_vf2(v4, v6, beta_cast.real, beta_cast.imag, v28, v30); - - __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); - __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - vcmacc_vf2(v8, v10, beta_cast.real, beta_cast.imag, v24, v26); - - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(c_tmp), "r"(csc)); - __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - vcmacc_vf2(v12, v14, beta_cast.real, beta_cast.imag, v28, v30); - - __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); - vcmacc_vf2(v16, v18, beta_cast.real, beta_cast.imag, v24, v26); - - vcmacc_vf2(v20, v22, beta_cast.real, beta_cast.imag, v28, v30); - } - } - - if (csc == 2 * FLT_SIZE) { - __asm__(VSSEG2 "v0, (%0)" : : "r"(c)); - __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); - __asm__(VSSEG2 "v4, (%0)" : : "r"(c)); - __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); - __asm__(VSSEG2 "v8, (%0)" : : "r"(c)); - __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); - __asm__(VSSEG2 "v12, (%0)" : : "r"(c)); - __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); - __asm__(VSSEG2 "v16, (%0)" : : "r"(c)); - __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); - __asm__(VSSEG2 "v20, (%0)" : : "r"(c)); - } - else { - __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); - __asm__(VSSSEG2 "v4, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); - __asm__(VSSSEG2 "v8, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); - __asm__(VSSSEG2 "v12, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); - __asm__(VSSSEG2 "v16, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); - __asm__(VSSSEG2 "v20, (%0), %1" : : "r"(c), "r"(csc)); - } - - return; -} - -void bli_cgemm_6m2_cleanup - ( - dim_t M, - dim_t N, - dim_t K, - const scomplex* restrict alpha, - const scomplex* restrict a, - const scomplex* restrict b, - const scomplex* restrict beta, - scomplex* restrict c, inc_t rsc, inc_t csc - ) -{ - // M x N x K cgemm, 0 < M < 6, N <= 32 = vlmax, K > 0 - // pairs of register groups hold the real and imag. parts of rows of c and b - - __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(N), "i"(8 * FLT_SIZE)); - __asm__(VLSEG2 "v24, (%0)" : : "r"(b)); - __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE)); - if (K >= 2) { - __asm__(VLSEG2 "v28, (%0)" : : "r"(b)); - __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE)); - } - - switch (M) { - case 5: - __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a), "I"(8 * FLT_SIZE)); - __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a), "I"(9 * FLT_SIZE)); - vcmul_vf(v16, v18, v24, v26, ft8, ft9); - case 4: - __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE)); - __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a), "I"(7 * FLT_SIZE)); - vcmul_vf(v12, v14, v24, v26, ft6, ft7); - case 3: - __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE)); - __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE)); - vcmul_vf(v8, v10, v24, v26, ft4, ft5); - case 2: - __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE)); - vcmul_vf(v4, v6, v24, v26, ft2, ft3); - case 1: - __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE)); - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE)); - vcmul_vf(v0, v2, v24, v26, ft0, ft1); - } - K -= 1; - - if (K >= 2) { - __asm__(VLSEG2 "v24, (%0)" : : "r"(b)); - __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE)); - } - __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * 2 * FLT_SIZE)); - - while (K > 0) { - switch (M) { - case 5: - __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a), "I"(8 * FLT_SIZE)); - __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a), "I"(9 * FLT_SIZE)); - vcmacc_vf(v16, v18, ft8, ft9, v28, v30); - case 4: - __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE)); - __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a), "I"(7 * FLT_SIZE)); - vcmacc_vf(v12, v14, ft6, ft7, v28, v30); - case 3: - __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE)); - __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE)); - vcmacc_vf(v8, v10, ft4, ft5, v28, v30); - case 2: - __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE)); - vcmacc_vf(v4, v6, ft2, ft3, v28, v30); - case 1: - __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE)); - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE)); - vcmacc_vf(v0, v2, ft0, ft1, v28, v30); - } - K -= 1; - - if (K == 0) { break; } - - if (K >= 2) { - __asm__(VLSEG2 "v28, (%0)" : : "r"(b)); - __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE)); - } - __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * 2 * FLT_SIZE)); - - switch (M) { - case 5: - __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a), "I"(8 * FLT_SIZE)); - __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a), "I"(9 * FLT_SIZE)); - vcmacc_vf(v16, v18, ft8, ft9, v24, v26); - case 4: - __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE)); - __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a), "I"(7 * FLT_SIZE)); - vcmacc_vf(v12, v14, ft6, ft7, v24, v26); - case 3: - __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE)); - __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE)); - vcmacc_vf(v8, v10, ft4, ft5, v24, v26); - case 2: - __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE)); - vcmacc_vf(v4, v6, ft2, ft3, v24, v26); - case 1: - __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE)); - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE)); - vcmacc_vf(v0, v2, ft0, ft1, v24, v26); - } - K -= 1; - - if (K >= 2) { - __asm__(VLSEG2 "v24, (%0)" : : "r"(b)); - __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE)); - } - __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * 2 * FLT_SIZE)); - } - - c += (M - 1) * rsc; - rsc *= 2 * FLT_SIZE; - csc *= 2 * FLT_SIZE; - - __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(alpha), "I"(0 * FLT_SIZE)); - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(alpha), "I"(1 * FLT_SIZE)); - - switch (M) { - case 5: - __asm__("vfmul.vf v24, v18, ft1"); - __asm__("vfmul.vf v26, v16, ft1"); - __asm__("vfmsub.vf v16, ft0, v24"); - __asm__("vfmadd.vf v18, ft0, v26"); - case 4: - __asm__("vfmul.vf v28, v14, ft1"); - __asm__("vfmul.vf v30, v12, ft1"); - __asm__("vfmsub.vf v12, ft0, v28"); - __asm__("vfmadd.vf v14, ft0, v30"); - case 3: - __asm__("vfmul.vf v24, v10, ft1"); - __asm__("vfmul.vf v26, v8, ft1"); - __asm__("vfmsub.vf v8, ft0, v24"); - __asm__("vfmadd.vf v10, ft0, v26"); - case 2: - __asm__("vfmul.vf v28, v6, ft1"); - __asm__("vfmul.vf v30, v4, ft1"); - __asm__("vfmsub.vf v4, ft0, v28"); - __asm__("vfmadd.vf v6, ft0, v30"); - case 1: - __asm__("vfmul.vf v24, v2, ft1"); - __asm__("vfmul.vf v26, v0, ft1"); - __asm__("vfmsub.vf v0, ft0, v24"); - __asm__("vfmadd.vf v2, ft0, v26"); - } - - scomplex beta_cast = *beta; - if (beta_cast.real != 0.f || beta_cast.imag != 0.f) { - if (csc == 2 * FLT_SIZE) { - scomplex *c_tmp = c; - switch (M) { - case 5: - __asm__(VLSEG2 "v24, (%0)" : : "r"(c_tmp)); - __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - vcmacc_vf2(v16, v18, beta_cast.real, beta_cast.imag, v24, v26); - case 4: - __asm__(VLSEG2 "v28, (%0)" : : "r"(c_tmp)); - __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - vcmacc_vf2(v12, v14, beta_cast.real, beta_cast.imag, v28, v30); - case 3: - __asm__(VLSEG2 "v24, (%0)" : : "r"(c_tmp)); - __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - vcmacc_vf2(v8, v10, beta_cast.real, beta_cast.imag, v24, v26); - case 2: - __asm__(VLSEG2 "v28, (%0)" : : "r"(c_tmp)); - __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - vcmacc_vf2(v4, v6, beta_cast.real, beta_cast.imag, v28, v30); - case 1: - __asm__(VLSEG2 "v24, (%0)" : : "r"(c_tmp)); - vcmacc_vf2(v0, v2, beta_cast.real, beta_cast.imag, v24, v26); - } - } - else { - scomplex *c_tmp = c; - switch (M) { - case 5: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(c_tmp), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - vcmacc_vf2(v16, v18, beta_cast.real, beta_cast.imag, v24, v26); - case 4: - __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - vcmacc_vf2(v12, v14, beta_cast.real, beta_cast.imag, v28, v30); - case 3: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(c_tmp), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - vcmacc_vf2(v8, v10, beta_cast.real, beta_cast.imag, v24, v26); - case 2: - __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - vcmacc_vf2(v4, v6, beta_cast.real, beta_cast.imag, v28, v30); - case 1: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(c_tmp), "r"(csc)); - vcmacc_vf2(v0, v2, beta_cast.real, beta_cast.imag, v24, v26); - } - } - } - - if (csc == 2 * FLT_SIZE) { - switch (M) { - case 5: - __asm__(VSSEG2 "v16, (%0)" : : "r"(c)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 4: - __asm__(VSSEG2 "v12, (%0)" : : "r"(c)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 3: - __asm__(VSSEG2 "v8, (%0)" : : "r"(c)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 2: - __asm__(VSSEG2 "v4, (%0)" : : "r"(c)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 1: - __asm__(VSSEG2 "v0, (%0)" : : "r"(c)); - } - } - else { - switch (M) { - case 5: - __asm__(VSSSEG2 "v16, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 4: - __asm__(VSSSEG2 "v12, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 3: - __asm__(VSSSEG2 "v8, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 2: - __asm__(VSSSEG2 "v4, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 1: - __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(c), "r"(csc)); - } - } - - return; -} - -void bli_cgemm_6m2_k0 - ( - dim_t M, - dim_t N, - const scomplex* restrict beta, - scomplex* restrict c, inc_t rsc, inc_t csc - ) -{ - // 0 < M <= 6, 0 < N <= 32 = vlmax, K = 0 - // This may not produce the same result as the reference kernel if alpha is infinite or NaN. - __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(N), "i"(8 * FLT_SIZE)); - csc *= 2 * FLT_SIZE; - - scomplex beta_cast = *beta; - if (beta_cast.real == 0.f && beta_cast.imag == 0.f) { - // set c to 0 - __asm__("vmv.v.i v0, 0"); - __asm__("vmv.v.i v2, 0"); - for (size_t i = 0; i < M; ++i) { - if (csc == 2 * FLT_SIZE) - __asm__(VSSEG2 "v0, (%0)" : : "r"(c)); - else - __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(c), "r"(csc)); - c += rsc; - } - } - else { - // scale c by beta - for (size_t i = 0; i < M; ++i) { - if (csc == 2 * FLT_SIZE) { - __asm__(VLSEG2 "v0, (%0)" : : "r"(c)); - vcmul_vf2(v4, v6, v0, v2, beta_cast.real, beta_cast.imag); - __asm__(VSSEG2 "v4, (%0)" : : "r"(c)); - } - else { - __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(c), "r"(csc)); - vcmul_vf2(v4, v6, v0, v2, beta_cast.real, beta_cast.imag); - __asm__(VSSSEG2 "v4, (%0), %1" : : "r"(c), "r"(csc)); - } - c += rsc; - } - } - return; -} - -void bli_cgemm_sifive_x280_asm_6m2 - ( - dim_t M, - dim_t N, - dim_t K, - const void* restrict alpha_, - const void* restrict a_, - const void* restrict b_, - const void* restrict beta_, - void* restrict c_, inc_t rsc, inc_t csc, - const auxinfo_t* restrict data, - const cntx_t* restrict cntx - ) -{ - // M x N x K cgemm - (void) data; - (void) cntx; - const scomplex* restrict alpha = alpha_; - const scomplex* restrict beta = beta_; - const scomplex* restrict a = a_; - const scomplex* restrict b = b_; - scomplex* restrict c = c_; - - if (M <= 0 || N <= 0 || K < 0) - return; - else if (K == 0) - bli_cgemm_6m2_k0(M, N, beta, c, rsc, csc); - else if (M == 6) - bli_cgemm_6m2(N, K, alpha, a, b, beta, c, rsc, csc); - else - bli_cgemm_6m2_cleanup(M, N, K, alpha, a, b, beta, c, rsc, csc); - return; -} - -#undef FLT_SIZE -#undef FLT_LOAD -#undef VLSEG2 -#undef VLSSEG2 -#undef VSSEG2 -#undef VSSSEG2 -#undef PACKMR -#undef PACKNR - -// byte-size of underlying floating point type -#define FLT_SIZE 8 -#define FLT_LOAD "fld " -#define VLSEG2 "vlseg2e64.v " -#define VLSSEG2 "vlsseg2e64.v " -#define VSSEG2 "vsseg2e64.v " -#define VSSSEG2 "vssseg2e64.v " -#define PACKMR 8 -#define PACKNR 16 - -void bli_zgemm_6m2 - ( - dim_t N, - dim_t K, - const dcomplex* restrict alpha, - const dcomplex* restrict a, - const dcomplex* restrict b, - const dcomplex* restrict beta, - dcomplex* restrict c, inc_t rsc, inc_t csc - ) -{ - // 6 x N x K zgemm, N <= 32 = vlmax, K > 0 - // pairs of register groups hold the real and imag. parts of rows of c and b - __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(N), "i"(8 * FLT_SIZE)); - __asm__(VLSEG2 "v24, (%0)" : : "r"(b)); - __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE)); - if (K >= 2) { - __asm__(VLSEG2 "v28, (%0)" : : "r"(b)); - __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE)); - } - - __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE)); - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE)); - vcmul_vf(v0, v2, v24, v26, ft0, ft1); - - __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE)); - vcmul_vf(v4, v6, v24, v26, ft2, ft3); - - __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE)); - __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE)); - vcmul_vf(v8, v10, v24, v26, ft4, ft5); - - __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE)); - __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a), "I"(7 * FLT_SIZE)); - vcmul_vf(v12, v14, v24, v26, ft6, ft7); - - __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a), "I"(8 * FLT_SIZE)); - __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a), "I"(9 * FLT_SIZE)); - vcmul_vf(v16, v18, v24, v26, ft8, ft9); - - __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a), "I"(10 * FLT_SIZE)); - __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a), "I"(11 * FLT_SIZE)); - vcmul_vf(v20, v22, v24, v26, ft10, ft11); - K -= 1; - - if (K >= 2) { - __asm__(VLSEG2 "v24, (%0)" : : "r"(b)); - __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE)); - } - __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * 2 * FLT_SIZE)); - - while (K > 0) { - __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE)); - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE)); - vcmacc_vf(v0, v2, ft0, ft1, v28, v30); - - __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE)); - vcmacc_vf(v4, v6, ft2, ft3, v28, v30); - - __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE)); - __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE)); - vcmacc_vf(v8, v10, ft4, ft5, v28, v30); - - __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE)); - __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a), "I"(7 * FLT_SIZE)); - vcmacc_vf(v12, v14, ft6, ft7, v28, v30); - - __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a), "I"(8 * FLT_SIZE)); - __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a), "I"(9 * FLT_SIZE)); - vcmacc_vf(v16, v18, ft8, ft9, v28, v30); - - __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a), "I"(10 * FLT_SIZE)); - __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a), "I"(11 * FLT_SIZE)); - vcmacc_vf(v20, v22, ft10, ft11, v28, v30); - K -= 1; - - if (K == 0) { break; } - - if (K >= 2) { - __asm__(VLSEG2 "v28, (%0)" : : "r"(b)); - __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE)); - } - __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * 2 * FLT_SIZE)); - - __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE)); - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE)); - vcmacc_vf(v0, v2, ft0, ft1, v24, v26); - - __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE)); - vcmacc_vf(v4, v6, ft2, ft3, v24, v26); - - __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE)); - __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE)); - vcmacc_vf(v8, v10, ft4, ft5, v24, v26); - - __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE)); - __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a), "I"(7 * FLT_SIZE)); - vcmacc_vf(v12, v14, ft6, ft7, v24, v26); - - __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a), "I"(8 * FLT_SIZE)); - __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a), "I"(9 * FLT_SIZE)); - vcmacc_vf(v16, v18, ft8, ft9, v24, v26); - - __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a), "I"(10 * FLT_SIZE)); - __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a), "I"(11 * FLT_SIZE)); - vcmacc_vf(v20, v22, ft10, ft11, v24, v26); - K -= 1; - - if (K >= 2) { - __asm__(VLSEG2 "v24, (%0)" : : "r"(b)); - __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE)); - } - __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * 2 * FLT_SIZE)); - } - - rsc *= 2 * FLT_SIZE; - csc *= 2 * FLT_SIZE; - - __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(alpha), "I"(0 * FLT_SIZE)); - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(alpha), "I"(1 * FLT_SIZE)); - - __asm__("vfmul.vf v24, v2, ft1"); - __asm__("vfmul.vf v26, v0, ft1"); - __asm__("vfmul.vf v28, v6, ft1"); - __asm__("vfmul.vf v30, v4, ft1"); - - __asm__("vfmsub.vf v0, ft0, v24"); - __asm__("vfmadd.vf v2, ft0, v26"); - __asm__("vfmsub.vf v4, ft0, v28"); - __asm__("vfmadd.vf v6, ft0, v30"); - - __asm__("vfmul.vf v24, v10, ft1"); - __asm__("vfmul.vf v26, v8, ft1"); - __asm__("vfmul.vf v28, v14, ft1"); - __asm__("vfmul.vf v30, v12, ft1"); - - __asm__("vfmsub.vf v8, ft0, v24"); - __asm__("vfmadd.vf v10, ft0, v26"); - __asm__("vfmsub.vf v12, ft0, v28"); - __asm__("vfmadd.vf v14, ft0, v30"); - - __asm__("vfmul.vf v24, v18, ft1"); - __asm__("vfmul.vf v26, v16, ft1"); - __asm__("vfmul.vf v28, v22, ft1"); - __asm__("vfmul.vf v30, v20, ft1"); - - __asm__("vfmsub.vf v16, ft0, v24"); - __asm__("vfmadd.vf v18, ft0, v26"); - __asm__("vfmsub.vf v20, ft0, v28"); - __asm__("vfmadd.vf v22, ft0, v30"); - - dcomplex beta_cast = *beta; - if (beta_cast.real != 0. || beta_cast.imag != 0.) { - if (csc == 2 * FLT_SIZE) { - dcomplex *c_tmp = c; - __asm__(VLSEG2 "v24, (%0)" : : "r"(c_tmp)); - __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - __asm__(VLSEG2 "v28, (%0)" : : "r"(c_tmp)); - __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - vcmacc_vf2(v0, v2, beta_cast.real, beta_cast.imag, v24, v26); - - __asm__(VLSEG2 "v24, (%0)" : : "r"(c_tmp)); - __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - vcmacc_vf2(v4, v6, beta_cast.real, beta_cast.imag, v28, v30); - - __asm__(VLSEG2 "v28, (%0)" : : "r"(c_tmp)); - __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - vcmacc_vf2(v8, v10, beta_cast.real, beta_cast.imag, v24, v26); - - __asm__(VLSEG2 "v24, (%0)" : : "r"(c_tmp)); - __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - vcmacc_vf2(v12, v14, beta_cast.real, beta_cast.imag, v28, v30); - - __asm__(VLSEG2 "v28, (%0)" : : "r"(c_tmp)); - vcmacc_vf2(v16, v18, beta_cast.real, beta_cast.imag, v24, v26); - - vcmacc_vf2(v20, v22, beta_cast.real, beta_cast.imag, v28, v30); - } - else { - dcomplex *c_tmp = c; - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(c_tmp), "r"(csc)); - __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); - __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - vcmacc_vf2(v0, v2, beta_cast.real, beta_cast.imag, v24, v26); - - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(c_tmp), "r"(csc)); - __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - vcmacc_vf2(v4, v6, beta_cast.real, beta_cast.imag, v28, v30); - - __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); - __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - vcmacc_vf2(v8, v10, beta_cast.real, beta_cast.imag, v24, v26); - - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(c_tmp), "r"(csc)); - __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - vcmacc_vf2(v12, v14, beta_cast.real, beta_cast.imag, v28, v30); - - __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); - vcmacc_vf2(v16, v18, beta_cast.real, beta_cast.imag, v24, v26); - - vcmacc_vf2(v20, v22, beta_cast.real, beta_cast.imag, v28, v30); - } - } - - if (csc == 2 * FLT_SIZE) { - __asm__(VSSEG2 "v0, (%0)" : : "r"(c)); - __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); - __asm__(VSSEG2 "v4, (%0)" : : "r"(c)); - __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); - __asm__(VSSEG2 "v8, (%0)" : : "r"(c)); - __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); - __asm__(VSSEG2 "v12, (%0)" : : "r"(c)); - __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); - __asm__(VSSEG2 "v16, (%0)" : : "r"(c)); - __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); - __asm__(VSSEG2 "v20, (%0)" : : "r"(c)); - } - else { - __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); - __asm__(VSSSEG2 "v4, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); - __asm__(VSSSEG2 "v8, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); - __asm__(VSSSEG2 "v12, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); - __asm__(VSSSEG2 "v16, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); - __asm__(VSSSEG2 "v20, (%0), %1" : : "r"(c), "r"(csc)); - } - - return; -} - -void bli_zgemm_6m2_cleanup - ( - dim_t M, - dim_t N, - dim_t K, - const dcomplex* restrict alpha, - const dcomplex* restrict a, - const dcomplex* restrict b, - const dcomplex* restrict beta, - dcomplex* restrict c, inc_t rsc, inc_t csc - ) -{ - // M x N x K zgemm, 0 < M < 6, N <= 32 = vlmax, K > 0 - // pairs of register groups hold the real and imag. parts of rows of c and b - - __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(N), "i"(8 * FLT_SIZE)); - __asm__(VLSEG2 "v24, (%0)" : : "r"(b)); - __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE)); - if (K >= 2) { - __asm__(VLSEG2 "v28, (%0)" : : "r"(b)); - __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE)); - } - - switch (M) { - case 5: - __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a), "I"(8 * FLT_SIZE)); - __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a), "I"(9 * FLT_SIZE)); - vcmul_vf(v16, v18, v24, v26, ft8, ft9); - case 4: - __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE)); - __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a), "I"(7 * FLT_SIZE)); - vcmul_vf(v12, v14, v24, v26, ft6, ft7); - case 3: - __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE)); - __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE)); - vcmul_vf(v8, v10, v24, v26, ft4, ft5); - case 2: - __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE)); - vcmul_vf(v4, v6, v24, v26, ft2, ft3); - case 1: - __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE)); - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE)); - vcmul_vf(v0, v2, v24, v26, ft0, ft1); - } - K -= 1; - - if (K >= 2) { - __asm__(VLSEG2 "v24, (%0)" : : "r"(b)); - __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE)); - } - __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * 2 * FLT_SIZE)); - - while (K > 0) { - switch (M) { - case 5: - __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a), "I"(8 * FLT_SIZE)); - __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a), "I"(9 * FLT_SIZE)); - vcmacc_vf(v16, v18, ft8, ft9, v28, v30); - case 4: - __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE)); - __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a), "I"(7 * FLT_SIZE)); - vcmacc_vf(v12, v14, ft6, ft7, v28, v30); - case 3: - __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE)); - __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE)); - vcmacc_vf(v8, v10, ft4, ft5, v28, v30); - case 2: - __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE)); - vcmacc_vf(v4, v6, ft2, ft3, v28, v30); - case 1: - __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE)); - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE)); - vcmacc_vf(v0, v2, ft0, ft1, v28, v30); - } - K -= 1; - - if (K == 0) { break; } - - if (K >= 2) { - __asm__(VLSEG2 "v28, (%0)" : : "r"(b)); - __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE)); - } - __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * 2 * FLT_SIZE)); - - switch (M) { - case 5: - __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a), "I"(8 * FLT_SIZE)); - __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a), "I"(9 * FLT_SIZE)); - vcmacc_vf(v16, v18, ft8, ft9, v24, v26); - case 4: - __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE)); - __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a), "I"(7 * FLT_SIZE)); - vcmacc_vf(v12, v14, ft6, ft7, v24, v26); - case 3: - __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE)); - __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE)); - vcmacc_vf(v8, v10, ft4, ft5, v24, v26); - case 2: - __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE)); - vcmacc_vf(v4, v6, ft2, ft3, v24, v26); - case 1: - __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE)); - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE)); - vcmacc_vf(v0, v2, ft0, ft1, v24, v26); - } - K -= 1; - - if (K >= 2) { - __asm__(VLSEG2 "v24, (%0)" : : "r"(b)); - __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE)); - } - __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * 2 * FLT_SIZE)); - } - - c += (M - 1) * rsc; - rsc *= 2 * FLT_SIZE; - csc *= 2 * FLT_SIZE; - - __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(alpha), "I"(0 * FLT_SIZE)); - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(alpha), "I"(1 * FLT_SIZE)); - - switch (M) { - case 5: - __asm__("vfmul.vf v24, v18, ft1"); - __asm__("vfmul.vf v26, v16, ft1"); - __asm__("vfmsub.vf v16, ft0, v24"); - __asm__("vfmadd.vf v18, ft0, v26"); - case 4: - __asm__("vfmul.vf v28, v14, ft1"); - __asm__("vfmul.vf v30, v12, ft1"); - __asm__("vfmsub.vf v12, ft0, v28"); - __asm__("vfmadd.vf v14, ft0, v30"); - case 3: - __asm__("vfmul.vf v24, v10, ft1"); - __asm__("vfmul.vf v26, v8, ft1"); - __asm__("vfmsub.vf v8, ft0, v24"); - __asm__("vfmadd.vf v10, ft0, v26"); - case 2: - __asm__("vfmul.vf v28, v6, ft1"); - __asm__("vfmul.vf v30, v4, ft1"); - __asm__("vfmsub.vf v4, ft0, v28"); - __asm__("vfmadd.vf v6, ft0, v30"); - case 1: - __asm__("vfmul.vf v24, v2, ft1"); - __asm__("vfmul.vf v26, v0, ft1"); - __asm__("vfmsub.vf v0, ft0, v24"); - __asm__("vfmadd.vf v2, ft0, v26"); - } - - dcomplex beta_cast = *beta; - if (beta_cast.real != 0. || beta_cast.imag != 0.) { - if (csc == 2 * FLT_SIZE) { - dcomplex *c_tmp = c; - switch (M) { - case 5: - __asm__(VLSEG2 "v24, (%0)" : : "r"(c_tmp)); - __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - vcmacc_vf2(v16, v18, beta_cast.real, beta_cast.imag, v24, v26); - case 4: - __asm__(VLSEG2 "v28, (%0)" : : "r"(c_tmp)); - __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - vcmacc_vf2(v12, v14, beta_cast.real, beta_cast.imag, v28, v30); - case 3: - __asm__(VLSEG2 "v24, (%0)" : : "r"(c_tmp)); - __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - vcmacc_vf2(v8, v10, beta_cast.real, beta_cast.imag, v24, v26); - case 2: - __asm__(VLSEG2 "v28, (%0)" : : "r"(c_tmp)); - __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - vcmacc_vf2(v4, v6, beta_cast.real, beta_cast.imag, v28, v30); - case 1: - __asm__(VLSEG2 "v24, (%0)" : : "r"(c_tmp)); - vcmacc_vf2(v0, v2, beta_cast.real, beta_cast.imag, v24, v26); - } - } - else { - dcomplex *c_tmp = c; - switch (M) { - case 5: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(c_tmp), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - vcmacc_vf2(v16, v18, beta_cast.real, beta_cast.imag, v24, v26); - case 4: - __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - vcmacc_vf2(v12, v14, beta_cast.real, beta_cast.imag, v28, v30); - case 3: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(c_tmp), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - vcmacc_vf2(v8, v10, beta_cast.real, beta_cast.imag, v24, v26); - case 2: - __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); - vcmacc_vf2(v4, v6, beta_cast.real, beta_cast.imag, v28, v30); - case 1: - __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(c_tmp), "r"(csc)); - vcmacc_vf2(v0, v2, beta_cast.real, beta_cast.imag, v24, v26); - } - } - } - - if (csc == 2 * FLT_SIZE) { - switch (M) { - case 5: - __asm__(VSSEG2 "v16, (%0)" : : "r"(c)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 4: - __asm__(VSSEG2 "v12, (%0)" : : "r"(c)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 3: - __asm__(VSSEG2 "v8, (%0)" : : "r"(c)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 2: - __asm__(VSSEG2 "v4, (%0)" : : "r"(c)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 1: - __asm__(VSSEG2 "v0, (%0)" : : "r"(c)); - } - } - else { - switch (M) { - case 5: - __asm__(VSSSEG2 "v16, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 4: - __asm__(VSSSEG2 "v12, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 3: - __asm__(VSSSEG2 "v8, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 2: - __asm__(VSSSEG2 "v4, (%0), %1" : : "r"(c), "r"(csc)); - __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); - case 1: - __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(c), "r"(csc)); - } - } - - return; -} - -void bli_zgemm_6m2_k0 - ( - dim_t M, - dim_t N, - const dcomplex* restrict beta, - dcomplex* restrict c, inc_t rsc, inc_t csc - ) -{ - // 0 < M <= 6, 0 < N <= 32 = vlmax, K = 0 - // This may not produce the same result as the reference kernel if alpha is infinite or NaN. - __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(N), "i"(8 * FLT_SIZE)); - csc *= 2 * FLT_SIZE; - - dcomplex beta_cast = *beta; - if (beta_cast.real == 0. && beta_cast.imag == 0.) { - // set c to 0 - __asm__("vmv.v.i v0, 0"); - __asm__("vmv.v.i v2, 0"); - for (size_t i = 0; i < M; ++i) { - if (csc == 2 * FLT_SIZE) - __asm__(VSSEG2 "v0, (%0)" : : "r"(c)); - else - __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(c), "r"(csc)); - c += rsc; - } - } - else { - // scale c by beta - for (size_t i = 0; i < M; ++i) { - if (csc == 2 * FLT_SIZE) { - __asm__(VLSEG2 "v0, (%0)" : : "r"(c)); - vcmul_vf2(v4, v6, v0, v2, beta_cast.real, beta_cast.imag); - __asm__(VSSEG2 "v4, (%0)" : : "r"(c)); - } - else { - __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(c), "r"(csc)); - vcmul_vf2(v4, v6, v0, v2, beta_cast.real, beta_cast.imag); - __asm__(VSSSEG2 "v4, (%0), %1" : : "r"(c), "r"(csc)); - } - c += rsc; - } - } - return; -} - -void bli_zgemm_sifive_x280_asm_6m2 - ( - dim_t M, - dim_t N, - dim_t K, - const void* restrict alpha_, - const void* restrict a_, - const void* restrict b_, - const void* restrict beta_, - void* restrict c_, inc_t rsc, inc_t csc, - const auxinfo_t* restrict data, - const cntx_t* restrict cntx - ) -{ - // M x N x K zgemm - (void) data; - (void) cntx; - const dcomplex* restrict alpha = alpha_; - const dcomplex* restrict beta = beta_; - const dcomplex* restrict a = a_; - const dcomplex* restrict b = b_; - dcomplex* restrict c = c_; - - if (M <= 0 || N <= 0 || K < 0) - return; - else if (K == 0) - bli_zgemm_6m2_k0(M, N, beta, c, rsc, csc); - else if (M == 6) - bli_zgemm_6m2(N, K, alpha, a, b, beta, c, rsc, csc); - else - bli_zgemm_6m2_cleanup(M, N, K, alpha, a, b, beta, c, rsc, csc); - return; -} - -#undef FLT_SIZE -#undef FLT_LOAD -#undef VLSEG2 -#undef VLSSEG2 -#undef VSSEG2 -#undef VSSSEG2 -#undef PACKMR -#undef PACKNR diff --git a/kernels/sifive_x280/3/bli_gemm_sifive_x280_intr/bli_gemm_sifive_x280_intr.c b/kernels/sifive_x280/3/bli_gemm_sifive_x280_intr/bli_gemm_sifive_x280_intr.c new file mode 100644 index 0000000000..664d4616f3 --- /dev/null +++ b/kernels/sifive_x280/3/bli_gemm_sifive_x280_intr/bli_gemm_sifive_x280_intr.c @@ -0,0 +1,138 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2024, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off +#include "../../riscv_cmul_macros_intr.h" +#include "../../riscv_overloaded_intrinsics.h" +#include "blis.h" +#include +#include + +#define GEMM_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##gemm_sifive_x280_intr(\ + dim_t m, \ + dim_t n, \ + dim_t k, \ + const void* restrict alpha_, \ + const void* restrict a_, \ + const void* restrict b_, \ + const void* restrict beta_, \ + void* restrict c_, inc_t rsc, inc_t csc, \ + const auxinfo_t* restrict data, \ + const cntx_t* restrict cntx \ +) + +#define GEMM(...) GEMM_(__VA_ARGS__) + +// Single precision real +#define DATATYPE float +#define PRECISION_CHAR s +#define PREC 32 +#define LMUL m4 +#define FLT_SIZE sizeof(float) +#define PACKMR 8 +#define PACKNR 64 + +#include "./bli_gemm_sifive_x280_intr_real.c" + +#undef DATATYPE +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE +#undef PACKMR +#undef PACKNR + +// Double precision real +#define DATATYPE double +#define PRECISION_CHAR d +#define PREC 64 +#define LMUL m4 +#define FLT_SIZE sizeof(double) +#define PACKMR 8 +#define PACKNR 32 + +#include "./bli_gemm_sifive_x280_intr_real.c" + +#undef DATATYPE +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE +#undef PACKMR +#undef PACKNR + +// Single precision complex +#define DATATYPE scomplex +#define BASE_DT float +#define PRECISION_CHAR c +#define PREC 32 +#define LMUL m2 +#define FLT_SIZE sizeof(float) +#define PACKMR 8 +#define PACKNR 32 + +#include "./bli_gemm_sifive_x280_intr_complex.c" + +#undef DATATYPE +#undef BASE_DT +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE +#undef PACKMR +#undef PACKNR + +// Double precision complex +#define DATATYPE dcomplex +#define BASE_DT double +#define PRECISION_CHAR z +#define PREC 64 +#define LMUL m2 +#define FLT_SIZE sizeof(double) +#define PACKMR 8 +#define PACKNR 16 + +#include "./bli_gemm_sifive_x280_intr_complex.c" + +#undef DATATYPE +#undef BASE_DT +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE +#undef PACKMR +#undef PACKNR + +#undef GEMM +#undef GEMM_ diff --git a/kernels/sifive_x280/3/bli_gemm_sifive_x280_intr/bli_gemm_sifive_x280_intr_complex.c b/kernels/sifive_x280/3/bli_gemm_sifive_x280_intr/bli_gemm_sifive_x280_intr_complex.c new file mode 100644 index 0000000000..15a19ab49d --- /dev/null +++ b/kernels/sifive_x280/3/bli_gemm_sifive_x280_intr/bli_gemm_sifive_x280_intr_complex.c @@ -0,0 +1,517 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2024, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off +#ifdef GEMM + +GEMM(PRECISION_CHAR, void) +{ + (void) data; // Suppress unused parameter warnings + (void) cntx; + const DATATYPE* restrict alpha = alpha_; + const DATATYPE* restrict a = a_; + const DATATYPE* restrict b = b_; + const DATATYPE* restrict beta = beta_; + DATATYPE* restrict c = c_; + + if (m <= 0 || n <= 0 || k < 0) + return; + else if (k == 0) { + if (PASTEMAC(PRECISION_CHAR, eq0)(*beta)) { + RVV_TYPE_FX(PREC, LMUL, 2) zero_splat = VUNDEFINED_FX(PREC, LMUL, 2)(); + zero_splat = VSET_V_F(PREC, LMUL, 2)(zero_splat, 0, VFMV_V_F(PREC, LMUL)(0., n)); + zero_splat = VSET_V_F(PREC, LMUL, 2)(zero_splat, 1, VFMV_V_F(PREC, LMUL)(0., n)); + + for (dim_t i = 0; i < m; ++i) { + if (csc == 1) + VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(c + i * rsc), zero_splat, n); + else + VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(c + i * rsc), 2 * FLT_SIZE * csc, zero_splat, n); + } + } + else { + for (dim_t i = 0; i < m; ++i) { + RVV_TYPE_FX(PREC, LMUL, 2) c0; + RVV_TYPE_F(PREC, LMUL) c0_r, c0_i; + RVV_TYPE_F(PREC, LMUL) beta_c0_r, beta_c0_i; + + if (csc == 1) + c0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(c + i * rsc), n); + else + c0 = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(c + i * rsc), 2 * FLT_SIZE * csc, n); + c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0); + c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1); + VCMUL_VF(PREC, LMUL, beta_c0_r, beta_c0_i, c0_r, c0_i, beta->real, beta->imag, n); + c0 = VSET_V_F(PREC, LMUL, 2)(c0, 0, beta_c0_r); + c0 = VSET_V_F(PREC, LMUL, 2)(c0, 1, beta_c0_i); + if (csc == 1) + VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(c + i * rsc), c0, n); + else + VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(c + i * rsc), 2 * FLT_SIZE * csc, c0, n); + } + } + } + else if (m == 6) { + RVV_TYPE_F(PREC, LMUL) ab0_r, ab1_r, ab2_r, ab3_r, ab4_r, ab5_r; + RVV_TYPE_F(PREC, LMUL) ab0_i, ab1_i, ab2_i, ab3_i, ab4_i, ab5_i; + RVV_TYPE_FX(PREC, LMUL, 2) b0, b1; + RVV_TYPE_F(PREC, LMUL) b0_r, b1_r; + RVV_TYPE_F(PREC, LMUL) b0_i, b1_i; + + b0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) b, n); + b0_r = VGET_V_F(PREC, LMUL, 2)(b0, 0); + b0_i = VGET_V_F(PREC, LMUL, 2)(b0, 1); + b += PACKNR; + if (k >= 2) { + b1 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) b, n); + b1_r = VGET_V_F(PREC, LMUL, 2)(b1, 0); + b1_i = VGET_V_F(PREC, LMUL, 2)(b1, 1); + b += PACKNR; + } + + VCMUL_VF(PREC, LMUL, ab0_r, ab0_i, b0_r, b0_i, a[0].real, a[0].imag, n); + VCMUL_VF(PREC, LMUL, ab1_r, ab1_i, b0_r, b0_i, a[1].real, a[1].imag, n); + VCMUL_VF(PREC, LMUL, ab2_r, ab2_i, b0_r, b0_i, a[2].real, a[2].imag, n); + VCMUL_VF(PREC, LMUL, ab3_r, ab3_i, b0_r, b0_i, a[3].real, a[3].imag, n); + VCMUL_VF(PREC, LMUL, ab4_r, ab4_i, b0_r, b0_i, a[4].real, a[4].imag, n); + VCMUL_VF(PREC, LMUL, ab5_r, ab5_i, b0_r, b0_i, a[5].real, a[5].imag, n); + + a += PACKMR; + k -= 1; + + if (k >= 2) { + b0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) b, n); + b0_r = VGET_V_F(PREC, LMUL, 2)(b0, 0); + b0_i = VGET_V_F(PREC, LMUL, 2)(b0, 1); + b += PACKNR; + } + + while (k > 0) { + VCMACC_VF(PREC, LMUL, ab0_r, ab0_i, a[0].real, a[0].imag, b1_r, b1_i, n); + VCMACC_VF(PREC, LMUL, ab1_r, ab1_i, a[1].real, a[1].imag, b1_r, b1_i, n); + VCMACC_VF(PREC, LMUL, ab2_r, ab2_i, a[2].real, a[2].imag, b1_r, b1_i, n); + VCMACC_VF(PREC, LMUL, ab3_r, ab3_i, a[3].real, a[3].imag, b1_r, b1_i, n); + VCMACC_VF(PREC, LMUL, ab4_r, ab4_i, a[4].real, a[4].imag, b1_r, b1_i, n); + VCMACC_VF(PREC, LMUL, ab5_r, ab5_i, a[5].real, a[5].imag, b1_r, b1_i, n); + + a += PACKMR; + k -= 1; + + if (k == 0) { break; } + + if (k >= 2) { + b1 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) b, n); + b1_r = VGET_V_F(PREC, LMUL, 2)(b1, 0); + b1_i = VGET_V_F(PREC, LMUL, 2)(b1, 1); + b += PACKNR; + } + + VCMACC_VF(PREC, LMUL, ab0_r, ab0_i, a[0].real, a[0].imag, b0_r, b0_i, n); + VCMACC_VF(PREC, LMUL, ab1_r, ab1_i, a[1].real, a[1].imag, b0_r, b0_i, n); + VCMACC_VF(PREC, LMUL, ab2_r, ab2_i, a[2].real, a[2].imag, b0_r, b0_i, n); + VCMACC_VF(PREC, LMUL, ab3_r, ab3_i, a[3].real, a[3].imag, b0_r, b0_i, n); + VCMACC_VF(PREC, LMUL, ab4_r, ab4_i, a[4].real, a[4].imag, b0_r, b0_i, n); + VCMACC_VF(PREC, LMUL, ab5_r, ab5_i, a[5].real, a[5].imag, b0_r, b0_i, n); + + a += PACKMR; + k -= 1; + + if (k == 0) { break; } + + if (k >= 2) { + b0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) b, n); + b0_r = VGET_V_F(PREC, LMUL, 2)(b0, 0); + b0_i = VGET_V_F(PREC, LMUL, 2)(b0, 1); + b += PACKNR; + } + } + + RVV_TYPE_F(PREC, LMUL) temp0_r, temp1_r; + RVV_TYPE_F(PREC, LMUL) temp0_i, temp1_i; + temp0_r = VFMUL_VF(PREC, LMUL)(ab0_i, alpha->imag, n); + temp0_i = VFMUL_VF(PREC, LMUL)(ab0_r, alpha->imag, n); + temp1_r = VFMUL_VF(PREC, LMUL)(ab1_i, alpha->imag, n); + temp1_i = VFMUL_VF(PREC, LMUL)(ab1_r, alpha->imag, n); + + ab0_r = VFMSUB_VF(PREC, LMUL)(ab0_r, alpha->real, temp0_r, n); + ab0_i = VFMADD_VF(PREC, LMUL)(ab0_i, alpha->real, temp0_i, n); + ab1_r = VFMSUB_VF(PREC, LMUL)(ab1_r, alpha->real, temp1_r, n); + ab1_i = VFMADD_VF(PREC, LMUL)(ab1_i, alpha->real, temp1_i, n); + + temp0_r = VFMUL_VF(PREC, LMUL)(ab2_i, alpha->imag, n); + temp0_i = VFMUL_VF(PREC, LMUL)(ab2_r, alpha->imag, n); + temp1_r = VFMUL_VF(PREC, LMUL)(ab3_i, alpha->imag, n); + temp1_i = VFMUL_VF(PREC, LMUL)(ab3_r, alpha->imag, n); + + ab2_r = VFMSUB_VF(PREC, LMUL)(ab2_r, alpha->real, temp0_r, n); + ab2_i = VFMADD_VF(PREC, LMUL)(ab2_i, alpha->real, temp0_i, n); + ab3_r = VFMSUB_VF(PREC, LMUL)(ab3_r, alpha->real, temp1_r, n); + ab3_i = VFMADD_VF(PREC, LMUL)(ab3_i, alpha->real, temp1_i, n); + + temp0_r = VFMUL_VF(PREC, LMUL)(ab4_i, alpha->imag, n); + temp0_i = VFMUL_VF(PREC, LMUL)(ab4_r, alpha->imag, n); + temp1_r = VFMUL_VF(PREC, LMUL)(ab5_i, alpha->imag, n); + temp1_i = VFMUL_VF(PREC, LMUL)(ab5_r, alpha->imag, n); + + ab4_r = VFMSUB_VF(PREC, LMUL)(ab4_r, alpha->real, temp0_r, n); + ab4_i = VFMADD_VF(PREC, LMUL)(ab4_i, alpha->real, temp0_i, n); + ab5_r = VFMSUB_VF(PREC, LMUL)(ab5_r, alpha->real, temp1_r, n); + ab5_i = VFMADD_VF(PREC, LMUL)(ab5_i, alpha->real, temp1_i, n); + + if (!PASTEMAC(PRECISION_CHAR, eq0)(*beta)) { + RVV_TYPE_FX(PREC, LMUL, 2) c0; + RVV_TYPE_F(PREC, LMUL) c0_r, c0_i; + if (csc == 1) { + c0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 0 * rsc), n); + c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0); + c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1); + VCMACC_VF(PREC, LMUL, ab0_r, ab0_i, beta->real, beta->imag, c0_r, c0_i, n); + + c0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 1 * rsc), n); + c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0); + c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1); + VCMACC_VF(PREC, LMUL, ab1_r, ab1_i, beta->real, beta->imag, c0_r, c0_i, n); + + c0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 2 * rsc), n); + c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0); + c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1); + VCMACC_VF(PREC, LMUL, ab2_r, ab2_i, beta->real, beta->imag, c0_r, c0_i, n); + + c0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 3 * rsc), n); + c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0); + c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1); + VCMACC_VF(PREC, LMUL, ab3_r, ab3_i, beta->real, beta->imag, c0_r, c0_i, n); + + c0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 4 * rsc), n); + c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0); + c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1); + VCMACC_VF(PREC, LMUL, ab4_r, ab4_i, beta->real, beta->imag, c0_r, c0_i, n); + + c0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 5 * rsc), n); + c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0); + c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1); + VCMACC_VF(PREC, LMUL, ab5_r, ab5_i, beta->real, beta->imag, c0_r, c0_i, n); + } + else { + c0 = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 0 * rsc), 2 * FLT_SIZE * csc, n); + c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0); + c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1); + VCMACC_VF(PREC, LMUL, ab0_r, ab0_i, beta->real, beta->imag, c0_r, c0_i, n); + + c0 = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 1 * rsc), 2 * FLT_SIZE * csc, n); + c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0); + c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1); + VCMACC_VF(PREC, LMUL, ab1_r, ab1_i, beta->real, beta->imag, c0_r, c0_i, n); + + c0 = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 2 * rsc), 2 * FLT_SIZE * csc, n); + c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0); + c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1); + VCMACC_VF(PREC, LMUL, ab2_r, ab2_i, beta->real, beta->imag, c0_r, c0_i, n); + + c0 = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 3 * rsc), 2 * FLT_SIZE * csc, n); + c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0); + c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1); + VCMACC_VF(PREC, LMUL, ab3_r, ab3_i, beta->real, beta->imag, c0_r, c0_i, n); + + c0 = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 4 * rsc), 2 * FLT_SIZE * csc, n); + c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0); + c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1); + VCMACC_VF(PREC, LMUL, ab4_r, ab4_i, beta->real, beta->imag, c0_r, c0_i, n); + + c0 = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 5 * rsc), 2 * FLT_SIZE * csc, n); + c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0); + c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1); + VCMACC_VF(PREC, LMUL, ab5_r, ab5_i, beta->real, beta->imag, c0_r, c0_i, n); + } + } + + RVV_TYPE_FX(PREC, LMUL, 2) ab0 = VCREATE_V_FX(PREC, LMUL, 2)(ab0_r, ab0_i); + RVV_TYPE_FX(PREC, LMUL, 2) ab1 = VCREATE_V_FX(PREC, LMUL, 2)(ab1_r, ab1_i); + RVV_TYPE_FX(PREC, LMUL, 2) ab2 = VCREATE_V_FX(PREC, LMUL, 2)(ab2_r, ab2_i); + RVV_TYPE_FX(PREC, LMUL, 2) ab3 = VCREATE_V_FX(PREC, LMUL, 2)(ab3_r, ab3_i); + RVV_TYPE_FX(PREC, LMUL, 2) ab4 = VCREATE_V_FX(PREC, LMUL, 2)(ab4_r, ab4_i); + RVV_TYPE_FX(PREC, LMUL, 2) ab5 = VCREATE_V_FX(PREC, LMUL, 2)(ab5_r, ab5_i); + + if (csc == 1) { + VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 0 * rsc), ab0, n); + VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 1 * rsc), ab1, n); + VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 2 * rsc), ab2, n); + VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 3 * rsc), ab3, n); + VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 4 * rsc), ab4, n); + VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 5 * rsc), ab5, n); + } + else { + VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 0 * rsc), 2 * FLT_SIZE * csc, ab0, n); + VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 1 * rsc), 2 * FLT_SIZE * csc, ab1, n); + VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 2 * rsc), 2 * FLT_SIZE * csc, ab2, n); + VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 3 * rsc), 2 * FLT_SIZE * csc, ab3, n); + VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 4 * rsc), 2 * FLT_SIZE * csc, ab4, n); + VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 5 * rsc), 2 * FLT_SIZE * csc, ab5, n); + } + } + else { + RVV_TYPE_F(PREC, LMUL) ab0_r, ab1_r, ab2_r, ab3_r, ab4_r; + RVV_TYPE_F(PREC, LMUL) ab0_i, ab1_i, ab2_i, ab3_i, ab4_i; + RVV_TYPE_FX(PREC, LMUL, 2) b0, b1; + RVV_TYPE_F(PREC, LMUL) b0_r, b1_r; + RVV_TYPE_F(PREC, LMUL) b0_i, b1_i; + + b0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) b, n); + b0_r = VGET_V_F(PREC, LMUL, 2)(b0, 0); + b0_i = VGET_V_F(PREC, LMUL, 2)(b0, 1); + b += PACKNR; + if (k >= 2) { + b1 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) b, n); + b1_r = VGET_V_F(PREC, LMUL, 2)(b1, 0); + b1_i = VGET_V_F(PREC, LMUL, 2)(b1, 1); + b += PACKNR; + } + + switch (m) { + case 5: + VCMUL_VF(PREC, LMUL, ab4_r, ab4_i, b0_r, b0_i, a[4].real, a[4].imag, n); + case 4: + VCMUL_VF(PREC, LMUL, ab3_r, ab3_i, b0_r, b0_i, a[3].real, a[3].imag, n); + case 3: + VCMUL_VF(PREC, LMUL, ab2_r, ab2_i, b0_r, b0_i, a[2].real, a[2].imag, n); + case 2: + VCMUL_VF(PREC, LMUL, ab1_r, ab1_i, b0_r, b0_i, a[1].real, a[1].imag, n); + case 1: + VCMUL_VF(PREC, LMUL, ab0_r, ab0_i, b0_r, b0_i, a[0].real, a[0].imag, n); + } + + a += PACKMR; + k -= 1; + + if (k >= 2) { + b0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) b, n); + b0_r = VGET_V_F(PREC, LMUL, 2)(b0, 0); + b0_i = VGET_V_F(PREC, LMUL, 2)(b0, 1); + b += PACKNR; + } + + while (k > 0) { + switch (m) { + case 5: + VCMACC_VF(PREC, LMUL, ab4_r, ab4_i, a[4].real, a[4].imag, b1_r, b1_i, n); + case 4: + VCMACC_VF(PREC, LMUL, ab3_r, ab3_i, a[3].real, a[3].imag, b1_r, b1_i, n); + case 3: + VCMACC_VF(PREC, LMUL, ab2_r, ab2_i, a[2].real, a[2].imag, b1_r, b1_i, n); + case 2: + VCMACC_VF(PREC, LMUL, ab1_r, ab1_i, a[1].real, a[1].imag, b1_r, b1_i, n); + case 1: + VCMACC_VF(PREC, LMUL, ab0_r, ab0_i, a[0].real, a[0].imag, b1_r, b1_i, n); + } + + a += PACKMR; + k -= 1; + + if (k == 0) { break; } + + if (k >= 2) { + b1 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) b, n); + b1_r = VGET_V_F(PREC, LMUL, 2)(b1, 0); + b1_i = VGET_V_F(PREC, LMUL, 2)(b1, 1); + b += PACKNR; + } + + switch (m) { + case 5: + VCMACC_VF(PREC, LMUL, ab4_r, ab4_i, a[4].real, a[4].imag, b0_r, b0_i, n); + case 4: + VCMACC_VF(PREC, LMUL, ab3_r, ab3_i, a[3].real, a[3].imag, b0_r, b0_i, n); + case 3: + VCMACC_VF(PREC, LMUL, ab2_r, ab2_i, a[2].real, a[2].imag, b0_r, b0_i, n); + case 2: + VCMACC_VF(PREC, LMUL, ab1_r, ab1_i, a[1].real, a[1].imag, b0_r, b0_i, n); + case 1: + VCMACC_VF(PREC, LMUL, ab0_r, ab0_i, a[0].real, a[0].imag, b0_r, b0_i, n); + } + + a += PACKMR; + k -= 1; + + if (k == 0) { break; } + + if (k >= 2) { + b0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) b, n); + b0_r = VGET_V_F(PREC, LMUL, 2)(b0, 0); + b0_i = VGET_V_F(PREC, LMUL, 2)(b0, 1); + b += PACKNR; + } + } + + RVV_TYPE_F(PREC, LMUL) temp0_r, temp1_r; + RVV_TYPE_F(PREC, LMUL) temp0_i, temp1_i; + switch (m) { + case 5: + temp0_r = VFMUL_VF(PREC, LMUL)(ab4_i, alpha->imag, n); + temp0_i = VFMUL_VF(PREC, LMUL)(ab4_r, alpha->imag, n); + ab4_r = VFMSUB_VF(PREC, LMUL)(ab4_r, alpha->real, temp0_r, n); + ab4_i = VFMADD_VF(PREC, LMUL)(ab4_i, alpha->real, temp0_i, n); + case 4: + temp1_r = VFMUL_VF(PREC, LMUL)(ab3_i, alpha->imag, n); + temp1_i = VFMUL_VF(PREC, LMUL)(ab3_r, alpha->imag, n); + ab3_r = VFMSUB_VF(PREC, LMUL)(ab3_r, alpha->real, temp1_r, n); + ab3_i = VFMADD_VF(PREC, LMUL)(ab3_i, alpha->real, temp1_i, n); + case 3: + temp0_r = VFMUL_VF(PREC, LMUL)(ab2_i, alpha->imag, n); + temp0_i = VFMUL_VF(PREC, LMUL)(ab2_r, alpha->imag, n); + ab2_r = VFMSUB_VF(PREC, LMUL)(ab2_r, alpha->real, temp0_r, n); + ab2_i = VFMADD_VF(PREC, LMUL)(ab2_i, alpha->real, temp0_i, n); + case 2: + temp1_r = VFMUL_VF(PREC, LMUL)(ab1_i, alpha->imag, n); + temp1_i = VFMUL_VF(PREC, LMUL)(ab1_r, alpha->imag, n); + ab1_r = VFMSUB_VF(PREC, LMUL)(ab1_r, alpha->real, temp1_r, n); + ab1_i = VFMADD_VF(PREC, LMUL)(ab1_i, alpha->real, temp1_i, n); + case 1: + temp0_r = VFMUL_VF(PREC, LMUL)(ab0_i, alpha->imag, n); + temp0_i = VFMUL_VF(PREC, LMUL)(ab0_r, alpha->imag, n); + ab0_r = VFMSUB_VF(PREC, LMUL)(ab0_r, alpha->real, temp0_r, n); + ab0_i = VFMADD_VF(PREC, LMUL)(ab0_i, alpha->real, temp0_i, n); + } + + if (!PASTEMAC(PRECISION_CHAR, eq0)(*beta)) { + RVV_TYPE_FX(PREC, LMUL, 2) c0; + RVV_TYPE_F(PREC, LMUL) c0_r, c0_i; + if (csc == 1) { + switch (m) { + case 5: + c0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 4 * rsc), n); + c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0); + c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1); + VCMACC_VF(PREC, LMUL, ab4_r, ab4_i, beta->real, beta->imag, c0_r, c0_i, n); + case 4: + c0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 3 * rsc), n); + c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0); + c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1); + VCMACC_VF(PREC, LMUL, ab3_r, ab3_i, beta->real, beta->imag, c0_r, c0_i, n); + case 3: + c0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 2 * rsc), n); + c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0); + c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1); + VCMACC_VF(PREC, LMUL, ab2_r, ab2_i, beta->real, beta->imag, c0_r, c0_i, n); + case 2: + c0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 1 * rsc), n); + c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0); + c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1); + VCMACC_VF(PREC, LMUL, ab1_r, ab1_i, beta->real, beta->imag, c0_r, c0_i, n); + case 1: + c0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 0 * rsc), n); + c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0); + c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1); + VCMACC_VF(PREC, LMUL, ab0_r, ab0_i, beta->real, beta->imag, c0_r, c0_i, n); + } + + } + else { + switch (m) { + case 5: + c0 = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 4 * rsc), 2 * FLT_SIZE * csc, n); + c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0); + c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1); + VCMACC_VF(PREC, LMUL, ab4_r, ab4_i, beta->real, beta->imag, c0_r, c0_i, n); + case 4: + c0 = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 3 * rsc), 2 * FLT_SIZE * csc, n); + c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0); + c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1); + VCMACC_VF(PREC, LMUL, ab3_r, ab3_i, beta->real, beta->imag, c0_r, c0_i, n); + case 3: + c0 = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 2 * rsc), 2 * FLT_SIZE * csc, n); + c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0); + c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1); + VCMACC_VF(PREC, LMUL, ab2_r, ab2_i, beta->real, beta->imag, c0_r, c0_i, n); + case 2: + c0 = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 1 * rsc), 2 * FLT_SIZE * csc, n); + c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0); + c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1); + VCMACC_VF(PREC, LMUL, ab1_r, ab1_i, beta->real, beta->imag, c0_r, c0_i, n); + case 1: + c0 = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 0 * rsc), 2 * FLT_SIZE * csc, n); + c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0); + c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1); + VCMACC_VF(PREC, LMUL, ab0_r, ab0_i, beta->real, beta->imag, c0_r, c0_i, n); + } + } + } + + RVV_TYPE_FX(PREC, LMUL, 2) ab0, ab1, ab2, ab3, ab4; + switch (m) { + case 5: + ab4 = VCREATE_V_FX(PREC, LMUL, 2)(ab4_r, ab4_i); + case 4: + ab3 = VCREATE_V_FX(PREC, LMUL, 2)(ab3_r, ab3_i); + case 3: + ab2 = VCREATE_V_FX(PREC, LMUL, 2)(ab2_r, ab2_i); + case 2: + ab1 = VCREATE_V_FX(PREC, LMUL, 2)(ab1_r, ab1_i); + case 1: + ab0 = VCREATE_V_FX(PREC, LMUL, 2)(ab0_r, ab0_i); + } + + if (csc == 1) { + switch (m) { + case 5: + VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 4 * rsc), ab4, n); + case 4: + VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 3 * rsc), ab3, n); + case 3: + VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 2 * rsc), ab2, n); + case 2: + VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 1 * rsc), ab1, n); + case 1: + VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 0 * rsc), ab0, n); + } + } + else { + switch (m) { + case 5: + VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 4 * rsc), 2 * FLT_SIZE * csc, ab4, n); + case 4: + VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 3 * rsc), 2 * FLT_SIZE * csc, ab3, n); + case 3: + VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 2 * rsc), 2 * FLT_SIZE * csc, ab2, n); + case 2: + VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 1 * rsc), 2 * FLT_SIZE * csc, ab1, n); + case 1: + VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 0 * rsc), 2 * FLT_SIZE * csc, ab0, n); + } + } + } + + return; +} + +#endif // GEMM diff --git a/kernels/sifive_x280/3/bli_gemm_sifive_x280_intr/bli_gemm_sifive_x280_intr_real.c b/kernels/sifive_x280/3/bli_gemm_sifive_x280_intr/bli_gemm_sifive_x280_intr_real.c new file mode 100644 index 0000000000..605b93fb79 --- /dev/null +++ b/kernels/sifive_x280/3/bli_gemm_sifive_x280_intr/bli_gemm_sifive_x280_intr_real.c @@ -0,0 +1,339 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2024, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off +#ifdef GEMM + +GEMM(PRECISION_CHAR, void) +{ + (void) data; // Suppress unused parameter warnings + (void) cntx; + const DATATYPE* restrict alpha = alpha_; + const DATATYPE* restrict a = a_; + const DATATYPE* restrict b = b_; + const DATATYPE* restrict beta = beta_; + DATATYPE* restrict c = c_; + + if (m <= 0 || n <= 0 || k < 0) + return; + else if (k == 0) { + if (PASTEMAC(PRECISION_CHAR, eq0)(*beta)) { + RVV_TYPE_F(PREC, LMUL) zero_splat = VFMV_V_F(PREC, LMUL)(0., n); + for (dim_t i = 0; i < m; ++i) { + if (csc == 1) + VSE_V_F(PREC, LMUL)(c + i * rsc, zero_splat, n); + else + VSSE_V_F(PREC, LMUL)(c + i * rsc, FLT_SIZE * csc, zero_splat, n); + } + } + else { + for (dim_t i = 0; i < m; ++i) { + RVV_TYPE_F(PREC, LMUL) c0; + if (csc == 1) + c0 = VLE_V_F(PREC, LMUL)(c + i * rsc, n); + else + c0 = VLSE_V_F(PREC, LMUL)(c + i * rsc, FLT_SIZE * csc, n); + c0 = VFMUL_VF(PREC, LMUL)(c0, *beta, n); + if (csc == 1) + VSE_V_F(PREC, LMUL)(c + i * rsc, c0, n); + else + VSSE_V_F(PREC, LMUL)(c + i * rsc, FLT_SIZE * csc, c0, n); + } + } + } + else if (m == 7) { + RVV_TYPE_F(PREC, LMUL) ab0, ab1, ab2, ab3, ab4, ab5, ab6; + bool first = true; + for (dim_t i = 0; i < k; ++i) { + RVV_TYPE_F(PREC, LMUL) b0 = VLE_V_F(PREC, LMUL)(b, n); + if (first) { + ab0 = VFMUL_VF(PREC, LMUL)(b0, a[0], n); + ab1 = VFMUL_VF(PREC, LMUL)(b0, a[1], n); + ab2 = VFMUL_VF(PREC, LMUL)(b0, a[2], n); + ab3 = VFMUL_VF(PREC, LMUL)(b0, a[3], n); + ab4 = VFMUL_VF(PREC, LMUL)(b0, a[4], n); + ab5 = VFMUL_VF(PREC, LMUL)(b0, a[5], n); + ab6 = VFMUL_VF(PREC, LMUL)(b0, a[6], n); + first = false; + } + else { + ab0 = VFMACC_VF(PREC, LMUL)(ab0, a[0], b0, n); + ab1 = VFMACC_VF(PREC, LMUL)(ab1, a[1], b0, n); + ab2 = VFMACC_VF(PREC, LMUL)(ab2, a[2], b0, n); + ab3 = VFMACC_VF(PREC, LMUL)(ab3, a[3], b0, n); + ab4 = VFMACC_VF(PREC, LMUL)(ab4, a[4], b0, n); + ab5 = VFMACC_VF(PREC, LMUL)(ab5, a[5], b0, n); + ab6 = VFMACC_VF(PREC, LMUL)(ab6, a[6], b0, n); + } + + a += PACKMR; + b += PACKNR; + } + + if (PASTEMAC(PRECISION_CHAR, eq0)(*beta)) { + ab0 = VFMUL_VF(PREC, LMUL)(ab0, *alpha, n); + ab1 = VFMUL_VF(PREC, LMUL)(ab1, *alpha, n); + ab2 = VFMUL_VF(PREC, LMUL)(ab2, *alpha, n); + ab3 = VFMUL_VF(PREC, LMUL)(ab3, *alpha, n); + ab4 = VFMUL_VF(PREC, LMUL)(ab4, *alpha, n); + ab5 = VFMUL_VF(PREC, LMUL)(ab5, *alpha, n); + ab6 = VFMUL_VF(PREC, LMUL)(ab6, *alpha, n); + } + else { + RVV_TYPE_F(PREC, LMUL) c0; + if (csc == 1) { + c0 = VLE_V_F(PREC, LMUL)(c + 0 * rsc, n); + ab0 = VFMUL_VF(PREC, LMUL)(ab0, *alpha, n); + ab0 = VFMACC_VF(PREC, LMUL)(ab0, *beta, c0, n); + c0 = VLE_V_F(PREC, LMUL)(c + 1 * rsc, n); + ab1 = VFMUL_VF(PREC, LMUL)(ab1, *alpha, n); + ab1 = VFMACC_VF(PREC, LMUL)(ab1, *beta, c0, n); + c0 = VLE_V_F(PREC, LMUL)(c + 2 * rsc, n); + ab2 = VFMUL_VF(PREC, LMUL)(ab2, *alpha, n); + ab2 = VFMACC_VF(PREC, LMUL)(ab2, *beta, c0, n); + c0 = VLE_V_F(PREC, LMUL)(c + 3 * rsc, n); + ab3 = VFMUL_VF(PREC, LMUL)(ab3, *alpha, n); + ab3 = VFMACC_VF(PREC, LMUL)(ab3, *beta, c0, n); + c0 = VLE_V_F(PREC, LMUL)(c + 4 * rsc, n); + ab4 = VFMUL_VF(PREC, LMUL)(ab4, *alpha, n); + ab4 = VFMACC_VF(PREC, LMUL)(ab4, *beta, c0, n); + c0 = VLE_V_F(PREC, LMUL)(c + 5 * rsc, n); + ab5 = VFMUL_VF(PREC, LMUL)(ab5, *alpha, n); + ab5 = VFMACC_VF(PREC, LMUL)(ab5, *beta, c0, n); + c0 = VLE_V_F(PREC, LMUL)(c + 6 * rsc, n); + ab6 = VFMUL_VF(PREC, LMUL)(ab6, *alpha, n); + ab6 = VFMACC_VF(PREC, LMUL)(ab6, *beta, c0, n); + } + else { + c0 = VLSE_V_F(PREC, LMUL)(c + 0 * rsc, FLT_SIZE * csc, n); + ab0 = VFMUL_VF(PREC, LMUL)(ab0, *alpha, n); + ab0 = VFMACC_VF(PREC, LMUL)(ab0, *beta, c0, n); + c0 = VLSE_V_F(PREC, LMUL)(c + 1 * rsc, FLT_SIZE * csc, n); + ab1 = VFMUL_VF(PREC, LMUL)(ab1, *alpha, n); + ab1 = VFMACC_VF(PREC, LMUL)(ab1, *beta, c0, n); + c0 = VLSE_V_F(PREC, LMUL)(c + 2 * rsc, FLT_SIZE * csc, n); + ab2 = VFMUL_VF(PREC, LMUL)(ab2, *alpha, n); + ab2 = VFMACC_VF(PREC, LMUL)(ab2, *beta, c0, n); + c0 = VLSE_V_F(PREC, LMUL)(c + 3 * rsc, FLT_SIZE * csc, n); + ab3 = VFMUL_VF(PREC, LMUL)(ab3, *alpha, n); + ab3 = VFMACC_VF(PREC, LMUL)(ab3, *beta, c0, n); + c0 = VLSE_V_F(PREC, LMUL)(c + 4 * rsc, FLT_SIZE * csc, n); + ab4 = VFMUL_VF(PREC, LMUL)(ab4, *alpha, n); + ab4 = VFMACC_VF(PREC, LMUL)(ab4, *beta, c0, n); + c0 = VLSE_V_F(PREC, LMUL)(c + 5 * rsc, FLT_SIZE * csc, n); + ab5 = VFMUL_VF(PREC, LMUL)(ab5, *alpha, n); + ab5 = VFMACC_VF(PREC, LMUL)(ab5, *beta, c0, n); + c0 = VLSE_V_F(PREC, LMUL)(c + 6 * rsc, FLT_SIZE * csc, n); + ab6 = VFMUL_VF(PREC, LMUL)(ab6, *alpha, n); + ab6 = VFMACC_VF(PREC, LMUL)(ab6, *beta, c0, n); + } + } + + if (csc == 1) { + VSE_V_F(PREC, LMUL)(c + 0 * rsc, ab0, n); + VSE_V_F(PREC, LMUL)(c + 1 * rsc, ab1, n); + VSE_V_F(PREC, LMUL)(c + 2 * rsc, ab2, n); + VSE_V_F(PREC, LMUL)(c + 3 * rsc, ab3, n); + VSE_V_F(PREC, LMUL)(c + 4 * rsc, ab4, n); + VSE_V_F(PREC, LMUL)(c + 5 * rsc, ab5, n); + VSE_V_F(PREC, LMUL)(c + 6 * rsc, ab6, n); + } + else { + VSSE_V_F(PREC, LMUL)(c + 0 * rsc, FLT_SIZE * csc, ab0, n); + VSSE_V_F(PREC, LMUL)(c + 1 * rsc, FLT_SIZE * csc, ab1, n); + VSSE_V_F(PREC, LMUL)(c + 2 * rsc, FLT_SIZE * csc, ab2, n); + VSSE_V_F(PREC, LMUL)(c + 3 * rsc, FLT_SIZE * csc, ab3, n); + VSSE_V_F(PREC, LMUL)(c + 4 * rsc, FLT_SIZE * csc, ab4, n); + VSSE_V_F(PREC, LMUL)(c + 5 * rsc, FLT_SIZE * csc, ab5, n); + VSSE_V_F(PREC, LMUL)(c + 6 * rsc, FLT_SIZE * csc, ab6, n); + } + } + else { + // 0 < m < 7 + RVV_TYPE_F(PREC, LMUL) ab0, ab1, ab2, ab3, ab4, ab5; + bool first = true; + for (dim_t i = 0; i < k; ++i) { + RVV_TYPE_F(PREC, LMUL) b0 = VLE_V_F(PREC, LMUL)(b, n); + if (first) { + switch (m) { + case 6: + ab5 = VFMUL_VF(PREC, LMUL)(b0, a[5], n); + case 5: + ab4 = VFMUL_VF(PREC, LMUL)(b0, a[4], n); + case 4: + ab3 = VFMUL_VF(PREC, LMUL)(b0, a[3], n); + case 3: + ab2 = VFMUL_VF(PREC, LMUL)(b0, a[2], n); + case 2: + ab1 = VFMUL_VF(PREC, LMUL)(b0, a[1], n); + case 1: + ab0 = VFMUL_VF(PREC, LMUL)(b0, a[0], n); + } + first = false; + } + else { + switch (m) { + case 6: + ab5 = VFMACC_VF(PREC, LMUL)(ab5, a[5], b0, n); + case 5: + ab4 = VFMACC_VF(PREC, LMUL)(ab4, a[4], b0, n); + case 4: + ab3 = VFMACC_VF(PREC, LMUL)(ab3, a[3], b0, n); + case 3: + ab2 = VFMACC_VF(PREC, LMUL)(ab2, a[2], b0, n); + case 2: + ab1 = VFMACC_VF(PREC, LMUL)(ab1, a[1], b0, n); + case 1: + ab0 = VFMACC_VF(PREC, LMUL)(ab0, a[0], b0, n); + } + } + + a += PACKMR; + b += PACKNR; + } + + if (PASTEMAC(PRECISION_CHAR, eq0)(*beta)) { + switch (m) { + case 6: + ab5 = VFMUL_VF(PREC, LMUL)(ab5, *alpha, n); + case 5: + ab4 = VFMUL_VF(PREC, LMUL)(ab4, *alpha, n); + case 4: + ab3 = VFMUL_VF(PREC, LMUL)(ab3, *alpha, n); + case 3: + ab2 = VFMUL_VF(PREC, LMUL)(ab2, *alpha, n); + case 2: + ab1 = VFMUL_VF(PREC, LMUL)(ab1, *alpha, n); + case 1: + ab0 = VFMUL_VF(PREC, LMUL)(ab0, *alpha, n); + } + } + else { + RVV_TYPE_F(PREC, LMUL) c0; + if (csc == 1) { + switch (m) { + case 6: + c0 = VLE_V_F(PREC, LMUL)(c + 5 * rsc, n); + ab5 = VFMUL_VF(PREC, LMUL)(ab5, *alpha, n); + ab5 = VFMACC_VF(PREC, LMUL)(ab5, *beta, c0, n); + case 5: + c0 = VLE_V_F(PREC, LMUL)(c + 4 * rsc, n); + ab4 = VFMUL_VF(PREC, LMUL)(ab4, *alpha, n); + ab4 = VFMACC_VF(PREC, LMUL)(ab4, *beta, c0, n); + case 4: + c0 = VLE_V_F(PREC, LMUL)(c + 3 * rsc, n); + ab3 = VFMUL_VF(PREC, LMUL)(ab3, *alpha, n); + ab3 = VFMACC_VF(PREC, LMUL)(ab3, *beta, c0, n); + case 3: + c0 = VLE_V_F(PREC, LMUL)(c + 2 * rsc, n); + ab2 = VFMUL_VF(PREC, LMUL)(ab2, *alpha, n); + ab2 = VFMACC_VF(PREC, LMUL)(ab2, *beta, c0, n); + case 2: + c0 = VLE_V_F(PREC, LMUL)(c + 1 * rsc, n); + ab1 = VFMUL_VF(PREC, LMUL)(ab1, *alpha, n); + ab1 = VFMACC_VF(PREC, LMUL)(ab1, *beta, c0, n); + case 1: + c0 = VLE_V_F(PREC, LMUL)(c + 0 * rsc, n); + ab0 = VFMUL_VF(PREC, LMUL)(ab0, *alpha, n); + ab0 = VFMACC_VF(PREC, LMUL)(ab0, *beta, c0, n); + } + } + else { + switch (m) { + case 6: + c0 = VLSE_V_F(PREC, LMUL)(c + 5 * rsc, FLT_SIZE * csc, n); + ab5 = VFMUL_VF(PREC, LMUL)(ab5, *alpha, n); + ab5 = VFMACC_VF(PREC, LMUL)(ab5, *beta, c0, n); + case 5: + c0 = VLSE_V_F(PREC, LMUL)(c + 4 * rsc, FLT_SIZE * csc, n); + ab4 = VFMUL_VF(PREC, LMUL)(ab4, *alpha, n); + ab4 = VFMACC_VF(PREC, LMUL)(ab4, *beta, c0, n); + case 4: + c0 = VLSE_V_F(PREC, LMUL)(c + 3 * rsc, FLT_SIZE * csc, n); + ab3 = VFMUL_VF(PREC, LMUL)(ab3, *alpha, n); + ab3 = VFMACC_VF(PREC, LMUL)(ab3, *beta, c0, n); + case 3: + c0 = VLSE_V_F(PREC, LMUL)(c + 2 * rsc, FLT_SIZE * csc, n); + ab2 = VFMUL_VF(PREC, LMUL)(ab2, *alpha, n); + ab2 = VFMACC_VF(PREC, LMUL)(ab2, *beta, c0, n); + case 2: + c0 = VLSE_V_F(PREC, LMUL)(c + 1 * rsc, FLT_SIZE * csc, n); + ab1 = VFMUL_VF(PREC, LMUL)(ab1, *alpha, n); + ab1 = VFMACC_VF(PREC, LMUL)(ab1, *beta, c0, n); + case 1: + c0 = VLSE_V_F(PREC, LMUL)(c + 0 * rsc, FLT_SIZE * csc, n); + ab0 = VFMUL_VF(PREC, LMUL)(ab0, *alpha, n); + ab0 = VFMACC_VF(PREC, LMUL)(ab0, *beta, c0, n); + } + } + } + + if (csc == 1) { + switch (m) { + case 6: + VSE_V_F(PREC, LMUL)(c + 5 * rsc, ab5, n); + case 5: + VSE_V_F(PREC, LMUL)(c + 4 * rsc, ab4, n); + case 4: + VSE_V_F(PREC, LMUL)(c + 3 * rsc, ab3, n); + case 3: + VSE_V_F(PREC, LMUL)(c + 2 * rsc, ab2, n); + case 2: + VSE_V_F(PREC, LMUL)(c + 1 * rsc, ab1, n); + case 1: + VSE_V_F(PREC, LMUL)(c + 0 * rsc, ab0, n); + } + } + else { + switch (m) { + case 6: + VSSE_V_F(PREC, LMUL)(c + 5 * rsc, FLT_SIZE * csc, ab5, n); + case 5: + VSSE_V_F(PREC, LMUL)(c + 4 * rsc, FLT_SIZE * csc, ab4, n); + case 4: + VSSE_V_F(PREC, LMUL)(c + 3 * rsc, FLT_SIZE * csc, ab3, n); + case 3: + VSSE_V_F(PREC, LMUL)(c + 2 * rsc, FLT_SIZE * csc, ab2, n); + case 2: + VSSE_V_F(PREC, LMUL)(c + 1 * rsc, FLT_SIZE * csc, ab1, n); + case 1: + VSSE_V_F(PREC, LMUL)(c + 0 * rsc, FLT_SIZE * csc, ab0, n); + } + } + } + + return; +} + +#endif // GEMM diff --git a/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_l_sifive_x280_asm_complex.c b/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_l_sifive_x280_asm_complex.c deleted file mode 100644 index 18df010d05..0000000000 --- a/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_l_sifive_x280_asm_complex.c +++ /dev/null @@ -1,327 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2023, SiFive, Inc. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -// clang-format off -#ifdef GEMMTRSM - -GEMMTRSM(GEMMTRSM_L, PRECISION_CHAR, void) -{ - (void) data; - (void) cntx; - const DATATYPE* restrict alpha = alpha_; - const DATATYPE* restrict a10 = a10_; - const DATATYPE* restrict a11 = a11_; - const DATATYPE* restrict b01 = b01_; - const DATATYPE* restrict b11 = b11_; - DATATYPE* restrict c11 = c11_; - - if (m <= 0 || n <= 0) - return; - - __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(n), "i"(8 * FLT_SIZE)); - - DATATYPE alpha_cast = *alpha; - if (alpha_cast.real == 0 && alpha_cast.imag == 0) { - switch (m) { - case 6: - __asm__("vmv.v.i v20, 0"); - __asm__("vmv.v.i v22, 0"); - case 5: - __asm__("vmv.v.i v16, 0"); - __asm__("vmv.v.i v18, 0"); - case 4: - __asm__("vmv.v.i v12, 0"); - __asm__("vmv.v.i v14, 0"); - case 3: - __asm__("vmv.v.i v8, 0"); - __asm__("vmv.v.i v10, 0"); - case 2: - __asm__("vmv.v.i v4, 0"); - __asm__("vmv.v.i v6, 0"); - case 1: - __asm__("vmv.v.i v0, 0"); - __asm__("vmv.v.i v2, 0"); - } - } - else { - const DATATYPE* b11_tmp = b11 + (m - 1) * PACKNR; - switch (m) { - case 6: - __asm__(VLSEG2 "v24, (%0)" : : "r"(b11_tmp)); - vcmul_vf2(v20, v22, v24, v26, alpha_cast.real, alpha_cast.imag); - __asm__("addi %0, %0, %1" : "+r"(b11_tmp) : "I"(-PACKNR * 2 * FLT_SIZE)); - case 5: - __asm__(VLSEG2 "v28, (%0)" : : "r"(b11_tmp)); - vcmul_vf2(v16, v18, v28, v30, alpha_cast.real, alpha_cast.imag); - __asm__("addi %0, %0, %1" : "+r"(b11_tmp) : "I"(-PACKNR * 2 * FLT_SIZE)); - case 4: - __asm__(VLSEG2 "v24, (%0)" : : "r"(b11_tmp)); - vcmul_vf2(v12, v14, v24, v26, alpha_cast.real, alpha_cast.imag); - __asm__("addi %0, %0, %1" : "+r"(b11_tmp) : "I"(-PACKNR * 2 * FLT_SIZE)); - case 3: - __asm__(VLSEG2 "v28, (%0)" : : "r"(b11_tmp)); - vcmul_vf2(v8, v10, v28, v30, alpha_cast.real, alpha_cast.imag); - __asm__("addi %0, %0, %1" : "+r"(b11_tmp) : "I"(-PACKNR * 2 * FLT_SIZE)); - case 2: - __asm__(VLSEG2 "v24, (%0)" : : "r"(b11_tmp)); - vcmul_vf2(v4, v6, v24, v26, alpha_cast.real, alpha_cast.imag); - __asm__("addi %0, %0, %1" : "+r"(b11_tmp) : "I"(-PACKNR * 2 * FLT_SIZE)); - case 1: - __asm__(VLSEG2 "v28, (%0)" : : "r"(b11_tmp)); - vcmul_vf2(v0, v2, v28, v30, alpha_cast.real, alpha_cast.imag); - } - } - - if (k >= 1) { - __asm__(VLSEG2 "v24, (%0)" : : "r"(b01)); - __asm__("addi %0, %0, %1" : "+r"(b01) : "I"(PACKNR * 2 * FLT_SIZE)); - } - if (k >= 2) { - __asm__(VLSEG2 "v28, (%0)" : : "r"(b01)); - __asm__("addi %0, %0, %1" : "+r"(b01) : "I"(PACKNR * 2 * FLT_SIZE)); - } - - while (k > 0) { - switch (m) { - case 6: - __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a10), "I"(10 * FLT_SIZE)); - __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a10), "I"(11 * FLT_SIZE)); - vcnmsac_vf(v20, v22, ft10, ft11, v24, v26); - case 5: - __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a10), "I"(8 * FLT_SIZE)); - __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a10), "I"(9 * FLT_SIZE)); - vcnmsac_vf(v16, v18, ft8, ft9, v24, v26); - case 4: - __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a10), "I"(6 * FLT_SIZE)); - __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a10), "I"(7 * FLT_SIZE)); - vcnmsac_vf(v12, v14, ft6, ft7, v24, v26); - case 3: - __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a10), "I"(4 * FLT_SIZE)); - __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a10), "I"(5 * FLT_SIZE)); - vcnmsac_vf(v8, v10, ft4, ft5, v24, v26); - case 2: - __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a10), "I"(2 * FLT_SIZE)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a10), "I"(3 * FLT_SIZE)); - vcnmsac_vf(v4, v6, ft2, ft3, v24, v26); - case 1: - __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a10), "I"(0 * FLT_SIZE)); - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a10), "I"(1 * FLT_SIZE)); - vcnmsac_vf(v0, v2, ft0, ft1, v24, v26); - } - k -= 1; - - if (k == 0) { break; } - - if (k >= 2) { - __asm__(VLSEG2 "v24, (%0)" : : "r"(b01)); - __asm__("addi %0, %0, %1" : "+r"(b01) : "I"(PACKNR * 2 * FLT_SIZE)); - } - __asm__("addi %0, %0, %1" : "+r"(a10) : "I"(PACKMR * 2 * FLT_SIZE)); - - switch (m) { - case 6: - __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a10), "I"(10 * FLT_SIZE)); - __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a10), "I"(11 * FLT_SIZE)); - vcnmsac_vf(v20, v22, ft10, ft11, v28, v30); - case 5: - __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a10), "I"(8 * FLT_SIZE)); - __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a10), "I"(9 * FLT_SIZE)); - vcnmsac_vf(v16, v18, ft8, ft9, v28, v30); - case 4: - __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a10), "I"(6 * FLT_SIZE)); - __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a10), "I"(7 * FLT_SIZE)); - vcnmsac_vf(v12, v14, ft6, ft7, v28, v30); - case 3: - __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a10), "I"(4 * FLT_SIZE)); - __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a10), "I"(5 * FLT_SIZE)); - vcnmsac_vf(v8, v10, ft4, ft5, v28, v30); - case 2: - __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a10), "I"(2 * FLT_SIZE)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a10), "I"(3 * FLT_SIZE)); - vcnmsac_vf(v4, v6, ft2, ft3, v28, v30); - case 1: - __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a10), "I"(0 * FLT_SIZE)); - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a10), "I"(1 * FLT_SIZE)); - vcnmsac_vf(v0, v2, ft0, ft1, v28, v30); - } - k -= 1; - - if (k >= 2) { - __asm__(VLSEG2 "v28, (%0)" : : "r"(b01)); - __asm__("addi %0, %0, %1" : "+r"(b01) : "I"(PACKNR * 2 * FLT_SIZE)); - } - __asm__("addi %0, %0, %1" : "+r"(a10) : "I"(PACKMR * 2 * FLT_SIZE)); - } - - rsc *= 2 * FLT_SIZE; - csc *= 2 * FLT_SIZE; - - __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a11), "I"(0 * FLT_SIZE)); - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a11), "I"(1 * FLT_SIZE)); - vcmul_vf(v24, v26, v0, v2, ft0, ft1); - __asm__(VSSEG2 "v24, (%0)" : : "r"(b11)); - __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(c11), "r"(csc)); - - if (m == 1) return; - - switch (m) { - case 6: - __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a11), "I"(10 * FLT_SIZE)); - __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a11), "I"(11 * FLT_SIZE)); - vcnmsac_vf(v20, v22, ft10, ft11, v24, v26); - case 5: - __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a11), "I"(8 * FLT_SIZE)); - __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a11), "I"(9 * FLT_SIZE)); - vcnmsac_vf(v16, v18, ft8, ft9, v24, v26); - case 4: - __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a11), "I"(6 * FLT_SIZE)); - __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a11), "I"(7 * FLT_SIZE)); - vcnmsac_vf(v12, v14, ft6, ft7, v24, v26); - case 3: - __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a11), "I"(4 * FLT_SIZE)); - __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a11), "I"(5 * FLT_SIZE)); - vcnmsac_vf(v8, v10, ft4, ft5, v24, v26); - case 2: - __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a11), "I"(2 * FLT_SIZE)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a11), "I"(3 * FLT_SIZE)); - vcnmsac_vf(v4, v6, ft2, ft3, v24, v26); - } - __asm__("addi %0, %0, %1" : "+r"(a11) : "I"(PACKMR * 2 * FLT_SIZE)); - __asm__("addi %0, %0, %1" : "+r"(b11) : "I"(PACKNR * 2 * FLT_SIZE)); - __asm__("add %0, %0, %1" : "+r"(c11) : "r"(rsc)); - - __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a11), "I"(2 * FLT_SIZE)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a11), "I"(3 * FLT_SIZE)); - vcmul_vf(v24, v26, v4, v6, ft2, ft3); - __asm__(VSSEG2 "v24, (%0)" : : "r"(b11)); - __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(c11), "r"(csc)); - - if (m == 2) return; - - switch (m) { - case 6: - __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a11), "I"(10 * FLT_SIZE)); - __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a11), "I"(11 * FLT_SIZE)); - vcnmsac_vf(v20, v22, ft10, ft11, v24, v26); - case 5: - __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a11), "I"(8 * FLT_SIZE)); - __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a11), "I"(9 * FLT_SIZE)); - vcnmsac_vf(v16, v18, ft8, ft9, v24, v26); - case 4: - __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a11), "I"(6 * FLT_SIZE)); - __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a11), "I"(7 * FLT_SIZE)); - vcnmsac_vf(v12, v14, ft6, ft7, v24, v26); - case 3: - __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a11), "I"(4 * FLT_SIZE)); - __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a11), "I"(5 * FLT_SIZE)); - vcnmsac_vf(v8, v10, ft4, ft5, v24, v26); - } - __asm__("addi %0, %0, %1" : "+r"(a11) : "I"(PACKMR * 2 * FLT_SIZE)); - __asm__("addi %0, %0, %1" : "+r"(b11) : "I"(PACKNR * 2 * FLT_SIZE)); - __asm__("add %0, %0, %1" : "+r"(c11) : "r"(rsc)); - - __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a11), "I"(4 * FLT_SIZE)); - __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a11), "I"(5 * FLT_SIZE)); - vcmul_vf(v24, v26, v8, v10, ft4, ft5); - __asm__(VSSEG2 "v24, (%0)" : : "r"(b11)); - __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(c11), "r"(csc)); - - if (m == 3) return; - - switch (m) { - case 6: - __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a11), "I"(10 * FLT_SIZE)); - __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a11), "I"(11 * FLT_SIZE)); - vcnmsac_vf(v20, v22, ft10, ft11, v24, v26); - case 5: - __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a11), "I"(8 * FLT_SIZE)); - __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a11), "I"(9 * FLT_SIZE)); - vcnmsac_vf(v16, v18, ft8, ft9, v24, v26); - case 4: - __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a11), "I"(6 * FLT_SIZE)); - __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a11), "I"(7 * FLT_SIZE)); - vcnmsac_vf(v12, v14, ft6, ft7, v24, v26); - } - __asm__("addi %0, %0, %1" : "+r"(a11) : "I"(PACKMR * 2 * FLT_SIZE)); - __asm__("addi %0, %0, %1" : "+r"(b11) : "I"(PACKNR * 2 * FLT_SIZE)); - __asm__("add %0, %0, %1" : "+r"(c11) : "r"(rsc)); - - __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a11), "I"(6 * FLT_SIZE)); - __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a11), "I"(7 * FLT_SIZE)); - vcmul_vf(v24, v26, v12, v14, ft6, ft7); - __asm__(VSSEG2 "v24, (%0)" : : "r"(b11)); - __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(c11), "r"(csc)); - - if (m == 4) return; - - switch (m) { - case 6: - __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a11), "I"(10 * FLT_SIZE)); - __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a11), "I"(11 * FLT_SIZE)); - vcnmsac_vf(v20, v22, ft10, ft11, v24, v26); - case 5: - __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a11), "I"(8 * FLT_SIZE)); - __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a11), "I"(9 * FLT_SIZE)); - vcnmsac_vf(v16, v18, ft8, ft9, v24, v26); - } - __asm__("addi %0, %0, %1" : "+r"(a11) : "I"(PACKMR * 2 * FLT_SIZE)); - __asm__("addi %0, %0, %1" : "+r"(b11) : "I"(PACKNR * 2 * FLT_SIZE)); - __asm__("add %0, %0, %1" : "+r"(c11) : "r"(rsc)); - - __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a11), "I"(8 * FLT_SIZE)); - __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a11), "I"(9 * FLT_SIZE)); - vcmul_vf(v24, v26, v16, v18, ft8, ft9); - __asm__(VSSEG2 "v24, (%0)" : : "r"(b11)); - __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(c11), "r"(csc)); - - if (m == 5) return; - - __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a11), "I"(10 * FLT_SIZE)); - __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a11), "I"(11 * FLT_SIZE)); - vcnmsac_vf(v20, v22, ft10, ft11, v24, v26); - - __asm__("addi %0, %0, %1" : "+r"(a11) : "I"(PACKMR * 2 * FLT_SIZE)); - __asm__("addi %0, %0, %1" : "+r"(b11) : "I"(PACKNR * 2 * FLT_SIZE)); - __asm__("add %0, %0, %1" : "+r"(c11) : "r"(rsc)); - - __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a11), "I"(10 * FLT_SIZE)); - __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a11), "I"(11 * FLT_SIZE)); - vcmul_vf(v24, v26, v20, v22, ft10, ft11); - __asm__(VSSEG2 "v24, (%0)" : : "r"(b11)); - __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(c11), "r"(csc)); - - return; -} - -#endif diff --git a/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_l_sifive_x280_asm_real.c b/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_l_sifive_x280_asm_real.c deleted file mode 100644 index a0f9134731..0000000000 --- a/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_l_sifive_x280_asm_real.c +++ /dev/null @@ -1,253 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2023, SiFive, Inc. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -// clang-format off -#ifdef GEMMTRSM - -GEMMTRSM(GEMMTRSM_L, PRECISION_CHAR, void) -{ - const DATATYPE* restrict alpha = alpha_; - const DATATYPE* restrict a10 = a10_; - const DATATYPE* restrict a11 = a11_; - const DATATYPE* restrict b01 = b01_; - const DATATYPE* restrict b11 = b11_; - DATATYPE* restrict c11 = c11_; - - if (!(1 <= m && m <= PACKMR && 1 <= n && n <= PACKNR)) - return; - - dim_t b11_offset, temp; - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma": "=r"(vl) : "r"(n), "i"(8*FLT_SIZE)); - - // Multiply step sizes by data size - __asm__("slli %0, %0, %1": "+r"(rsc) : "I"(LOG_FLT_SIZE)); - __asm__("slli %0, %0, %1": "+r"(csc) : "I"(LOG_FLT_SIZE)); - - __asm__("addi %0, %1, %2": "=r"(b11_offset): "r"(m), "I"(-1)); - __asm__("li %0, %1": "=r"(temp): "I"(PACKNR * FLT_SIZE)); - __asm__("mul %0, %0, %1": "+r"(b11_offset): "r"(temp)); - // b11_offset = (m-1)*PACKNR*FLT_SIZE - - __asm__("add %0, %0, %1": "+r"(b11): "r"(b11_offset)); - __asm__(FLT_LOAD " f0, (%0)" : : "r"(alpha)); // TO DO: optimize alpha = 1 case - switch (m){ // Vector loads from b11 with Duff device, multiplying by alpha - case 7: __asm__(VLE " v0, (%0)": : "r"(b11)); - __asm__("vfmul.vf v0, v0, f0"); - __asm__("addi %0, %0, %1": "+r"(b11): "I"(-PACKNR * FLT_SIZE)); - case 6: __asm__(VLE " v4, (%0)": : "r"(b11)); - __asm__("vfmul.vf v4, v4, f0"); - __asm__("addi %0, %0, %1": "+r"(b11): "I"(-PACKNR * FLT_SIZE)); - case 5: __asm__(VLE " v8, (%0)": : "r"(b11)); - __asm__("vfmul.vf v8, v8, f0"); - __asm__("addi %0, %0, %1": "+r"(b11): "I"(-PACKNR * FLT_SIZE)); - case 4: __asm__(VLE " v12, (%0)": : "r"(b11)); - __asm__("vfmul.vf v12, v12, f0"); - __asm__("addi %0, %0, %1": "+r"(b11): "I"(-PACKNR * FLT_SIZE)); - case 3: __asm__(VLE " v16, (%0)": : "r"(b11)); - __asm__("vfmul.vf v16, v16, f0"); - __asm__("addi %0, %0, %1": "+r"(b11): "I"(-PACKNR * FLT_SIZE)); - case 2: __asm__(VLE " v20, (%0)": : "r"(b11)); - __asm__("vfmul.vf v20, v20, f0"); - __asm__("addi %0, %0, %1": "+r"(b11): "I"(-PACKNR * FLT_SIZE)); - case 1: __asm__(VLE " v24, (%0)": : "r"(b11)); - __asm__("vfmul.vf v24, v24, f0"); - // no sub of b11 on final entry - } - // b11 now reset to original value - // v0 = row 6 of b11 - // v4 = row 5 of b11 - // v8 = row 4 of b11 - // v12 = row 3 of b11 - // v16 = row 2 of b11 - // v20 = row 1 of b11 - // v24 = row 0 of b11 - - // GEMM: B11 := alpha * B11 - A10 * B01 - for (dim_t i = 0; i < k; i++){ - __asm__(VLE " v28, (%0)": : "r"(b01)); // kth row of b01 - switch (m){ - case 7: __asm__(FLT_LOAD " f6, %0(%1)" : : "I"(6*FLT_SIZE), "r"(a10)); - __asm__("vfnmsac.vf v0, f6, v28"); - case 6: __asm__(FLT_LOAD " f5, %0(%1)" : : "I"(5*FLT_SIZE), "r"(a10)); - __asm__("vfnmsac.vf v4, f5, v28"); - case 5: __asm__(FLT_LOAD " f4, %0(%1)" : : "I"(4*FLT_SIZE), "r"(a10)); - __asm__("vfnmsac.vf v8, f4, v28"); - case 4: __asm__(FLT_LOAD " f3, %0(%1)" : : "I"(3*FLT_SIZE), "r"(a10)); - __asm__("vfnmsac.vf v12, f3, v28"); - case 3: __asm__(FLT_LOAD " f2, %0(%1)" : : "I"(2*FLT_SIZE), "r"(a10)); - __asm__("vfnmsac.vf v16, f2, v28"); - case 2: __asm__(FLT_LOAD " f1, %0(%1)" : : "I"(1*FLT_SIZE), "r"(a10)); - __asm__("vfnmsac.vf v20, f1, v28"); - case 1: __asm__(FLT_LOAD " f0, %0(%1)" : : "I"(0*FLT_SIZE), "r"(a10)); - __asm__("vfnmsac.vf v24, f0, v28"); - } - __asm__("addi %0, %0, %1": "+r"(a10): "I"(PACKMR * FLT_SIZE)); - __asm__("addi %0, %0, %1": "+r"(b01): "I"(PACKNR * FLT_SIZE)); - } - // TRSM: B11 := inv(A11) * B11 - // TO DO: Investigate code size reduction (loop rerolling) - - // Row 0 - __asm__(FLT_LOAD " f0, %0(%1)": : "I"(0*FLT_SIZE), "r"(a11)); - __asm__("vfmul.vf v24, v24, f0"); - __asm__(VSE " v24, (%0)": : "r"(b11)); - __asm__(VSSE " v24, (%0), %1": : "r"(c11), "r"(csc)); - if (m == 1) return; - - switch (m){ - case 7: __asm__(FLT_LOAD " f6, %0(%1)": : "I"(6*FLT_SIZE), "r"(a11)); - __asm__("vfnmsac.vf v0, f6, v24"); - case 6: __asm__(FLT_LOAD " f5, %0(%1)": : "I"(5*FLT_SIZE), "r"(a11)); - __asm__("vfnmsac.vf v4, f5, v24"); - case 5: __asm__(FLT_LOAD " f4, %0(%1)": : "I"(4*FLT_SIZE), "r"(a11)); - __asm__("vfnmsac.vf v8, f4, v24"); - case 4: __asm__(FLT_LOAD " f3, %0(%1)": : "I"(3*FLT_SIZE), "r"(a11)); - __asm__("vfnmsac.vf v12, f3, v24"); - case 3: __asm__(FLT_LOAD " f2, %0(%1)": : "I"(2*FLT_SIZE), "r"(a11)); - __asm__("vfnmsac.vf v16, f2, v24"); - case 2: __asm__(FLT_LOAD " f1, %0(%1)": : "I"(1*FLT_SIZE), "r"(a11)); - __asm__("vfnmsac.vf v20, f1, v24"); - } - // Pointer bumps - __asm__("addi %0, %0, %1": "+r"(a11): "I"(PACKMR * FLT_SIZE)); - __asm__("addi %0, %0, %1": "+r"(b11): "I"(PACKNR * FLT_SIZE)); - __asm__("add %0, %0, %1": "+r"(c11): "r"(rsc)); - - // Row 1 - __asm__(FLT_LOAD " f1, %0(%1)": : "I"(1*FLT_SIZE), "r"(a11)); - __asm__("vfmul.vf v20, v20, f1"); - __asm__(VSE " v20, (%0)": : "r"(b11)); - __asm__(VSSE " v20, (%0), %1": : "r"(c11), "r"(csc)); - if (m == 2) return; - - switch (m){ - case 7: __asm__(FLT_LOAD " f6, %0(%1)": : "I"(6*FLT_SIZE), "r"(a11)); - __asm__("vfnmsac.vf v0, f6, v20"); - case 6: __asm__(FLT_LOAD " f5, %0(%1)": : "I"(5*FLT_SIZE), "r"(a11)); - __asm__("vfnmsac.vf v4, f5, v20"); - case 5: __asm__(FLT_LOAD " f4, %0(%1)": : "I"(4*FLT_SIZE), "r"(a11)); - __asm__("vfnmsac.vf v8, f4, v20"); - case 4: __asm__(FLT_LOAD " f3, %0(%1)": : "I"(3*FLT_SIZE), "r"(a11)); - __asm__("vfnmsac.vf v12, f3, v20"); - case 3: __asm__(FLT_LOAD " f2, %0(%1)": : "I"(2*FLT_SIZE), "r"(a11)); - __asm__("vfnmsac.vf v16, f2, v20"); - } - // Pointer bumps - __asm__("addi %0, %0, %1": "+r"(a11): "I"(PACKMR * FLT_SIZE)); - __asm__("addi %0, %0, %1": "+r"(b11): "I"(PACKNR * FLT_SIZE)); - __asm__("add %0, %0, %1": "+r"(c11): "r"(rsc)); - - // Row 2 - __asm__(FLT_LOAD " f2, %0(%1)": : "I"(2*FLT_SIZE), "r"(a11)); - __asm__("vfmul.vf v16, v16, f2"); - __asm__(VSE " v16, (%0)": : "r"(b11)); - __asm__(VSSE " v16, (%0), %1": : "r"(c11), "r"(csc)); - if (m == 3) return; - - switch (m){ - case 7: __asm__(FLT_LOAD " f6, %0(%1)": : "I"(6*FLT_SIZE), "r"(a11)); - __asm__("vfnmsac.vf v0, f6, v16"); - case 6: __asm__(FLT_LOAD " f5, %0(%1)": : "I"(5*FLT_SIZE), "r"(a11)); - __asm__("vfnmsac.vf v4, f5, v16"); - case 5: __asm__(FLT_LOAD " f4, %0(%1)": : "I"(4*FLT_SIZE), "r"(a11)); - __asm__("vfnmsac.vf v8, f4, v16"); - case 4: __asm__(FLT_LOAD " f3, %0(%1)": : "I"(3*FLT_SIZE), "r"(a11)); - __asm__("vfnmsac.vf v12, f3, v16"); - } - // Pointer bumps - __asm__("addi %0, %0, %1": "+r"(a11): "I"(PACKMR * FLT_SIZE)); - __asm__("addi %0, %0, %1": "+r"(b11): "I"(PACKNR * FLT_SIZE)); - __asm__("add %0, %0, %1": "+r"(c11): "r"(rsc)); - - // Row 3 - __asm__(FLT_LOAD " f3, %0(%1)": : "I"(3*FLT_SIZE), "r"(a11)); - __asm__("vfmul.vf v12, v12, f3"); - __asm__(VSE " v12, (%0)": : "r"(b11)); - __asm__(VSSE " v12, (%0), %1": : "r"(c11), "r"(csc)); - if (m == 4) return; - - switch (m){ - case 7: __asm__(FLT_LOAD " f6, %0(%1)": : "I"(6*FLT_SIZE), "r"(a11)); - __asm__("vfnmsac.vf v0, f6, v12"); - case 6: __asm__(FLT_LOAD " f5, %0(%1)": : "I"(5*FLT_SIZE), "r"(a11)); - __asm__("vfnmsac.vf v4, f5, v12"); - case 5: __asm__(FLT_LOAD " f4, %0(%1)": : "I"(4*FLT_SIZE), "r"(a11)); - __asm__("vfnmsac.vf v8, f4, v12"); - } - // Pointer bumps - __asm__("addi %0, %0, %1": "+r"(a11): "I"(PACKMR * FLT_SIZE)); - __asm__("addi %0, %0, %1": "+r"(b11): "I"(PACKNR * FLT_SIZE)); - __asm__("add %0, %0, %1": "+r"(c11): "r"(rsc)); - - // Row 4 - __asm__(FLT_LOAD " f4, %0(%1)": : "I"(4*FLT_SIZE), "r"(a11)); - __asm__("vfmul.vf v8, v8, f4"); - __asm__(VSE " v8, (%0)": : "r"(b11)); - __asm__(VSSE " v8, (%0), %1": : "r"(c11), "r"(csc)); - if (m == 5) return; - - switch (m){ - case 7: __asm__(FLT_LOAD " f6, %0(%1)": : "I"(6*FLT_SIZE), "r"(a11)); - __asm__("vfnmsac.vf v0, f6, v8"); - case 6: __asm__(FLT_LOAD " f5, %0(%1)": : "I"(5*FLT_SIZE), "r"(a11)); - __asm__("vfnmsac.vf v4, f5, v8"); - } - // Pointer bumps - __asm__("addi %0, %0, %1": "+r"(a11): "I"(PACKMR * FLT_SIZE)); - __asm__("addi %0, %0, %1": "+r"(b11): "I"(PACKNR * FLT_SIZE)); - __asm__("add %0, %0, %1": "+r"(c11): "r"(rsc)); - - // Row 5 - __asm__(FLT_LOAD " f5, %0(%1)": : "I"(5*FLT_SIZE), "r"(a11)); - __asm__("vfmul.vf v4, v4, f5"); - __asm__(VSE " v4, (%0)": : "r"(b11)); - __asm__(VSSE " v4, (%0), %1": : "r"(c11), "r"(csc)); - if (m == 6) return; - - __asm__(FLT_LOAD " f6, %0(%1)": : "I"(6*FLT_SIZE), "r"(a11)); - __asm__("vfnmsac.vf v0, f6, v4"); - - // Pointer bumps - __asm__("addi %0, %0, %1": "+r"(a11): "I"(PACKMR * FLT_SIZE)); - __asm__("addi %0, %0, %1": "+r"(b11): "I"(PACKNR * FLT_SIZE)); - __asm__("add %0, %0, %1": "+r"(c11): "r"(rsc)); - - // Row 6 - __asm__(FLT_LOAD " f6, %0(%1)": : "I"(6*FLT_SIZE), "r"(a11)); - __asm__("vfmul.vf v0, v0, f6"); - __asm__(VSE " v0, (%0)": : "r"(b11)); - __asm__(VSSE " v0, (%0), %1": : "r"(c11), "r"(csc)); -} -#endif diff --git a/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_u_sifive_x280_asm_complex.c b/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_u_sifive_x280_asm_complex.c deleted file mode 100644 index 9332fd0963..0000000000 --- a/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_u_sifive_x280_asm_complex.c +++ /dev/null @@ -1,331 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2023, SiFive, Inc. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -// clang-format off -#ifdef GEMMTRSM - -GEMMTRSM(GEMMTRSM_U, PRECISION_CHAR, void) -{ - (void) data; - (void) cntx; - const DATATYPE* restrict alpha = alpha_; - const DATATYPE* restrict a12 = a12_; - const DATATYPE* restrict a11 = a11_; - const DATATYPE* restrict b21 = b21_; - const DATATYPE* restrict b11 = b11_; - DATATYPE* restrict c11 = c11_; - - if (m <= 0 || n <= 0) - return; - - __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(n), "i"(8 * FLT_SIZE)); - - DATATYPE alpha_cast = *alpha; - if (alpha_cast.real == 0 && alpha_cast.imag == 0) { - switch (m) { - case 6: - __asm__("vmv.v.i v20, 0"); - __asm__("vmv.v.i v22, 0"); - case 5: - __asm__("vmv.v.i v16, 0"); - __asm__("vmv.v.i v18, 0"); - case 4: - __asm__("vmv.v.i v12, 0"); - __asm__("vmv.v.i v14, 0"); - case 3: - __asm__("vmv.v.i v8, 0"); - __asm__("vmv.v.i v10, 0"); - case 2: - __asm__("vmv.v.i v4, 0"); - __asm__("vmv.v.i v6, 0"); - case 1: - __asm__("vmv.v.i v0, 0"); - __asm__("vmv.v.i v2, 0"); - } - } - else { - const DATATYPE* b11_tmp = b11; - switch (m) { - case 6: - __asm__(VLSEG2 "v24, (%0)" : : "r"(b11_tmp)); - vcmul_vf2(v20, v22, v24, v26, alpha_cast.real, alpha_cast.imag); - __asm__("addi %0, %0, %1" : "+r"(b11_tmp) : "I"(PACKNR * 2 * FLT_SIZE)); - case 5: - __asm__(VLSEG2 "v28, (%0)" : : "r"(b11_tmp)); - vcmul_vf2(v16, v18, v28, v30, alpha_cast.real, alpha_cast.imag); - __asm__("addi %0, %0, %1" : "+r"(b11_tmp) : "I"(PACKNR * 2 * FLT_SIZE)); - case 4: - __asm__(VLSEG2 "v24, (%0)" : : "r"(b11_tmp)); - vcmul_vf2(v12, v14, v24, v26, alpha_cast.real, alpha_cast.imag); - __asm__("addi %0, %0, %1" : "+r"(b11_tmp) : "I"(PACKNR * 2 * FLT_SIZE)); - case 3: - __asm__(VLSEG2 "v28, (%0)" : : "r"(b11_tmp)); - vcmul_vf2(v8, v10, v28, v30, alpha_cast.real, alpha_cast.imag); - __asm__("addi %0, %0, %1" : "+r"(b11_tmp) : "I"(PACKNR * 2 * FLT_SIZE)); - case 2: - __asm__(VLSEG2 "v24, (%0)" : : "r"(b11_tmp)); - vcmul_vf2(v4, v6, v24, v26, alpha_cast.real, alpha_cast.imag); - __asm__("addi %0, %0, %1" : "+r"(b11_tmp) : "I"(PACKNR * 2 * FLT_SIZE)); - case 1: - __asm__(VLSEG2 "v28, (%0)" : : "r"(b11_tmp)); - vcmul_vf2(v0, v2, v28, v30, alpha_cast.real, alpha_cast.imag); - } - } - - if (k >= 1) { - __asm__(VLSEG2 "v24, (%0)" : : "r"(b21)); - __asm__("addi %0, %0, %1" : "+r"(b21) : "I"(PACKNR * 2 * FLT_SIZE)); - } - if (k >= 2) { - __asm__(VLSEG2 "v28, (%0)" : : "r"(b21)); - __asm__("addi %0, %0, %1" : "+r"(b21) : "I"(PACKNR * 2 * FLT_SIZE)); - } - - a12 += m - 1; - - while (k > 0) { - switch (m) { - case 6: - __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a12), "I"(-10 * FLT_SIZE)); - __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a12), "I"(-9 * FLT_SIZE)); - vcnmsac_vf(v20, v22, ft10, ft11, v24, v26); - case 5: - __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a12), "I"(-8 * FLT_SIZE)); - __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a12), "I"(-7 * FLT_SIZE)); - vcnmsac_vf(v16, v18, ft8, ft9, v24, v26); - case 4: - __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a12), "I"(-6 * FLT_SIZE)); - __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a12), "I"(-5 * FLT_SIZE)); - vcnmsac_vf(v12, v14, ft6, ft7, v24, v26); - case 3: - __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a12), "I"(-4 * FLT_SIZE)); - __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a12), "I"(-3 * FLT_SIZE)); - vcnmsac_vf(v8, v10, ft4, ft5, v24, v26); - case 2: - __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a12), "I"(-2 * FLT_SIZE)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a12), "I"(-1 * FLT_SIZE)); - vcnmsac_vf(v4, v6, ft2, ft3, v24, v26); - case 1: - __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a12), "I"(0 * FLT_SIZE)); - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a12), "I"(1 * FLT_SIZE)); - vcnmsac_vf(v0, v2, ft0, ft1, v24, v26); - } - k -= 1; - - if (k == 0) { break; } - - if (k >= 2) { - __asm__(VLSEG2 "v24, (%0)" : : "r"(b21)); - __asm__("addi %0, %0, %1" : "+r"(b21) : "I"(PACKNR * 2 * FLT_SIZE)); - } - __asm__("addi %0, %0, %1" : "+r"(a12) : "I"(PACKMR * 2 * FLT_SIZE)); - - switch (m) { - case 6: - __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a12), "I"(-10 * FLT_SIZE)); - __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a12), "I"(-9 * FLT_SIZE)); - vcnmsac_vf(v20, v22, ft10, ft11, v28, v30); - case 5: - __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a12), "I"(-8 * FLT_SIZE)); - __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a12), "I"(-7 * FLT_SIZE)); - vcnmsac_vf(v16, v18, ft8, ft9, v28, v30); - case 4: - __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a12), "I"(-6 * FLT_SIZE)); - __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a12), "I"(-5 * FLT_SIZE)); - vcnmsac_vf(v12, v14, ft6, ft7, v28, v30); - case 3: - __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a12), "I"(-4 * FLT_SIZE)); - __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a12), "I"(-3 * FLT_SIZE)); - vcnmsac_vf(v8, v10, ft4, ft5, v28, v30); - case 2: - __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a12), "I"(-2 * FLT_SIZE)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a12), "I"(-1 * FLT_SIZE)); - vcnmsac_vf(v4, v6, ft2, ft3, v28, v30); - case 1: - __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a12), "I"(0 * FLT_SIZE)); - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a12), "I"(1 * FLT_SIZE)); - vcnmsac_vf(v0, v2, ft0, ft1, v28, v30); - } - k -= 1; - - if (k >= 2) { - __asm__(VLSEG2 "v28, (%0)" : : "r"(b21)); - __asm__("addi %0, %0, %1" : "+r"(b21) : "I"(PACKNR * 2 * FLT_SIZE)); - } - __asm__("addi %0, %0, %1" : "+r"(a12) : "I"(PACKMR * 2 * FLT_SIZE)); - } - - a11 += (m - 1) * (PACKMR + 1); // (m - 1) + (m - 1) * PACKMR - b11 += (m - 1) * PACKNR; - c11 += (m - 1) * rsc; - rsc *= 2 * FLT_SIZE; - csc *= 2 * FLT_SIZE; - - __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a11), "I"(0 * FLT_SIZE)); - __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a11), "I"(1 * FLT_SIZE)); - vcmul_vf(v24, v26, v0, v2, ft0, ft1); - __asm__(VSSEG2 "v24, (%0)" : : "r"(b11)); - __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(c11), "r"(csc)); - - if (m == 1) return; - - switch (m) { - case 6: - __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a11), "I"(-10 * FLT_SIZE)); - __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a11), "I"(-9 * FLT_SIZE)); - vcnmsac_vf(v20, v22, ft10, ft11, v24, v26); - case 5: - __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a11), "I"(-8 * FLT_SIZE)); - __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a11), "I"(-7 * FLT_SIZE)); - vcnmsac_vf(v16, v18, ft8, ft9, v24, v26); - case 4: - __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a11), "I"(-6 * FLT_SIZE)); - __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a11), "I"(-5 * FLT_SIZE)); - vcnmsac_vf(v12, v14, ft6, ft7, v24, v26); - case 3: - __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a11), "I"(-4 * FLT_SIZE)); - __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a11), "I"(-3 * FLT_SIZE)); - vcnmsac_vf(v8, v10, ft4, ft5, v24, v26); - case 2: - __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a11), "I"(-2 * FLT_SIZE)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a11), "I"(-1 * FLT_SIZE)); - vcnmsac_vf(v4, v6, ft2, ft3, v24, v26); - } - __asm__("addi %0, %0, %1" : "+r"(a11) : "I"(-PACKMR * 2 * FLT_SIZE)); - __asm__("addi %0, %0, %1" : "+r"(b11) : "I"(-PACKNR * 2 * FLT_SIZE)); - __asm__("sub %0, %0, %1" : "+r"(c11) : "r"(rsc)); - - __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a11), "I"(-2 * FLT_SIZE)); - __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a11), "I"(-1 * FLT_SIZE)); - vcmul_vf(v24, v26, v4, v6, ft2, ft3); - __asm__(VSSEG2 "v24, (%0)" : : "r"(b11)); - __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(c11), "r"(csc)); - - if (m == 2) return; - - switch (m) { - case 6: - __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a11), "I"(-10 * FLT_SIZE)); - __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a11), "I"(-9 * FLT_SIZE)); - vcnmsac_vf(v20, v22, ft10, ft11, v24, v26); - case 5: - __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a11), "I"(-8 * FLT_SIZE)); - __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a11), "I"(-7 * FLT_SIZE)); - vcnmsac_vf(v16, v18, ft8, ft9, v24, v26); - case 4: - __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a11), "I"(-6 * FLT_SIZE)); - __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a11), "I"(-5 * FLT_SIZE)); - vcnmsac_vf(v12, v14, ft6, ft7, v24, v26); - case 3: - __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a11), "I"(-4 * FLT_SIZE)); - __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a11), "I"(-3 * FLT_SIZE)); - vcnmsac_vf(v8, v10, ft4, ft5, v24, v26); - } - __asm__("addi %0, %0, %1" : "+r"(a11) : "I"(-PACKMR * 2 * FLT_SIZE)); - __asm__("addi %0, %0, %1" : "+r"(b11) : "I"(-PACKNR * 2 * FLT_SIZE)); - __asm__("sub %0, %0, %1" : "+r"(c11) : "r"(rsc)); - - __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a11), "I"(-4 * FLT_SIZE)); - __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a11), "I"(-3 * FLT_SIZE)); - vcmul_vf(v24, v26, v8, v10, ft4, ft5); - __asm__(VSSEG2 "v24, (%0)" : : "r"(b11)); - __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(c11), "r"(csc)); - - if (m == 3) return; - - switch (m) { - case 6: - __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a11), "I"(-10 * FLT_SIZE)); - __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a11), "I"(-9 * FLT_SIZE)); - vcnmsac_vf(v20, v22, ft10, ft11, v24, v26); - case 5: - __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a11), "I"(-8 * FLT_SIZE)); - __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a11), "I"(-7 * FLT_SIZE)); - vcnmsac_vf(v16, v18, ft8, ft9, v24, v26); - case 4: - __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a11), "I"(-6 * FLT_SIZE)); - __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a11), "I"(-5 * FLT_SIZE)); - vcnmsac_vf(v12, v14, ft6, ft7, v24, v26); - } - __asm__("addi %0, %0, %1" : "+r"(a11) : "I"(-PACKMR * 2 * FLT_SIZE)); - __asm__("addi %0, %0, %1" : "+r"(b11) : "I"(-PACKNR * 2 * FLT_SIZE)); - __asm__("sub %0, %0, %1" : "+r"(c11) : "r"(rsc)); - - __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a11), "I"(-6 * FLT_SIZE)); - __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a11), "I"(-5 * FLT_SIZE)); - vcmul_vf(v24, v26, v12, v14, ft6, ft7); - __asm__(VSSEG2 "v24, (%0)" : : "r"(b11)); - __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(c11), "r"(csc)); - - if (m == 4) return; - - switch (m) { - case 6: - __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a11), "I"(-10 * FLT_SIZE)); - __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a11), "I"(-9 * FLT_SIZE)); - vcnmsac_vf(v20, v22, ft10, ft11, v24, v26); - case 5: - __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a11), "I"(-8 * FLT_SIZE)); - __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a11), "I"(-7 * FLT_SIZE)); - vcnmsac_vf(v16, v18, ft8, ft9, v24, v26); - } - __asm__("addi %0, %0, %1" : "+r"(a11) : "I"(-PACKMR * 2 * FLT_SIZE)); - __asm__("addi %0, %0, %1" : "+r"(b11) : "I"(-PACKNR * 2 * FLT_SIZE)); - __asm__("sub %0, %0, %1" : "+r"(c11) : "r"(rsc)); - - __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a11), "I"(-8 * FLT_SIZE)); - __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a11), "I"(-7 * FLT_SIZE)); - vcmul_vf(v24, v26, v16, v18, ft8, ft9); - __asm__(VSSEG2 "v24, (%0)" : : "r"(b11)); - __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(c11), "r"(csc)); - - if (m == 5) return; - - __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a11), "I"(-10 * FLT_SIZE)); - __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a11), "I"(-9 * FLT_SIZE)); - vcnmsac_vf(v20, v22, ft10, ft11, v24, v26); - - __asm__("addi %0, %0, %1" : "+r"(a11) : "I"(-PACKMR * 2 * FLT_SIZE)); - __asm__("addi %0, %0, %1" : "+r"(b11) : "I"(-PACKNR * 2 * FLT_SIZE)); - __asm__("sub %0, %0, %1" : "+r"(c11) : "r"(rsc)); - - __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a11), "I"(-10 * FLT_SIZE)); - __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a11), "I"(-9 * FLT_SIZE)); - vcmul_vf(v24, v26, v20, v22, ft10, ft11); - __asm__(VSSEG2 "v24, (%0)" : : "r"(b11)); - __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(c11), "r"(csc)); - - return; -} -#endif diff --git a/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_u_sifive_x280_asm_real.c b/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_u_sifive_x280_asm_real.c deleted file mode 100644 index 2d511a8ba6..0000000000 --- a/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_u_sifive_x280_asm_real.c +++ /dev/null @@ -1,260 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2023, SiFive, Inc. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -// clang-format off -#ifdef GEMMTRSM - -GEMMTRSM(GEMMTRSM_U, PRECISION_CHAR, void) -{ - const DATATYPE* restrict alpha = alpha_; - const DATATYPE* restrict a12 = a12_; - const DATATYPE* restrict a11 = a11_; - const DATATYPE* restrict b21 = b21_; - const DATATYPE* restrict b11 = b11_; - DATATYPE* restrict c11 = c11_; - - if (!(1 <= m && m <= PACKMR && 1 <= n && n <= PACKNR)) - return; - - dim_t m_sz, a11_offset, c11_offset, temp; - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma": "=r"(vl) : "r"(n), "i"(8*FLT_SIZE)); - - // Multiply step sizes by data size - __asm__("slli %0, %0, %1": "+r"(rsc) : "I"(LOG_FLT_SIZE)); - __asm__("slli %0, %0, %1": "+r"(csc) : "I"(LOG_FLT_SIZE)); - __asm__("slli %0, %1, %2": "=r"(m_sz) : "r"(m), "I"(LOG_FLT_SIZE)); - - __asm__("li %0, %1": "=r"(temp): "I"((PACKMR+1)*FLT_SIZE)); - __asm__("mul %0, %1, %2": "=r"(a11_offset) : "r"(m), "r"(temp)); - __asm__("addi %0, %0, %1": "+r"(a11_offset) : "I"(-PACKMR * FLT_SIZE)); - __asm__("mul %0, %1, %2": "=r"(c11_offset) : "r"(m), "r"(rsc)); - __asm__("sub %0, %0, %1": "+r"(c11_offset) : "r"(rsc)); - // a11_offset = (PACKMR*(m-1)+m)*sz = m*(PACKMR+1)*FLT_SIZE - PACKMR*FLT_SIZE - // c11_offset = rsc*(m-1)*sz - - __asm__(FLT_LOAD " f0, (%0)" : : "r"(alpha)); - switch (m){ // Vector loads from b11 with Duff device, multiplying by alpha - case 7: __asm__(VLE " v0, (%0)": : "r"(b11)); - __asm__("vfmul.vf v0, v0, f0"); - __asm__("addi %0, %0, %1": "+r"(b11): "I"(PACKNR * FLT_SIZE)); - case 6: __asm__(VLE " v4, (%0)": : "r"(b11)); - __asm__("vfmul.vf v4, v4, f0"); - __asm__("addi %0, %0, %1": "+r"(b11): "I"(PACKNR * FLT_SIZE)); - case 5: __asm__(VLE " v8, (%0)": : "r"(b11)); - __asm__("vfmul.vf v8, v8, f0"); - __asm__("addi %0, %0, %1": "+r"(b11): "I"(PACKNR * FLT_SIZE)); - case 4: __asm__(VLE " v12, (%0)": : "r"(b11)); - __asm__("vfmul.vf v12, v12, f0"); - __asm__("addi %0, %0, %1": "+r"(b11): "I"(PACKNR * FLT_SIZE)); - case 3: __asm__(VLE " v16, (%0)": : "r"(b11)); - __asm__("vfmul.vf v16, v16, f0"); - __asm__("addi %0, %0, %1": "+r"(b11): "I"(PACKNR * FLT_SIZE)); - case 2: __asm__(VLE " v20, (%0)": : "r"(b11)); - __asm__("vfmul.vf v20, v20, f0"); - __asm__("addi %0, %0, %1": "+r"(b11): "I"(PACKNR * FLT_SIZE)); - case 1: __asm__(VLE " v24, (%0)": : "r"(b11)); - __asm__("vfmul.vf v24, v24, f0"); - // no add of b11 on final entry - } - // b11 now positioned at start of last row - // v24 = row 0 from bottom (bottom row) - // v20 = row 1 from bottom - // v16 = row 2 from bottom - // v12 = row 3 from bottom - // v8 = row 4 from bottom - // v4 = row 5 from bottom - // v0 = row 6 from bottom - - // GEMM: B11 := alpha * B11 - A12 * B21 - __asm__("add %0, %0, %1": "+r"(a12): "r"(m_sz)); - for (dim_t i = 0; i < k; i++){ - __asm__(VLE " v28, (%0)": : "r"(b21)); // kth row of b21 - switch (m){ - case 7: __asm__(FLT_LOAD " f6, %0(%1)" : : "I"(-7*FLT_SIZE), "r"(a12)); - __asm__("vfnmsac.vf v0, f6, v28"); - case 6: __asm__(FLT_LOAD " f5, %0(%1)" : : "I"(-6*FLT_SIZE), "r"(a12)); - __asm__("vfnmsac.vf v4, f5, v28"); - case 5: __asm__(FLT_LOAD " f4, %0(%1)" : : "I"(-5*FLT_SIZE), "r"(a12)); - __asm__("vfnmsac.vf v8, f4, v28"); - case 4: __asm__(FLT_LOAD " f3, %0(%1)" : : "I"(-4*FLT_SIZE), "r"(a12)); - __asm__("vfnmsac.vf v12, f3, v28"); - case 3: __asm__(FLT_LOAD " f2, %0(%1)" : : "I"(-3*FLT_SIZE), "r"(a12)); - __asm__("vfnmsac.vf v16, f2, v28"); - case 2: __asm__(FLT_LOAD " f1, %0(%1)" : : "I"(-2*FLT_SIZE), "r"(a12)); - __asm__("vfnmsac.vf v20, f1, v28"); - case 1: __asm__(FLT_LOAD " f0, %0(%1)" : : "I"(-1*FLT_SIZE), "r"(a12)); - __asm__("vfnmsac.vf v24, f0, v28"); - } - __asm__("addi %0, %0, %1": "+r"(a12): "I"(PACKMR * FLT_SIZE)); - __asm__("addi %0, %0, %1": "+r"(b21): "I"(PACKNR * FLT_SIZE)); - } - // TRSM: B11 := inv(A11) * B11 - // Move a11 to end of array and c11 to first entry in last row - __asm__("add %0, %0, %1": "+r"(a11): "r"(a11_offset)); - __asm__("add %0, %0, %1": "+r"(c11): "r"(c11_offset)); - - // Row 0 from bottom (bottom row) - __asm__(FLT_LOAD " f0, %0(%1)": : "I"(-1*FLT_SIZE), "r"(a11)); - __asm__("vfmul.vf v24, v24, f0"); - __asm__(VSE " v24, (%0)": : "r"(b11)); - __asm__(VSSE " v24, (%0), %1": : "r"(c11), "r"(csc)); - if (m == 1) return; - - switch (m){ - case 7: __asm__(FLT_LOAD " f6, %0(%1)": : "I"(-7*FLT_SIZE), "r"(a11)); - __asm__("vfnmsac.vf v0, f6, v24"); - case 6: __asm__(FLT_LOAD " f5, %0(%1)": : "I"(-6*FLT_SIZE), "r"(a11)); - __asm__("vfnmsac.vf v4, f5, v24"); - case 5: __asm__(FLT_LOAD " f4, %0(%1)": : "I"(-5*FLT_SIZE), "r"(a11)); - __asm__("vfnmsac.vf v8, f4, v24"); - case 4: __asm__(FLT_LOAD " f3, %0(%1)": : "I"(-4*FLT_SIZE), "r"(a11)); - __asm__("vfnmsac.vf v12, f3, v24"); - case 3: __asm__(FLT_LOAD " f2, %0(%1)": : "I"(-3*FLT_SIZE), "r"(a11)); - __asm__("vfnmsac.vf v16, f2, v24"); - case 2: __asm__(FLT_LOAD " f1, %0(%1)": : "I"(-2*FLT_SIZE), "r"(a11)); - __asm__("vfnmsac.vf v20, f1, v24"); - } - // Pointer bumps - __asm__("addi %0, %0, %1": "+r"(a11): "I"(-PACKMR * FLT_SIZE)); - __asm__("addi %0, %0, %1": "+r"(b11): "I"(-PACKNR * FLT_SIZE)); - __asm__("sub %0, %0, %1": "+r"(c11): "r"(rsc)); - - // Row 1 from bottom - __asm__(FLT_LOAD " f1, %0(%1)": : "I"(-2*FLT_SIZE), "r"(a11)); - __asm__("vfmul.vf v20, v20, f1"); - __asm__(VSE " v20, (%0)": : "r"(b11)); - __asm__(VSSE " v20, (%0), %1": : "r"(c11), "r"(csc)); - if (m == 2) return; - - switch (m){ - case 7: __asm__(FLT_LOAD " f6, %0(%1)": : "I"(-7*FLT_SIZE), "r"(a11)); - __asm__("vfnmsac.vf v0, f6, v20"); - case 6: __asm__(FLT_LOAD " f5, %0(%1)": : "I"(-6*FLT_SIZE), "r"(a11)); - __asm__("vfnmsac.vf v4, f5, v20"); - case 5: __asm__(FLT_LOAD " f4, %0(%1)": : "I"(-5*FLT_SIZE), "r"(a11)); - __asm__("vfnmsac.vf v8, f4, v20"); - case 4: __asm__(FLT_LOAD " f3, %0(%1)": : "I"(-4*FLT_SIZE), "r"(a11)); - __asm__("vfnmsac.vf v12, f3, v20"); - case 3: __asm__(FLT_LOAD " f2, %0(%1)": : "I"(-3*FLT_SIZE), "r"(a11)); - __asm__("vfnmsac.vf v16, f2, v20"); - } - // Pointer bumps - __asm__("addi %0, %0, %1": "+r"(a11): "I"(-PACKMR * FLT_SIZE)); - __asm__("addi %0, %0, %1": "+r"(b11): "I"(-PACKNR * FLT_SIZE)); - __asm__("sub %0, %0, %1": "+r"(c11): "r"(rsc)); - - // Row 2 from bottom - __asm__(FLT_LOAD " f2, %0(%1)": : "I"(-3*FLT_SIZE), "r"(a11)); - __asm__("vfmul.vf v16, v16, f2"); - __asm__(VSE " v16, (%0)": : "r"(b11)); - __asm__(VSSE " v16, (%0), %1": : "r"(c11), "r"(csc)); - if (m == 3) return; - - switch (m){ - case 7: __asm__(FLT_LOAD " f6, %0(%1)": : "I"(-7*FLT_SIZE), "r"(a11)); - __asm__("vfnmsac.vf v0, f6, v16"); - case 6: __asm__(FLT_LOAD " f5, %0(%1)": : "I"(-6*FLT_SIZE), "r"(a11)); - __asm__("vfnmsac.vf v4, f5, v16"); - case 5: __asm__(FLT_LOAD " f4, %0(%1)": : "I"(-5*FLT_SIZE), "r"(a11)); - __asm__("vfnmsac.vf v8, f4, v16"); - case 4: __asm__(FLT_LOAD " f3, %0(%1)": : "I"(-4*FLT_SIZE), "r"(a11)); - __asm__("vfnmsac.vf v12, f3, v16"); - } - // Pointer bumps - __asm__("addi %0, %0, %1": "+r"(a11): "I"(-PACKMR * FLT_SIZE)); - __asm__("addi %0, %0, %1": "+r"(b11): "I"(-PACKNR * FLT_SIZE)); - __asm__("sub %0, %0, %1": "+r"(c11): "r"(rsc)); - - // Row 3 from bottom - __asm__(FLT_LOAD " f3, %0(%1)": : "I"(-4*FLT_SIZE), "r"(a11)); - __asm__("vfmul.vf v12, v12, f3"); - __asm__(VSE " v12, (%0)": : "r"(b11)); - __asm__(VSSE " v12, (%0), %1": : "r"(c11), "r"(csc)); - if (m == 4) return; - - switch (m){ - case 7: __asm__(FLT_LOAD " f6, %0(%1)": : "I"(-7*FLT_SIZE), "r"(a11)); - __asm__("vfnmsac.vf v0, f6, v12"); - case 6: __asm__(FLT_LOAD " f5, %0(%1)": : "I"(-6*FLT_SIZE), "r"(a11)); - __asm__("vfnmsac.vf v4, f5, v12"); - case 5: __asm__(FLT_LOAD " f4, %0(%1)": : "I"(-5*FLT_SIZE), "r"(a11)); - __asm__("vfnmsac.vf v8, f4, v12"); - } - // Pointer bumps - __asm__("addi %0, %0, %1": "+r"(a11): "I"(-PACKMR * FLT_SIZE)); - __asm__("addi %0, %0, %1": "+r"(b11): "I"(-PACKNR * FLT_SIZE)); - __asm__("sub %0, %0, %1": "+r"(c11): "r"(rsc)); - - // Row 4 from bottom - __asm__(FLT_LOAD " f4, %0(%1)": : "I"(-5*FLT_SIZE), "r"(a11)); - __asm__("vfmul.vf v8, v8, f4"); - __asm__(VSE " v8, (%0)": : "r"(b11)); - __asm__(VSSE " v8, (%0), %1": : "r"(c11), "r"(csc)); - if (m == 5) return; - - switch (m){ - case 7: __asm__(FLT_LOAD " f6, %0(%1)": : "I"(-7*FLT_SIZE), "r"(a11)); - __asm__("vfnmsac.vf v0, f6, v8"); - case 6: __asm__(FLT_LOAD " f5, %0(%1)": : "I"(-6*FLT_SIZE), "r"(a11)); - __asm__("vfnmsac.vf v4, f5, v8"); - } - // Pointer bumps - __asm__("addi %0, %0, %1": "+r"(a11): "I"(-PACKMR * FLT_SIZE)); - __asm__("addi %0, %0, %1": "+r"(b11): "I"(-PACKNR * FLT_SIZE)); - __asm__("sub %0, %0, %1": "+r"(c11): "r"(rsc)); - - // Row 5 from bottom - __asm__(FLT_LOAD " f5, %0(%1)": : "I"(-6*FLT_SIZE), "r"(a11)); - __asm__("vfmul.vf v4, v4, f5"); - __asm__(VSE " v4, (%0)": : "r"(b11)); - __asm__(VSSE " v4, (%0), %1": : "r"(c11), "r"(csc)); - if (m == 6) return; - - __asm__(FLT_LOAD " f6, %0(%1)": : "I"(-7*FLT_SIZE), "r"(a11)); - __asm__("vfnmsac.vf v0, f6, v4"); - - // Pointer bumps - __asm__("addi %0, %0, %1": "+r"(a11): "I"(-PACKMR * FLT_SIZE)); - __asm__("addi %0, %0, %1": "+r"(b11): "I"(-PACKNR * FLT_SIZE)); - __asm__("sub %0, %0, %1": "+r"(c11): "r"(rsc)); - - // Row 6 from bottom - __asm__(FLT_LOAD " f6, %0(%1)": : "I"(-7*FLT_SIZE), "r"(a11)); - __asm__("vfmul.vf v0, v0, f6"); - __asm__(VSE " v0, (%0)": : "r"(b11)); - __asm__(VSSE " v0, (%0), %1": : "r"(c11), "r"(csc)); - -} -#endif diff --git a/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_sifive_x280_asm.c b/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_intr/bli_gemmtrsm_sifive_x280_intr.c similarity index 75% rename from kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_sifive_x280_asm.c rename to kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_intr/bli_gemmtrsm_sifive_x280_intr.c index 7cb8d9e070..687abec185 100644 --- a/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_sifive_x280_asm.c +++ b/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_intr/bli_gemmtrsm_sifive_x280_intr.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2023, SiFive, Inc. + Copyright (C) 2024, SiFive, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -34,12 +34,12 @@ // clang-format off #include "blis.h" -#include "../../riscv_cmul_macros_asm.h" +#include "../../riscv_cmul_macros_intr.h" #include "../../bli_kernels_sifive_x280.h" #include #include -#define GEMMTRSM_L(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##gemmtrsm_l_sifive_x280_asm(\ +#define GEMMTRSM_L(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##gemmtrsm_l_sifive_x280_intr(\ dim_t m, \ dim_t n, \ dim_t k, \ @@ -55,7 +55,7 @@ const cntx_t* restrict cntx \ ) -#define GEMMTRSM_U(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##gemmtrsm_u_sifive_x280_asm(\ +#define GEMMTRSM_U(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##gemmtrsm_u_sifive_x280_intr(\ dim_t m, \ dim_t n, \ dim_t k, \ @@ -76,108 +76,83 @@ // Single precision real #define DATATYPE float #define PRECISION_CHAR s +#define PREC 32 +#define LMUL m4 +#define FLT_SIZE sizeof(float) #define PACKMR 8 #define PACKNR 64 -#define VLE "vle32.v" -#define VSE "vse32.v" -#define VSSE "vsse32.v" -#define FLT_LOAD "flw" -#define FLT_SIZE sizeof(float) -#define LOG_FLT_SIZE 2 - -#include "./bli_gemmtrsm_l_sifive_x280_asm_real.c" -#include "./bli_gemmtrsm_u_sifive_x280_asm_real.c" +#include "./bli_gemmtrsm_sifive_x280_intr_real.c" #undef DATATYPE #undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE #undef PACKMR #undef PACKNR -#undef VLE -#undef VSE -#undef VSSE -#undef FLT_LOAD -#undef FLT_SIZE -#undef LOG_FLT_SIZE // Double precision real #define DATATYPE double #define PRECISION_CHAR d +#define PREC 64 +#define LMUL m4 +#define FLT_SIZE sizeof(double) #define PACKMR 8 #define PACKNR 32 -#define VLE "vle64.v" -#define VSE "vse64.v" -#define VSSE "vsse64.v" -#define FLT_LOAD "fld" -#define FLT_SIZE sizeof(double) -#define LOG_FLT_SIZE 3 -#include "./bli_gemmtrsm_l_sifive_x280_asm_real.c" -#include "./bli_gemmtrsm_u_sifive_x280_asm_real.c" +#include "./bli_gemmtrsm_sifive_x280_intr_real.c" #undef DATATYPE #undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE #undef PACKMR #undef PACKNR -#undef VLE -#undef VSE -#undef VSSE -#undef FLT_LOAD -#undef FLT_SIZE -#undef LOG_FLT_SIZE // Single precision complex #define DATATYPE scomplex +#define BASE_DT float #define PRECISION_CHAR c +#define PREC 32 +#define LMUL m2 +#define FLT_SIZE sizeof(float) #define PACKMR 8 #define PACKNR 32 -#define VLSEG2 "vlseg2e32.v " -#define VSSEG2 "vsseg2e32.v " -#define VSSSEG2 "vssseg2e32.v " -#define FLT_LOAD "flw " -#define FLT_SIZE sizeof(float) -#include "./bli_gemmtrsm_l_sifive_x280_asm_complex.c" -#include "./bli_gemmtrsm_u_sifive_x280_asm_complex.c" +#include "./bli_gemmtrsm_sifive_x280_intr_complex.c" #undef DATATYPE +#undef BASE_DT #undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE #undef PACKMR #undef PACKNR -#undef VLSEG2 -#undef VSSEG2 -#undef VSSSEG2 -#undef FLT_LOAD -#undef FLT_SIZE // Double precision complex #define DATATYPE dcomplex +#define BASE_DT double #define PRECISION_CHAR z +#define PREC 64 +#define LMUL m2 +#define FLT_SIZE sizeof(double) #define PACKMR 8 #define PACKNR 16 -#define VLSEG2 "vlseg2e64.v " -#define VSSEG2 "vsseg2e64.v " -#define VSSSEG2 "vssseg2e64.v " -#define FLT_LOAD "fld " -#define FLT_SIZE sizeof(double) -#include "./bli_gemmtrsm_l_sifive_x280_asm_complex.c" -#include "./bli_gemmtrsm_u_sifive_x280_asm_complex.c" +#include "./bli_gemmtrsm_sifive_x280_intr_complex.c" #undef DATATYPE +#undef BASE_DT #undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE #undef PACKMR #undef PACKNR -#undef VLSEG -#undef VSSEG -#undef VSSSEG -#undef FLT_LOAD -#undef FLT_SIZE - - #undef GEMMTRSM #undef GEMMTRSM_L #undef GEMMTRSM_U - - diff --git a/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_intr/bli_gemmtrsm_sifive_x280_intr_complex.c b/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_intr/bli_gemmtrsm_sifive_x280_intr_complex.c new file mode 100644 index 0000000000..88ea04b7a9 --- /dev/null +++ b/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_intr/bli_gemmtrsm_sifive_x280_intr_complex.c @@ -0,0 +1,437 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2024, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off +#ifdef GEMMTRSM + +#define GEMMTRSM_IMPL_NAME_(PRECISION_CHAR) bli_##PRECISION_CHAR##gemmtrsm_sifive_x280_intr +#define GEMMTRSM_IMPL_NAME(PRECISION_CHAR) GEMMTRSM_IMPL_NAME_(PRECISION_CHAR) + +static void GEMMTRSM_IMPL_NAME(PRECISION_CHAR) + ( + dim_t m, + dim_t n, + dim_t k, + const DATATYPE* restrict beta, + const DATATYPE* restrict a, inc_t rsa, inc_t csa, + const DATATYPE* restrict b, inc_t rsb, + DATATYPE* restrict c, inc_t rsc, + const DATATYPE* restrict a11, inc_t rsa11, inc_t csa11, + DATATYPE* restrict c11, inc_t rsc11, inc_t csc11 + ) +{ + // This function computes inv(a11) * (beta * c - a * b) + // and stores the result in c and c11. + + RVV_TYPE_F(PREC, LMUL) ab0_r, ab1_r, ab2_r, ab3_r, ab4_r, ab5_r; + RVV_TYPE_F(PREC, LMUL) ab0_i, ab1_i, ab2_i, ab3_i, ab4_i, ab5_i; + // gemm step + if (m <= 0 || n <= 0 || k < 0) + return; + else if (k == 0) { + if (PASTEMAC(PRECISION_CHAR, eq0)(*beta)) { + RVV_TYPE_F(PREC, LMUL) zero_splat = VFMV_V_F(PREC, LMUL)(0., n); + switch (m) { + case 6: + ab5_r = zero_splat; + ab5_i = zero_splat; + case 5: + ab4_r = zero_splat; + ab4_i = zero_splat; + case 4: + ab3_r = zero_splat; + ab3_i = zero_splat; + case 3: + ab2_r = zero_splat; + ab2_i = zero_splat; + case 2: + ab1_r = zero_splat; + ab1_i = zero_splat; + case 1: + ab0_r = zero_splat; + ab0_i = zero_splat; + } + } + else { + RVV_TYPE_FX(PREC, LMUL, 2) c0; + RVV_TYPE_F(PREC, LMUL) c0_r, c0_i; + + switch (m) { + case 6: + c0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(c + 5 * rsc), n); + c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0); + c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1); + VCMUL_VF(PREC, LMUL, ab5_r, ab5_i, c0_r, c0_i, beta->real, beta->imag, n); + case 5: + c0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(c + 4 * rsc), n); + c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0); + c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1); + VCMUL_VF(PREC, LMUL, ab4_r, ab4_i, c0_r, c0_i, beta->real, beta->imag, n); + case 4: + c0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(c + 3 * rsc), n); + c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0); + c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1); + VCMUL_VF(PREC, LMUL, ab3_r, ab3_i, c0_r, c0_i, beta->real, beta->imag, n); + case 3: + c0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(c + 2 * rsc), n); + c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0); + c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1); + VCMUL_VF(PREC, LMUL, ab2_r, ab2_i, c0_r, c0_i, beta->real, beta->imag, n); + case 2: + c0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(c + 1 * rsc), n); + c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0); + c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1); + VCMUL_VF(PREC, LMUL, ab1_r, ab1_i, c0_r, c0_i, beta->real, beta->imag, n); + case 1: + c0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(c + 0 * rsc), n); + c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0); + c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1); + VCMUL_VF(PREC, LMUL, ab0_r, ab0_i, c0_r, c0_i, beta->real, beta->imag, n); + } + } + } + else { + RVV_TYPE_FX(PREC, LMUL, 2) b0, b1; + RVV_TYPE_F(PREC, LMUL) b0_r, b1_r; + RVV_TYPE_F(PREC, LMUL) b0_i, b1_i; + + b0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) b, n); + b0_r = VGET_V_F(PREC, LMUL, 2)(b0, 0); + b0_i = VGET_V_F(PREC, LMUL, 2)(b0, 1); + b += rsb; + if (k >= 2) { + b1 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) b, n); + b1_r = VGET_V_F(PREC, LMUL, 2)(b1, 0); + b1_i = VGET_V_F(PREC, LMUL, 2)(b1, 1); + b += rsb; + } + + switch (m) { + case 6: + VCMUL_VF(PREC, LMUL, ab5_r, ab5_i, b0_r, b0_i, a[5 * rsa].real, a[5 * rsa].imag, n); + case 5: + VCMUL_VF(PREC, LMUL, ab4_r, ab4_i, b0_r, b0_i, a[4 * rsa].real, a[4 * rsa].imag, n); + case 4: + VCMUL_VF(PREC, LMUL, ab3_r, ab3_i, b0_r, b0_i, a[3 * rsa].real, a[3 * rsa].imag, n); + case 3: + VCMUL_VF(PREC, LMUL, ab2_r, ab2_i, b0_r, b0_i, a[2 * rsa].real, a[2 * rsa].imag, n); + case 2: + VCMUL_VF(PREC, LMUL, ab1_r, ab1_i, b0_r, b0_i, a[1 * rsa].real, a[1 * rsa].imag, n); + case 1: + VCMUL_VF(PREC, LMUL, ab0_r, ab0_i, b0_r, b0_i, a[0 * rsa].real, a[0 * rsa].imag, n); + } + + a += csa; + k -= 1; + + if (k >= 2) { + b0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) b, n); + b0_r = VGET_V_F(PREC, LMUL, 2)(b0, 0); + b0_i = VGET_V_F(PREC, LMUL, 2)(b0, 1); + b += rsb; + } + + while (k > 0) { + switch (m) { + case 6: + VCMACC_VF(PREC, LMUL, ab5_r, ab5_i, a[5 * rsa].real, a[5 * rsa].imag, b1_r, b1_i, n); + case 5: + VCMACC_VF(PREC, LMUL, ab4_r, ab4_i, a[4 * rsa].real, a[4 * rsa].imag, b1_r, b1_i, n); + case 4: + VCMACC_VF(PREC, LMUL, ab3_r, ab3_i, a[3 * rsa].real, a[3 * rsa].imag, b1_r, b1_i, n); + case 3: + VCMACC_VF(PREC, LMUL, ab2_r, ab2_i, a[2 * rsa].real, a[2 * rsa].imag, b1_r, b1_i, n); + case 2: + VCMACC_VF(PREC, LMUL, ab1_r, ab1_i, a[1 * rsa].real, a[1 * rsa].imag, b1_r, b1_i, n); + case 1: + VCMACC_VF(PREC, LMUL, ab0_r, ab0_i, a[0 * rsa].real, a[0 * rsa].imag, b1_r, b1_i, n); + } + + a += csa; + k -= 1; + + if (k == 0) { break; } + + if (k >= 2) { + b1 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) b, n); + b1_r = VGET_V_F(PREC, LMUL, 2)(b1, 0); + b1_i = VGET_V_F(PREC, LMUL, 2)(b1, 1); + b += rsb; + } + + switch (m) { + case 6: + VCMACC_VF(PREC, LMUL, ab5_r, ab5_i, a[5 * rsa].real, a[5 * rsa].imag, b0_r, b0_i, n); + case 5: + VCMACC_VF(PREC, LMUL, ab4_r, ab4_i, a[4 * rsa].real, a[4 * rsa].imag, b0_r, b0_i, n); + case 4: + VCMACC_VF(PREC, LMUL, ab3_r, ab3_i, a[3 * rsa].real, a[3 * rsa].imag, b0_r, b0_i, n); + case 3: + VCMACC_VF(PREC, LMUL, ab2_r, ab2_i, a[2 * rsa].real, a[2 * rsa].imag, b0_r, b0_i, n); + case 2: + VCMACC_VF(PREC, LMUL, ab1_r, ab1_i, a[1 * rsa].real, a[1 * rsa].imag, b0_r, b0_i, n); + case 1: + VCMACC_VF(PREC, LMUL, ab0_r, ab0_i, a[0 * rsa].real, a[0 * rsa].imag, b0_r, b0_i, n); + } + + a += csa; + k -= 1; + + if (k == 0) { break; } + + if (k >= 2) { + b0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) b, n); + b0_r = VGET_V_F(PREC, LMUL, 2)(b0, 0); + b0_i = VGET_V_F(PREC, LMUL, 2)(b0, 1); + b += rsb; + } + } + + if (PASTEMAC(PRECISION_CHAR, eq0)(*beta)) { + switch (m) { + case 6: + ab5_r = VFNEG_VF(PREC, LMUL)(ab5_r, n); + ab5_i = VFNEG_VF(PREC, LMUL)(ab5_i, n); + case 5: + ab4_r = VFNEG_VF(PREC, LMUL)(ab4_r, n); + ab4_i = VFNEG_VF(PREC, LMUL)(ab4_i, n); + case 4: + ab3_r = VFNEG_VF(PREC, LMUL)(ab3_r, n); + ab3_i = VFNEG_VF(PREC, LMUL)(ab3_i, n); + case 3: + ab2_r = VFNEG_VF(PREC, LMUL)(ab2_r, n); + ab2_i = VFNEG_VF(PREC, LMUL)(ab2_i, n); + case 2: + ab1_r = VFNEG_VF(PREC, LMUL)(ab1_r, n); + ab1_i = VFNEG_VF(PREC, LMUL)(ab1_i, n); + case 1: + ab0_r = VFNEG_VF(PREC, LMUL)(ab0_r, n); + ab0_i = VFNEG_VF(PREC, LMUL)(ab0_i, n); + } + } + else { + RVV_TYPE_FX(PREC, LMUL, 2) c0; + RVV_TYPE_F(PREC, LMUL) c0_r, c0_i; + switch (m) { + case 6: + c0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 5 * rsc), n); + c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0); + c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1); + VCMSAC_VF(PREC, LMUL, ab5_r, ab5_i, beta->real, beta->imag, c0_r, c0_i, n); + case 5: + c0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 4 * rsc), n); + c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0); + c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1); + VCMSAC_VF(PREC, LMUL, ab4_r, ab4_i, beta->real, beta->imag, c0_r, c0_i, n); + case 4: + c0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 3 * rsc), n); + c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0); + c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1); + VCMSAC_VF(PREC, LMUL, ab3_r, ab3_i, beta->real, beta->imag, c0_r, c0_i, n); + case 3: + c0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 2 * rsc), n); + c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0); + c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1); + VCMSAC_VF(PREC, LMUL, ab2_r, ab2_i, beta->real, beta->imag, c0_r, c0_i, n); + case 2: + c0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 1 * rsc), n); + c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0); + c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1); + VCMSAC_VF(PREC, LMUL, ab1_r, ab1_i, beta->real, beta->imag, c0_r, c0_i, n); + case 1: + c0 = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) (c + 0 * rsc), n); + c0_r = VGET_V_F(PREC, LMUL, 2)(c0, 0); + c0_i = VGET_V_F(PREC, LMUL, 2)(c0, 1); + VCMSAC_VF(PREC, LMUL, ab0_r, ab0_i, beta->real, beta->imag, c0_r, c0_i, n); + } + } + } + + // trsm step + RVV_TYPE_FX(PREC, LMUL, 2) temp = VUNDEFINED_FX(PREC, LMUL, 2)(); + RVV_TYPE_F(PREC, LMUL) temp_r, temp_i; + + VCMUL_VF(PREC, LMUL, temp_r, temp_i, ab0_r, ab0_i, a11[0 * rsa11].real, a11[0 * rsa11].imag, n); + temp = VSET_V_F(PREC, LMUL, 2)(temp, 0, temp_r); + temp = VSET_V_F(PREC, LMUL, 2)(temp, 1, temp_i); + VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(c + 0 * rsc), temp, n); + if (csc11 == 1) + VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(c11 + 0 * rsc11), temp, n); + else + VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(c11 + 0 * rsc11), 2 * FLT_SIZE * csc11, temp, n); + if (m == 1) return; + switch (m) { + case 6: + VCNMSAC_VF(PREC, LMUL, ab5_r, ab5_i, a11[5 * rsa11].real, a11[5 * rsa11].imag, temp_r, temp_i, n); + case 5: + VCNMSAC_VF(PREC, LMUL, ab4_r, ab4_i, a11[4 * rsa11].real, a11[4 * rsa11].imag, temp_r, temp_i, n); + case 4: + VCNMSAC_VF(PREC, LMUL, ab3_r, ab3_i, a11[3 * rsa11].real, a11[3 * rsa11].imag, temp_r, temp_i, n); + case 3: + VCNMSAC_VF(PREC, LMUL, ab2_r, ab2_i, a11[2 * rsa11].real, a11[2 * rsa11].imag, temp_r, temp_i, n); + case 2: + VCNMSAC_VF(PREC, LMUL, ab1_r, ab1_i, a11[1 * rsa11].real, a11[1 * rsa11].imag, temp_r, temp_i, n); + } + a11 += csa11; + + VCMUL_VF(PREC, LMUL, temp_r, temp_i, ab1_r, ab1_i, a11[1 * rsa11].real, a11[1 * rsa11].imag, n); + temp = VSET_V_F(PREC, LMUL, 2)(temp, 0, temp_r); + temp = VSET_V_F(PREC, LMUL, 2)(temp, 1, temp_i); + VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(c + 1 * rsc), temp, n); + if (csc11 == 1) + VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(c11 + 1 * rsc11), temp, n); + else + VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(c11 + 1 * rsc11), 2 * FLT_SIZE * csc11, temp, n); + if (m == 2) return; + switch (m) { + case 6: + VCNMSAC_VF(PREC, LMUL, ab5_r, ab5_i, a11[5 * rsa11].real, a11[5 * rsa11].imag, temp_r, temp_i, n); + case 5: + VCNMSAC_VF(PREC, LMUL, ab4_r, ab4_i, a11[4 * rsa11].real, a11[4 * rsa11].imag, temp_r, temp_i, n); + case 4: + VCNMSAC_VF(PREC, LMUL, ab3_r, ab3_i, a11[3 * rsa11].real, a11[3 * rsa11].imag, temp_r, temp_i, n); + case 3: + VCNMSAC_VF(PREC, LMUL, ab2_r, ab2_i, a11[2 * rsa11].real, a11[2 * rsa11].imag, temp_r, temp_i, n); + } + a11 += csa11; + + VCMUL_VF(PREC, LMUL, temp_r, temp_i, ab2_r, ab2_i, a11[2 * rsa11].real, a11[2 * rsa11].imag, n); + temp = VSET_V_F(PREC, LMUL, 2)(temp, 0, temp_r); + temp = VSET_V_F(PREC, LMUL, 2)(temp, 1, temp_i); + VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(c + 2 * rsc), temp, n); + if (csc11 == 1) + VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(c11 + 2 * rsc11), temp, n); + else + VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(c11 + 2 * rsc11), 2 * FLT_SIZE * csc11, temp, n); + if (m == 3) return; + switch (m) { + case 6: + VCNMSAC_VF(PREC, LMUL, ab5_r, ab5_i, a11[5 * rsa11].real, a11[5 * rsa11].imag, temp_r, temp_i, n); + case 5: + VCNMSAC_VF(PREC, LMUL, ab4_r, ab4_i, a11[4 * rsa11].real, a11[4 * rsa11].imag, temp_r, temp_i, n); + case 4: + VCNMSAC_VF(PREC, LMUL, ab3_r, ab3_i, a11[3 * rsa11].real, a11[3 * rsa11].imag, temp_r, temp_i, n); + } + a11 += csa11; + + VCMUL_VF(PREC, LMUL, temp_r, temp_i, ab3_r, ab3_i, a11[3 * rsa11].real, a11[3 * rsa11].imag, n); + temp = VSET_V_F(PREC, LMUL, 2)(temp, 0, temp_r); + temp = VSET_V_F(PREC, LMUL, 2)(temp, 1, temp_i); + VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(c + 3 * rsc), temp, n); + if (csc11 == 1) + VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(c11 + 3 * rsc11), temp, n); + else + VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(c11 + 3 * rsc11), 2 * FLT_SIZE * csc11, temp, n); + if (m == 4) return; + switch (m) { + case 6: + VCNMSAC_VF(PREC, LMUL, ab5_r, ab5_i, a11[5 * rsa11].real, a11[5 * rsa11].imag, temp_r, temp_i, n); + case 5: + VCNMSAC_VF(PREC, LMUL, ab4_r, ab4_i, a11[4 * rsa11].real, a11[4 * rsa11].imag, temp_r, temp_i, n); + } + a11 += csa11; + + VCMUL_VF(PREC, LMUL, temp_r, temp_i, ab4_r, ab4_i, a11[4 * rsa11].real, a11[4 * rsa11].imag, n); + temp = VSET_V_F(PREC, LMUL, 2)(temp, 0, temp_r); + temp = VSET_V_F(PREC, LMUL, 2)(temp, 1, temp_i); + VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(c + 4 * rsc), temp, n); + if (csc11 == 1) + VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(c11 + 4 * rsc11), temp, n); + else + VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(c11 + 4 * rsc11), 2 * FLT_SIZE * csc11, temp, n); + if (m == 5) return; + VCNMSAC_VF(PREC, LMUL, ab5_r, ab5_i, a11[5 * rsa11].real, a11[5 * rsa11].imag, temp_r, temp_i, n); + a11 += csa11; + + VCMUL_VF(PREC, LMUL, temp_r, temp_i, ab5_r, ab5_i, a11[5 * rsa11].real, a11[5 * rsa11].imag, n); + temp = VSET_V_F(PREC, LMUL, 2)(temp, 0, temp_r); + temp = VSET_V_F(PREC, LMUL, 2)(temp, 1, temp_i); + VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(c + 5 * rsc), temp, n); + if (csc11 == 1) + VSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(c11 + 5 * rsc11), temp, n); + else + VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*)(c11 + 5 * rsc11), 2 * FLT_SIZE * csc11, temp, n); + return; +} + +GEMMTRSM(GEMMTRSM_L, PRECISION_CHAR, void) +{ + const DATATYPE* restrict alpha = alpha_; + const DATATYPE* restrict a10 = a10_; + const DATATYPE* restrict a11 = a11_; + const DATATYPE* restrict b01 = b01_; + DATATYPE* restrict b11 = b11_; + DATATYPE* restrict c11 = c11_; + + GEMMTRSM_IMPL_NAME(PRECISION_CHAR) + ( + m, n, k, + alpha, + a10, 1, PACKMR, + b01, PACKNR, + b11, PACKNR, + a11, 1, PACKMR, + c11, rsc, csc + ); + + return; +} + +GEMMTRSM(GEMMTRSM_U, PRECISION_CHAR, void) +{ + const DATATYPE* restrict alpha = alpha_; + const DATATYPE* restrict a12 = a12_; + const DATATYPE* restrict a11 = a11_; + const DATATYPE* restrict b21 = b21_; + DATATYPE* restrict b11 = b11_; + DATATYPE* restrict c11 = c11_; + + GEMMTRSM_IMPL_NAME(PRECISION_CHAR) + ( + m, n, k, + alpha, + a12 + (m - 1), -1, PACKMR, + b21, PACKNR, + b11 + (m - 1) * PACKNR, -PACKNR, + a11 + (m - 1) + (m - 1) * PACKMR, -1, -PACKMR, + c11 + (m - 1) * rsc, -rsc, csc + ); + + return; +} + +#undef GEMMTRSM_IMPL_NAME_ +#undef GEMMTRSM_IMPL_NAME + +#endif // GEMMTRSM diff --git a/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_intr/bli_gemmtrsm_sifive_x280_intr_real.c b/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_intr/bli_gemmtrsm_sifive_x280_intr_real.c new file mode 100644 index 0000000000..7c3c3b8b7b --- /dev/null +++ b/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_intr/bli_gemmtrsm_sifive_x280_intr_real.c @@ -0,0 +1,364 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2024, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off +#ifdef GEMMTRSM + +#define GEMMTRSM_IMPL_NAME_(PRECISION_CHAR) bli_##PRECISION_CHAR##gemmtrsm_sifive_x280_intr +#define GEMMTRSM_IMPL_NAME(PRECISION_CHAR) GEMMTRSM_IMPL_NAME_(PRECISION_CHAR) + +static void GEMMTRSM_IMPL_NAME(PRECISION_CHAR) + ( + dim_t m, + dim_t n, + dim_t k, + const DATATYPE* restrict beta, + const DATATYPE* restrict a, inc_t rsa, inc_t csa, + const DATATYPE* restrict b, inc_t rsb, + DATATYPE* restrict c, inc_t rsc, + const DATATYPE* restrict a11, inc_t rsa11, inc_t csa11, + DATATYPE* restrict c11, inc_t rsc11, inc_t csc11 + ) +{ + // This function computes inv(a11) * (beta * c - a * b) + // and stores the result in c and c11. + + RVV_TYPE_F(PREC, LMUL) ab0, ab1, ab2, ab3, ab4, ab5, ab6; + // gemm step + if (m <= 0 || n <= 0 || k < 0) + return; + else if (k == 0) { + if (PASTEMAC(PRECISION_CHAR, eq0)(*beta)) { + RVV_TYPE_F(PREC, LMUL) zero_splat = VFMV_V_F(PREC, LMUL)(0., n); + switch (m) { + case 7: + ab6 = zero_splat; + case 6: + ab5 = zero_splat; + case 5: + ab4 = zero_splat; + case 4: + ab3 = zero_splat; + case 3: + ab2 = zero_splat; + case 2: + ab1 = zero_splat; + case 1: + ab0 = zero_splat; + } + } + else { + RVV_TYPE_F(PREC, LMUL) c0; + switch (m) { + case 7: + c0 = VLE_V_F(PREC, LMUL)(c + 6 * rsc, n); + ab6 = VFMUL_VF(PREC, LMUL)(c0, *beta, n); + case 6: + c0 = VLE_V_F(PREC, LMUL)(c + 5 * rsc, n); + ab5 = VFMUL_VF(PREC, LMUL)(c0, *beta, n); + case 5: + c0 = VLE_V_F(PREC, LMUL)(c + 4 * rsc, n); + ab4 = VFMUL_VF(PREC, LMUL)(c0, *beta, n); + case 4: + c0 = VLE_V_F(PREC, LMUL)(c + 3 * rsc, n); + ab3 = VFMUL_VF(PREC, LMUL)(c0, *beta, n); + case 3: + c0 = VLE_V_F(PREC, LMUL)(c + 2 * rsc, n); + ab2 = VFMUL_VF(PREC, LMUL)(c0, *beta, n); + case 2: + c0 = VLE_V_F(PREC, LMUL)(c + 1 * rsc, n); + ab1 = VFMUL_VF(PREC, LMUL)(c0, *beta, n); + case 1: + c0 = VLE_V_F(PREC, LMUL)(c + 0 * rsc, n); + ab0 = VFMUL_VF(PREC, LMUL)(c0, *beta, n); + } + } + } + else { + bool first = true; + for (dim_t i = 0; i < k; ++i) { + RVV_TYPE_F(PREC, LMUL) b0 = VLE_V_F(PREC, LMUL)(b, n); + if (first) { + switch (m) { + case 7: + ab6 = VFMUL_VF(PREC, LMUL)(b0, a[6 * rsa], n); + case 6: + ab5 = VFMUL_VF(PREC, LMUL)(b0, a[5 * rsa], n); + case 5: + ab4 = VFMUL_VF(PREC, LMUL)(b0, a[4 * rsa], n); + case 4: + ab3 = VFMUL_VF(PREC, LMUL)(b0, a[3 * rsa], n); + case 3: + ab2 = VFMUL_VF(PREC, LMUL)(b0, a[2 * rsa], n); + case 2: + ab1 = VFMUL_VF(PREC, LMUL)(b0, a[1 * rsa], n); + case 1: + ab0 = VFMUL_VF(PREC, LMUL)(b0, a[0 * rsa], n); + } + first = false; + } + else { + switch (m) { + case 7: + ab6 = VFMACC_VF(PREC, LMUL)(ab6, a[6 * rsa], b0, n); + case 6: + ab5 = VFMACC_VF(PREC, LMUL)(ab5, a[5 * rsa], b0, n); + case 5: + ab4 = VFMACC_VF(PREC, LMUL)(ab4, a[4 * rsa], b0, n); + case 4: + ab3 = VFMACC_VF(PREC, LMUL)(ab3, a[3 * rsa], b0, n); + case 3: + ab2 = VFMACC_VF(PREC, LMUL)(ab2, a[2 * rsa], b0, n); + case 2: + ab1 = VFMACC_VF(PREC, LMUL)(ab1, a[1 * rsa], b0, n); + case 1: + ab0 = VFMACC_VF(PREC, LMUL)(ab0, a[0 * rsa], b0, n); + } + } + + a += csa; + b += rsb; + } + + if (PASTEMAC(PRECISION_CHAR, eq0)(*beta)) { + switch (m) { + case 7: + ab6 = VFNEG_VF(PREC, LMUL)(ab6, n); + case 6: + ab5 = VFNEG_VF(PREC, LMUL)(ab5, n); + case 5: + ab4 = VFNEG_VF(PREC, LMUL)(ab4, n); + case 4: + ab3 = VFNEG_VF(PREC, LMUL)(ab3, n); + case 3: + ab2 = VFNEG_VF(PREC, LMUL)(ab2, n); + case 2: + ab1 = VFNEG_VF(PREC, LMUL)(ab1, n); + case 1: + ab0 = VFNEG_VF(PREC, LMUL)(ab0, n); + } + } + else { + RVV_TYPE_F(PREC, LMUL) c0; + switch (m) { + case 7: + c0 = VLE_V_F(PREC, LMUL)(c + 6 * rsc, n); + ab6 = VFMSAC_VF(PREC, LMUL)(ab6, *beta, c0, n); + case 6: + c0 = VLE_V_F(PREC, LMUL)(c + 5 * rsc, n); + ab5 = VFMSAC_VF(PREC, LMUL)(ab5, *beta, c0, n); + case 5: + c0 = VLE_V_F(PREC, LMUL)(c + 4 * rsc, n); + ab4 = VFMSAC_VF(PREC, LMUL)(ab4, *beta, c0, n); + case 4: + c0 = VLE_V_F(PREC, LMUL)(c + 3 * rsc, n); + ab3 = VFMSAC_VF(PREC, LMUL)(ab3, *beta, c0, n); + case 3: + c0 = VLE_V_F(PREC, LMUL)(c + 2 * rsc, n); + ab2 = VFMSAC_VF(PREC, LMUL)(ab2, *beta, c0, n); + case 2: + c0 = VLE_V_F(PREC, LMUL)(c + 1 * rsc, n); + ab1 = VFMSAC_VF(PREC, LMUL)(ab1, *beta, c0, n); + case 1: + c0 = VLE_V_F(PREC, LMUL)(c + 0 * rsc, n); + ab0 = VFMSAC_VF(PREC, LMUL)(ab0, *beta, c0, n); + } + } + } + + // trsm step + ab0 = VFMUL_VF(PREC, LMUL)(ab0, a11[0 * rsa11], n); + VSE_V_F(PREC, LMUL)(c + 0 * rsc, ab0, n); + if (csc11 == 1) + VSE_V_F(PREC, LMUL)(c11 + 0 * rsc11, ab0, n); + else + VSSE_V_F(PREC, LMUL)(c11 + 0 * rsc11, FLT_SIZE * csc11, ab0, n); + if (m == 1) return; + switch (m) { + case 7: + ab6 = VFNMSAC_VF(PREC, LMUL)(ab6, a11[6 * rsa11], ab0, n); + case 6: + ab5 = VFNMSAC_VF(PREC, LMUL)(ab5, a11[5 * rsa11], ab0, n); + case 5: + ab4 = VFNMSAC_VF(PREC, LMUL)(ab4, a11[4 * rsa11], ab0, n); + case 4: + ab3 = VFNMSAC_VF(PREC, LMUL)(ab3, a11[3 * rsa11], ab0, n); + case 3: + ab2 = VFNMSAC_VF(PREC, LMUL)(ab2, a11[2 * rsa11], ab0, n); + case 2: + ab1 = VFNMSAC_VF(PREC, LMUL)(ab1, a11[1 * rsa11], ab0, n); + } + a11 += csa11; + + ab1 = VFMUL_VF(PREC, LMUL)(ab1, a11[1 * rsa11], n); + VSE_V_F(PREC, LMUL)(c + 1 * rsc, ab1, n); + if (csc11 == 1) + VSE_V_F(PREC, LMUL)(c11 + 1 * rsc11, ab1, n); + else + VSSE_V_F(PREC, LMUL)(c11 + 1 * rsc11, FLT_SIZE * csc11, ab1, n); + if (m == 2) return; + switch (m) { + case 7: + ab6 = VFNMSAC_VF(PREC, LMUL)(ab6, a11[6 * rsa11], ab1, n); + case 6: + ab5 = VFNMSAC_VF(PREC, LMUL)(ab5, a11[5 * rsa11], ab1, n); + case 5: + ab4 = VFNMSAC_VF(PREC, LMUL)(ab4, a11[4 * rsa11], ab1, n); + case 4: + ab3 = VFNMSAC_VF(PREC, LMUL)(ab3, a11[3 * rsa11], ab1, n); + case 3: + ab2 = VFNMSAC_VF(PREC, LMUL)(ab2, a11[2 * rsa11], ab1, n); + } + a11 += csa11; + + ab2 = VFMUL_VF(PREC, LMUL)(ab2, a11[2 * rsa11], n); + VSE_V_F(PREC, LMUL)(c + 2 * rsc, ab2, n); + if (csc11 == 1) + VSE_V_F(PREC, LMUL)(c11 + 2 * rsc11, ab2, n); + else + VSSE_V_F(PREC, LMUL)(c11 + 2 * rsc11, FLT_SIZE * csc11, ab2, n); + if (m == 3) return; + switch (m) { + case 7: + ab6 = VFNMSAC_VF(PREC, LMUL)(ab6, a11[6 * rsa11], ab2, n); + case 6: + ab5 = VFNMSAC_VF(PREC, LMUL)(ab5, a11[5 * rsa11], ab2, n); + case 5: + ab4 = VFNMSAC_VF(PREC, LMUL)(ab4, a11[4 * rsa11], ab2, n); + case 4: + ab3 = VFNMSAC_VF(PREC, LMUL)(ab3, a11[3 * rsa11], ab2, n); + } + a11 += csa11; + + ab3 = VFMUL_VF(PREC, LMUL)(ab3, a11[3 * rsa11], n); + VSE_V_F(PREC, LMUL)(c + 3 * rsc, ab3, n); + if (csc11 == 1) + VSE_V_F(PREC, LMUL)(c11 + 3 * rsc11, ab3, n); + else + VSSE_V_F(PREC, LMUL)(c11 + 3 * rsc11, FLT_SIZE * csc11, ab3, n); + if (m == 4) return; + switch (m) { + case 7: + ab6 = VFNMSAC_VF(PREC, LMUL)(ab6, a11[6 * rsa11], ab3, n); + case 6: + ab5 = VFNMSAC_VF(PREC, LMUL)(ab5, a11[5 * rsa11], ab3, n); + case 5: + ab4 = VFNMSAC_VF(PREC, LMUL)(ab4, a11[4 * rsa11], ab3, n); + } + a11 += csa11; + + ab4 = VFMUL_VF(PREC, LMUL)(ab4, a11[4 * rsa11], n); + VSE_V_F(PREC, LMUL)(c + 4 * rsc, ab4, n); + if (csc11 == 1) + VSE_V_F(PREC, LMUL)(c11 + 4 * rsc11, ab4, n); + else + VSSE_V_F(PREC, LMUL)(c11 + 4 * rsc11, FLT_SIZE * csc11, ab4, n); + if (m == 5) return; + switch (m) { + case 7: + ab6 = VFNMSAC_VF(PREC, LMUL)(ab6, a11[6 * rsa11], ab4, n); + case 6: + ab5 = VFNMSAC_VF(PREC, LMUL)(ab5, a11[5 * rsa11], ab4, n); + } + a11 += csa11; + + ab5 = VFMUL_VF(PREC, LMUL)(ab5, a11[5 * rsa11], n); + VSE_V_F(PREC, LMUL)(c + 5 * rsc, ab5, n); + if (csc11 == 1) + VSE_V_F(PREC, LMUL)(c11 + 5 * rsc11, ab5, n); + else + VSSE_V_F(PREC, LMUL)(c11 + 5 * rsc11, FLT_SIZE * csc11, ab5, n); + if (m == 6) return; + ab6 = VFNMSAC_VF(PREC, LMUL)(ab6, a11[6 * rsa11], ab5, n); + a11 += csa11; + + ab6 = VFMUL_VF(PREC, LMUL)(ab6, a11[6 * rsa11], n); + VSE_V_F(PREC, LMUL)(c + 6 * rsc, ab6, n); + if (csc11 == 1) + VSE_V_F(PREC, LMUL)(c11 + 6 * rsc11, ab6, n); + else + VSSE_V_F(PREC, LMUL)(c11 + 6 * rsc11, FLT_SIZE * csc11, ab6, n); + return; +} + +GEMMTRSM(GEMMTRSM_L, PRECISION_CHAR, void) +{ + const DATATYPE* restrict alpha = alpha_; + const DATATYPE* restrict a10 = a10_; + const DATATYPE* restrict a11 = a11_; + const DATATYPE* restrict b01 = b01_; + DATATYPE* restrict b11 = b11_; + DATATYPE* restrict c11 = c11_; + + GEMMTRSM_IMPL_NAME(PRECISION_CHAR) + ( + m, n, k, + alpha, + a10, 1, PACKMR, + b01, PACKNR, + b11, PACKNR, + a11, 1, PACKMR, + c11, rsc, csc + ); + + return; +} + +GEMMTRSM(GEMMTRSM_U, PRECISION_CHAR, void) +{ + const DATATYPE* restrict alpha = alpha_; + const DATATYPE* restrict a12 = a12_; + const DATATYPE* restrict a11 = a11_; + const DATATYPE* restrict b21 = b21_; + DATATYPE* restrict b11 = b11_; + DATATYPE* restrict c11 = c11_; + + GEMMTRSM_IMPL_NAME(PRECISION_CHAR) + ( + m, n, k, + alpha, + a12 + (m - 1), -1, PACKMR, + b21, PACKNR, + b11 + (m - 1) * PACKNR, -PACKNR, + a11 + (m - 1) + (m - 1) * PACKMR, -1, -PACKMR, + c11 + (m - 1) * rsc, -rsc, csc + ); + + return; +} + +#undef GEMMTRSM_IMPL_NAME_ +#undef GEMMTRSM_IMPL_NAME + +#endif // GEMMTRSM diff --git a/kernels/sifive_x280/bli_kernels_sifive_x280.h b/kernels/sifive_x280/bli_kernels_sifive_x280.h index 0ee01041ea..ff7b445c47 100644 --- a/kernels/sifive_x280/bli_kernels_sifive_x280.h +++ b/kernels/sifive_x280/bli_kernels_sifive_x280.h @@ -38,10 +38,10 @@ ADDV_KER_PROT(double, d, addv_sifive_x280_intr) ADDV_KER_PROT(scomplex, c, addv_sifive_x280_intr) ADDV_KER_PROT(dcomplex, z, addv_sifive_x280_intr) -AMAXV_KER_PROT(float, s, amaxv_sifive_x280_asm) -AMAXV_KER_PROT(double, d, amaxv_sifive_x280_asm) -AMAXV_KER_PROT(scomplex, c, amaxv_sifive_x280_asm) -AMAXV_KER_PROT(dcomplex, z, amaxv_sifive_x280_asm) +AMAXV_KER_PROT(float, s, amaxv_sifive_x280_intr) +AMAXV_KER_PROT(double, d, amaxv_sifive_x280_intr) +AMAXV_KER_PROT(scomplex, c, amaxv_sifive_x280_intr) +AMAXV_KER_PROT(dcomplex, z, amaxv_sifive_x280_intr) AXPBYV_KER_PROT(float, s, axpbyv_sifive_x280_intr) AXPBYV_KER_PROT(double, d, axpbyv_sifive_x280_intr) @@ -53,10 +53,10 @@ AXPYV_KER_PROT(double, d, axpyv_sifive_x280_intr) AXPYV_KER_PROT(scomplex, c, axpyv_sifive_x280_intr) AXPYV_KER_PROT(dcomplex, z, axpyv_sifive_x280_intr) -COPYV_KER_PROT(float, s, copyv_sifive_x280_asm) -COPYV_KER_PROT(double, d, copyv_sifive_x280_asm) -COPYV_KER_PROT(scomplex, c, copyv_sifive_x280_asm) -COPYV_KER_PROT(dcomplex, z, copyv_sifive_x280_asm) +COPYV_KER_PROT(float, s, copyv_sifive_x280_intr) +COPYV_KER_PROT(double, d, copyv_sifive_x280_intr) +COPYV_KER_PROT(scomplex, c, copyv_sifive_x280_intr) +COPYV_KER_PROT(dcomplex, z, copyv_sifive_x280_intr) DOTV_KER_PROT(float, s, dotv_sifive_x280_intr) DOTV_KER_PROT(double, d, dotv_sifive_x280_intr) @@ -68,15 +68,15 @@ DOTXV_KER_PROT(double, d, dotxv_sifive_x280_intr) DOTXV_KER_PROT(scomplex, c, dotxv_sifive_x280_intr) DOTXV_KER_PROT(dcomplex, z, dotxv_sifive_x280_intr) -INVERTV_KER_PROT(float, s, invertv_sifive_x280_asm) -INVERTV_KER_PROT(double, d, invertv_sifive_x280_asm) -INVERTV_KER_PROT(scomplex, c, invertv_sifive_x280_asm) -INVERTV_KER_PROT(dcomplex, z, invertv_sifive_x280_asm) +INVERTV_KER_PROT(float, s, invertv_sifive_x280_intr) +INVERTV_KER_PROT(double, d, invertv_sifive_x280_intr) +INVERTV_KER_PROT(scomplex, c, invertv_sifive_x280_intr) +INVERTV_KER_PROT(dcomplex, z, invertv_sifive_x280_intr) -INVSCALV_KER_PROT(float, s, invscalv_sifive_x280_asm) -INVSCALV_KER_PROT(double, d, invscalv_sifive_x280_asm) -INVSCALV_KER_PROT(scomplex, c, invscalv_sifive_x280_asm) -INVSCALV_KER_PROT(dcomplex, z, invscalv_sifive_x280_asm) +INVSCALV_KER_PROT(float, s, invscalv_sifive_x280_intr) +INVSCALV_KER_PROT(double, d, invscalv_sifive_x280_intr) +INVSCALV_KER_PROT(scomplex, c, invscalv_sifive_x280_intr) +INVSCALV_KER_PROT(dcomplex, z, invscalv_sifive_x280_intr) SCAL2V_KER_PROT(float, s, scal2v_sifive_x280_intr) SCAL2V_KER_PROT(double, d, scal2v_sifive_x280_intr) @@ -88,20 +88,20 @@ SCALV_KER_PROT(double, d, scalv_sifive_x280_intr) SCALV_KER_PROT(scomplex, c, scalv_sifive_x280_intr) SCALV_KER_PROT(dcomplex, z, scalv_sifive_x280_intr) -SETV_KER_PROT(float, s, setv_sifive_x280_asm) -SETV_KER_PROT(double, d, setv_sifive_x280_asm) -SETV_KER_PROT(scomplex, c, setv_sifive_x280_asm) -SETV_KER_PROT(dcomplex, z, setv_sifive_x280_asm) +SETV_KER_PROT(float, s, setv_sifive_x280_intr) +SETV_KER_PROT(double, d, setv_sifive_x280_intr) +SETV_KER_PROT(scomplex, c, setv_sifive_x280_intr) +SETV_KER_PROT(dcomplex, z, setv_sifive_x280_intr) SUBV_KER_PROT(float, s, subv_sifive_x280_intr) SUBV_KER_PROT(double, d, subv_sifive_x280_intr) SUBV_KER_PROT(scomplex, c, subv_sifive_x280_intr) SUBV_KER_PROT(dcomplex, z, subv_sifive_x280_intr) -SWAPV_KER_PROT(float, s, swapv_sifive_x280_asm) -SWAPV_KER_PROT(double, d, swapv_sifive_x280_asm) -SWAPV_KER_PROT(scomplex, c, swapv_sifive_x280_asm) -SWAPV_KER_PROT(dcomplex, z, swapv_sifive_x280_asm) +SWAPV_KER_PROT(float, s, swapv_sifive_x280_intr) +SWAPV_KER_PROT(double, d, swapv_sifive_x280_intr) +SWAPV_KER_PROT(scomplex, c, swapv_sifive_x280_intr) +SWAPV_KER_PROT(dcomplex, z, swapv_sifive_x280_intr) XPBYV_KER_PROT(float, s, xpbyv_sifive_x280_intr) XPBYV_KER_PROT(double, d, xpbyv_sifive_x280_intr) @@ -114,31 +114,31 @@ AXPY2V_KER_PROT(double, d, axpy2v_sifive_x280_intr) AXPY2V_KER_PROT(scomplex, c, axpy2v_sifive_x280_intr) AXPY2V_KER_PROT(dcomplex, z, axpy2v_sifive_x280_intr) -AXPYF_KER_PROT(float, s, axpyf_sifive_x280_asm) -AXPYF_KER_PROT(double, d, axpyf_sifive_x280_asm) -AXPYF_KER_PROT(scomplex, c, axpyf_sifive_x280_asm) -AXPYF_KER_PROT(dcomplex, z, axpyf_sifive_x280_asm) +AXPYF_KER_PROT(float, s, axpyf_sifive_x280_intr) +AXPYF_KER_PROT(double, d, axpyf_sifive_x280_intr) +AXPYF_KER_PROT(scomplex, c, axpyf_sifive_x280_intr) +AXPYF_KER_PROT(dcomplex, z, axpyf_sifive_x280_intr) -DOTXF_KER_PROT(float, s, dotxf_sifive_x280_asm) -DOTXF_KER_PROT(double, d, dotxf_sifive_x280_asm) -DOTXF_KER_PROT(scomplex, c, dotxf_sifive_x280_asm) -DOTXF_KER_PROT(dcomplex, z, dotxf_sifive_x280_asm) +DOTXF_KER_PROT(float, s, dotxf_sifive_x280_intr) +DOTXF_KER_PROT(double, d, dotxf_sifive_x280_intr) +DOTXF_KER_PROT(scomplex, c, dotxf_sifive_x280_intr) +DOTXF_KER_PROT(dcomplex, z, dotxf_sifive_x280_intr) DOTAXPYV_KER_PROT(float, s, dotaxpyv_sifive_x280_intr) DOTAXPYV_KER_PROT(double, d, dotaxpyv_sifive_x280_intr) DOTAXPYV_KER_PROT(scomplex, c, dotaxpyv_sifive_x280_intr) DOTAXPYV_KER_PROT(dcomplex, z, dotaxpyv_sifive_x280_intr) -DOTXAXPYF_KER_PROT(float, s, dotxaxpyf_sifive_x280_asm) -DOTXAXPYF_KER_PROT(double, d, dotxaxpyf_sifive_x280_asm) -DOTXAXPYF_KER_PROT(scomplex,c, dotxaxpyf_sifive_x280_asm) -DOTXAXPYF_KER_PROT(dcomplex,z, dotxaxpyf_sifive_x280_asm) +DOTXAXPYF_KER_PROT(float, s, dotxaxpyf_sifive_x280_intr) +DOTXAXPYF_KER_PROT(double, d, dotxaxpyf_sifive_x280_intr) +DOTXAXPYF_KER_PROT(scomplex,c, dotxaxpyf_sifive_x280_intr) +DOTXAXPYF_KER_PROT(dcomplex,z, dotxaxpyf_sifive_x280_intr) // Level 1m -PACKM_KER_PROT(float, s, packm_sifive_x280_asm_7m4) -PACKM_KER_PROT(double, d, packm_sifive_x280_asm_7m4) -PACKM_KER_PROT(scomplex, c, packm_sifive_x280_asm_6m2) -PACKM_KER_PROT(dcomplex, z, packm_sifive_x280_asm_6m2) +PACKM_KER_PROT(float, s, packm_sifive_x280_intr) +PACKM_KER_PROT(double, d, packm_sifive_x280_intr) +PACKM_KER_PROT(scomplex, c, packm_sifive_x280_intr) +PACKM_KER_PROT(dcomplex, z, packm_sifive_x280_intr) // Reference 1m PACKM_KER_PROT(float, ss, packm_sifive_x280_ref) @@ -147,16 +147,16 @@ PACKM_KER_PROT(scomplex, cc, packm_sifive_x280_ref) PACKM_KER_PROT(dcomplex, zz, packm_sifive_x280_ref) // Level 3 -GEMM_UKR_PROT(float, s, gemm_sifive_x280_asm_7m4) -GEMM_UKR_PROT(double, d, gemm_sifive_x280_asm_7m4) -GEMM_UKR_PROT(scomplex, c, gemm_sifive_x280_asm_6m2) -GEMM_UKR_PROT(dcomplex, z, gemm_sifive_x280_asm_6m2) - -GEMMTRSM_UKR_PROT(float, s, gemmtrsm_l_sifive_x280_asm) -GEMMTRSM_UKR_PROT(double, d, gemmtrsm_l_sifive_x280_asm) -GEMMTRSM_UKR_PROT(scomplex, c, gemmtrsm_l_sifive_x280_asm) -GEMMTRSM_UKR_PROT(dcomplex, z, gemmtrsm_l_sifive_x280_asm) -GEMMTRSM_UKR_PROT(float, s, gemmtrsm_u_sifive_x280_asm) -GEMMTRSM_UKR_PROT(double, d, gemmtrsm_u_sifive_x280_asm) -GEMMTRSM_UKR_PROT(scomplex, c, gemmtrsm_u_sifive_x280_asm) -GEMMTRSM_UKR_PROT(dcomplex, z, gemmtrsm_u_sifive_x280_asm) +GEMM_UKR_PROT(float, s, gemm_sifive_x280_intr) +GEMM_UKR_PROT(double, d, gemm_sifive_x280_intr) +GEMM_UKR_PROT(scomplex, c, gemm_sifive_x280_intr) +GEMM_UKR_PROT(dcomplex, z, gemm_sifive_x280_intr) + +GEMMTRSM_UKR_PROT(float, s, gemmtrsm_l_sifive_x280_intr) +GEMMTRSM_UKR_PROT(double, d, gemmtrsm_l_sifive_x280_intr) +GEMMTRSM_UKR_PROT(scomplex, c, gemmtrsm_l_sifive_x280_intr) +GEMMTRSM_UKR_PROT(dcomplex, z, gemmtrsm_l_sifive_x280_intr) +GEMMTRSM_UKR_PROT(float, s, gemmtrsm_u_sifive_x280_intr) +GEMMTRSM_UKR_PROT(double, d, gemmtrsm_u_sifive_x280_intr) +GEMMTRSM_UKR_PROT(scomplex, c, gemmtrsm_u_sifive_x280_intr) +GEMMTRSM_UKR_PROT(dcomplex, z, gemmtrsm_u_sifive_x280_intr) diff --git a/kernels/sifive_x280/riscv_cmul_macros_intr.h b/kernels/sifive_x280/riscv_cmul_macros_intr.h new file mode 100644 index 0000000000..70a0a16124 --- /dev/null +++ b/kernels/sifive_x280/riscv_cmul_macros_intr.h @@ -0,0 +1,147 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2024, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "riscv_overloaded_intrinsics.h" + +// macros to emit complex multiplication +// caveat: the destination registers cannot overlap the source registers! + +// vd = vs2 * f[rs1] +#define VCMUL_VF(PREC, LMUL, VD_R, VD_I, VS2_R, VS2_I, RS1_R, RS1_I, VL) \ + do { \ + VD_R = VFMUL_VF(PREC, LMUL)(VS2_R, RS1_R, VL); \ + VD_I = VFMUL_VF(PREC, LMUL)(VS2_R, RS1_I, VL); \ + VD_R = VFNMSAC_VF(PREC, LMUL)(VD_R, RS1_I, VS2_I, VL); \ + VD_I = VFMACC_VF(PREC, LMUL)(VD_I, RS1_R, VS2_I, VL); \ + } while(0) + +// vd = conj(vs2) * f[rs1] +#define VCMUL_VF_CONJ(PREC, LMUL, VD_R, VD_I, VS2_R, VS2_I, RS1_R, RS1_I, VL) \ + do { \ + VD_R = VFMUL_VF(PREC, LMUL)(VS2_R, RS1_R, VL); \ + VD_I = VFMUL_VF(PREC, LMUL)(VS2_R, RS1_I, VL); \ + VD_R = VFMACC_VF(PREC, LMUL)(VD_R, RS1_I, VS2_I, VL); \ + VD_I = VFNMSAC_VF(PREC, LMUL)(VD_I, RS1_R, VS2_I, VL); \ + } while(0) + +// vd = vs2 * f[rs1] +#define VCMUL_VF_TU(PREC, LMUL, VD_R, VD_I, VS2_R, VS2_I, RS1_R, RS1_I, VL) \ + do { \ + VD_R = VFMUL_VF_TU(PREC, LMUL)(VS2_R, VS2_R, RS1_R, VL); \ + VD_I = VFMUL_VF_TU(PREC, LMUL)(VS2_I, VS2_I, RS1_R, VL); \ + VD_R = VFNMSAC_VF_TU(PREC, LMUL)(VD_R, RS1_I, VS2_I, VL); \ + VD_I = VFMACC_VF_TU(PREC, LMUL)(VD_I, RS1_I, VS2_R, VL); \ + } while(0) + +// vd = conj(vs2) * f[rs1] +#define VCMUL_VF_CONJ_TU(PREC, LMUL, VD_R, VD_I, VS2_R, VS2_I, RS1_R, RS1_I, VL) \ + do { \ + VD_R = VFMUL_VF_TU(PREC, LMUL)(VS2_R, VS2_R, RS1_R, VL); \ + VD_I = VFMUL_VF_TU(PREC, LMUL)(VS2_I, VS2_I, RS1_R, VL); \ + VD_R = VFMACC_VF_TU(PREC, LMUL)(VD_R, RS1_I, VS2_I, VL); \ + VD_I = VFMSAC_VF_TU(PREC, LMUL)(VD_I, RS1_I, VS2_R, VL); \ + } while(0) + +// vd = vs2 * vs1 +#define VCMUL_VV(PREC, LMUL, VD_R, VD_I, VS2_R, VS2_I, VS1_R, VS1_I, VL) \ + do { \ + VD_R = VFMUL_VV(PREC, LMUL)(VS2_R, VS1_R, VL); \ + VD_I = VFMUL_VV(PREC, LMUL)(VS2_R, VS1_I, VL); \ + VD_R = VFNMSAC_VV(PREC, LMUL)(VD_R, VS1_I, VS2_I, VL); \ + VD_I = VFMACC_VV(PREC, LMUL)(VD_I, VS1_R, VS2_I, VL); \ + } while(0) + +// vd = conj(vs2) * vs1 +#define VCMUL_VV_CONJ(PREC, LMUL, VD_R, VD_I, VS2_R, VS2_I, VS1_R, VS1_I, VL) \ + do { \ + VD_R = VFMUL_VV(PREC, LMUL)(VS2_R, VS1_R, VL); \ + VD_I = VFMUL_VV(PREC, LMUL)(VS2_R, VS1_I, VL); \ + VD_R = VFMACC_VV(PREC, LMUL)(VD_R, VS1_I, VS2_I, VL); \ + VD_I = VFNMSAC_VV(PREC, LMUL)(VD_I, VS1_R, VS2_I, VL); \ + } while(0) + +// vd += vs2 * f[rs1] +#define VCMACC_VF(PREC, LMUL, VD_R, VD_I, RS1_R, RS1_I, VS2_R, VS2_I, VL) \ + do { \ + VD_R = VFMACC_VF(PREC, LMUL)(VD_R, RS1_R, VS2_R, VL); \ + VD_I = VFMACC_VF(PREC, LMUL)(VD_I, RS1_I, VS2_R, VL); \ + VD_R = VFNMSAC_VF(PREC, LMUL)(VD_R, RS1_I, VS2_I, VL); \ + VD_I = VFMACC_VF(PREC, LMUL)(VD_I, RS1_R, VS2_I, VL); \ + } while(0) + +// vd += conj(vs2) * f[rs1] +#define VCMACC_VF_CONJ(PREC, LMUL, VD_R, VD_I, RS1_R, RS1_I, VS2_R, VS2_I, VL) \ + do { \ + VD_R = VFMACC_VF(PREC, LMUL)(VD_R, RS1_R, VS2_R, VL); \ + VD_I = VFMACC_VF(PREC, LMUL)(VD_I, RS1_I, VS2_R, VL); \ + VD_R = VFMACC_VF(PREC, LMUL)(VD_R, RS1_I, VS2_I, VL); \ + VD_I = VFNMSAC_VF(PREC, LMUL)(VD_I, RS1_R, VS2_I, VL); \ + } while(0) + +// vd = vs2 * f[rs1] - vd +#define VCMSAC_VF(PREC, LMUL, VD_R, VD_I, RS1_R, RS1_I, VS2_R, VS2_I, VL) \ + do { \ + VD_R = VFMSAC_VF(PREC, LMUL)(VD_R, RS1_R, VS2_R, VL); \ + VD_I = VFMSAC_VF(PREC, LMUL)(VD_I, RS1_I, VS2_R, VL); \ + VD_R = VFNMSAC_VF(PREC, LMUL)(VD_R, RS1_I, VS2_I, VL); \ + VD_I = VFMACC_VF(PREC, LMUL)(VD_I, RS1_R, VS2_I, VL); \ + } while(0) + +// vd -= vs2 * f[rs1] +#define VCNMSAC_VF(PREC, LMUL, VD_R, VD_I, RS1_R, RS1_I, VS2_R, VS2_I, VL) \ + do { \ + VD_R = VFNMSAC_VF(PREC, LMUL)(VD_R, RS1_R, VS2_R, VL); \ + VD_I = VFNMSAC_VF(PREC, LMUL)(VD_I, RS1_I, VS2_R, VL); \ + VD_R = VFMACC_VF(PREC, LMUL)(VD_R, RS1_I, VS2_I, VL); \ + VD_I = VFNMSAC_VF(PREC, LMUL)(VD_I, RS1_R, VS2_I, VL); \ + } while(0) + +// vd += vs2 * vs1 +#define VCMACC_VV_TU(PREC, LMUL, VD_R, VD_I, VS1_R, VS1_I, VS2_R, VS2_I, VL) \ + do { \ + VD_R = VFMACC_VV_TU(PREC, LMUL)(VD_R, VS1_R, VS2_R, VL); \ + VD_I = VFMACC_VV_TU(PREC, LMUL)(VD_I, VS1_I, VS2_R, VL); \ + VD_R = VFNMSAC_VV_TU(PREC, LMUL)(VD_R, VS1_I, VS2_I, VL); \ + VD_I = VFMACC_VV_TU(PREC, LMUL)(VD_I, VS1_R, VS2_I, VL); \ + } while(0) + +// vd += conj(vs2) * vs1 +#define VCMACC_VV_CONJ_TU(PREC, LMUL, VD_R, VD_I, VS1_R, VS1_I, VS2_R, VS2_I, VL) \ + do { \ + VD_R = VFMACC_VV_TU(PREC, LMUL)(VD_R, VS1_R, VS2_R, VL); \ + VD_I = VFMACC_VV_TU(PREC, LMUL)(VD_I, VS1_I, VS2_R, VL); \ + VD_R = VFMACC_VV_TU(PREC, LMUL)(VD_R, VS1_I, VS2_I, VL); \ + VD_I = VFNMSAC_VV_TU(PREC, LMUL)(VD_I, VS1_R, VS2_I, VL); \ + } while(0) + diff --git a/kernels/sifive_x280/riscv_overloaded_intrinsics.h b/kernels/sifive_x280/riscv_overloaded_intrinsics.h index 6a1d11b131..44f70f2727 100644 --- a/kernels/sifive_x280/riscv_overloaded_intrinsics.h +++ b/kernels/sifive_x280/riscv_overloaded_intrinsics.h @@ -33,6 +33,10 @@ */ // 6. Configuration-Setting and Utility Functions +#define RVV_TYPE_B_(RATIO) vbool##RATIO##_t +#define RVV_TYPE_B(RATIO) RVV_TYPE_B_(RATIO) +#define RVV_TYPE_U_(PRECISION, LMUL) vuint##PRECISION##LMUL##_t +#define RVV_TYPE_U(PRECISION, LMUL) RVV_TYPE_U_(PRECISION, LMUL) #define RVV_TYPE_F_(PRECISION, LMUL) vfloat##PRECISION##LMUL##_t #define RVV_TYPE_F(PRECISION, LMUL) RVV_TYPE_F_(PRECISION, LMUL) #define RVV_TYPE_FX_(PRECISION, LMUL, NFIELDS) vfloat##PRECISION##LMUL##x##NFIELDS##_t @@ -50,6 +54,14 @@ #define VLSEG2_V_F(PRECISION, LMUL, NFIELDS) VLSEG2_V_F_(PRECISION, LMUL, NFIELDS) #define VLSSEG2_V_F_(PRECISION, LMUL, NFIELDS) __riscv_vlsseg2e##PRECISION##_v_f##PRECISION##LMUL##x##NFIELDS #define VLSSEG2_V_F(PRECISION, LMUL, NFIELDS) VLSSEG2_V_F_(PRECISION, LMUL, NFIELDS) +#define VLE_V_F_TU_(PRECISION, LMUL) __riscv_vle##PRECISION##_v_f##PRECISION##LMUL##_tu +#define VLE_V_F_TU(PRECISION, LMUL) VLE_V_F_TU_(PRECISION, LMUL) +#define VLSE_V_F_TU_(PRECISION, LMUL) __riscv_vlse##PRECISION##_v_f##PRECISION##LMUL##_tu +#define VLSE_V_F_TU(PRECISION, LMUL) VLSE_V_F_TU_(PRECISION, LMUL) +#define VLSEG2_V_F_TU_(PRECISION, LMUL, NFIELDS) __riscv_vlseg2e##PRECISION##_v_f##PRECISION##LMUL##x##NFIELDS##_tu +#define VLSEG2_V_F_TU(PRECISION, LMUL, NFIELDS) VLSEG2_V_F_TU_(PRECISION, LMUL, NFIELDS) +#define VLSSEG2_V_F_TU_(PRECISION, LMUL, NFIELDS) __riscv_vlsseg2e##PRECISION##_v_f##PRECISION##LMUL##x##NFIELDS##_tu +#define VLSSEG2_V_F_TU(PRECISION, LMUL, NFIELDS) VLSSEG2_V_F_TU_(PRECISION, LMUL, NFIELDS) // Stores #define VSE_V_F_(PRECISION, LMUL) __riscv_vse##PRECISION##_v_f##PRECISION##LMUL #define VSE_V_F(PRECISION, LMUL) VSE_V_F_(PRECISION, LMUL) @@ -59,58 +71,131 @@ #define VSSEG2_V_F(PRECISION, LMUL, NFIELDS) VSSEG2_V_F_(PRECISION, LMUL, NFIELDS) #define VSSSEG2_V_F_(PRECISION, LMUL, NFIELDS) __riscv_vssseg2e##PRECISION##_v_f##PRECISION##LMUL##x##NFIELDS #define VSSSEG2_V_F(PRECISION, LMUL, NFIELDS) VSSSEG2_V_F_(PRECISION, LMUL, NFIELDS) +#define VSSSEG3_V_F_(PRECISION, LMUL, NFIELDS) __riscv_vssseg3e##PRECISION##_v_f##PRECISION##LMUL##x##NFIELDS +#define VSSSEG3_V_F(PRECISION, LMUL, NFIELDS) VSSSEG3_V_F_(PRECISION, LMUL, NFIELDS) +#define VSSSEG4_V_F_(PRECISION, LMUL, NFIELDS) __riscv_vssseg4e##PRECISION##_v_f##PRECISION##LMUL##x##NFIELDS +#define VSSSEG4_V_F(PRECISION, LMUL, NFIELDS) VSSSEG4_V_F_(PRECISION, LMUL, NFIELDS) +#define VSSSEG5_V_F_(PRECISION, LMUL, NFIELDS) __riscv_vssseg5e##PRECISION##_v_f##PRECISION##LMUL##x##NFIELDS +#define VSSSEG5_V_F(PRECISION, LMUL, NFIELDS) VSSSEG5_V_F_(PRECISION, LMUL, NFIELDS) +#define VSSSEG6_V_F_(PRECISION, LMUL, NFIELDS) __riscv_vssseg6e##PRECISION##_v_f##PRECISION##LMUL##x##NFIELDS +#define VSSSEG6_V_F(PRECISION, LMUL, NFIELDS) VSSSEG6_V_F_(PRECISION, LMUL, NFIELDS) +#define VSSSEG7_V_F_(PRECISION, LMUL, NFIELDS) __riscv_vssseg7e##PRECISION##_v_f##PRECISION##LMUL##x##NFIELDS +#define VSSSEG7_V_F(PRECISION, LMUL, NFIELDS) VSSSEG7_V_F_(PRECISION, LMUL, NFIELDS) +#define VSSSEG8_V_F_(PRECISION, LMUL, NFIELDS) __riscv_vssseg8e##PRECISION##_v_f##PRECISION##LMUL##x##NFIELDS +#define VSSSEG8_V_F(PRECISION, LMUL, NFIELDS) VSSSEG8_V_F_(PRECISION, LMUL, NFIELDS) + +// 11. Vector Integer Arithmetic Operations +#define VADD_VX_U_(PRECISION, LMUL) __riscv_vadd_vx_u##PRECISION##LMUL +#define VADD_VX_U(PRECISION, LMUL) VADD_VX_U_(PRECISION, LMUL) +#define VMERGE_VVM_TU_U_(PRECISION, LMUL) __riscv_vmerge_vvm_u##PRECISION##LMUL##_tu +#define VMERGE_VVM_TU_U(PRECISION, LMUL) VMERGE_VVM_TU_U_(PRECISION, LMUL) // 13. Vector Floating-Point Operations #define VFADD_VV_(PRECISION, LMUL) __riscv_vfadd_vv_f##PRECISION##LMUL #define VFADD_VV(PRECISION, LMUL) VFADD_VV_(PRECISION, LMUL) #define VFSUB_VV_(PRECISION, LMUL) __riscv_vfsub_vv_f##PRECISION##LMUL #define VFSUB_VV(PRECISION, LMUL) VFSUB_VV_(PRECISION, LMUL) -#define VFMUL_VF_(PRECISION, LMUL) __riscv_vfmul_vf_f##PRECISION##LMUL -#define VFMUL_VF(PRECISION, LMUL) VFMUL_VF_(PRECISION, LMUL) #define VFMUL_VV_(PRECISION, LMUL) __riscv_vfmul_vv_f##PRECISION##LMUL #define VFMUL_VV(PRECISION, LMUL) VFMUL_VV_(PRECISION, LMUL) #define VFMUL_VF_(PRECISION, LMUL) __riscv_vfmul_vf_f##PRECISION##LMUL #define VFMUL_VF(PRECISION, LMUL) VFMUL_VF_(PRECISION, LMUL) -#define VFMACC_VF_(PRECISION, LMUL) __riscv_vfmacc_vf_f##PRECISION##LMUL -#define VFMACC_VF(PRECISION, LMUL) VFMACC_VF_(PRECISION, LMUL) +#define VFDIV_VV_(PRECISION, LMUL) __riscv_vfdiv_vv_f##PRECISION##LMUL +#define VFDIV_VV(PRECISION, LMUL) VFDIV_VV_(PRECISION, LMUL) +#define VFRDIV_VF_(PRECISION, LMUL) __riscv_vfrdiv_vf_f##PRECISION##LMUL +#define VFRDIV_VF(PRECISION, LMUL) VFRDIV_VF_(PRECISION, LMUL) #define VFMACC_VV_(PRECISION, LMUL) __riscv_vfmacc_vv_f##PRECISION##LMUL #define VFMACC_VV(PRECISION, LMUL) VFMACC_VV_(PRECISION, LMUL) -#define VFMACC_VV_TU_(PRECISION, LMUL) __riscv_vfmacc_vv_f##PRECISION##LMUL##_tu -#define VFMACC_VV_TU(PRECISION, LMUL) VFMACC_VV_TU_(PRECISION, LMUL) +#define VFMACC_VF_(PRECISION, LMUL) __riscv_vfmacc_vf_f##PRECISION##LMUL +#define VFMACC_VF(PRECISION, LMUL) VFMACC_VF_(PRECISION, LMUL) #define VFMSAC_VF_(PRECISION, LMUL) __riscv_vfmsac_vf_f##PRECISION##LMUL #define VFMSAC_VF(PRECISION, LMUL) VFMSAC_VF_(PRECISION, LMUL) +#define VFNMSAC_VV_(PRECISION, LMUL) __riscv_vfnmsac_vv_f##PRECISION##LMUL +#define VFNMSAC_VV(PRECISION, LMUL) VFNMSAC_VV_(PRECISION, LMUL) #define VFNMSAC_VF_(PRECISION, LMUL) __riscv_vfnmsac_vf_f##PRECISION##LMUL #define VFNMSAC_VF(PRECISION, LMUL) VFNMSAC_VF_(PRECISION, LMUL) -#define VFNMSAC_VV_TU_(PRECISION, LMUL) __riscv_vfnmsac_vv_f##PRECISION##LMUL##_tu -#define VFNMSAC_VV_TU(PRECISION, LMUL) VFNMSAC_VV_TU_(PRECISION, LMUL) #define VFMADD_VF_(PRECISION, LMUL) __riscv_vfmadd_vf_f##PRECISION##LMUL #define VFMADD_VF(PRECISION, LMUL) VFMADD_VF_(PRECISION, LMUL) #define VFMSUB_VF_(PRECISION, LMUL) __riscv_vfmsub_vf_f##PRECISION##LMUL #define VFMSUB_VF(PRECISION, LMUL) VFMSUB_VF_(PRECISION, LMUL) +#define VFMAX_VV_(PRECISION, LMUL) __riscv_vfmax_vv_f##PRECISION##LMUL +#define VFMAX_VV(PRECISION, LMUL) VFMAX_VV_(PRECISION, LMUL) #define VFNEG_VF_(PRECISION, LMUL) __riscv_vfneg_v_f##PRECISION##LMUL #define VFNEG_VF(PRECISION, LMUL) VFNEG_VF_(PRECISION, LMUL) +#define VFABS_V_(PRECISION, LMUL) __riscv_vfabs_v_f##PRECISION##LMUL +#define VFABS_V(PRECISION, LMUL) VFABS_V_(PRECISION, LMUL) +#define VMFEQ_VV_(PRECISION, LMUL, RATIO) __riscv_vmfeq_vv_f##PRECISION##LMUL##_b##RATIO +#define VMFEQ_VV(PRECISION, LMUL, RATIO) VMFEQ_VV_(PRECISION, LMUL, RATIO) +#define VMFNE_VV_(PRECISION, LMUL, RATIO) __riscv_vmfne_vv_f##PRECISION##LMUL##_b##RATIO +#define VMFNE_VV(PRECISION, LMUL, RATIO) VMFNE_VV_(PRECISION, LMUL, RATIO) +#define VMFGT_VV_(PRECISION, LMUL, RATIO) __riscv_vmfgt_vv_f##PRECISION##LMUL##_b##RATIO +#define VMFGT_VV(PRECISION, LMUL, RATIO) VMFGT_VV_(PRECISION, LMUL, RATIO) +#define VMFGE_VV_(PRECISION, LMUL, RATIO) __riscv_vmfge_vv_f##PRECISION##LMUL##_b##RATIO +#define VMFGE_VV(PRECISION, LMUL, RATIO) VMFGE_VV_(PRECISION, LMUL, RATIO) +#define VMERGE_VVM_F_(PRECISION, LMUL) __riscv_vmerge_vvm_f##PRECISION##LMUL +#define VMERGE_VVM_F(PRECISION, LMUL) VMERGE_VVM_F_(PRECISION, LMUL) #define VFMV_V_V_(PRECISION, LMUL) VREINTERPRET_V_I_F(PRECISION, LMUL)( __riscv_vmv_v_v_i##PRECISION##LMUL( VREINTERPRET_V_F_I(PRECISION, LMUL) CURRY_1ARG #define VFMV_V_V(PRECISION, LMUL) VFMV_V_V_(PRECISION, LMUL) +#define VFMV_V_F_(PRECISION, LMUL) __riscv_vfmv_v_f_f##PRECISION##LMUL +#define VFMV_V_F(PRECISION, LMUL) VFMV_V_F_(PRECISION, LMUL) + +#define VFMUL_VF_TU_(PRECISION, LMUL) __riscv_vfmul_vf_f##PRECISION##LMUL##_tu +#define VFMUL_VF_TU(PRECISION, LMUL) VFMUL_VF_TU_(PRECISION, LMUL) +#define VFMACC_VV_TU_(PRECISION, LMUL) __riscv_vfmacc_vv_f##PRECISION##LMUL##_tu +#define VFMACC_VV_TU(PRECISION, LMUL) VFMACC_VV_TU_(PRECISION, LMUL) +#define VFMACC_VF_TU_(PRECISION, LMUL) __riscv_vfmacc_vf_f##PRECISION##LMUL##_tu +#define VFMACC_VF_TU(PRECISION, LMUL) VFMACC_VF_TU_(PRECISION, LMUL) +#define VFMSAC_VF_TU_(PRECISION, LMUL) __riscv_vfmsac_vf_f##PRECISION##LMUL##_tu +#define VFMSAC_VF_TU(PRECISION, LMUL) VFMSAC_VF_TU_(PRECISION, LMUL) +#define VFNMSAC_VV_TU_(PRECISION, LMUL) __riscv_vfnmsac_vv_f##PRECISION##LMUL##_tu +#define VFNMSAC_VV_TU(PRECISION, LMUL) VFNMSAC_VV_TU_(PRECISION, LMUL) +#define VFNMSAC_VF_TU_(PRECISION, LMUL) __riscv_vfnmsac_vf_f##PRECISION##LMUL##_tu +#define VFNMSAC_VF_TU(PRECISION, LMUL) VFNMSAC_VF_TU_(PRECISION, LMUL) +#define VFMAX_VV_TU_(PRECISION, LMUL) __riscv_vfmax_vv_f##PRECISION##LMUL##_tu +#define VFMAX_VV_TU(PRECISION, LMUL) VFMAX_VV_TU_(PRECISION, LMUL) +#define VFNEG_VF_TU_(PRECISION, LMUL) __riscv_vfneg_v_f##PRECISION##LMUL##_tu +#define VFNEG_VF_TU(PRECISION, LMUL) VFNEG_VF_TU_(PRECISION, LMUL) // 14. Vector Reduction Operations +#define VREDMINU_VS_M_(PRECISION, LMUL) __riscv_vredminu_vs_u##PRECISION##LMUL##_u##PRECISION##m1_m +#define VREDMINU_VS_M(PRECISION, LMUL) VREDMINU_VS_M_(PRECISION, LMUL) #define VF_REDUSUM_VS_(PRECISION, LMUL) __riscv_vfredusum_vs_f##PRECISION##LMUL##_f##PRECISION##m1 #define VF_REDUSUM_VS(PRECISION, LMUL) VF_REDUSUM_VS_(PRECISION, LMUL) +#define VFREDMAX_VS_(PRECISION, LMUL) __riscv_vfredmax_vs_f##PRECISION##LMUL##_f##PRECISION##m1 +#define VFREDMAX_VS(PRECISION, LMUL) VFREDMAX_VS_(PRECISION, LMUL) + +// 15. Vector Mask Operations +#define VFIRST_M_(RATIO) __riscv_vfirst_m_b##RATIO +#define VFIRST_M(RATIO) VFIRST_M_(RATIO) +#define VID_V_(PRECISION, LMUL) __riscv_vid_v_u##PRECISION##LMUL +#define VID_V(PRECISION, LMUL) VID_V_(PRECISION, LMUL) // 16. Vector Permutation Operations -#define VFMV_S_F_(PRECISION, LMUL) __riscv_vfmv_s_f_f##PRECISION##LMUL -#define VFMV_S_F(PRECISION, LMUL) VFMV_S_F_(PRECISION, LMUL) +#define VMV_X_S_U_(PRECISION) __riscv_vmv_x_s_u##PRECISION##m1_u##PRECISION +#define VMV_X_S_U(PRECISION) VMV_X_S_U_(PRECISION) +#define VMV_S_X_U_(PRECISION, LMUL) __riscv_vmv_s_x_u##PRECISION##LMUL +#define VMV_S_X_U(PRECISION, LMUL) VMV_S_X_U_(PRECISION, LMUL) #define VFMV_F_S_(PRECISION) __riscv_vfmv_f_s_f##PRECISION##m1_f##PRECISION #define VFMV_F_S(PRECISION) VFMV_F_S_(PRECISION) +#define VFMV_S_F_(PRECISION, LMUL) __riscv_vfmv_s_f_f##PRECISION##LMUL +#define VFMV_S_F(PRECISION, LMUL) VFMV_S_F_(PRECISION, LMUL) +#define VRGATHER_VX_F_(PRECISION, LMUL) __riscv_vrgather_vx_f##PRECISION##LMUL +#define VRGATHER_VX_F(PRECISION, LMUL) VRGATHER_VX_F_(PRECISION, LMUL) // Miscellaneous Vector Function #define VREINTERPRET_V_I_F_(PRECISION, LMUL) __riscv_vreinterpret_v_i##PRECISION##LMUL##_f##PRECISION##LMUL #define VREINTERPRET_V_I_F(PRECISION, LMUL) VREINTERPRET_V_I_F_(PRECISION, LMUL) #define VREINTERPRET_V_F_I_(PRECISION, LMUL) __riscv_vreinterpret_v_f##PRECISION##LMUL##_i##PRECISION##LMUL #define VREINTERPRET_V_F_I(PRECISION, LMUL) VREINTERPRET_V_F_I_(PRECISION, LMUL) -#define VGET_V_F_(PRECISION, LMUL, NFIELDS) __riscv_vget_v_f##PRECISION##LMUL##x##NFIELDS##_f##PRECISION##LMUL -#define VGET_V_F(PRECISION, LMUL, NFIELDS) VGET_V_F_(PRECISION, LMUL, NFIELDS) +#define VLMUL_EXT_V_F_M1_(PRECISION, LMUL) __riscv_vlmul_ext_v_f##PRECISION##m1##_f##PRECISION##LMUL +#define VLMUL_EXT_V_F_M1(PRECISION, LMUL) VLMUL_EXT_V_F_M1_(PRECISION, LMUL) +#define VUNDEFINED_FX_(PRECISION, LMUL, NFIELDS) __riscv_vundefined_f##PRECISION##LMUL##x##NFIELDS +#define VUNDEFINED_FX(PRECISION, LMUL, NFIELDS) VUNDEFINED_FX_(PRECISION, LMUL, NFIELDS) #define VSET_V_F_(PRECISION, LMUL, NFIELDS) __riscv_vset_v_f##PRECISION##LMUL##_f##PRECISION##LMUL##x##NFIELDS #define VSET_V_F(PRECISION, LMUL, NFIELDS) VSET_V_F_(PRECISION, LMUL, NFIELDS) +#define VGET_V_F_(PRECISION, LMUL, NFIELDS) __riscv_vget_v_f##PRECISION##LMUL##x##NFIELDS##_f##PRECISION##LMUL +#define VGET_V_F(PRECISION, LMUL, NFIELDS) VGET_V_F_(PRECISION, LMUL, NFIELDS) +#define VCREATE_V_FX_(PRECISION, LMUL, NFIELDS) __riscv_vcreate_v_f##PRECISION##LMUL##x##NFIELDS +#define VCREATE_V_FX(PRECISION, LMUL, NFIELDS) VCREATE_V_FX_(PRECISION, LMUL, NFIELDS) // Non-vector functions #define CURRY_1ARG(arg1, ...) (arg1), __VA_ARGS__)) diff --git a/travis/do_riscv.sh b/travis/do_riscv.sh index 56c2b85c26..82b6afee62 100755 --- a/travis/do_riscv.sh +++ b/travis/do_riscv.sh @@ -3,7 +3,7 @@ set -e set -x -TAG=2023.10.18 +TAG=2024.08.03 # The prebuilt toolchains only support hardfloat, so we only # test these for now.