From 3eccc392080951358a37ee7c121c6be6a16ac083 Mon Sep 17 00:00:00 2001 From: Yang Hau Date: Sun, 28 Jul 2024 00:54:58 +0800 Subject: [PATCH] feat: Add vmulx[q]_lane[q]_[f32|f64] --- neon2rvv.h | 48 +++++++++--- tests/impl.cpp | 196 ++++++++++++++++++++++++++++++++++++++++++++++--- tests/impl.h | 16 ++-- 3 files changed, 230 insertions(+), 30 deletions(-) diff --git a/neon2rvv.h b/neon2rvv.h index 38b943ab..2b16ceda 100644 --- a/neon2rvv.h +++ b/neon2rvv.h @@ -845,29 +845,53 @@ FORCE_INLINE float64_t vmulxd_f64(float64_t a, float64_t b) { return mul; } -// FORCE_INLINE float32x2_t vmulx_lane_f32(float32x2_t a, float32x2_t v, const int lane); +FORCE_INLINE float32x2_t vmulx_lane_f32(float32x2_t a, float32x2_t b, const int lane) { + vfloat32m1_t b_dup_lane = __riscv_vrgather_vx_f32m1(b, lane, 2); + return vmulx_f32(a, b_dup_lane); +} -// FORCE_INLINE float32x4_t vmulxq_lane_f32(float32x4_t a, float32x2_t v, const int lane); +FORCE_INLINE float32x4_t vmulxq_lane_f32(float32x4_t a, float32x2_t b, const int lane) { + vfloat32m1_t b_dup_lane = __riscv_vrgather_vx_f32m1(b, lane, 4); + return vmulxq_f32(a, b_dup_lane); +} -// FORCE_INLINE float64x1_t vmulx_lane_f64(float64x1_t a, float64x1_t v, const int lane); +FORCE_INLINE float64x1_t vmulx_lane_f64(float64x1_t a, float64x1_t b, const int lane) { + vfloat64m1_t b_dup_lane = __riscv_vrgather_vx_f64m1(b, lane, 1); + return vmulx_f64(a, b_dup_lane); +} -// FORCE_INLINE float64x2_t vmulxq_lane_f64(float64x2_t a, float64x1_t v, const int lane); +FORCE_INLINE float64x2_t vmulxq_lane_f64(float64x2_t a, float64x1_t b, const int lane) { + vfloat64m1_t b_dup_lane = __riscv_vrgather_vx_f64m1(b, lane, 2); + return vmulxq_f64(a, b_dup_lane); +} -// FORCE_INLINE float32_t vmulxs_lane_f32(float32_t a, float32x2_t v, const int lane); +// FORCE_INLINE float32_t vmulxs_lane_f32(float32_t a, float32x2_t b, const int lane); -// FORCE_INLINE float64_t vmulxd_lane_f64(float64_t a, float64x1_t v, const int lane); +// FORCE_INLINE float64_t vmulxd_lane_f64(float64_t a, float64x1_t b, const int lane); -// FORCE_INLINE float32x2_t vmulx_laneq_f32(float32x2_t a, float32x4_t v, const int lane); +FORCE_INLINE float32x2_t vmulx_laneq_f32(float32x2_t a, float32x4_t b, const int lane) { + vfloat32m1_t b_dup_lane = __riscv_vrgather_vx_f32m1(b, lane, 4); + return vmulx_f32(a, b_dup_lane); +} -// FORCE_INLINE float32x4_t vmulxq_laneq_f32(float32x4_t a, float32x4_t v, const int lane); +FORCE_INLINE float32x4_t vmulxq_laneq_f32(float32x4_t a, float32x4_t b, const int lane) { + vfloat32m1_t b_dup_lane = __riscv_vrgather_vx_f32m1(b, lane, 4); + return vmulxq_f32(a, b_dup_lane); +} -// FORCE_INLINE float64x1_t vmulx_laneq_f64(float64x1_t a, float64x2_t v, const int lane); +FORCE_INLINE float64x1_t vmulx_laneq_f64(float64x1_t a, float64x2_t b, const int lane) { + vfloat64m1_t b_dup_lane = __riscv_vrgather_vx_f64m1(b, lane, 2); + return vmulx_f64(a, b_dup_lane); +} -// FORCE_INLINE float64x2_t vmulxq_laneq_f64(float64x2_t a, float64x2_t v, const int lane); +FORCE_INLINE float64x2_t vmulxq_laneq_f64(float64x2_t a, float64x2_t b, const int lane) { + vfloat64m1_t b_dup_lane = __riscv_vrgather_vx_f64m1(b, lane, 2); + return vmulxq_f64(a, b_dup_lane); +} -// FORCE_INLINE float32_t vmulxs_laneq_f32(float32_t a, float32x4_t v, const int lane); +// FORCE_INLINE float32_t vmulxs_laneq_f32(float32_t a, float32x4_t b, const int lane); -// FORCE_INLINE float64_t vmulxd_laneq_f64(float64_t a, float64x2_t v, const int lane); +// FORCE_INLINE float64_t vmulxd_laneq_f64(float64_t a, float64x2_t b, const int lane); FORCE_INLINE float32x2_t vdiv_f32(float32x2_t a, float32x2_t b) { return __riscv_vfdiv_vv_f32m1(a, b, 2); } diff --git a/tests/impl.cpp b/tests/impl.cpp index 46ceed96..bcb530eb 100644 --- a/tests/impl.cpp +++ b/tests/impl.cpp @@ -2872,25 +2872,203 @@ result_t test_vmulxd_f64(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { #endif // ENABLE_TEST_ALL } -result_t test_vmulx_lane_f32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; } +result_t test_vmulx_lane_f32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { +#ifdef ENABLE_TEST_ALL + const float *_a = (float *)impl.test_cases_float_pointer1; + const float *_b = (float *)impl.test_cases_float_pointer2; + float _c[2]; + float32x2_t a = vld1_f32(_a); + float32x2_t b = vld1_f32(_b); + float32x2_t c; + +#define TEST_IMPL(IDX) \ + for (int i = 0; i < 2; i++) { \ + _c[i] = _a[i] * _b[IDX]; \ + } \ + c = vmulx_lane_f32(a, b, IDX); \ + CHECK_RESULT(validate_float(c, _c[0], _c[1])) + + IMM_2_ITER +#undef TEST_IMPL + + return TEST_SUCCESS; +#else + return TEST_UNIMPL; +#endif // ENABLE_TEST_ALL +} -result_t test_vmulxq_lane_f32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; } +result_t test_vmulxq_lane_f32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { +#ifdef ENABLE_TEST_ALL + const float *_a = (float *)impl.test_cases_float_pointer1; + const float *_b = (float *)impl.test_cases_float_pointer2; + float _c[4]; + float32x4_t a = vld1q_f32(_a); + float32x2_t b = vld1_f32(_b); + float32x4_t c; -result_t test_vmulx_lane_f64(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; } +#define TEST_IMPL(IDX) \ + for (int i = 0; i < 4; i++) { \ + _c[i] = _a[i] * _b[IDX]; \ + } \ + c = vmulxq_lane_f32(a, b, IDX); \ + CHECK_RESULT(validate_float(c, _c[0], _c[1], _c[2], _c[3])) -result_t test_vmulxq_lane_f64(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; } + IMM_2_ITER +#undef TEST_IMPL + + return TEST_SUCCESS; +#else + return TEST_UNIMPL; +#endif // ENABLE_TEST_ALL +} + +result_t test_vmulx_lane_f64(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { +#ifdef ENABLE_TEST_ALL + const double *_a = (double *)impl.test_cases_float_pointer1; + const double *_b = (double *)impl.test_cases_float_pointer2; + double _c[2]; + float64x1_t a = vld1_f64(_a); + float64x1_t b = vld1_f64(_b); + float64x1_t c; + + for (int i = 0; i < 1; i++) { + _c[i] = _a[i] * _b[0]; + } + c = vmulx_lane_f64(a, b, 0); + return validate_double(c, _c[0]); + + return TEST_SUCCESS; + +#else + return TEST_UNIMPL; +#endif // ENABLE_TEST_ALL +} + +result_t test_vmulxq_lane_f64(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { +#ifdef ENABLE_TEST_ALL + const double *_a = (double *)impl.test_cases_float_pointer1; + const double *_b = (double *)impl.test_cases_float_pointer2; + double _c[4]; + float64x2_t a = vld1q_f64(_a); + float64x1_t b = vld1_f64(_b); + float64x2_t c; + + for (int i = 0; i < 2; i++) { + _c[i] = _a[i] * _b[0]; + } + c = vmulxq_lane_f64(a, b, 0); + return validate_double(c, _c[0], _c[1]); + +#else + return TEST_UNIMPL; +#endif // ENABLE_TEST_ALL +} result_t test_vmulxs_lane_f32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; } result_t test_vmulxd_lane_f64(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; } -result_t test_vmulx_laneq_f32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; } +result_t test_vmulx_laneq_f32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { +#ifdef ENABLE_TEST_ALL + const float *_a = (float *)impl.test_cases_float_pointer1; + const float *_b = (float *)impl.test_cases_float_pointer2; + float _c[2]; + float32x2_t a = vld1_f32(_a); + float32x4_t b = vld1q_f32(_b); + float32x2_t c; + +#define TEST_IMPL(IDX) \ + for (int i = 0; i < 2; i++) { \ + _c[i] = _a[i] * _b[IDX]; \ + } \ + c = vmulx_laneq_f32(a, b, IDX); \ + CHECK_RESULT(validate_float(c, _c[0], _c[1])) + + IMM_4_ITER +#undef TEST_IMPL + + return TEST_SUCCESS; +#else + return TEST_UNIMPL; +#endif // ENABLE_TEST_ALL +} + +result_t test_vmulxq_laneq_f32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { +#ifdef ENABLE_TEST_ALL + const float *_a = (float *)impl.test_cases_float_pointer1; + const float *_b = (float *)impl.test_cases_float_pointer2; + float _c[4]; + float32x4_t a = vld1q_f32(_a); + float32x4_t b = vld1q_f32(_b); + float32x4_t c; -result_t test_vmulxq_laneq_f32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; } +#define TEST_IMPL(IDX) \ + for (int i = 0; i < 4; i++) { \ + _c[i] = _a[i] * _b[IDX]; \ + } \ + c = vmulxq_laneq_f32(a, b, IDX); \ + CHECK_RESULT(validate_float(c, _c[0], _c[1], _c[2], _c[3])) -result_t test_vmulx_laneq_f64(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; } + IMM_4_ITER +#undef TEST_IMPL -result_t test_vmulxq_laneq_f64(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; } + return TEST_SUCCESS; +#else + return TEST_UNIMPL; +#endif // ENABLE_TEST_ALL +} + +result_t test_vmulx_laneq_f64(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { +#ifdef ENABLE_TEST_ALL + const double *_a = (double *)impl.test_cases_float_pointer1; + const double *_b = (double *)impl.test_cases_float_pointer2; + double _c[2]; + float64x1_t a = vld1_f64(_a); + float64x2_t b = vld1q_f64(_b); + float64x1_t c; + +#define TEST_IMPL(IDX) \ + for (int i = 0; i < 1; i++) { \ + _c[i] = _a[i] * _b[IDX]; \ + } \ + c = vmulx_laneq_f64(a, b, IDX); \ + CHECK_RESULT(validate_double(c, _c[0])) + + IMM_2_ITER +#undef TEST_IMPL + + return TEST_SUCCESS; + +#else + return TEST_UNIMPL; +#endif // ENABLE_TEST_ALL +} + +result_t test_vmulxq_laneq_f64(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { +#ifdef ENABLE_TEST_ALL + const double *_a = (double *)impl.test_cases_float_pointer1; + const double *_b = (double *)impl.test_cases_float_pointer2; + double _c[4]; + float64x2_t a = vld1q_f64(_a); + float64x2_t b = vld1q_f64(_b); + float64x2_t c; + +#define TEST_IMPL(IDX) \ + for (int i = 0; i < 2; i++) { \ + _c[i] = _a[i] * _b[IDX]; \ + } \ + c = vmulxq_laneq_f64(a, b, IDX); \ + CHECK_RESULT(validate_double(c, _c[0], _c[1])) + + IMM_2_ITER +#undef TEST_IMPL + + return TEST_SUCCESS; + +#else + return TEST_UNIMPL; +#endif // ENABLE_TEST_ALL +} result_t test_vmulxs_laneq_f32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; } @@ -31126,7 +31304,6 @@ result_t test_vmul_lane_f32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { #undef TEST_IMPL return TEST_SUCCESS; - #else return TEST_UNIMPL; #endif // ENABLE_TEST_ALL @@ -31255,7 +31432,6 @@ result_t test_vmulq_lane_f32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { #undef TEST_IMPL return TEST_SUCCESS; - #else return TEST_UNIMPL; #endif // ENABLE_TEST_ALL diff --git a/tests/impl.h b/tests/impl.h index 4af0ac2d..0eff143b 100644 --- a/tests/impl.h +++ b/tests/impl.h @@ -171,16 +171,16 @@ _(vmulxq_f64) \ _(vmulxs_f32) \ _(vmulxd_f64) \ - /*_(vmulx_lane_f32) */ \ - /*_(vmulxq_lane_f32) */ \ - /*_(vmulx_lane_f64) */ \ - /*_(vmulxq_lane_f64) */ \ + _(vmulx_lane_f32) \ + _(vmulxq_lane_f32) \ + _(vmulx_lane_f64) \ + _(vmulxq_lane_f64) \ /*_(vmulxs_lane_f32) */ \ /*_(vmulxd_lane_f64) */ \ - /*_(vmulx_laneq_f32) */ \ - /*_(vmulxq_laneq_f32) */ \ - /*_(vmulx_laneq_f64) */ \ - /*_(vmulxq_laneq_f64) */ \ + _(vmulx_laneq_f32) \ + _(vmulxq_laneq_f32) \ + _(vmulx_laneq_f64) \ + _(vmulxq_laneq_f64) \ /*_(vmulxs_laneq_f32) */ \ /*_(vmulxd_laneq_f64) */ \ _(vdiv_f32) \