From 3d5c52963b3877815bcd8d6f7972a9c562668798 Mon Sep 17 00:00:00 2001 From: Yang Hau Date: Tue, 23 Jul 2024 23:19:10 +0800 Subject: [PATCH] feat: Add vqdmull_high_n_[s16|s32] --- neon2rvv.h | 12 ++++++++++-- tests/impl.cpp | 34 ++++++++++++++++++++++++++++++++-- tests/impl.h | 4 ++-- 3 files changed, 44 insertions(+), 6 deletions(-) diff --git a/neon2rvv.h b/neon2rvv.h index 8cd0f9ec..817dfd76 100644 --- a/neon2rvv.h +++ b/neon2rvv.h @@ -10617,9 +10617,17 @@ FORCE_INLINE int64x2_t vqdmull_n_s32(int32x2_t a, int32_t b) { return __riscv_vlmul_trunc_v_i64m2_i64m1(__riscv_vsll_vx_i64m2(ab_mul, 1, 2)); } -// FORCE_INLINE int32x4_t vqdmull_high_n_s16(int16x8_t a, int16_t b); +FORCE_INLINE int32x4_t vqdmull_high_n_s16(int16x8_t a, int16_t b) { + vint16m1_t a_high = __riscv_vslidedown_vx_i16m1(a, 4, 8); + vint32m2_t ab_mul = __riscv_vwmul_vx_i32m2(a_high, b, 4); + return __riscv_vlmul_trunc_v_i32m2_i32m1(__riscv_vsll_vx_i32m2(ab_mul, 1, 4)); +} -// FORCE_INLINE int64x2_t vqdmull_high_n_s32(int32x4_t a, int32_t b); +FORCE_INLINE int64x2_t vqdmull_high_n_s32(int32x4_t a, int32_t b) { + vint32m1_t a_high = __riscv_vslidedown_vx_i32m1(a, 2, 4); + vint64m2_t ab_mul = __riscv_vwmul_vx_i64m2(a_high, b, 2); + return __riscv_vlmul_trunc_v_i64m2_i64m1(__riscv_vsll_vx_i64m2(ab_mul, 1, 2)); +} FORCE_INLINE int16x8_t vqdmulhq_n_s16(int16x8_t a, int16_t b) { vint16m1_t b_dup = vdupq_n_s16(b); diff --git a/tests/impl.cpp b/tests/impl.cpp index bad19c1b..333c7720 100644 --- a/tests/impl.cpp +++ b/tests/impl.cpp @@ -33960,9 +33960,39 @@ result_t test_vqdmull_n_s32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { #endif // ENABLE_TEST_ALL } -result_t test_vqdmull_high_n_s16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; } +result_t test_vqdmull_high_n_s16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { +#ifdef ENABLE_TEST_ALL + const int16_t *_a = (int16_t *)impl.test_cases_int_pointer1; + const int16_t *_b = (int16_t *)impl.test_cases_int_pointer2; + int32_t _c[4]; + int16x8_t a = vld1q_s16(_a); + + for (int i = 0; i < 4; i++) { + _c[i] = sat_dmull(_a[i + 4], _b[0]); + } + int32x4_t c = vqdmull_high_n_s16(a, _b[0]); + return validate_int32(c, _c[0], _c[1], _c[2], _c[3]); +#else + return TEST_UNIMPL; +#endif // ENABLE_TEST_ALL +} -result_t test_vqdmull_high_n_s32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; } +result_t test_vqdmull_high_n_s32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { +#ifdef ENABLE_TEST_ALL + const int32_t *_a = (int32_t *)impl.test_cases_int_pointer1; + const int32_t *_b = (int32_t *)impl.test_cases_int_pointer2; + int64_t _c[2]; + int32x4_t a = vld1q_s32(_a); + + for (int i = 0; i < 2; i++) { + _c[i] = sat_dmull(_a[i + 2], _b[0]); + } + int64x2_t c = vqdmull_high_n_s32(a, _b[0]); + return validate_int64(c, _c[0], _c[1]); +#else + return TEST_UNIMPL; +#endif // ENABLE_TEST_ALL +} result_t test_vqdmulhq_n_s16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { #ifdef ENABLE_TEST_ALL diff --git a/tests/impl.h b/tests/impl.h index 4d0cad98..85a74fd9 100644 --- a/tests/impl.h +++ b/tests/impl.h @@ -2197,8 +2197,8 @@ _(vmull_high_n_u32) \ _(vqdmull_n_s16) \ _(vqdmull_n_s32) \ - /*_(vqdmull_high_n_s16) */ \ - /*_(vqdmull_high_n_s32) */ \ + _(vqdmull_high_n_s16) \ + _(vqdmull_high_n_s32) \ _(vqdmulhq_n_s16) \ _(vqdmulhq_n_s32) \ _(vqdmulh_n_s16) \