From fe209e570ecb2662ae84f77b72524f0eacb9658b Mon Sep 17 00:00:00 2001 From: Yang Hau Date: Thu, 25 Jul 2024 03:19:37 +0800 Subject: [PATCH] feat: Add vqmovun_high_[s16|s32|s64] --- neon2rvv.h | 18 ++++++++++--- tests/impl.cpp | 73 +++++++++++++++++++++++++++++++++++++++++++++++--- tests/impl.h | 6 ++--- 3 files changed, 88 insertions(+), 9 deletions(-) diff --git a/neon2rvv.h b/neon2rvv.h index 440ea169..f37d7361 100644 --- a/neon2rvv.h +++ b/neon2rvv.h @@ -8278,11 +8278,23 @@ FORCE_INLINE uint32x2_t vqmovun_s64(int64x2_t a) { // FORCE_INLINE uint32_t vqmovund_s64(int64_t a); -// FORCE_INLINE uint8x16_t vqmovun_high_s16(uint8x8_t r, int16x8_t a); +FORCE_INLINE uint8x16_t vqmovun_high_s16(uint8x8_t r, int16x8_t a) { + vuint16m1_t a_non_neg = __riscv_vreinterpret_v_i16m1_u16m1(__riscv_vmax_vx_i16m1(a, 0, 8)); + vuint8m1_t vqmovun = __riscv_vlmul_ext_v_u8mf2_u8m1(__riscv_vnclipu_wx_u8mf2(a_non_neg, 0, __RISCV_VXRM_RDN, 8)); + return __riscv_vslideup_vx_u8m1(r, vqmovun, 8, 16); +} -// FORCE_INLINE uint16x8_t vqmovun_high_s32(uint16x4_t r, int32x4_t a); +FORCE_INLINE uint16x8_t vqmovun_high_s32(uint16x4_t r, int32x4_t a) { + vuint32m1_t a_non_neg = __riscv_vreinterpret_v_i32m1_u32m1(__riscv_vmax_vx_i32m1(a, 0, 4)); + vuint16m1_t vqmovun = __riscv_vlmul_ext_v_u16mf2_u16m1(__riscv_vnclipu_wx_u16mf2(a_non_neg, 0, __RISCV_VXRM_RDN, 4)); + return __riscv_vslideup_vx_u16m1(r, vqmovun, 4, 8); +} -// FORCE_INLINE uint32x4_t vqmovun_high_s64(uint32x2_t r, int64x2_t a); +FORCE_INLINE uint32x4_t vqmovun_high_s64(uint32x2_t r, int64x2_t a) { + vuint64m1_t a_non_neg = __riscv_vreinterpret_v_i64m1_u64m1(__riscv_vmax_vx_i64m1(a, 0, 2)); + vuint32m1_t vqmovun = __riscv_vlmul_ext_v_u32mf2_u32m1(__riscv_vnclipu_wx_u32mf2(a_non_neg, 0, __RISCV_VXRM_RDN, 2)); + return __riscv_vslideup_vx_u32m1(r, vqmovun, 2, 4); +} FORCE_INLINE int16x8_t vmovl_s8(int8x8_t a) { return __riscv_vlmul_trunc_v_i16m2_i16m1(__riscv_vsext_vf2_i16m2(a, 8)); } diff --git a/tests/impl.cpp b/tests/impl.cpp index 14c325b2..47ed1976 100644 --- a/tests/impl.cpp +++ b/tests/impl.cpp @@ -29752,11 +29752,78 @@ result_t test_vqmovuns_s32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { retu result_t test_vqmovund_s64(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; } -result_t test_vqmovun_high_s16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; } +result_t test_vqmovun_high_s16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { +#ifdef ENABLE_TEST_ALL + const int16_t *_a = (int16_t *)impl.test_cases_int_pointer1; + const uint8_t *_r = (uint8_t *)impl.test_cases_int_pointer2; + uint8_t _c[16]; + for (int i = 0; i < 8; i++) { + if (_a[i] < 0) { + _c[i + 8] = 0; + } else if (_a[i] > UINT8_MAX) { + _c[i + 8] = UINT8_MAX; + } else { + _c[i + 8] = _a[i]; + } + _c[i] = _r[i]; + } + int16x8_t a = vld1q_s16(_a); + uint8x8_t r = vld1_u8(_r); + uint8x16_t c = vqmovun_high_s16(r, a); + return validate_uint8(c, _c[0], _c[1], _c[2], _c[3], _c[4], _c[5], _c[6], _c[7], _c[8], _c[9], _c[10], _c[11], _c[12], + _c[13], _c[14], _c[15]); +#else + return TEST_UNIMPL; +#endif // ENABLE_TEST_ALL +} -result_t test_vqmovun_high_s32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; } +result_t test_vqmovun_high_s32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { +#ifdef ENABLE_TEST_ALL + const int32_t *_a = (int32_t *)impl.test_cases_int_pointer1; + const uint16_t *_r = (uint16_t *)impl.test_cases_int_pointer2; + uint16_t _c[8]; + for (int i = 0; i < 4; i++) { + if (_a[i] < 0) { + _c[i + 4] = 0; + } else if (_a[i] > UINT16_MAX) { + _c[i + 4] = UINT16_MAX; + } else { + _c[i + 4] = _a[i]; + } + _c[i] = _r[i]; + } + int32x4_t a = vld1q_s32(_a); + uint16x4_t r = vld1_u16(_r); + uint16x8_t c = vqmovun_high_s32(r, a); + return validate_uint16(c, _c[0], _c[1], _c[2], _c[3], _c[4], _c[5], _c[6], _c[7]); +#else + return TEST_UNIMPL; +#endif // ENABLE_TEST_ALL +} -result_t test_vqmovun_high_s64(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; } +result_t test_vqmovun_high_s64(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { +#ifdef ENABLE_TEST_ALL + const int64_t *_a = (int64_t *)impl.test_cases_int_pointer1; + const uint32_t *_r = (uint32_t *)impl.test_cases_int_pointer2; + uint32_t _c[4]; + for (int i = 0; i < 2; i++) { + if (_a[i] < 0) { + _c[i + 2] = 0; + } else if (_a[i] > UINT32_MAX) { + _c[i + 2] = UINT32_MAX; + } else { + _c[i + 2] = _a[i]; + } + _c[i] = _r[i]; + } + int64x2_t a = vld1q_s64(_a); + uint32x2_t r = vld1_u32(_r); + uint32x4_t c = vqmovun_high_s64(r, a); + return validate_uint32(c, _c[0], _c[1], _c[2], _c[3]); +#else + return TEST_UNIMPL; +#endif // ENABLE_TEST_ALL +} result_t test_vmovl_s8(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { #ifdef ENABLE_TEST_ALL diff --git a/tests/impl.h b/tests/impl.h index 2ac49686..ad7bbeeb 100644 --- a/tests/impl.h +++ b/tests/impl.h @@ -1795,9 +1795,9 @@ /*_(vqmovunh_s16) */ \ /*_(vqmovuns_s32) */ \ /*_(vqmovund_s64) */ \ - /*_(vqmovun_high_s16) */ \ - /*_(vqmovun_high_s32) */ \ - /*_(vqmovun_high_s64) */ \ + _(vqmovun_high_s16) \ + _(vqmovun_high_s32) \ + _(vqmovun_high_s64) \ _(vmovl_s8) \ _(vmovl_s16) \ _(vmovl_s32) \