From 2a548e591c74fb26b88c5c25dd295f58dee22069 Mon Sep 17 00:00:00 2001 From: Zhijin Zeng Date: Mon, 15 Apr 2024 18:43:36 +0800 Subject: [PATCH] arm: fix some neon2rvv intrinsic function error 1. For vqdmlal_s16/s32: the doubling result maybe overflow, so need to use vqaddq_s16/32 to saturate it. As the same with vqdmlsl_s16/32. 2. The vrdmulh family function need to use vqadd saturating function to avoid the doubling result overflow. 3. The result of vrshl family function need to keep the sign bit of the origin data. If a > 0 && b < 0, the result of (a + (1 << (-b - 1))) maybe overflow into a negative value. And in gcc/clang, >> means the arithmetic shift left, so it will get the incorrect sign bit whithout unsigned extend value. Signed-off-by: Zhijin Zeng --- simde/arm/neon/qdmlal.h | 17 +++++-------- simde/arm/neon/qdmlsl.h | 18 ++++++-------- simde/arm/neon/qrdmulh.h | 10 ++++++-- simde/arm/neon/qrshl.h | 20 ++++++++++++--- simde/arm/neon/rshl.h | 30 +++++++++++++++++------ test/arm/neon/qdmlal.c | 8 ++++++ test/arm/neon/qdmlsl.c | 8 ++++++ test/arm/neon/qrdmulh.c | 14 +++++++++++ test/arm/neon/qrdmulh_n.c | 12 +++++++++ test/arm/neon/qrshl.c | 51 +++++++++++++++++++++++++++++++++++++++ test/arm/neon/rshl.c | 43 +++++++++++++++++++++++++++++++++ 11 files changed, 195 insertions(+), 36 deletions(-) diff --git a/simde/arm/neon/qdmlal.h b/simde/arm/neon/qdmlal.h index b23ab6fca..fe96b0fc8 100644 --- a/simde/arm/neon/qdmlal.h +++ b/simde/arm/neon/qdmlal.h @@ -31,6 +31,7 @@ #include "mul.h" #include "mul_n.h" #include "movl.h" +#include "qadd.h" #include "types.h" HEDLEY_DIAGNOSTIC_PUSH @@ -71,7 +72,8 @@ simde_vqdmlal_s16(simde_int32x4_t a, simde_int16x4_t b, simde_int16x4_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vqdmlal_s16(a, b, c); #else - return simde_vaddq_s32(simde_vmulq_n_s32(simde_vmulq_s32(simde_vmovl_s16(b), simde_vmovl_s16(c)), 2), a); + simde_int32x4_t temp = simde_vmulq_s32(simde_vmovl_s16(b), simde_vmovl_s16(c)); + return simde_vqaddq_s32(simde_vqaddq_s32(temp, temp), a); #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -85,17 +87,10 @@ simde_vqdmlal_s32(simde_int64x2_t a, simde_int32x2_t b, simde_int32x2_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vqdmlal_s32(a, b, c); #else - simde_int64x2_private r_ = simde_int64x2_to_private( - simde_x_vmulq_s64( + simde_int64x2_t r = simde_x_vmulq_s64( simde_vmovl_s32(b), - simde_vmovl_s32(c))); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = r_.values[i] * HEDLEY_STATIC_CAST(int64_t, 2); - } - - return simde_vaddq_s64(a, simde_int64x2_from_private(r_)); + simde_vmovl_s32(c)); + return simde_vqaddq_s64(a, simde_vqaddq_s64(r, r)); #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) diff --git a/simde/arm/neon/qdmlsl.h b/simde/arm/neon/qdmlsl.h index e7770ac61..68e17ca05 100644 --- a/simde/arm/neon/qdmlsl.h +++ b/simde/arm/neon/qdmlsl.h @@ -31,6 +31,8 @@ #include "mul.h" #include "mul_n.h" #include "movl.h" +#include "qadd.h" +#include "qsub.h" #include "types.h" HEDLEY_DIAGNOSTIC_PUSH @@ -71,7 +73,8 @@ simde_vqdmlsl_s16(simde_int32x4_t a, simde_int16x4_t b, simde_int16x4_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vqdmlsl_s16(a, b, c); #else - return simde_vsubq_s32(a, simde_vmulq_n_s32(simde_vmulq_s32(simde_vmovl_s16(b), simde_vmovl_s16(c)), 2)); + simde_int32x4_t temp = simde_vmulq_s32(simde_vmovl_s16(b), simde_vmovl_s16(c)); + return simde_vqsubq_s32(a, simde_vqaddq_s32(temp, temp)); #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -85,17 +88,10 @@ simde_vqdmlsl_s32(simde_int64x2_t a, simde_int32x2_t b, simde_int32x2_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vqdmlsl_s32(a, b, c); #else - simde_int64x2_private r_ = simde_int64x2_to_private( - simde_x_vmulq_s64( + simde_int64x2_t r = simde_x_vmulq_s64( simde_vmovl_s32(b), - simde_vmovl_s32(c))); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = r_.values[i] * HEDLEY_STATIC_CAST(int64_t, 2); - } - - return simde_vsubq_s64(a, simde_int64x2_from_private(r_)); + simde_vmovl_s32(c)); + return simde_vqsubq_s64(a, simde_vqaddq_s64(r, r)); #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) diff --git a/simde/arm/neon/qrdmulh.h b/simde/arm/neon/qrdmulh.h index 9a69b92e5..55fedfe72 100644 --- a/simde/arm/neon/qrdmulh.h +++ b/simde/arm/neon/qrdmulh.h @@ -40,7 +40,10 @@ simde_vqrdmulhh_s16(int16_t a, int16_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vqrdmulhh_s16(a, b); #else - return HEDLEY_STATIC_CAST(int16_t, (((1 << 15) + ((HEDLEY_STATIC_CAST(int32_t, (HEDLEY_STATIC_CAST(int32_t, a) * HEDLEY_STATIC_CAST(int32_t, b)))) << 1)) >> 16) & 0xffff); + int32_t temp = HEDLEY_STATIC_CAST(int32_t, a) * HEDLEY_STATIC_CAST(int32_t, b); + int32_t r = temp > 0 ? (temp > (INT32_MAX >> 1) ? INT32_MAX : (temp << 1)) : (temp < (INT32_MIN >> 1) ? INT32_MIN : (temp << 1)); + r = (r > (INT32_MAX - (1 << 15))) ? INT32_MAX : ((1 << 15) + r); + return HEDLEY_STATIC_CAST(int16_t, ((r >> 16) & 0xffff)); #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) @@ -54,7 +57,10 @@ simde_vqrdmulhs_s32(int32_t a, int32_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vqrdmulhs_s32(a, b); #else - return HEDLEY_STATIC_CAST(int32_t, (((HEDLEY_STATIC_CAST(int64_t, 1) << 31) + ((HEDLEY_STATIC_CAST(int64_t, (HEDLEY_STATIC_CAST(int64_t, a) * HEDLEY_STATIC_CAST(int64_t, b)))) << 1)) >> 32) & 0xffffffff); + int64_t temp = HEDLEY_STATIC_CAST(int64_t, a) * HEDLEY_STATIC_CAST(int64_t, b); + int64_t r = temp > 0 ? (temp > (INT64_MAX >> 1) ? INT64_MAX : (temp << 1)) : (temp < (INT64_MIN >> 1) ? INT64_MIN : (temp << 1)); + r = (r > (INT64_MAX - (HEDLEY_STATIC_CAST(int64_t, 1) << 31))) ? INT64_MAX : ((HEDLEY_STATIC_CAST(int64_t, 1) << 31) + r); + return HEDLEY_STATIC_CAST(int32_t, ((r >> 32) & 0xffffffff)); #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) diff --git a/simde/arm/neon/qrshl.h b/simde/arm/neon/qrshl.h index cd30b6ff5..e91435619 100644 --- a/simde/arm/neon/qrshl.h +++ b/simde/arm/neon/qrshl.h @@ -44,7 +44,10 @@ simde_vqrshlb_s8(int8_t a, int8_t b) { if (b < -8) { r = 0; } else if (b < 0) { - r = HEDLEY_STATIC_CAST(int8_t, ((a + (1 << (-b - 1))) >> -b)); + r = HEDLEY_STATIC_CAST(int8_t, a <= 0 + ? ((a + (1 << (-b - 1))) >> -b) + : HEDLEY_STATIC_CAST(int8_t, ((HEDLEY_STATIC_CAST(uint8_t, + (a + (1 << (-b - 1)))) >> -b) & 0x7FUL))); } else if (b == 0) { r = a; } else if (b < 7) { @@ -79,7 +82,10 @@ simde_vqrshlh_s16(int16_t a, int16_t b) { if (b8 <= -16) { r = 0; } else if (b8 < 0) { - r = HEDLEY_STATIC_CAST(int16_t, ((a + (1 << (-b8 - 1))) >> -b8)); + r = HEDLEY_STATIC_CAST(int16_t, a <= 0 + ? ((a + (1 << (-b8 - 1))) >> -b8) + : HEDLEY_STATIC_CAST(int16_t, ((HEDLEY_STATIC_CAST(uint16_t, + (a + (1 << (-b8 - 1)))) >> -b8) & 0x7FFFUL))); } else if (b8 == 0) { r = a; } else if (b8 < 15) { @@ -114,7 +120,10 @@ simde_vqrshls_s32(int32_t a, int32_t b) { if (b8 <= -32) { r = 0; } else if (b8 < 0) { - r = ((a + (1 << (-b8 - 1))) >> -b8); + r = a <= 0 + ? ((a + (1 << (-b8 - 1))) >> -b8) + : HEDLEY_STATIC_CAST(int32_t, ((HEDLEY_STATIC_CAST(uint32_t, + (a + (1 << (-b8 - 1)))) >> -b8) & 0x7FFFFFFFUL)); } else if (b8 == 0) { r = a; } else if (b8 < 31) { @@ -149,7 +158,10 @@ simde_vqrshld_s64(int64_t a, int64_t b) { if (b8 <= -64) { r = 0; } else if (b8 < 0) { - r = ((a + (INT64_C(1) << (-b8 - 1))) >> -b8); + r = a <= 0 + ? ((a + (INT64_C(1) << (-b8 - 1))) >> -b8) + : HEDLEY_STATIC_CAST(int64_t, ((HEDLEY_STATIC_CAST(uint64_t, + (a + (INT64_C(1) << (-b8 - 1)))) >> -b8) & 0x7FFFFFFFFFFFFFFFUL)); } else if (b8 == 0) { r = a; } else if (b8 < 63) { diff --git a/simde/arm/neon/rshl.h b/simde/arm/neon/rshl.h index 260eda332..091a9a407 100644 --- a/simde/arm/neon/rshl.h +++ b/simde/arm/neon/rshl.h @@ -84,7 +84,9 @@ simde_vrshld_s64(int64_t a, int64_t b) { ? 0 : (b >= 0) ? (a << b) - : ((a + (INT64_C(1) << (-b - 1))) >> -b); + : (a <= 0 + ? ((a + (INT64_C(1) << (-b - 1))) >> -b) + : HEDLEY_STATIC_CAST(int64_t, (HEDLEY_STATIC_CAST(uint64_t, (a + (INT64_C(1) << (-b - 1)))) >> -b))); #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) @@ -148,7 +150,9 @@ simde_vrshl_s8 (const simde_int8x8_t a, const simde_int8x8_t b) { r_.values[i] = HEDLEY_STATIC_CAST(int8_t, (simde_math_abs(b_.values[i]) >= 8) ? 0 : (b_.values[i] >= 0) ? (a_.values[i] << b_.values[i]) : - ((a_.values[i] + (1 << (-b_.values[i] - 1))) >> -b_.values[i])); + ((a_.values[i] <= 0) ? ((a_.values[i] + (1 << (-b_.values[i] - 1))) >> -b_.values[i]) : + HEDLEY_STATIC_CAST(int8_t, ((HEDLEY_STATIC_CAST(uint8_t, + (a_.values[i] + (1 << (-b_.values[i] - 1)))) >> -b_.values[i]) & (0x7FUL))))); } #endif @@ -189,7 +193,9 @@ simde_vrshl_s16 (const simde_int16x4_t a, const simde_int16x4_t b) { r_.values[i] = HEDLEY_STATIC_CAST(int16_t, (simde_math_abs(b_.values[i]) >= 16) ? 0 : (b_.values[i] >= 0) ? (a_.values[i] << b_.values[i]) : - ((a_.values[i] + (1 << (-b_.values[i] - 1))) >> -b_.values[i])); + ((a_.values[i] <= 0) ? ((a_.values[i] + (1 << (-b_.values[i] - 1))) >> -b_.values[i]) : + HEDLEY_STATIC_CAST(int16_t, ((HEDLEY_STATIC_CAST(uint16_t, + (a_.values[i] + (1 << (-b_.values[i] - 1)))) >> -b_.values[i]) & (0x7FFFUL))))); } #endif @@ -230,7 +236,9 @@ simde_vrshl_s32 (const simde_int32x2_t a, const simde_int32x2_t b) { r_.values[i] = HEDLEY_STATIC_CAST(int32_t, (simde_math_abs(b_.values[i]) >= 32) ? 0 : (b_.values[i] >= 0) ? (a_.values[i] << b_.values[i]) : - ((a_.values[i] + (1 << (-b_.values[i] - 1))) >> -b_.values[i])); + ((a_.values[i] <= 0) ? ((a_.values[i] + (1 << (-b_.values[i] - 1))) >> -b_.values[i]) : + HEDLEY_STATIC_CAST(int32_t, ((HEDLEY_STATIC_CAST(uint32_t, + (a_.values[i] + (1 << (-b_.values[i] - 1)))) >> -b_.values[i]) & (0x7FFFFFFFUL))))); } #endif @@ -513,7 +521,9 @@ simde_vrshlq_s8 (const simde_int8x16_t a, const simde_int8x16_t b) { r_.values[i] = HEDLEY_STATIC_CAST(int8_t, (simde_math_abs(b_.values[i]) >= 8) ? 0 : (b_.values[i] >= 0) ? (a_.values[i] << b_.values[i]) : - ((a_.values[i] + (1 << (-b_.values[i] - 1))) >> -b_.values[i])); + ((a_.values[i] <= 0) ? ((a_.values[i] + (1 << (-b_.values[i] - 1))) >> -b_.values[i]) : + HEDLEY_STATIC_CAST(int8_t, ((HEDLEY_STATIC_CAST(uint8_t, + (a_.values[i] + (1 << (-b_.values[i] - 1)))) >> -b_.values[i]) & (0x7FUL))))); } #endif @@ -580,7 +590,9 @@ simde_vrshlq_s16 (const simde_int16x8_t a, const simde_int16x8_t b) { r_.values[i] = HEDLEY_STATIC_CAST(int16_t, (simde_math_abs(b_.values[i]) >= 16) ? 0 : (b_.values[i] >= 0) ? (a_.values[i] << b_.values[i]) : - ((a_.values[i] + (1 << (-b_.values[i] - 1))) >> -b_.values[i])); + ((a_.values[i] <= 0) ? ((a_.values[i] + (1 << (-b_.values[i] - 1))) >> -b_.values[i]) : + HEDLEY_STATIC_CAST(int16_t, ((HEDLEY_STATIC_CAST(uint16_t, + (a_.values[i] + (1 << (-b_.values[i] - 1)))) >> -b_.values[i]) & (0x7FFFUL))))); } #endif @@ -634,8 +646,10 @@ simde_vrshlq_s32 (const simde_int32x4_t a, const simde_int32x4_t b) { b_.values[i] = HEDLEY_STATIC_CAST(int8_t, b_.values[i]); r_.values[i] = HEDLEY_STATIC_CAST(int32_t, (simde_math_abs(b_.values[i]) >= 32) ? 0 : - (b_.values[i] >= 0) ? (a_.values[i] << b_.values[i]) : - ((a_.values[i] + (1 << (-b_.values[i] - 1))) >> -b_.values[i])); + (b_.values[i] >= 0) ? (a_.values[i] << b_.values[i]) : + ((a_.values[i] <= 0) ? ((a_.values[i] + (1 << (-b_.values[i] - 1))) >> -b_.values[i]) : + HEDLEY_STATIC_CAST(int32_t, ((HEDLEY_STATIC_CAST(uint32_t, + (a_.values[i] + (1 << (-b_.values[i] - 1)))) >> -b_.values[i]) & (0X7FFFFFFFUL))))); } #endif diff --git a/test/arm/neon/qdmlal.c b/test/arm/neon/qdmlal.c index d33e449d1..f1c629abe 100644 --- a/test/arm/neon/qdmlal.c +++ b/test/arm/neon/qdmlal.c @@ -145,6 +145,10 @@ test_simde_vqdmlal_s16 (SIMDE_MUNIT_TEST_ARGS) { { INT16_C( 8642), -INT16_C( 579), INT16_C( 2963), INT16_C( 9252) }, { -INT16_C( 7314), -INT16_C( 5230), INT16_C( 8688), INT16_C( 5749) }, { -INT32_C( 126337050), INT32_C( 6064180), INT32_C( 51565328), INT32_C( 106311312) } }, + { { -INT32_C( 16), -INT32_C( 15), -INT32_C( 14), -INT32_C( 13) }, + { INT16_MIN, INT16_MIN, INT16_MIN, INT16_MIN }, + { INT16_MIN, INT16_MIN, INT16_MIN, INT16_MIN }, + { INT32_C(2147483631), INT32_C(2147483632), INT32_C(2147483633), INT32_C(2147483634) } }, }; for (size_t i = 0 ; i < (sizeof(test_vec) / sizeof(test_vec[0])) ; i++) { @@ -199,6 +203,10 @@ test_simde_vqdmlal_s32 (SIMDE_MUNIT_TEST_ARGS) { { -INT32_C( 8397045), INT32_C( 6142639) }, { -INT32_C( 6977990), -INT32_C( 5600341) }, { INT64_C(117189633707070), -INT64_C( 68801010159595) } }, + { { -INT64_C( 16), -INT64_C( 15) }, + { INT32_MIN, INT32_MIN }, + { INT32_MIN, INT32_MIN }, + {INT64_C(9223372036854775791), INT64_C(9223372036854775792) } }, }; for (size_t i = 0 ; i < (sizeof(test_vec) / sizeof(test_vec[0])) ; i++) { diff --git a/test/arm/neon/qdmlsl.c b/test/arm/neon/qdmlsl.c index c2b9968e2..aa972a671 100644 --- a/test/arm/neon/qdmlsl.c +++ b/test/arm/neon/qdmlsl.c @@ -145,6 +145,10 @@ test_simde_vqdmlsl_s16 (SIMDE_MUNIT_TEST_ARGS) { { INT16_C( 8057), -INT16_C( 9339), INT16_C( 1806), INT16_C( 8600) }, { -INT16_C( 751), -INT16_C( 6991), INT16_C( 1494), -INT16_C( 6795)}, { INT32_C( 16070724), -INT32_C( 122085335), INT32_C( 18446020), INT32_C( 153174877) } }, + { { -INT32_C( 16), -INT32_C( 15), -INT32_C( 14), -INT32_C( 13) }, + { INT16_MIN, INT16_MIN, INT16_MIN, INT16_MIN }, + { INT16_MIN, INT16_MIN, INT16_MIN, INT16_MIN }, + { INT32_MIN, INT32_MIN, INT32_MIN, INT32_MIN } }, }; for (size_t i = 0 ; i < (sizeof(test_vec) / sizeof(test_vec[0])) ; i++) { @@ -199,6 +203,10 @@ test_simde_vqdmlsl_s32 (SIMDE_MUNIT_TEST_ARGS) { { -INT32_C( 305245), -INT32_C( 548274) }, { -INT32_C( 805474), INT32_C( 431866)}, { -INT64_C( 353429066965), -INT64_C( 166180089563) } }, + { { -INT64_C( 16), -INT64_C( 15) }, + { INT32_MIN, INT32_MIN }, + { INT32_MIN, INT32_MIN }, + { INT64_MIN, INT64_MIN } }, }; for (size_t i = 0 ; i < (sizeof(test_vec) / sizeof(test_vec[0])) ; i++) { diff --git a/test/arm/neon/qrdmulh.c b/test/arm/neon/qrdmulh.c index b82cac034..6d20bf76e 100644 --- a/test/arm/neon/qrdmulh.c +++ b/test/arm/neon/qrdmulh.c @@ -35,6 +35,9 @@ test_simde_vqrdmulh_s16 (SIMDE_MUNIT_TEST_ARGS) { { { INT16_C( 31066), INT16_C( 19881), INT16_C( 14863), INT16_C( 16264) }, { INT16_C( 17499), INT16_C( 19391), -INT16_C( 23792), -INT16_C( 25706) }, { INT16_C( 16590), INT16_C( 11765), -INT16_C( 10792), -INT16_C( 12759) } }, + { { INT16_MIN, INT16_MIN, INT16_MIN, INT16_MIN }, + { INT16_MIN, INT16_MIN, INT16_MIN, INT16_MIN }, + { INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX } }, }; @@ -94,6 +97,9 @@ test_simde_vqrdmulh_s32 (SIMDE_MUNIT_TEST_ARGS) { { { -INT32_C( 1216301242), INT32_C( 231209245) }, { INT32_C( 1833478310), -INT32_C( 429409792) }, { -INT32_C( 1038453516), -INT32_C( 46232489) } }, + { { INT32_MIN, INT32_MIN }, + { INT32_MIN, INT32_MIN }, + { INT32_MAX, INT32_MAX } }, }; @@ -153,6 +159,11 @@ test_simde_vqrdmulhq_s16 (SIMDE_MUNIT_TEST_ARGS) { { { INT16_C( 28579), INT16_C( 26571), INT16_C( 23618), INT16_C( 3470), INT16_C( 10594), INT16_C( 31318), -INT16_C( 24794), INT16_C( 1860) }, { -INT16_C( 22526), -INT16_C( 12632), INT16_C( 21464), INT16_C( 8577), INT16_C( 28627), INT16_C( 27596), -INT16_C( 26895), -INT16_C( 27290) }, { -INT16_C( 19646), -INT16_C( 10243), INT16_C( 15470), INT16_C( 908), INT16_C( 9255), INT16_C( 26375), INT16_C( 20350), -INT16_C( 1549) } }, +#if !defined(SIMDE_X86_SSE_NATIVE) && !defined(SIMDE_X86_MMX_NATIVE) + { { INT16_MIN, INT16_MIN, INT16_MIN, INT16_MIN, INT16_MIN, INT16_MIN, INT16_MIN, INT16_MIN }, + { INT16_MIN, INT16_MIN, INT16_MIN, INT16_MIN, INT16_MIN, INT16_MIN, INT16_MIN, INT16_MIN }, + { INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX } }, +#endif }; @@ -212,6 +223,9 @@ test_simde_vqrdmulhq_s32 (SIMDE_MUNIT_TEST_ARGS) { { { -INT32_C( 613662219), -INT32_C( 1259034176), INT32_C( 1695972338), -INT32_C( 22565202) }, { INT32_C( 1459986413), INT32_C( 865007473), -INT32_C( 921225670), -INT32_C( 335884554) }, { -INT32_C( 417203876), -INT32_C( 507139587), -INT32_C( 727536740), INT32_C( 3529388) } }, + { { INT32_MIN, INT32_MIN, INT32_MIN, INT32_MIN }, + { INT32_MIN, INT32_MIN, INT32_MIN, INT32_MIN }, + { INT32_MAX, INT32_MAX, INT32_MAX, INT32_MAX } }, }; diff --git a/test/arm/neon/qrdmulh_n.c b/test/arm/neon/qrdmulh_n.c index 3809375aa..0e689f7cd 100644 --- a/test/arm/neon/qrdmulh_n.c +++ b/test/arm/neon/qrdmulh_n.c @@ -35,6 +35,9 @@ test_simde_vqrdmulh_n_s16 (SIMDE_MUNIT_TEST_ARGS) { { { -INT16_C( 14394), INT16_C( 28773), INT16_C( 30122), -INT16_C( 574) }, -INT16_C( 10708), { INT16_C( 4704), -INT16_C( 9403), -INT16_C( 9843), INT16_C( 188) } }, + { { INT16_MIN, INT16_MIN, INT16_MIN, INT16_MIN }, + INT16_MIN, + { INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX } }, }; @@ -94,6 +97,9 @@ test_simde_vqrdmulh_n_s32 (SIMDE_MUNIT_TEST_ARGS) { { { -INT32_C( 1238271146), INT32_C( 1164109663) }, INT32_C( 737217376), { -INT32_C( 425090550), INT32_C( 399631388) } }, + { { INT32_MIN, INT32_MIN }, + INT32_MIN, + { INT32_MAX, INT32_MAX } }, }; @@ -153,6 +159,9 @@ test_simde_vqrdmulhq_n_s16 (SIMDE_MUNIT_TEST_ARGS) { { { INT16_C( 15761), INT16_C( 23849), INT16_C( 9736), INT16_C( 26802), INT16_C( 27881), -INT16_C( 7053), -INT16_C( 14710), -INT16_C( 23581) }, INT16_C( 25688), { INT16_C( 12356), INT16_C( 18696), INT16_C( 7632), INT16_C( 21011), INT16_C( 21857), -INT16_C( 5529), -INT16_C( 11532), -INT16_C( 18486) } }, + { { INT16_MIN, INT16_MIN, INT16_MIN, INT16_MIN, INT16_MIN, INT16_MIN, INT16_MIN, INT16_MIN }, + INT16_MIN, + { INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX } }, }; @@ -212,6 +221,9 @@ test_simde_vqrdmulhq_n_s32 (SIMDE_MUNIT_TEST_ARGS) { { { -INT32_C( 219487289), INT32_C( 1420994589), INT32_C( 889110344), -INT32_C( 2103115347) }, INT32_C( 1735639961), { -INT32_C( 177394091), INT32_C( 1148476728), INT32_C( 718597063), -INT32_C( 1699780598) } }, + { { INT32_MIN, INT32_MIN, INT32_MIN, INT32_MIN }, + INT32_MIN, + { INT32_MAX, INT32_MAX, INT32_MAX, INT32_MAX } }, }; diff --git a/test/arm/neon/qrshl.c b/test/arm/neon/qrshl.c index 97180cb28..9d6c28426 100644 --- a/test/arm/neon/qrshl.c +++ b/test/arm/neon/qrshl.c @@ -46,6 +46,9 @@ test_simde_vqrshlb_s8 (SIMDE_MUNIT_TEST_ARGS) { { INT8_C( 7), INT8_C( 0), INT8_C( 7) }, + { INT8_MAX, + -INT8_C( 1), + INT8_C( 64) }, }; for (size_t i = 0 ; i < (sizeof(test_vec) / sizeof(test_vec[0])) ; i++) { @@ -103,6 +106,9 @@ test_simde_vqrshlh_s16 (SIMDE_MUNIT_TEST_ARGS) { { INT16_C( 329), INT16_C( 14), INT16_MAX }, + { INT16_MAX, + -INT16_C( 1), + INT16_C( 16384) }, }; for (size_t i = 0 ; i < (sizeof(test_vec) / sizeof(test_vec[0])) ; i++) { @@ -184,6 +190,9 @@ test_simde_vqrshls_s32 (SIMDE_MUNIT_TEST_ARGS) { { INT32_C( 435262094), INT32_C( 32), INT32_MAX }, + { INT32_MAX, + -INT32_C( 1), + INT32_C( 1073741824) }, }; for (size_t i = 0 ; i < (sizeof(test_vec) / sizeof(test_vec[0])) ; i++) { @@ -241,6 +250,9 @@ test_simde_vqrshld_s64 (SIMDE_MUNIT_TEST_ARGS) { { INT64_C( 3143776020433277350), INT64_C( 21), INT64_MAX }, + { INT64_MAX, + -INT64_C( 1), + INT64_C( 4611686018427387904) }, }; for (size_t i = 0 ; i < (sizeof(test_vec) / sizeof(test_vec[0])) ; i++) { @@ -553,6 +565,12 @@ test_simde_vqrshl_s8 (SIMDE_MUNIT_TEST_ARGS) { -INT8_C( 2), INT8_C( 5), -INT8_C( 6), -INT8_C( 5) }, { INT8_C( 0), INT8_MIN, INT8_MAX, -INT8_C( 124), -INT8_C( 32), INT8_MAX, INT8_C( 1), INT8_C( 0) } }, + { { INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, + INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX }, + { -INT8_C( 1), -INT8_C( 1), -INT8_C( 1), -INT8_C( 1), + -INT8_C( 1), -INT8_C( 1), -INT8_C( 1), -INT8_C( 1) }, + { INT8_C( 64), INT8_C( 64), INT8_C( 64), INT8_C( 64), + INT8_C( 64), INT8_C( 64), INT8_C( 64), INT8_C( 64) } }, }; for (size_t i = 0 ; i < (sizeof(test_vec) / sizeof(test_vec[0])) ; i++) { @@ -613,6 +631,9 @@ test_simde_vqrshl_s16 (SIMDE_MUNIT_TEST_ARGS) { { { INT16_C( 14453), -INT16_C( 12196), INT16_C( 27445), INT16_C( 31840) }, { INT16_C( 14), INT16_C( 5), INT16_C( 13), -INT16_C( 9) }, { INT16_MAX, INT16_MIN, INT16_MAX, INT16_C( 62) } }, + { { INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX }, + { -INT16_C( 1), -INT16_C( 1), -INT16_C( 1), -INT16_C( 1) }, + { INT16_C( 16384), INT16_C( 16384), INT16_C( 16384), INT16_C( 16384) } }, }; for (size_t i = 0 ; i < (sizeof(test_vec) / sizeof(test_vec[0])) ; i++) { @@ -672,6 +693,9 @@ test_simde_vqrshl_s32 (SIMDE_MUNIT_TEST_ARGS) { { { INT32_C( 58593943), -INT32_C( 594339506) }, { -INT32_C( 21), -INT32_C( 9) }, { INT32_C( 28), -INT32_C( 1160819) } }, + { { INT32_MAX, INT32_MAX }, + { -INT32_C( 1), -INT32_C( 1) }, + { INT32_C( 1073741824), INT32_C( 1073741824) } }, }; for (size_t i = 0 ; i < (sizeof(test_vec) / sizeof(test_vec[0])) ; i++) { @@ -731,6 +755,9 @@ test_simde_vqrshl_s64 (SIMDE_MUNIT_TEST_ARGS) { { { INT64_C( 8846445702649439056) }, { -INT64_C( 5) }, { INT64_C( 276451428207794971) } }, + { { INT64_MAX }, + { -INT64_C( 1) }, + { INT64_C( 4611686018427387904) } }, }; for (size_t i = 0 ; i < (sizeof(test_vec) / sizeof(test_vec[0])) ; i++) { @@ -1119,6 +1146,18 @@ test_simde_vqrshlq_s8 (SIMDE_MUNIT_TEST_ARGS) { -INT8_C( 7), INT8_MIN, INT8_MAX, INT8_MAX, -INT8_C( 24), INT8_C( 4), INT8_MAX, INT8_MIN, -INT8_C( 82), INT8_C( 2), INT8_C( 2), INT8_C( 42) } }, + { { INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, + INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, + INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, + INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX }, + { -INT8_C( 1), -INT8_C( 1), -INT8_C( 1), -INT8_C( 1), + -INT8_C( 1), -INT8_C( 1), -INT8_C( 1), -INT8_C( 1), + -INT8_C( 1), -INT8_C( 1), -INT8_C( 1), -INT8_C( 1), + -INT8_C( 1), -INT8_C( 1), -INT8_C( 1), -INT8_C( 1) }, + { INT8_C( 64), INT8_C( 64), INT8_C( 64), INT8_C( 64), + INT8_C( 64), INT8_C( 64), INT8_C( 64), INT8_C( 64), + INT8_C( 64), INT8_C( 64), INT8_C( 64), INT8_C( 64), + INT8_C( 64), INT8_C( 64), INT8_C( 64), INT8_C( 64) } }, }; for (size_t i = 0 ; i < (sizeof(test_vec) / sizeof(test_vec[0])) ; i++) { @@ -1202,6 +1241,12 @@ test_simde_vqrshlq_s16 (SIMDE_MUNIT_TEST_ARGS) { INT16_C( 0), INT16_C( 15), -INT16_C( 5), -INT16_C( 13) }, { -INT16_C( 10376), INT16_MAX, INT16_MAX, INT16_MAX, -INT16_C( 2630), INT16_MIN, -INT16_C( 651), INT16_C( 0) } }, + { { INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, + INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX }, + { -INT16_C( 1), -INT16_C( 1), -INT16_C( 1), -INT16_C( 1), + -INT16_C( 1), -INT16_C( 1), -INT16_C( 1), -INT16_C( 1) }, + { INT16_C( 16384), INT16_C( 16384), INT16_C( 16384), INT16_C( 16384), + INT16_C( 16384), INT16_C( 16384), INT16_C( 16384), INT16_C( 16384) } }, }; for (size_t i = 0 ; i < (sizeof(test_vec) / sizeof(test_vec[0])) ; i++) { @@ -1261,6 +1306,9 @@ test_simde_vqrshlq_s32 (SIMDE_MUNIT_TEST_ARGS) { { { INT32_C( 219035158), INT32_C( 2118934887), INT32_C( 641686675), INT32_C( 264846024) }, { INT32_C( 6), INT32_C( 28), -INT32_C( 1), INT32_C( 12) }, { INT32_MAX, INT32_MAX, INT32_C( 320843338), INT32_MAX } }, + { { INT32_MAX, INT32_MAX, INT32_MAX, INT32_MAX }, + { -INT32_C( 1), -INT32_C( 1), -INT32_C( 1), -INT32_C( 1) }, + { INT32_C( 1073741824), INT32_C( 1073741824), INT32_C( 1073741824), INT32_C( 1073741824) } }, }; for (size_t i = 0 ; i < (sizeof(test_vec) / sizeof(test_vec[0])) ; i++) { @@ -1319,6 +1367,9 @@ test_simde_vqrshlq_s64 (SIMDE_MUNIT_TEST_ARGS) { { { INT64_C( 570160631599597498), INT64_C( 7291438976119207855) }, { -INT64_C( 54), INT64_C( 30) }, { INT64_C( 32), INT64_MAX } }, + { { INT64_MAX, INT64_MAX }, + { -INT64_C( 1), -INT64_C( 1) }, + { INT64_C( 4611686018427387904), INT64_C( 4611686018427387904) } }, }; for (size_t i = 0 ; i < (sizeof(test_vec) / sizeof(test_vec[0])) ; i++) { diff --git a/test/arm/neon/rshl.c b/test/arm/neon/rshl.c index 8e7862ca1..6502b0275 100644 --- a/test/arm/neon/rshl.c +++ b/test/arm/neon/rshl.c @@ -46,6 +46,11 @@ test_simde_vrshl_s8 (SIMDE_MUNIT_TEST_ARGS) { { { -INT8_C( 12), INT8_C( 85), -INT8_C( 96), INT8_C( 21), -INT8_C( 69), -INT8_C( 127), -INT8_C( 84), INT8_C( 79) }, { INT8_C( 7), -INT8_C( 7), INT8_C( 5), -INT8_C( 7), -INT8_C( 37), -INT8_C( 7), -INT8_C( 7), -INT8_C( 10) }, { INT8_C( 0), INT8_C( 1), INT8_C( 0), INT8_C( 0), INT8_C( 0), -INT8_C( 1), -INT8_C( 1), INT8_C( 0) } }, +#if !defined(SIMDE_X86_SSE_NATIVE) && !defined(SIMDE_X86_MMX_NATIVE) + { { INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX }, + { -INT8_C( 1), -INT8_C( 1), -INT8_C( 1), -INT8_C( 1), -INT8_C( 1), -INT8_C( 1), -INT8_C( 1), -INT8_C( 1) }, + { INT8_C( 64), INT8_C( 64), INT8_C( 64), INT8_C( 64), INT8_C( 64), INT8_C( 64), INT8_C( 64), INT8_C( 64) } }, +#endif }; for (size_t i = 0 ; i < (sizeof(test_vec) / sizeof(test_vec[0])) ; i++) { @@ -124,6 +129,11 @@ test_simde_vrshl_s16 (SIMDE_MUNIT_TEST_ARGS) { { { -INT16_C( 30838), INT16_C( 27999), -INT16_C( 18012), -INT16_C( 18857) }, { INT16_C( 15), -INT16_C( 9), INT16_C( 9), INT16_C( 31646) }, { INT16_C( 0), INT16_C( 55), INT16_C( 18432), INT16_C( 0) } }, +#if !defined(SIMDE_X86_SSE_NATIVE) && !defined(SIMDE_X86_MMX_NATIVE) + { { INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX}, + { -INT16_C( 1), -INT16_C( 1), -INT16_C( 1), -INT16_C( 1) }, + { INT16_C( 16384), INT16_C( 16384), INT16_C( 16384), INT16_C( 16384) } }, +#endif }; for (size_t i = 0 ; i < (sizeof(test_vec) / sizeof(test_vec[0])) ; i++) { @@ -203,6 +213,11 @@ test_simde_vrshl_s32 (SIMDE_MUNIT_TEST_ARGS) { { { -INT32_C( 1295350051), -INT32_C( 413045626) }, { -INT32_C( 25), -INT32_C( 30) }, { -INT32_C( 39), INT32_C( 0) } }, +#if !defined(SIMDE_X86_SSE_NATIVE) && !defined(SIMDE_X86_MMX_NATIVE) + { { INT32_MAX, INT32_MAX }, + { -INT32_C( 1), -INT32_C( 1) }, + { INT32_C( 1073741824), INT32_C( 1073741824) } }, +#endif }; for (size_t i = 0 ; i < (sizeof(test_vec) / sizeof(test_vec[0])) ; i++) { @@ -283,6 +298,11 @@ test_simde_vrshl_s64 (SIMDE_MUNIT_TEST_ARGS) { { { -INT64_C( 4313522761509692639) }, { -INT64_C( 43) }, { -INT64_C( 490391) } }, +#if !defined(SIMDE_X86_SSE_NATIVE) && !defined(SIMDE_X86_MMX_NATIVE) + { { INT64_MAX }, + { -INT64_C( 1) }, + { INT64_C( 4611686018427387904) } }, +#endif }; for (size_t i = 0 ; i < (sizeof(test_vec) / sizeof(test_vec[0])) ; i++) { @@ -700,6 +720,14 @@ test_simde_vrshlq_s8 (SIMDE_MUNIT_TEST_ARGS) { INT8_C( 6), INT8_C( 8), -INT8_C( 6), INT8_C( 8), -INT8_C( 8), -INT8_C( 5), -INT8_C( 6), INT8_C( 5) }, { INT8_C( 0), -INT8_C( 64), INT8_C( 0), INT8_C( 0), -INT8_C( 1), INT8_C( 2), -INT8_C( 64), INT8_C( 0), INT8_C( 64), INT8_C( 0), -INT8_C( 2), INT8_C( 0), INT8_C( 0), INT8_C( 0), -INT8_C( 2), INT8_C( 64) } }, +#if !defined(SIMDE_X86_SSE_NATIVE) && !defined(SIMDE_X86_MMX_NATIVE) + { { INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, + INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX }, + { -INT8_C( 1), -INT8_C( 1), -INT8_C( 1), -INT8_C( 1), -INT8_C( 1), -INT8_C( 1), -INT8_C( 1), -INT8_C( 1), + -INT8_C( 1), -INT8_C( 1), -INT8_C( 1), -INT8_C( 1), -INT8_C( 1), -INT8_C( 1), -INT8_C( 1), -INT8_C( 1) }, + { INT8_C( 64), INT8_C( 64), INT8_C( 64), INT8_C( 64), INT8_C( 64), INT8_C( 64), INT8_C( 64), INT8_C( 64), + INT8_C( 64), INT8_C( 64), INT8_C( 64), INT8_C( 64), INT8_C( 64), INT8_C( 64), INT8_C( 64), INT8_C( 64) } }, +#endif }; for (size_t i = 0 ; i < (sizeof(test_vec) / sizeof(test_vec[0])) ; i++) { @@ -780,6 +808,11 @@ test_simde_vrshlq_s16 (SIMDE_MUNIT_TEST_ARGS) { { { INT16_C( 31504), INT16_C( 25108), -INT16_C( 10829), INT16_C( 20263), INT16_C( 26767), -INT16_C( 20705), -INT16_C( 22702), -INT16_C( 18381) }, { INT16_C( 11), -INT16_C( 13), INT16_C( 11), INT16_C( 10), INT16_C( 12), -INT16_C( 15), INT16_C( 14), INT16_C( 20055) }, { INT16_MIN, INT16_C( 3), -INT16_C( 26624), -INT16_C( 25600), -INT16_C( 4096), -INT16_C( 1), INT16_MIN, INT16_C( 0) } }, +#if !defined(SIMDE_X86_SSE_NATIVE) && !defined(SIMDE_X86_MMX_NATIVE) + { { INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX }, + { -INT16_C( 1), -INT16_C( 1), -INT16_C( 1), -INT16_C( 1), -INT16_C( 1), -INT16_C( 1), -INT16_C( 1), -INT16_C( 1) }, + { INT16_C( 16384), INT16_C( 16384), INT16_C( 16384), INT16_C( 16384), INT16_C( 16384), INT16_C( 16384), INT16_C( 16384), INT16_C( 16384) } }, +#endif }; for (size_t i = 0 ; i < (sizeof(test_vec) / sizeof(test_vec[0])) ; i++) { @@ -860,6 +893,11 @@ test_simde_vrshlq_s32 (SIMDE_MUNIT_TEST_ARGS) { { { INT32_C( 1269846085), -INT32_C( 1900169466), INT32_C( 1047704628), INT32_C( 498857806) }, { INT32_C( 27), INT32_C( 16), INT32_C( 30), -INT32_C( 25) }, { INT32_C( 671088640), -INT32_C( 1224343552), INT32_C( 0), INT32_C( 15) } }, +#if !defined(SIMDE_X86_SSE_NATIVE) && !defined(SIMDE_X86_MMX_NATIVE) + { { INT32_MAX, INT32_MAX, INT32_MAX, INT32_MAX }, + { -INT32_C( 1), -INT32_C( 1), -INT32_C( 1), -INT32_C( 1) }, + { INT32_C( 1073741824), INT32_C( 1073741824), INT32_C( 1073741824), INT32_C( 1073741824) } }, +#endif }; for (size_t i = 0 ; i < (sizeof(test_vec) / sizeof(test_vec[0])) ; i++) { @@ -939,6 +977,11 @@ test_simde_vrshlq_s64 (SIMDE_MUNIT_TEST_ARGS) { { { INT64_C( 1082637037262742893), INT64_C( 6751871869302762015) }, { INT64_C( 5088688905005675581), INT64_C( 45) }, { -INT64_C( 6917529027641081856), -INT64_C( 3655832181890088960) } }, +#if !defined(SIMDE_X86_SSE_NATIVE) && !defined(SIMDE_X86_MMX_NATIVE) + { { INT64_MAX, INT64_MAX }, + { -INT64_C( 1), -INT64_C( 1) }, + { INT64_C( 4611686018427387904), INT64_C( 4611686018427387904) } }, +#endif }; for (size_t i = 0 ; i < (sizeof(test_vec) / sizeof(test_vec[0])) ; i++) {