From 1c75c7b07b254532fa2e8bb93523b3ae52d1189b Mon Sep 17 00:00:00 2001 From: LI Qingwu Date: Tue, 10 Sep 2024 19:05:32 +0800 Subject: [PATCH] arm neon: avoid GCC 11 vst1_*_x4 built-in functions The vst1_*_x4 built-in functions in GCC 11 produce incorrect results, as following exanple, this patch circumvents the issue by avoiding these functions in GCC 11 and earlier versions. float32x2x4_t val = {{ {0.10f, 0.20f}, {0.30f, 0.40f}, {0.50f, 0.60f}, {0.70f, 0.80f} }}; float result[16]; vst1_f32_x4(result, val); get result = {0.100000 0.200000 0.500000 0.600000 0.000000 0.000000 0.000000 0.000000} Signed-off-by: LI Qingwu --- simde/arm/neon/st1_x4.h | 38 +++++++++++++++++++++++++------------- simde/simde-common.h | 3 +++ 2 files changed, 28 insertions(+), 13 deletions(-) diff --git a/simde/arm/neon/st1_x4.h b/simde/arm/neon/st1_x4.h index 41f6db6e1..aa8c17db9 100644 --- a/simde/arm/neon/st1_x4.h +++ b/simde/arm/neon/st1_x4.h @@ -68,7 +68,8 @@ simde_vst1_f16_x4(simde_float16_t ptr[HEDLEY_ARRAY_PARAM(16)], simde_float16x4x4 SIMDE_FUNCTION_ATTRIBUTES void simde_vst1_f32_x4(simde_float32 ptr[HEDLEY_ARRAY_PARAM(8)], simde_float32x2x4_t val) { - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) \ + && !defined(SIMDE_BUG_GCC_114521) vst1_f32_x4(ptr, val); #else simde_vst1_f32(ptr, val.val[0]); @@ -85,7 +86,7 @@ simde_vst1_f32_x4(simde_float32 ptr[HEDLEY_ARRAY_PARAM(8)], simde_float32x2x4_t SIMDE_FUNCTION_ATTRIBUTES void simde_vst1_f64_x4(simde_float64 ptr[HEDLEY_ARRAY_PARAM(4)], simde_float64x1x4_t val) { - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_GCC_114521) vst1_f64_x4(ptr, val); #else simde_vst1_f64(ptr, val.val[0]); @@ -102,7 +103,8 @@ simde_vst1_f64_x4(simde_float64 ptr[HEDLEY_ARRAY_PARAM(4)], simde_float64x1x4_t SIMDE_FUNCTION_ATTRIBUTES void simde_vst1_s8_x4(int8_t ptr[HEDLEY_ARRAY_PARAM(32)], simde_int8x8x4_t val) { - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) \ + && !defined(SIMDE_BUG_GCC_114521) vst1_s8_x4(ptr, val); #else simde_vst1_s8(ptr, val.val[0]); @@ -119,7 +121,8 @@ simde_vst1_s8_x4(int8_t ptr[HEDLEY_ARRAY_PARAM(32)], simde_int8x8x4_t val) { SIMDE_FUNCTION_ATTRIBUTES void simde_vst1_s16_x4(int16_t ptr[HEDLEY_ARRAY_PARAM(16)], simde_int16x4x4_t val) { - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) \ + && !defined(SIMDE_BUG_GCC_114521) vst1_s16_x4(ptr, val); #else simde_vst1_s16(ptr, val.val[0]); @@ -136,7 +139,8 @@ simde_vst1_s16_x4(int16_t ptr[HEDLEY_ARRAY_PARAM(16)], simde_int16x4x4_t val) { SIMDE_FUNCTION_ATTRIBUTES void simde_vst1_s32_x4(int32_t ptr[HEDLEY_ARRAY_PARAM(8)], simde_int32x2x4_t val) { - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) \ + && !defined(SIMDE_BUG_GCC_114521) vst1_s32_x4(ptr, val); #else simde_vst1_s32(ptr, val.val[0]); @@ -153,7 +157,8 @@ simde_vst1_s32_x4(int32_t ptr[HEDLEY_ARRAY_PARAM(8)], simde_int32x2x4_t val) { SIMDE_FUNCTION_ATTRIBUTES void simde_vst1_s64_x4(int64_t ptr[HEDLEY_ARRAY_PARAM(4)], simde_int64x1x4_t val) { - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) \ + && !defined(SIMDE_BUG_GCC_114521) vst1_s64_x4(ptr, val); #else simde_vst1_s64(ptr, val.val[0]); @@ -170,7 +175,8 @@ simde_vst1_s64_x4(int64_t ptr[HEDLEY_ARRAY_PARAM(4)], simde_int64x1x4_t val) { SIMDE_FUNCTION_ATTRIBUTES void simde_vst1_u8_x4(uint8_t ptr[HEDLEY_ARRAY_PARAM(32)], simde_uint8x8x4_t val) { - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) \ + && !defined(SIMDE_BUG_GCC_114521) vst1_u8_x4(ptr, val); #else simde_vst1_u8(ptr, val.val[0]); @@ -187,7 +193,8 @@ simde_vst1_u8_x4(uint8_t ptr[HEDLEY_ARRAY_PARAM(32)], simde_uint8x8x4_t val) { SIMDE_FUNCTION_ATTRIBUTES void simde_vst1_u16_x4(uint16_t ptr[HEDLEY_ARRAY_PARAM(16)], simde_uint16x4x4_t val) { - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) \ + && !defined(SIMDE_BUG_GCC_114521) vst1_u16_x4(ptr, val); #else simde_vst1_u16(ptr, val.val[0]); @@ -204,7 +211,8 @@ simde_vst1_u16_x4(uint16_t ptr[HEDLEY_ARRAY_PARAM(16)], simde_uint16x4x4_t val) SIMDE_FUNCTION_ATTRIBUTES void simde_vst1_u32_x4(uint32_t ptr[HEDLEY_ARRAY_PARAM(8)], simde_uint32x2x4_t val) { - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) \ + && !defined(SIMDE_BUG_GCC_114521) vst1_u32_x4(ptr, val); #else simde_vst1_u32(ptr, val.val[0]); @@ -221,7 +229,8 @@ simde_vst1_u32_x4(uint32_t ptr[HEDLEY_ARRAY_PARAM(8)], simde_uint32x2x4_t val) { SIMDE_FUNCTION_ATTRIBUTES void simde_vst1_u64_x4(uint64_t ptr[HEDLEY_ARRAY_PARAM(4)], simde_uint64x1x4_t val) { - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) \ + && !defined(SIMDE_BUG_GCC_114521) vst1_u64_x4(ptr, val); #else simde_vst1_u64(ptr, val.val[0]); @@ -239,7 +248,8 @@ SIMDE_FUNCTION_ATTRIBUTES void simde_vst1_p8_x4(simde_poly8_t ptr[HEDLEY_ARRAY_PARAM(32)], simde_poly8x8x4_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ - (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) \ + && !defined(SIMDE_BUG_GCC_114521) vst1_p8_x4(ptr, val); #else simde_poly8x8_private val_[4]; @@ -265,7 +275,8 @@ SIMDE_FUNCTION_ATTRIBUTES void simde_vst1_p16_x4(simde_poly16_t ptr[HEDLEY_ARRAY_PARAM(16)], simde_poly16x4x4_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ - (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) \ + && !defined(SIMDE_BUG_GCC_114521) vst1_p16_x4(ptr, val); #else simde_poly16x4_private val_[4]; @@ -291,7 +302,8 @@ SIMDE_FUNCTION_ATTRIBUTES void simde_vst1_p64_x4(simde_poly64_t ptr[HEDLEY_ARRAY_PARAM(4)], simde_poly64x1x4_t val) { #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && \ - (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) \ + && !defined(SIMDE_BUG_GCC_114521) vst1_p64_x4(ptr, val); #else simde_poly64x1_private val_[4]; diff --git a/simde/simde-common.h b/simde/simde-common.h index aacab9dca..212bff620 100644 --- a/simde/simde-common.h +++ b/simde/simde-common.h @@ -1033,6 +1033,9 @@ HEDLEY_DIAGNOSTIC_POP # if (!HEDLEY_GCC_VERSION_CHECK(9,0,0) && !defined(SIMDE_ARCH_AARCH64)) || (!defined(SIMDE_ARCH_AARCH64) && defined(SIMDE_ARCH_ARM)) # define SIMDE_BUG_GCC_REV_260989 # endif +# if !HEDLEY_GCC_VERSION_CHECK(11,5,0) && (defined(SIMDE_ARCH_ARM4) || defined(SIMDE_ARCH_AARCH64)) +# define SIMDE_BUG_GCC_114521 +# endif # if defined(SIMDE_ARCH_ARM) && !defined(SIMDE_ARCH_AARCH64) # define SIMDE_BUG_GCC_95399 # define SIMDE_BUG_GCC_95471