From 7198d6d3604dea4d287e0cc203492abab99b3248 Mon Sep 17 00:00:00 2001 From: "Michael R. Crusoe" Date: Mon, 16 Oct 2023 11:04:29 +0200 Subject: [PATCH] arm: use SIMDE_ARCH_ARM_FMA --- simde/arm/neon/fma.h | 10 +++++----- simde/arm/neon/fma_lane.h | 24 ++++++++++++------------ simde/arm/neon/fma_n.h | 8 ++++---- simde/mips/msa/madd.h | 2 +- simde/simde-arch.h | 3 +++ simde/wasm/relaxed-simd.h | 4 ++-- simde/x86/aes.h | 2 ++ simde/x86/fma.h | 4 ++-- 8 files changed, 31 insertions(+), 26 deletions(-) diff --git a/simde/arm/neon/fma.h b/simde/arm/neon/fma.h index e66ee57b1..7c1a00118 100644 --- a/simde/arm/neon/fma.h +++ b/simde/arm/neon/fma.h @@ -38,7 +38,7 @@ SIMDE_BEGIN_DECLS_ SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vfma_f32(simde_float32x2_t a, simde_float32x2_t b, simde_float32x2_t c) { - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && (defined(__ARM_FEATURE_FMA) && __ARM_FEATURE_FMA) + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) return vfma_f32(a, b, c); #else return simde_vadd_f32(a, simde_vmul_f32(b, c)); @@ -52,7 +52,7 @@ simde_vfma_f32(simde_float32x2_t a, simde_float32x2_t b, simde_float32x2_t c) { SIMDE_FUNCTION_ATTRIBUTES simde_float64x1_t simde_vfma_f64(simde_float64x1_t a, simde_float64x1_t b, simde_float64x1_t c) { - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && (defined(__ARM_FEATURE_FMA) && __ARM_FEATURE_FMA) + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) return vfma_f64(a, b, c); #else return simde_vadd_f64(a, simde_vmul_f64(b, c)); @@ -66,7 +66,7 @@ simde_vfma_f64(simde_float64x1_t a, simde_float64x1_t b, simde_float64x1_t c) { SIMDE_FUNCTION_ATTRIBUTES simde_float16x8_t simde_vfmaq_f16(simde_float16x8_t a, simde_float16x8_t b, simde_float16x8_t c) { - #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && (defined(__ARM_FEATURE_FMA) && __ARM_FEATURE_FMA) && defined(SIMDE_ARM_NEON_FP16) + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && defined(SIMDE_ARM_NEON_FP16) return vfmaq_f16(a, b, c); #else return simde_vaddq_f16(a, simde_vmulq_f16(b, c)); @@ -80,7 +80,7 @@ simde_vfmaq_f16(simde_float16x8_t a, simde_float16x8_t b, simde_float16x8_t c) { SIMDE_FUNCTION_ATTRIBUTES simde_float32x4_t simde_vfmaq_f32(simde_float32x4_t a, simde_float32x4_t b, simde_float32x4_t c) { - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && (defined(__ARM_FEATURE_FMA) && __ARM_FEATURE_FMA) + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) return vfmaq_f32(a, b, c); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) return vec_madd(b, c, a); @@ -109,7 +109,7 @@ simde_vfmaq_f32(simde_float32x4_t a, simde_float32x4_t b, simde_float32x4_t c) { SIMDE_FUNCTION_ATTRIBUTES simde_float64x2_t simde_vfmaq_f64(simde_float64x2_t a, simde_float64x2_t b, simde_float64x2_t c) { - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && (defined(__ARM_FEATURE_FMA) && __ARM_FEATURE_FMA) + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) return vfmaq_f64(a, b, c); #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) return vec_madd(b, c, a); diff --git a/simde/arm/neon/fma_lane.h b/simde/arm/neon/fma_lane.h index 6100ed78c..bf4edcbb9 100644 --- a/simde/arm/neon/fma_lane.h +++ b/simde/arm/neon/fma_lane.h @@ -38,7 +38,7 @@ SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ /* simde_vfmad_lane_f64 */ -#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && (defined(__ARM_FEATURE_FMA) && __ARM_FEATURE_FMA) +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) #if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(11,0,0) #define simde_vfmad_lane_f64(a, b, v, lane) \ SIMDE_DISABLE_DIAGNOSTIC_EXPR_(SIMDE_DIAGNOSTIC_DISABLE_VECTOR_CONVERSION_, vfmad_lane_f64(a, b, v, lane)) @@ -61,7 +61,7 @@ SIMDE_BEGIN_DECLS_ #endif /* simde_vfmad_laneq_f64 */ -#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && (defined(__ARM_FEATURE_FMA) && __ARM_FEATURE_FMA) +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) #if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(11,0,0) #define simde_vfmad_laneq_f64(a, b, v, lane) \ SIMDE_DISABLE_DIAGNOSTIC_EXPR_(SIMDE_DIAGNOSTIC_DISABLE_VECTOR_CONVERSION_, vfmad_laneq_f64(a, b, v, lane)) @@ -84,7 +84,7 @@ SIMDE_BEGIN_DECLS_ #endif /* simde_vfmas_lane_f32 */ -#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && (defined(__ARM_FEATURE_FMA) && __ARM_FEATURE_FMA) +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) #if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(11,0,0) #define simde_vfmas_lane_f32(a, b, v, lane) \ SIMDE_DISABLE_DIAGNOSTIC_EXPR_(SIMDE_DIAGNOSTIC_DISABLE_VECTOR_CONVERSION_, vfmas_lane_f32(a, b, v, lane)) @@ -107,7 +107,7 @@ SIMDE_BEGIN_DECLS_ #endif /* simde_vfmas_laneq_f32 */ -#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && (defined(__ARM_FEATURE_FMA) && __ARM_FEATURE_FMA) +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) #if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(11,0,0) #define simde_vfmas_laneq_f32(a, b, v, lane) \ SIMDE_DISABLE_DIAGNOSTIC_EXPR_(SIMDE_DIAGNOSTIC_DISABLE_VECTOR_CONVERSION_, vfmas_laneq_f32(a, b, v, lane)) @@ -130,7 +130,7 @@ SIMDE_BEGIN_DECLS_ #endif /* simde_vfma_lane_f32 */ -#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && (defined(__ARM_FEATURE_FMA) && __ARM_FEATURE_FMA) +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) #define simde_vfma_lane_f32(a, b, v, lane) vfma_lane_f32(a, b, v, lane) #else #define simde_vfma_lane_f32(a, b, v, lane) simde_vadd_f32(a, simde_vmul_lane_f32(b, v, lane)) @@ -141,7 +141,7 @@ SIMDE_BEGIN_DECLS_ #endif /* simde_vfma_lane_f64 */ -#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && (defined(__ARM_FEATURE_FMA) && __ARM_FEATURE_FMA) +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) #define simde_vfma_lane_f64(a, b, v, lane) vfma_lane_f64((a), (b), (v), (lane)) #else #define simde_vfma_lane_f64(a, b, v, lane) simde_vadd_f64(a, simde_vmul_lane_f64(b, v, lane)) @@ -152,7 +152,7 @@ SIMDE_BEGIN_DECLS_ #endif /* simde_vfma_laneq_f32 */ -#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && (defined(__ARM_FEATURE_FMA) && __ARM_FEATURE_FMA) +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) #define simde_vfma_laneq_f32(a, b, v, lane) vfma_laneq_f32((a), (b), (v), (lane)) #else #define simde_vfma_laneq_f32(a, b, v, lane) simde_vadd_f32(a, simde_vmul_laneq_f32(b, v, lane)) @@ -163,7 +163,7 @@ SIMDE_BEGIN_DECLS_ #endif /* simde_vfma_laneq_f64 */ -#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && (defined(__ARM_FEATURE_FMA) && __ARM_FEATURE_FMA) +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) #define simde_vfma_laneq_f64(a, b, v, lane) vfma_laneq_f64((a), (b), (v), (lane)) #else #define simde_vfma_laneq_f64(a, b, v, lane) simde_vadd_f64(a, simde_vmul_laneq_f64(b, v, lane)) @@ -174,7 +174,7 @@ SIMDE_BEGIN_DECLS_ #endif /* simde_vfmaq_lane_f64 */ -#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && (defined(__ARM_FEATURE_FMA) && __ARM_FEATURE_FMA) +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) #define simde_vfmaq_lane_f64(a, b, v, lane) vfmaq_lane_f64((a), (b), (v), (lane)) #else #define simde_vfmaq_lane_f64(a, b, v, lane) simde_vaddq_f64(a, simde_vmulq_lane_f64(b, v, lane)) @@ -185,7 +185,7 @@ SIMDE_BEGIN_DECLS_ #endif /* simde_vfmaq_lane_f32 */ -#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && (defined(__ARM_FEATURE_FMA) && __ARM_FEATURE_FMA) +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) #define simde_vfmaq_lane_f32(a, b, v, lane) vfmaq_lane_f32((a), (b), (v), (lane)) #else #define simde_vfmaq_lane_f32(a, b, v, lane) simde_vaddq_f32(a, simde_vmulq_lane_f32(b, v, lane)) @@ -196,7 +196,7 @@ SIMDE_BEGIN_DECLS_ #endif /* simde_vfmaq_laneq_f32 */ -#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && (defined(__ARM_FEATURE_FMA) && __ARM_FEATURE_FMA) +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) #define simde_vfmaq_laneq_f32(a, b, v, lane) vfmaq_laneq_f32((a), (b), (v), (lane)) #else #define simde_vfmaq_laneq_f32(a, b, v, lane) \ @@ -208,7 +208,7 @@ SIMDE_BEGIN_DECLS_ #endif /* simde_vfmaq_laneq_f64 */ -#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && (defined(__ARM_FEATURE_FMA) && __ARM_FEATURE_FMA) +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) #define simde_vfmaq_laneq_f64(a, b, v, lane) vfmaq_laneq_f64((a), (b), (v), (lane)) #else #define simde_vfmaq_laneq_f64(a, b, v, lane) \ diff --git a/simde/arm/neon/fma_n.h b/simde/arm/neon/fma_n.h index 6cf58259c..d94f01ac3 100644 --- a/simde/arm/neon/fma_n.h +++ b/simde/arm/neon/fma_n.h @@ -38,7 +38,7 @@ SIMDE_BEGIN_DECLS_ SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vfma_n_f32(simde_float32x2_t a, simde_float32x2_t b, simde_float32_t c) { - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && (defined(__ARM_FEATURE_FMA) && __ARM_FEATURE_FMA) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) && !defined(SIMDE_BUG_GCC_95399) + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) && !defined(SIMDE_BUG_GCC_95399) return vfma_n_f32(a, b, c); #else return simde_vfma_f32(a, b, simde_vdup_n_f32(c)); @@ -52,7 +52,7 @@ simde_vfma_n_f32(simde_float32x2_t a, simde_float32x2_t b, simde_float32_t c) { SIMDE_FUNCTION_ATTRIBUTES simde_float64x1_t simde_vfma_n_f64(simde_float64x1_t a, simde_float64x1_t b, simde_float64_t c) { - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && (defined(__ARM_FEATURE_FMA) && __ARM_FEATURE_FMA) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) return vfma_n_f64(a, b, c); #else return simde_vfma_f64(a, b, simde_vdup_n_f64(c)); @@ -66,7 +66,7 @@ simde_vfma_n_f64(simde_float64x1_t a, simde_float64x1_t b, simde_float64_t c) { SIMDE_FUNCTION_ATTRIBUTES simde_float32x4_t simde_vfmaq_n_f32(simde_float32x4_t a, simde_float32x4_t b, simde_float32_t c) { - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && (defined(__ARM_FEATURE_FMA) && __ARM_FEATURE_FMA) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) && !defined(SIMDE_BUG_GCC_95399) + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) && !defined(SIMDE_BUG_GCC_95399) return vfmaq_n_f32(a, b, c); #else return simde_vfmaq_f32(a, b, simde_vdupq_n_f32(c)); @@ -80,7 +80,7 @@ simde_vfmaq_n_f32(simde_float32x4_t a, simde_float32x4_t b, simde_float32_t c) { SIMDE_FUNCTION_ATTRIBUTES simde_float64x2_t simde_vfmaq_n_f64(simde_float64x2_t a, simde_float64x2_t b, simde_float64_t c) { - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && (defined(__ARM_FEATURE_FMA) && __ARM_FEATURE_FMA) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) return vfmaq_n_f64(a, b, c); #else return simde_vfmaq_f64(a, b, simde_vdupq_n_f64(c)); diff --git a/simde/mips/msa/madd.h b/simde/mips/msa/madd.h index 15b478eb4..cfa5d8605 100644 --- a/simde/mips/msa/madd.h +++ b/simde/mips/msa/madd.h @@ -38,7 +38,7 @@ simde_v4f32 simde_msa_fmadd_w(simde_v4f32 a, simde_v4f32 b, simde_v4f32 c) { #if defined(SIMDE_MIPS_MSA_NATIVE) return __msa_fmadd_w(a, b, c); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(__ARM_FEATURE_FMA) + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) return vfmaq_f32(a, c, b); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmlaq_f32(a, b, c); diff --git a/simde/simde-arch.h b/simde/simde-arch.h index 26e796830..1bf978093 100644 --- a/simde/simde-arch.h +++ b/simde/simde-arch.h @@ -124,6 +124,9 @@ #if defined(__ARM_FEATURE_SVE) # define SIMDE_ARCH_ARM_SVE #endif +#if defined(__ARM_FEATURE_FMA) && __ARM_FEATURE_FMA +# define SIMDE_ARCH_ARM_FMA +#endif /* Blackfin */ diff --git a/simde/wasm/relaxed-simd.h b/simde/wasm/relaxed-simd.h index 3bfcc902a..3a2601468 100644 --- a/simde/wasm/relaxed-simd.h +++ b/simde/wasm/relaxed-simd.h @@ -367,7 +367,7 @@ simde_wasm_f32x4_fma (simde_v128_t a, simde_v128_t b, simde_v128_t c) { #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) r_.altivec_f32 = vec_madd(c_.altivec_f32, b_.altivec_f32, a_.altivec_f32); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(__ARM_FEATURE_FMA) + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) r_.neon_f32 = vfmaq_f32(a_.neon_f32, c_.neon_f32, b_.neon_f32); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_f32 = vmlaq_f32(a_.neon_f32, b_.neon_f32, c_.neon_f32); @@ -443,7 +443,7 @@ simde_wasm_f32x4_fms (simde_v128_t a, simde_v128_t b, simde_v128_t c) { #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) r_.altivec_f32 = vec_nmsub(c_.altivec_f32, b_.altivec_f32, a_.altivec_f32); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(__ARM_FEATURE_FMA) + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) r_.neon_f32 = vfmsq_f32(a_.neon_f32, c_.neon_f32, b_.neon_f32); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_f32 = vmlsq_f32(a_.neon_f32, b_.neon_f32, c_.neon_f32); diff --git a/simde/x86/aes.h b/simde/x86/aes.h index e1c6a10f1..0ac12ed39 100644 --- a/simde/x86/aes.h +++ b/simde/x86/aes.h @@ -62,6 +62,7 @@ uint8_t gmult(uint8_t a, uint8_t b) { } */ +#if !(defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO)) static uint8_t simde_x_aes_gmult_lookup_table[8][256] = { { // gmult(0x02, b); 0x00, 0x02, 0x04, 0x06, 0x08, 0x0a, 0x0c, 0x0e, 0x10, 0x12, 0x14, 0x16, 0x18, 0x1a, 0x1c, 0x1e, @@ -509,6 +510,7 @@ void simde_x_aes_dec(uint8_t *in, uint8_t *out, uint8_t *w, int is_last) { } } } +#endif // if !(defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO)) SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_aesenc_si128(simde__m128i a, simde__m128i round_key) { diff --git a/simde/x86/fma.h b/simde/x86/fma.h index 6ed68d5bf..630efc54a 100644 --- a/simde/x86/fma.h +++ b/simde/x86/fma.h @@ -101,7 +101,7 @@ simde_mm_fmadd_ps (simde__m128 a, simde__m128 b, simde__m128 c) { #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) r_.altivec_f32 = vec_madd(a_.altivec_f32, b_.altivec_f32, c_.altivec_f32); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(__ARM_FEATURE_FMA) + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) r_.neon_f32 = vfmaq_f32(c_.neon_f32, b_.neon_f32, a_.neon_f32); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_f32 = vmlaq_f32(c_.neon_f32, b_.neon_f32, a_.neon_f32); @@ -489,7 +489,7 @@ simde_mm_fnmadd_ps (simde__m128 a, simde__m128 b, simde__m128 c) { b_ = simde__m128_to_private(b), c_ = simde__m128_to_private(c); - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(__ARM_FEATURE_FMA) + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) r_.neon_f32 = vfmsq_f32(c_.neon_f32, a_.neon_f32, b_.neon_f32); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_f32 = vmlsq_f32(c_.neon_f32, a_.neon_f32, b_.neon_f32);