From 6ce6030ae3b8decadac030b88a8a1868e809c9e0 Mon Sep 17 00:00:00 2001 From: "Michael R. Crusoe" Date: Thu, 19 Oct 2023 12:19:12 +0200 Subject: [PATCH] sse{,2,4.1}, avx{,2} *_stream_{,load}: use __builtin_nontemporal_{load,store} --- simde/x86/avx.h | 8 +++++++- simde/x86/avx2.h | 2 ++ simde/x86/sse.h | 21 +++++++++++++-------- simde/x86/sse2.h | 30 ++++++++++++++++++++++++++---- simde/x86/sse4.1.h | 9 ++++++--- 5 files changed, 54 insertions(+), 16 deletions(-) diff --git a/simde/x86/avx.h b/simde/x86/avx.h index 06485253d..2314f9556 100644 --- a/simde/x86/avx.h +++ b/simde/x86/avx.h @@ -5345,6 +5345,8 @@ void simde_mm256_stream_ps (simde_float32 mem_addr[8], simde__m256 a) { #if defined(SIMDE_X86_AVX_NATIVE) _mm256_stream_ps(mem_addr, a); + #elif HEDLEY_HAS_BUILTIN(__builtin_nontemporal_store) && defined(SIMDE_VECTOR_SUBSCRIPT) + __builtin_nontemporal_store(a, SIMDE_ALIGN_CAST(__typeof__(a)*, mem_addr)); #else simde_memcpy(SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m256), &a, sizeof(a)); #endif @@ -5359,6 +5361,8 @@ void simde_mm256_stream_pd (simde_float64 mem_addr[4], simde__m256d a) { #if defined(SIMDE_X86_AVX_NATIVE) _mm256_stream_pd(mem_addr, a); + #elif HEDLEY_HAS_BUILTIN(__builtin_nontemporal_store) && defined(SIMDE_VECTOR_SUBSCRIPT) + __builtin_nontemporal_store(a, SIMDE_ALIGN_CAST(__typeof__(a)*, mem_addr)); #else simde_memcpy(SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m256d), &a, sizeof(a)); #endif @@ -5373,8 +5377,10 @@ void simde_mm256_stream_si256 (simde__m256i* mem_addr, simde__m256i a) { #if defined(SIMDE_X86_AVX_NATIVE) _mm256_stream_si256(mem_addr, a); + #elif HEDLEY_HAS_BUILTIN(__builtin_nontemporal_store) && defined(SIMDE_VECTOR_SUBSCRIPT) + __builtin_nontemporal_store(a, SIMDE_ALIGN_CAST(__typeof__(a)*, mem_addr)); #else - simde_memcpy(SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m256i), &a, sizeof(a)); + simde_memcpy(SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m256i), &a, sizeof(a)); #endif } #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) diff --git a/simde/x86/avx2.h b/simde/x86/avx2.h index 3601e1a33..504c071e9 100644 --- a/simde/x86/avx2.h +++ b/simde/x86/avx2.h @@ -5117,6 +5117,8 @@ simde__m256i simde_mm256_stream_load_si256 (const simde__m256i* mem_addr) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm256_stream_load_si256(HEDLEY_CONST_CAST(simde__m256i*, mem_addr)); + #elif HEDLEY_HAS_BUILTIN(__builtin_nontemporal_store) && defined(SIMDE_VECTOR_SUBSCRIPT) + return __builtin_nontemporal_load(mem_addr); #else simde__m256i r; simde_memcpy(&r, SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m256i), sizeof(r)); diff --git a/simde/x86/sse.h b/simde/x86/sse.h index e68376621..a3e060a26 100644 --- a/simde/x86/sse.h +++ b/simde/x86/sse.h @@ -4754,16 +4754,19 @@ void simde_mm_stream_pi (simde__m64* mem_addr, simde__m64 a) { #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) _mm_stream_pi(HEDLEY_REINTERPRET_CAST(__m64*, mem_addr), a); + #elif HEDLEY_HAS_BUILTIN(__builtin_nontemporal_store) && ( \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) || defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) || \ + defined(SIMDE_VECTOR_SUBSCRIPT)) + __builtin_nontemporal_store(a, mem_addr); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + simde__m64_private a_ = simde__m64_to_private(a); + vst1_s64(HEDLEY_REINTERPRET_CAST(int64_t *, mem_addr), a_.neon_i64); #else simde__m64_private* dest = HEDLEY_REINTERPRET_CAST(simde__m64_private*, mem_addr), a_ = simde__m64_to_private(a); - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - dest->i64[0] = vget_lane_s64(a_.neon_i64, 0); - #else - dest->i64[0] = a_.i64[0]; - #endif + dest->i64[0] = a_.i64[0]; #endif } #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) @@ -4775,9 +4778,11 @@ void simde_mm_stream_ps (simde_float32 mem_addr[4], simde__m128 a) { #if defined(SIMDE_X86_SSE_NATIVE) _mm_stream_ps(mem_addr, a); - #elif HEDLEY_HAS_BUILTIN(__builtin_nontemporal_store) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - simde__m128_private a_ = simde__m128_to_private(a); - __builtin_nontemporal_store(a_.f32, SIMDE_ALIGN_CAST(__typeof__(a_.f32)*, mem_addr)); + #elif HEDLEY_HAS_BUILTIN(__builtin_nontemporal_store) && ( \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) || defined(SIMDE_VECTOR_SUBSCRIPT) || \ + defined(SIMDE_WASM_SIMD128_NATIVE) || defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || \ + defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) || defined(SIMDE_LOONGARCH_LSX_NATIVE)) + __builtin_nontemporal_store(a, SIMDE_ALIGN_ASSUME_CAST(__typeof__(a)*, mem_addr)); #else simde_mm_store_ps(mem_addr, a); #endif diff --git a/simde/x86/sse2.h b/simde/x86/sse2.h index 1d2f7da19..96a8163b6 100644 --- a/simde/x86/sse2.h +++ b/simde/x86/sse2.h @@ -3495,13 +3495,13 @@ simde__m128i simde_mm_load_si128 (simde__m128i const* mem_addr) { #if defined(SIMDE_X86_SSE2_NATIVE) return _mm_load_si128(HEDLEY_REINTERPRET_CAST(__m128i const*, mem_addr)); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld1q_s64(HEDLEY_REINTERPRET_CAST(int64_t const*, mem_addr)); #else simde__m128i_private r_; #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) r_.altivec_i32 = vec_ld(0, HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(int) const*, mem_addr)); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i32 = vld1q_s32(HEDLEY_REINTERPRET_CAST(int32_t const*, mem_addr)); #else simde_memcpy(&r_, SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m128i), sizeof(simde__m128i)); #endif @@ -4949,6 +4949,10 @@ simde_mm_loadu_si32 (void const* mem_addr) { return _mm_loadu_si32(mem_addr); #elif defined(SIMDE_WASM_SIMD128_NATIVE) return simde__m128i_from_wasm_v128(wasm_v128_load32_zero(mem_addr)); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + simde__m128i_private r_; + r_.neon_i32 = vsetq_lane_s32(* HEDLEY_REINTERPRET_CAST(const int32_t *, mem_addr), vdupq_n_s32(0), 0); + return simde__m128i_from_private(r_); #else int32_t val; simde_memcpy(&val, mem_addr, sizeof(val)); @@ -6600,8 +6604,13 @@ void simde_mm_stream_pd (simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(2)], simde__m128d a) { #if defined(SIMDE_X86_SSE2_NATIVE) _mm_stream_pd(mem_addr, a); + #elif HEDLEY_HAS_BUILTIN(__builtin_nontemporal_store) && ( \ + defined(SIMDE_VECTOR_SUBSCRIPT) || defined(SIMDE_ARM_NEON_A64V8_NATIVE) || \ + defined(SIMDE_WASM_SIMD128_NATIVE) || defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || \ + defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)) + __builtin_nontemporal_store(a, SIMDE_ALIGN_CAST(__typeof__(a)*, mem_addr)); #else - simde_memcpy(mem_addr, &a, sizeof(a)); + simde_mm_store_pd(mem_addr, a); #endif } #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) @@ -6613,8 +6622,13 @@ void simde_mm_stream_si128 (simde__m128i* mem_addr, simde__m128i a) { #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64) _mm_stream_si128(HEDLEY_STATIC_CAST(__m128i*, mem_addr), a); + #elif HEDLEY_HAS_BUILTIN(__builtin_nontemporal_store) && ( \ + defined(SIMDE_VECTOR_SUBSCRIPT) || defined(SIMDE_ARM_NEON_A32V7_NATIVE) || \ + defined(SIMDE_WASM_SIMD128_NATIVE) || defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || \ + defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)) + __builtin_nontemporal_store(a, SIMDE_ALIGN_CAST(__typeof__(a)*, mem_addr)); #else - simde_memcpy(mem_addr, &a, sizeof(a)); + simde_mm_store_si128(mem_addr, a); #endif } #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) @@ -6626,6 +6640,10 @@ void simde_mm_stream_si32 (int32_t* mem_addr, int32_t a) { #if defined(SIMDE_X86_SSE2_NATIVE) _mm_stream_si32(mem_addr, a); + #elif HEDLEY_HAS_BUILTIN(__builtin_nontemporal_store) + __builtin_nontemporal_store(a, mem_addr); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + vst1q_lane_s32(mem_addr, vdupq_n_s32(a), 0); #else *mem_addr = a; #endif @@ -6639,6 +6657,10 @@ void simde_mm_stream_si64 (int64_t* mem_addr, int64_t a) { #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64) && !defined(HEDLEY_MSVC_VERSION) _mm_stream_si64(SIMDE_CHECKED_REINTERPRET_CAST(long long int*, int64_t*, mem_addr), a); + #elif HEDLEY_HAS_BUILTIN(__builtin_nontemporal_store) + __builtin_nontemporal_store(a, mem_addr); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + vst1_s64(mem_addr, vdup_n_s64(a)); #else *mem_addr = a; #endif diff --git a/simde/x86/sse4.1.h b/simde/x86/sse4.1.h index e0fc485f4..d258f7a95 100644 --- a/simde/x86/sse4.1.h +++ b/simde/x86/sse4.1.h @@ -2139,10 +2139,13 @@ simde__m128i simde_mm_stream_load_si128 (const simde__m128i* mem_addr) { #if defined(SIMDE_X86_SSE4_1_NATIVE) return _mm_stream_load_si128(HEDLEY_CONST_CAST(simde__m128i*, mem_addr)); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - return vreinterpretq_s64_s32(vld1q_s32(HEDLEY_REINTERPRET_CAST(int32_t const*, mem_addr))); + #elif HEDLEY_HAS_BUILTIN(__builtin_nontemporal_load) && ( \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) || defined(SIMDE_VECTOR_SUBSCRIPT) || \ + defined(SIMDE_WASM_SIMD128_NATIVE) || defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || \ + defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)) + return __builtin_nontemporal_load(mem_addr); #else - return *mem_addr; + return simde_mm_load_si128(mem_addr); #endif } #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)