From 3afebc4a7a1d2c6ba11a3a1186bf8ebd6ccc3d38 Mon Sep 17 00:00:00 2001 From: yuanhecai Date: Thu, 12 Dec 2024 13:58:09 +0800 Subject: [PATCH] x86 sse2/avx2: rewrite loongarch immediate operand shift instruction The x86 instructions like _mm_srli_epi8() can accept variable or immediate operand as the second parm, however, the corresponding loongarch instructions like __lsx_vsrli_b only accept immediate operand as the second parm, so we need to rewirite them to avoid compilation error. --- simde/x86/avx2.h | 54 +++++++++++++++++++++++++++++++----------------- simde/x86/sse2.h | 38 +++++++++++++++++----------------- 2 files changed, 54 insertions(+), 38 deletions(-) diff --git a/simde/x86/avx2.h b/simde/x86/avx2.h index ad939e90a..2d4c40670 100644 --- a/simde/x86/avx2.h +++ b/simde/x86/avx2.h @@ -4564,6 +4564,10 @@ simde_mm256_slli_epi16 (simde__m256i a, const int imm8) the expected range, hence the discrepancy between what we allow and what Intel specifies. Some compilers will return 0, others seem to just mask off everything outside of the range. */ + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvsll_h(a, __lasx_xvreplgr2vr_h(imm8)); + #endif + simde__m256i_private r_, a_ = simde__m256i_to_private(a); @@ -4586,8 +4590,6 @@ simde_mm256_slli_epi16 (simde__m256i a, const int imm8) } #if defined(SIMDE_X86_AVX2_NATIVE) # define simde_mm256_slli_epi16(a, imm8) _mm256_slli_epi16(a, imm8) -#elif defined(SIMDE_LOONGARCH_LASX_NATIVE) -# define simde_mm256_slli_epi16(a, imm8) (imm8 > 15 ? __lasx_xvreplgr2vr_h(0) : __lasx_xvslli_h(a, imm8 & 15)) #elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) # define simde_mm256_slli_epi16(a, imm8) \ simde_mm256_set_m128i( \ @@ -4603,6 +4605,10 @@ SIMDE_FUNCTION_ATTRIBUTES simde__m256i simde_mm256_slli_epi32 (simde__m256i a, const int imm8) SIMDE_REQUIRE_RANGE(imm8, 0, 255) { + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvsll_w(a, __lasx_xvreplgr2vr_w(imm8)); + #endif + simde__m256i_private r_, a_ = simde__m256i_to_private(a); @@ -4625,8 +4631,6 @@ simde_mm256_slli_epi32 (simde__m256i a, const int imm8) } #if defined(SIMDE_X86_AVX2_NATIVE) # define simde_mm256_slli_epi32(a, imm8) _mm256_slli_epi32(a, imm8) -#elif defined(SIMDE_LOONGARCH_LASX_NATIVE) -# define simde_mm256_slli_epi32(a, imm8) (imm8 > 31 ? __lasx_xvreplgr2vr_w(0) : __lasx_xvslli_w(a, imm8 & 31)) #elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) # define simde_mm256_slli_epi32(a, imm8) \ simde_mm256_set_m128i( \ @@ -4642,6 +4646,10 @@ SIMDE_FUNCTION_ATTRIBUTES simde__m256i simde_mm256_slli_epi64 (simde__m256i a, const int imm8) SIMDE_REQUIRE_RANGE(imm8, 0, 255) { +#if defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvsll_d(a, __lasx_xvreplgr2vr_d(imm8)); +#endif + simde__m256i_private r_, a_ = simde__m256i_to_private(a); @@ -4659,8 +4667,6 @@ simde_mm256_slli_epi64 (simde__m256i a, const int imm8) } #if defined(SIMDE_X86_AVX2_NATIVE) # define simde_mm256_slli_epi64(a, imm8) _mm256_slli_epi64(a, imm8) -#elif defined(SIMDE_LOONGARCH_LASX_NATIVE) -# define simde_mm256_slli_epi64(a, imm8) (imm8 > 63 ? __lasx_xvreplgr2vr_d(0) : __lasx_xvslli_d(a, imm8)) #elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) # define simde_mm256_slli_epi64(a, imm8) \ simde_mm256_set_m128i( \ @@ -4927,6 +4933,10 @@ SIMDE_FUNCTION_ATTRIBUTES simde__m256i simde_mm256_srai_epi16 (simde__m256i a, const int imm8) SIMDE_REQUIRE_RANGE(imm8, 0, 255) { + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvsra_h(a, __lasx_xvreplgr2vr_h((imm8 > 15 ? 15 : imm8))); + #endif + simde__m256i_private r_, a_ = simde__m256i_to_private(a); @@ -4947,8 +4957,6 @@ simde_mm256_srai_epi16 (simde__m256i a, const int imm8) } #if defined(SIMDE_X86_AVX2_NATIVE) # define simde_mm256_srai_epi16(a, imm8) _mm256_srai_epi16(a, imm8) -#elif defined(SIMDE_LOONGARCH_LASX_NATIVE) -# define simde_mm256_srai_epi16(a, imm8) __lasx_xvsrai_h(a, (imm8 > 15 ? 15 : imm8)) #elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) # define simde_mm256_srai_epi16(a, imm8) \ simde_mm256_set_m128i( \ @@ -4964,6 +4972,10 @@ SIMDE_FUNCTION_ATTRIBUTES simde__m256i simde_mm256_srai_epi32 (simde__m256i a, const int imm8) SIMDE_REQUIRE_RANGE(imm8, 0, 255) { + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvsra_w(a, __lasx_xvreplgr2vr_w((imm8 > 31 ? 31 : imm8))); + #endif + simde__m256i_private r_, a_ = simde__m256i_to_private(a); @@ -4984,8 +4996,6 @@ simde_mm256_srai_epi32 (simde__m256i a, const int imm8) } #if defined(SIMDE_X86_AVX2_NATIVE) # define simde_mm256_srai_epi32(a, imm8) _mm256_srai_epi32(a, imm8) -#elif defined(SIMDE_LOONGARCH_LASX_NATIVE) -# define simde_mm256_srai_epi32(a, imm8) __lasx_xvsrai_w(a, (imm8 > 31 ? 31 : imm8)) #elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) # define simde_mm256_srai_epi32(a, imm8) \ simde_mm256_set_m128i( \ @@ -5183,13 +5193,17 @@ SIMDE_FUNCTION_ATTRIBUTES simde__m256i simde_mm256_srli_epi16 (simde__m256i a, const int imm8) SIMDE_REQUIRE_RANGE(imm8, 0, 255) { + if (imm8 > 15) + return simde_mm256_setzero_si256(); + + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvsrl_h(a, __lasx_xvreplgr2vr_h(imm8)); + #endif + simde__m256i_private r_, a_ = simde__m256i_to_private(a); - if (imm8 > 15) - return simde_mm256_setzero_si256(); - #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) SIMDE_POWER_ALTIVEC_VECTOR(unsigned short) sv = vec_splats(HEDLEY_STATIC_CAST(unsigned short, imm8)); for (size_t i = 0 ; i < (sizeof(a_.altivec_u16) / sizeof(a_.altivec_u16[0])) ; i++) { @@ -5214,8 +5228,6 @@ simde_mm256_srli_epi16 (simde__m256i a, const int imm8) } #if defined(SIMDE_X86_AVX2_NATIVE) # define simde_mm256_srli_epi16(a, imm8) _mm256_srli_epi16(a, imm8) -#elif defined(SIMDE_LOONGARCH_LASX_NATIVE) -# define simde_mm256_srli_epi16(a, imm8) (imm8 > 15 ? __lasx_xvreplgr2vr_h(0) : __lasx_xvsrli_h(a, imm8 & 15)) #elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) # define simde_mm256_srli_epi16(a, imm8) \ simde_mm256_set_m128i( \ @@ -5231,6 +5243,10 @@ SIMDE_FUNCTION_ATTRIBUTES simde__m256i simde_mm256_srli_epi32 (simde__m256i a, const int imm8) SIMDE_REQUIRE_RANGE(imm8, 0, 255) { + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvsrl_w(a, __lasx_xvreplgr2vr_w(imm8)); + #endif + simde__m256i_private r_, a_ = simde__m256i_to_private(a); @@ -5253,8 +5269,6 @@ simde_mm256_srli_epi32 (simde__m256i a, const int imm8) } #if defined(SIMDE_X86_AVX2_NATIVE) # define simde_mm256_srli_epi32(a, imm8) _mm256_srli_epi32(a, imm8) -#elif defined(SIMDE_LOONGARCH_LASX_NATIVE) -# define simde_mm256_srli_epi32(a, imm8) __lasx_xvsrli_w(a, imm8) #elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) # define simde_mm256_srli_epi32(a, imm8) \ simde_mm256_set_m128i( \ @@ -5270,6 +5284,10 @@ SIMDE_FUNCTION_ATTRIBUTES simde__m256i simde_mm256_srli_epi64 (simde__m256i a, const int imm8) SIMDE_REQUIRE_RANGE(imm8, 0, 255) { +#if defined(SIMDE_LOONGARCH_LASX_NATIVE) + return __lasx_xvsrl_d(a, __lasx_xvreplgr2vr_d(imm8)); +#endif + simde__m256i_private r_, a_ = simde__m256i_to_private(a); @@ -5287,8 +5305,6 @@ simde_mm256_srli_epi64 (simde__m256i a, const int imm8) } #if defined(SIMDE_X86_AVX2_NATIVE) # define simde_mm256_srli_epi64(a, imm8) _mm256_srli_epi64(a, imm8) -#elif defined(SIMDE_LOONGARCH_LASX_NATIVE) -# define simde_mm256_srli_epi64(a, imm8) __lasx_xvsrli_d(a, imm8) #elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) # define simde_mm256_srli_epi64(a, imm8) \ simde_mm256_set_m128i( \ diff --git a/simde/x86/sse2.h b/simde/x86/sse2.h index ee50bca3b..feffb0244 100644 --- a/simde/x86/sse2.h +++ b/simde/x86/sse2.h @@ -6163,7 +6163,7 @@ simde_mm_sll_epi16 (simde__m128i a, simde__m128i count) { #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_u16 = vshlq_u16(a_.neon_u16, vdupq_n_s16(HEDLEY_STATIC_CAST(int16_t, count_.u64[0]))); #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - r_.lsx_i64 = __lsx_vslli_h(a_.lsx_i64, count_.u64[0]); + r_.lsx_i64 = __lsx_vsll_h(a_.lsx_i64, __lsx_vreplgr2vr_h(count_.u64[0])); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = ((wasm_i64x2_extract_lane(count_.wasm_v128, 0) < 16) ? wasm_i16x8_shl(a_.wasm_v128, HEDLEY_STATIC_CAST(int32_t, wasm_i64x2_extract_lane(count_.wasm_v128, 0))) : wasm_i16x8_const(0,0,0,0,0,0,0,0)); #else @@ -6199,7 +6199,7 @@ simde_mm_sll_epi32 (simde__m128i a, simde__m128i count) { #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_u32 = vshlq_u32(a_.neon_u32, vdupq_n_s32(HEDLEY_STATIC_CAST(int32_t, count_.u64[0]))); #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - r_.lsx_i64 = __lsx_vslli_w(a_.lsx_i64, count_.u64[0]); + r_.lsx_i64 = __lsx_vsll_w(a_.lsx_i64, __lsx_vreplgr2vr_w(count_.u64[0])); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = ((wasm_i64x2_extract_lane(count_.wasm_v128, 0) < 32) ? wasm_i32x4_shl(a_.wasm_v128, HEDLEY_STATIC_CAST(int32_t, wasm_i64x2_extract_lane(count_.wasm_v128, 0))) : wasm_i32x4_const(0,0,0,0)); #else @@ -6561,7 +6561,9 @@ simde_mm_slli_epi16 (simde__m128i a, const int imm8) r_, a_ = simde__m128i_to_private(a); - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vsll_h(a_.lsx_i64, __lsx_vreplgr2vr_h(imm8)); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.i16 = a_.i16 << SIMDE_CAST_VECTOR_SHIFT_COUNT(8, imm8 & 0xff); #else const int s = (imm8 > HEDLEY_STATIC_CAST(int, sizeof(r_.i16[0]) * CHAR_BIT) - 1) ? 0 : imm8; @@ -6589,8 +6591,6 @@ simde_mm_slli_epi16 (simde__m128i a, const int imm8) #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) #define simde_mm_slli_epi16(a, imm8) \ ((imm8 & ~15) ? simde_mm_setzero_si128() : simde__m128i_from_altivec_i16(vec_sl(simde__m128i_to_altivec_i16(a), vec_splat_u16(HEDLEY_STATIC_CAST(unsigned short, imm8))))) -#elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - #define simde_mm_slli_epi16(a, imm8) ((imm8 & ~15) ? simde_mm_setzero_si128() : simde__m128i_from_lsx_i64(__lsx_vslli_h(simde__m128i_to_private(a).lsx_i64, ((imm8) & 15)))) #endif #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) #define _mm_slli_epi16(a, imm8) simde_mm_slli_epi16(a, imm8) @@ -6607,7 +6607,9 @@ simde_mm_slli_epi32 (simde__m128i a, const int imm8) r_, a_ = simde__m128i_to_private(a); - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vsll_w(a_.lsx_i64, __lsx_vreplgr2vr_w(imm8)); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.i32 = a_.i32 << imm8; #else SIMDE_VECTORIZE @@ -6646,8 +6648,6 @@ simde_mm_slli_epi32 (simde__m128i a, const int imm8) } \ ret; \ })) -#elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - #define simde_mm_slli_epi32(a, imm8) ((imm8 & ~31) ? simde_mm_setzero_si128() : simde__m128i_from_lsx_i64(__lsx_vslli_w(simde__m128i_to_private(a).lsx_i64, ((imm8) & 31)))) #endif #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) #define _mm_slli_epi32(a, imm8) simde_mm_slli_epi32(a, imm8) @@ -6664,7 +6664,9 @@ simde_mm_slli_epi64 (simde__m128i a, const int imm8) r_, a_ = simde__m128i_to_private(a); - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vsll_d(a_.lsx_i64, __lsx_vreplgr2vr_d(imm8)); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.i64 = a_.i64 << imm8; #else SIMDE_VECTORIZE @@ -6688,8 +6690,6 @@ simde_mm_slli_epi64 (simde__m128i a, const int imm8) #elif defined(SIMDE_WASM_SIMD128_NATIVE) #define simde_mm_slli_epi64(a, imm8) \ ((imm8 < 64) ? wasm_i64x2_shl(simde__m128i_to_private(a).wasm_v128, imm8) : wasm_i64x2_const(0,0)) -#elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - #define simde_mm_slli_epi64(a, imm8) ((imm8 & ~63) ? simde_mm_setzero_si128() : simde__m128i_from_lsx_i64(__lsx_vslli_d(simde__m128i_to_private(a).lsx_i64, ((imm8) & 63)))) #endif #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) #define _mm_slli_epi64(a, imm8) simde_mm_slli_epi64(a, imm8) @@ -6706,7 +6706,9 @@ simde_mm_srli_epi16 (simde__m128i a, const int imm8) r_, a_ = simde__m128i_to_private(a); - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vsrl_h(a_.lsx_i64, __lsx_vreplgr2vr_h(imm8)); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.u16 = a_.u16 >> SIMDE_CAST_VECTOR_SHIFT_COUNT(8, imm8); #else SIMDE_VECTORIZE @@ -6733,8 +6735,6 @@ simde_mm_srli_epi16 (simde__m128i a, const int imm8) #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) #define simde_mm_srli_epi16(a, imm8) \ ((imm8 & ~15) ? simde_mm_setzero_si128() : simde__m128i_from_altivec_i16(vec_sr(simde__m128i_to_altivec_i16(a), vec_splat_u16(HEDLEY_STATIC_CAST(unsigned short, imm8))))) -#elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - #define simde_mm_srli_epi16(a, imm8) ((imm8 & ~15) ? simde_mm_setzero_si128() : simde__m128i_from_lsx_i64(__lsx_vsrli_h(simde__m128i_to_private(a).lsx_i64, ((imm8) & 15)))) #endif #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) #define _mm_srli_epi16(a, imm8) simde_mm_srli_epi16(a, imm8) @@ -6751,7 +6751,9 @@ simde_mm_srli_epi32 (simde__m128i a, const int imm8) r_, a_ = simde__m128i_to_private(a); - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vsrl_w(a_.lsx_i64, __lsx_vreplgr2vr_w(imm8)); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.u32 = a_.u32 >> SIMDE_CAST_VECTOR_SHIFT_COUNT(8, imm8 & 0xff); #else SIMDE_VECTORIZE @@ -6790,8 +6792,6 @@ simde_mm_srli_epi32 (simde__m128i a, const int imm8) } \ ret; \ })) -#elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - #define simde_mm_srli_epi32(a, imm8) ((imm8 & ~31) ? simde_mm_setzero_si128() : simde__m128i_from_lsx_i64(__lsx_vsrli_w(simde__m128i_to_private(a).lsx_i64, ((imm8) & 31)))) #endif #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) #define _mm_srli_epi32(a, imm8) simde_mm_srli_epi32(a, imm8) @@ -6810,6 +6810,8 @@ simde_mm_srli_epi64 (simde__m128i a, const int imm8) #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_u64 = vshlq_u64(a_.neon_u64, vdupq_n_s64(-imm8)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vsrl_d(a_.lsx_i64, __lsx_vreplgr2vr_d(imm8)); #else #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_94488) r_.u64 = a_.u64 >> SIMDE_CAST_VECTOR_SHIFT_COUNT(8, imm8); @@ -6836,8 +6838,6 @@ simde_mm_srli_epi64 (simde__m128i a, const int imm8) #elif defined(SIMDE_WASM_SIMD128_NATIVE) #define simde_mm_srli_epi64(a, imm8) \ ((imm8 < 64) ? wasm_u64x2_shr(simde__m128i_to_private(a).wasm_v128, imm8) : wasm_i64x2_const(0,0)) -#elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - #define simde_mm_srli_epi64(a, imm8) ((imm8 & ~63) ? simde_mm_setzero_si128() : simde__m128i_from_lsx_i64(__lsx_vsrli_d(simde__m128i_to_private(a).lsx_i64, ((imm8) & 63)))) #endif #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) #define _mm_srli_epi64(a, imm8) simde_mm_srli_epi64(a, imm8)