From 6346405cfdce63bf882a7801c9cedcefd28ba8fd Mon Sep 17 00:00:00 2001 From: Ruhung <143302514+Ruhung@users.noreply.github.com> Date: Fri, 21 Jun 2024 08:31:00 -0400 Subject: [PATCH] arm/neon riscv64: additional RVV implementations - part1 (#1188) Contains RVV implementations for the following Neon instructions. `abs`, `addl`, `addl_high`, `addlv`, `addv`, `cge`, `cgt`, `cle`, `clez`, `clt`, `cnt`, `fma`, `fms`, `fms_n`, `get_high`, `get_low`, `hsub`, `mla`, `mla_n`, `mlal`, `mlal_high`, `mlal_high_n`, `mlal_n`, `mls`, `mls_n`, `mlsl`, `mlsl_high`, `mlsl_high_n`, `mlsl_n`, `qsub`, `qtbl`, `qtbx`, `rbit`, `recpe`, `rev16`, `rev32`, `rev64`, `subl`, `subl_high`, `subw`, `subw_high`, `tbl`, `tbx` --- simde/arm/neon/abs.h | 72 ++++++--- simde/arm/neon/addl.h | 43 ++++++ simde/arm/neon/addl_high.h | 55 +++++++ simde/arm/neon/addlv.h | 185 +++++++++++++++++------- simde/arm/neon/addv.h | 273 ++++++++++++++++++++++++----------- simde/arm/neon/cge.h | 118 +++++++++++++-- simde/arm/neon/cgt.h | 117 +++++++++++++-- simde/arm/neon/cle.h | 117 +++++++++++++-- simde/arm/neon/clez.h | 125 ++++++++++++---- simde/arm/neon/clt.h | 117 +++++++++++++-- simde/arm/neon/cnt.h | 33 ++++- simde/arm/neon/fma.h | 45 +++++- simde/arm/neon/fms.h | 49 +++++++ simde/arm/neon/fms_n.h | 43 ++++++ simde/arm/neon/get_high.h | 55 +++++-- simde/arm/neon/get_low.h | 37 ++++- simde/arm/neon/hsub.h | 61 ++++++++ simde/arm/neon/mla.h | 135 ++++++++++++++++- simde/arm/neon/mla_n.h | 49 +++++-- simde/arm/neon/mlal.h | 55 +++++++ simde/arm/neon/mlal_high.h | 55 +++++++ simde/arm/neon/mlal_high_n.h | 29 ++++ simde/arm/neon/mlal_n.h | 30 +++- simde/arm/neon/mls.h | 133 ++++++++++++++++- simde/arm/neon/mls_n.h | 71 +++++++++ simde/arm/neon/mlsl.h | 55 +++++++ simde/arm/neon/mlsl_high.h | 67 +++++++++ simde/arm/neon/mlsl_high_n.h | 33 +++++ simde/arm/neon/mlsl_n.h | 29 ++++ simde/arm/neon/qsub.h | 45 +++++- simde/arm/neon/qtbl.h | 69 +++++++++ simde/arm/neon/qtbx.h | 75 ++++++++++ simde/arm/neon/rbit.h | 15 ++ simde/arm/neon/recpe.h | 40 +++-- simde/arm/neon/rev16.h | 7 + simde/arm/neon/rev32.h | 15 +- simde/arm/neon/rev64.h | 19 +++ simde/arm/neon/subl.h | 37 +++++ simde/arm/neon/subl_high.h | 49 +++++++ simde/arm/neon/subw.h | 37 +++-- simde/arm/neon/subw_high.h | 44 ++++-- simde/arm/neon/tbl.h | 32 ++++ simde/arm/neon/tbx.h | 34 +++++ 43 files changed, 2474 insertions(+), 330 deletions(-) diff --git a/simde/arm/neon/abs.h b/simde/arm/neon/abs.h index 16250da78..3cc11d4d7 100644 --- a/simde/arm/neon/abs.h +++ b/simde/arm/neon/abs.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_ABS_H) @@ -74,10 +75,14 @@ simde_vabs_f16(simde_float16x4_t a) { r_, a_ = simde_float16x4_to_private(a); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = simde_vabsh_f16(a_.values[i]); - } + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVFH) + r_.sv64 = __riscv_vfabs_v_f16m1(a_.sv64 , 4); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vabsh_f16(a_.values[i]); + } + #endif return simde_float16x4_from_private(r_); #endif @@ -97,10 +102,14 @@ simde_vabs_f32(simde_float32x2_t a) { r_, a_ = simde_float32x2_to_private(a); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] < 0 ? -a_.values[i] : a_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vfabs_v_f32m1(a_.sv64 , 2); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] < 0 ? -a_.values[i] : a_.values[i]; + } + #endif return simde_float32x2_from_private(r_); #endif @@ -120,10 +129,14 @@ simde_vabs_f64(simde_float64x1_t a) { r_, a_ = simde_float64x1_to_private(a); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] < 0 ? -a_.values[i] : a_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vfabs_v_f64m1(a_.sv64 , 1); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] < 0 ? -a_.values[i] : a_.values[i]; + } + #endif return simde_float64x1_from_private(r_); #endif @@ -145,6 +158,8 @@ simde_vabs_s8(simde_int8x8_t a) { #if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_abs_pi8(a_.m64); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmax_vv_i8m1(a_.sv64 , __riscv_vneg_v_i8m1(a_.sv64 , 8) , 8); #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_100762) __typeof__(r_.values) m = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values < INT8_C(0)); r_.values = (-a_.values & m) | (a_.values & ~m); @@ -175,6 +190,8 @@ simde_vabs_s16(simde_int16x4_t a) { #if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_abs_pi16(a_.m64); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmax_vv_i16m1(a_.sv64 , __riscv_vneg_v_i16m1(a_.sv64 , 4) , 4); #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_100761) __typeof__(r_.values) m = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values < INT16_C(0)); r_.values = (-a_.values & m) | (a_.values & ~m); @@ -205,6 +222,8 @@ simde_vabs_s32(simde_int32x2_t a) { #if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_abs_pi32(a_.m64); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmax_vv_i32m1(a_.sv64 , __riscv_vneg_v_i32m1(a_.sv64 , 2) , 2); #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_100761) __typeof__(r_.values) m = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values < INT32_C(0)); r_.values = (-a_.values & m) | (a_.values & ~m); @@ -233,7 +252,9 @@ simde_vabs_s64(simde_int64x1_t a) { r_, a_ = simde_int64x1_to_private(a); - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmax_vv_i64m1(a_.sv64 , __riscv_vneg_v_i64m1(a_.sv64 , 1) , 1); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) __typeof__(r_.values) m = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values < INT64_C(0)); r_.values = (-a_.values & m) | (a_.values & ~m); #else @@ -261,10 +282,14 @@ simde_vabsq_f16(simde_float16x8_t a) { r_, a_ = simde_float16x8_to_private(a); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = simde_vabsh_f16(a_.values[i]); - } + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVFH) + r_.sv128 = __riscv_vfabs_v_f16m1(a_.sv128 , 8); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vabsh_f16(a_.values[i]); + } + #endif return simde_float16x8_from_private(r_); #endif @@ -288,6 +313,8 @@ simde_vabsq_f32(simde_float32x4_t a) { #if defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_f32x4_abs(a_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vfabs_v_f32m1(a_.sv128 , 4); #elif defined(SIMDE_X86_SSE_NATIVE) simde_float32 mask_; uint32_t u32_ = UINT32_C(0x7FFFFFFF); @@ -325,6 +352,8 @@ simde_vabsq_f64(simde_float64x2_t a) { uint64_t u64_ = UINT64_C(0x7FFFFFFFFFFFFFFF); simde_memcpy(&mask_, &u64_, sizeof(u64_)); r_.m128d = _mm_and_pd(_mm_set1_pd(mask_), a_.m128d); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vfabs_v_f64m1(a_.sv128 , 2); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -358,6 +387,8 @@ simde_vabsq_s8(simde_int8x16_t a) { r_.m128i = _mm_min_epu8(a_.m128i, _mm_sub_epi8(_mm_setzero_si128(), a_.m128i)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i8x16_abs(a_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vmax_vv_i8m1(a_.sv128 , __riscv_vneg_v_i8m1(a_.sv128 , 16) , 16); #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) __typeof__(r_.values) m = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values < INT8_C(0)); r_.values = (-a_.values & m) | (a_.values & ~m); @@ -394,6 +425,8 @@ simde_vabsq_s16(simde_int16x8_t a) { r_.m128i = _mm_max_epi16(a_.m128i, _mm_sub_epi16(_mm_setzero_si128(), a_.m128i)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i16x8_abs(a_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vmax_vv_i16m1(a_.sv128 , __riscv_vneg_v_i16m1(a_.sv128 , 8) , 8); #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) __typeof__(r_.values) m = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values < INT16_C(0)); r_.values = (-a_.values & m) | (a_.values & ~m); @@ -431,6 +464,8 @@ simde_vabsq_s32(simde_int32x4_t a) { r_.m128i = _mm_sub_epi32(_mm_xor_si128(a_.m128i, m), m); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i32x4_abs(a_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vmax_vv_i32m1(a_.sv128 , __riscv_vneg_v_i32m1(a_.sv128 , 4) , 4); #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) __typeof__(r_.values) m = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values < INT32_C(0)); r_.values = (-a_.values & m) | (a_.values & ~m); @@ -452,6 +487,7 @@ simde_vabsq_s32(simde_int32x4_t a) { SIMDE_FUNCTION_ATTRIBUTES simde_int64x2_t simde_vabsq_s64(simde_int64x2_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vabsq_s64(a); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) @@ -470,6 +506,8 @@ simde_vabsq_s64(simde_int64x2_t a) { r_.m128i = _mm_sub_epi64(_mm_xor_si128(a_.m128i, m), m); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i64x2_abs(a_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vmax_vv_i64m1(a_.sv128 , __riscv_vneg_v_i64m1(a_.sv128 , 2) , 2); #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) __typeof__(r_.values) m = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values < INT64_C(0)); r_.values = (-a_.values & m) | (a_.values & ~m); diff --git a/simde/arm/neon/addl.h b/simde/arm/neon/addl.h index 539e91e47..cdabc5802 100644 --- a/simde/arm/neon/addl.h +++ b/simde/arm/neon/addl.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Sean Maher (Copyright owned by Google, LLC) + * 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_ADDL_H) @@ -42,6 +43,13 @@ simde_int16x8_t simde_vaddl_s8(simde_int8x8_t a, simde_int8x8_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vaddl_s8(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int16x8_private r_; + simde_int8x8_private a_ = simde_int8x8_to_private(a); + simde_int8x8_private b_ = simde_int8x8_to_private(b); + + r_.sv128 = __riscv_vwadd_vv_i16m1(__riscv_vlmul_trunc_v_i8m1_i8mf2(a_.sv64) , __riscv_vlmul_trunc_v_i8m1_i8mf2(b_.sv64) , 8); + return simde_int16x8_from_private(r_); #else return simde_vaddq_s16(simde_vmovl_s8(a), simde_vmovl_s8(b)); #endif @@ -56,6 +64,13 @@ simde_int32x4_t simde_vaddl_s16(simde_int16x4_t a, simde_int16x4_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vaddl_s16(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int32x4_private r_; + simde_int16x4_private a_ = simde_int16x4_to_private(a); + simde_int16x4_private b_ = simde_int16x4_to_private(b); + + r_.sv128 = __riscv_vwadd_vv_i32m1(__riscv_vlmul_trunc_v_i16m1_i16mf2(a_.sv64) , __riscv_vlmul_trunc_v_i16m1_i16mf2(b_.sv64) , 4); + return simde_int32x4_from_private(r_); #else return simde_vaddq_s32(simde_vmovl_s16(a), simde_vmovl_s16(b)); #endif @@ -70,6 +85,13 @@ simde_int64x2_t simde_vaddl_s32(simde_int32x2_t a, simde_int32x2_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vaddl_s32(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int64x2_private r_; + simde_int32x2_private a_ = simde_int32x2_to_private(a); + simde_int32x2_private b_ = simde_int32x2_to_private(b); + + r_.sv128 = __riscv_vwadd_vv_i64m1(__riscv_vlmul_trunc_v_i32m1_i32mf2(a_.sv64) , __riscv_vlmul_trunc_v_i32m1_i32mf2(b_.sv64) , 2); + return simde_int64x2_from_private(r_); #else return simde_vaddq_s64(simde_vmovl_s32(a), simde_vmovl_s32(b)); #endif @@ -84,6 +106,13 @@ simde_uint16x8_t simde_vaddl_u8(simde_uint8x8_t a, simde_uint8x8_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vaddl_u8(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint16x8_private r_; + simde_uint8x8_private a_ = simde_uint8x8_to_private(a); + simde_uint8x8_private b_ = simde_uint8x8_to_private(b); + + r_.sv128 = __riscv_vwaddu_vv_u16m1(__riscv_vlmul_trunc_v_u8m1_u8mf2 (a_.sv64) , __riscv_vlmul_trunc_v_u8m1_u8mf2 (b_.sv64) , 8); + return simde_uint16x8_from_private(r_); #else return simde_vaddq_u16(simde_vmovl_u8(a), simde_vmovl_u8(b)); #endif @@ -98,6 +127,13 @@ simde_uint32x4_t simde_vaddl_u16(simde_uint16x4_t a, simde_uint16x4_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vaddl_u16(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint32x4_private r_; + simde_uint16x4_private a_ = simde_uint16x4_to_private(a); + simde_uint16x4_private b_ = simde_uint16x4_to_private(b); + + r_.sv128 = __riscv_vwaddu_vv_u32m1(__riscv_vlmul_trunc_v_u16m1_u16mf2 (a_.sv64) , __riscv_vlmul_trunc_v_u16m1_u16mf2 (b_.sv64) , 4); + return simde_uint32x4_from_private(r_); #else return simde_vaddq_u32(simde_vmovl_u16(a), simde_vmovl_u16(b)); #endif @@ -112,6 +148,13 @@ simde_uint64x2_t simde_vaddl_u32(simde_uint32x2_t a, simde_uint32x2_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vaddl_u32(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint64x2_private r_; + simde_uint32x2_private a_ = simde_uint32x2_to_private(a); + simde_uint32x2_private b_ = simde_uint32x2_to_private(b); + + r_.sv128 = __riscv_vwaddu_vv_u64m1(__riscv_vlmul_trunc_v_u32m1_u32mf2 (a_.sv64) , __riscv_vlmul_trunc_v_u32m1_u32mf2 (b_.sv64) , 4); + return simde_uint64x2_from_private(r_); #else return simde_vaddq_u64(simde_vmovl_u32(a), simde_vmovl_u32(b)); #endif diff --git a/simde/arm/neon/addl_high.h b/simde/arm/neon/addl_high.h index fdef796c9..cf229823e 100644 --- a/simde/arm/neon/addl_high.h +++ b/simde/arm/neon/addl_high.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Sean Maher (Copyright owned by Google, LLC) + * 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_ADDL_HIGH_H) @@ -42,6 +43,15 @@ simde_int16x8_t simde_vaddl_high_s8(simde_int8x16_t a, simde_int8x16_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vaddl_high_s8(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int16x8_private r_; + simde_int8x16_private a_ = simde_int8x16_to_private(a); + simde_int8x16_private b_ = simde_int8x16_to_private(b); + + a_.sv128 = __riscv_vslidedown_vx_i8m1(a_.sv128 , 8 , 16); + b_.sv128 = __riscv_vslidedown_vx_i8m1(b_.sv128 , 8 , 16); + r_.sv128 = __riscv_vwadd_vv_i16m1(__riscv_vlmul_trunc_v_i8m1_i8mf2(a_.sv128) , __riscv_vlmul_trunc_v_i8m1_i8mf2(b_.sv128) , 8); + return simde_int16x8_from_private(r_); #else return simde_vaddq_s16(simde_vmovl_high_s8(a), simde_vmovl_high_s8(b)); #endif @@ -56,6 +66,15 @@ simde_int32x4_t simde_vaddl_high_s16(simde_int16x8_t a, simde_int16x8_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vaddl_high_s16(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int32x4_private r_; + simde_int16x8_private a_ = simde_int16x8_to_private(a); + simde_int16x8_private b_ = simde_int16x8_to_private(b); + + a_.sv128 = __riscv_vslidedown_vx_i16m1(a_.sv128 , 4 , 8); + b_.sv128 = __riscv_vslidedown_vx_i16m1(b_.sv128 , 4 , 8); + r_.sv128 = __riscv_vwadd_vv_i32m1(__riscv_vlmul_trunc_v_i16m1_i16mf2(a_.sv128) , __riscv_vlmul_trunc_v_i16m1_i16mf2(b_.sv128) , 4); + return simde_int32x4_from_private(r_); #else return simde_vaddq_s32(simde_vmovl_high_s16(a), simde_vmovl_high_s16(b)); #endif @@ -70,6 +89,15 @@ simde_int64x2_t simde_vaddl_high_s32(simde_int32x4_t a, simde_int32x4_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vaddl_high_s32(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int64x2_private r_; + simde_int32x4_private a_ = simde_int32x4_to_private(a); + simde_int32x4_private b_ = simde_int32x4_to_private(b); + + a_.sv128 = __riscv_vslidedown_vx_i32m1(a_.sv128 , 2, 4); + b_.sv128 = __riscv_vslidedown_vx_i32m1(b_.sv128 , 2, 4); + r_.sv128 = __riscv_vwadd_vv_i64m1(__riscv_vlmul_trunc_v_i32m1_i32mf2(a_.sv128) , __riscv_vlmul_trunc_v_i32m1_i32mf2(b_.sv128) , 2); + return simde_int64x2_from_private(r_); #else return simde_vaddq_s64(simde_vmovl_high_s32(a), simde_vmovl_high_s32(b)); #endif @@ -84,6 +112,15 @@ simde_uint16x8_t simde_vaddl_high_u8(simde_uint8x16_t a, simde_uint8x16_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vaddl_high_u8(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint16x8_private r_; + simde_uint8x16_private a_ = simde_uint8x16_to_private(a); + simde_uint8x16_private b_ = simde_uint8x16_to_private(b); + + a_.sv128 = __riscv_vslidedown_vx_u8m1(a_.sv128 , 8 , 16); + b_.sv128 = __riscv_vslidedown_vx_u8m1(b_.sv128 , 8 , 16); + r_.sv128 = __riscv_vwaddu_vv_u16m1(__riscv_vlmul_trunc_v_u8m1_u8mf2 (a_.sv128) , __riscv_vlmul_trunc_v_u8m1_u8mf2 (b_.sv128) , 8); + return simde_uint16x8_from_private(r_); #else return simde_vaddq_u16(simde_vmovl_high_u8(a), simde_vmovl_high_u8(b)); #endif @@ -98,6 +135,15 @@ simde_uint32x4_t simde_vaddl_high_u16(simde_uint16x8_t a, simde_uint16x8_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vaddl_high_u16(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint32x4_private r_; + simde_uint16x8_private a_ = simde_uint16x8_to_private(a); + simde_uint16x8_private b_ = simde_uint16x8_to_private(b); + + a_.sv128 = __riscv_vslidedown_vx_u16m1(a_.sv128 , 4 , 8); + b_.sv128 = __riscv_vslidedown_vx_u16m1(b_.sv128 , 4 , 8); + r_.sv128 = __riscv_vwaddu_vv_u32m1(__riscv_vlmul_trunc_v_u16m1_u16mf2 (a_.sv128) , __riscv_vlmul_trunc_v_u16m1_u16mf2 (b_.sv128) , 4); + return simde_uint32x4_from_private(r_); #else return simde_vaddq_u32(simde_vmovl_high_u16(a), simde_vmovl_high_u16(b)); #endif @@ -112,6 +158,15 @@ simde_uint64x2_t simde_vaddl_high_u32(simde_uint32x4_t a, simde_uint32x4_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vaddl_high_u32(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint64x2_private r_; + simde_uint32x4_private a_ = simde_uint32x4_to_private(a); + simde_uint32x4_private b_ = simde_uint32x4_to_private(b); + + a_.sv128 = __riscv_vslidedown_vx_u32m1(a_.sv128 , 2, 4); + b_.sv128 = __riscv_vslidedown_vx_u32m1(b_.sv128 , 2, 4); + r_.sv128 = __riscv_vwaddu_vv_u64m1(__riscv_vlmul_trunc_v_u32m1_u32mf2 (a_.sv128) , __riscv_vlmul_trunc_v_u32m1_u32mf2 (b_.sv128) , 2); + return simde_uint64x2_from_private(r_); #else return simde_vaddq_u64(simde_vmovl_high_u32(a), simde_vmovl_high_u32(b)); #endif diff --git a/simde/arm/neon/addlv.h b/simde/arm/neon/addlv.h index dc7de0c45..37be2c82e 100644 --- a/simde/arm/neon/addlv.h +++ b/simde/arm/neon/addlv.h @@ -22,6 +22,7 @@ * * Copyright: * 2020 Evan Nemerson + * 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_ADDLV_H) @@ -40,16 +41,22 @@ int16_t simde_vaddlv_s8(simde_int8x8_t a) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vaddlv_s8(a); - #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) + #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && !defined(SIMDE_RISCV_V_NATIVE) return simde_vaddvq_s16(simde_vmovl_s8(a)); #else simde_int8x8_private a_ = simde_int8x8_to_private(a); int16_t r = 0; - SIMDE_VECTORIZE_REDUCTION(+:r) - for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { - r += a_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vint16m1_t zero = __riscv_vmv_v_x_i16m1(0 , 1); + vint16m1_t sum = __riscv_vwredsum_vs_i8m1_i16m1(a_.sv64 , zero , 8); + r = __riscv_vmv_x_s_i16m1_i16 (sum); + #else + SIMDE_VECTORIZE_REDUCTION(+:r) + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + r += a_.values[i]; + } + #endif return r; #endif @@ -64,16 +71,22 @@ int32_t simde_vaddlv_s16(simde_int16x4_t a) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vaddlv_s16(a); - #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) + #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && !defined(SIMDE_RISCV_V_NATIVE) return simde_vaddvq_s32(simde_vmovl_s16(a)); #else simde_int16x4_private a_ = simde_int16x4_to_private(a); int32_t r = 0; - SIMDE_VECTORIZE_REDUCTION(+:r) - for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { - r += a_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vint32m1_t zero = __riscv_vmv_v_x_i32m1(0 , 1); + vint32m1_t sum = __riscv_vwredsum_vs_i16m1_i32m1(a_.sv64 , zero , 4); + r = __riscv_vmv_x_s_i32m1_i32 (sum); + #else + SIMDE_VECTORIZE_REDUCTION(+:r) + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + r += a_.values[i]; + } + #endif return r; #endif @@ -88,16 +101,22 @@ int64_t simde_vaddlv_s32(simde_int32x2_t a) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vaddlv_s32(a); - #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) + #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && !defined(SIMDE_RISCV_V_NATIVE) return simde_vaddvq_s64(simde_vmovl_s32(a)); #else simde_int32x2_private a_ = simde_int32x2_to_private(a); int64_t r = 0; - SIMDE_VECTORIZE_REDUCTION(+:r) - for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { - r += a_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vint64m1_t zero = __riscv_vmv_v_x_i64m1(0 , 1); + vint64m1_t sum = __riscv_vwredsum_vs_i32m1_i64m1(a_.sv64 , zero , 2); + r = __riscv_vmv_x_s_i64m1_i64 (sum); + #else + SIMDE_VECTORIZE_REDUCTION(+:r) + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + r += a_.values[i]; + } + #endif return r; #endif @@ -112,16 +131,22 @@ uint16_t simde_vaddlv_u8(simde_uint8x8_t a) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vaddlv_u8(a); - #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) + #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && !defined(SIMDE_RISCV_V_NATIVE) return simde_vaddvq_u16(simde_vmovl_u8(a)); #else simde_uint8x8_private a_ = simde_uint8x8_to_private(a); uint16_t r = 0; - SIMDE_VECTORIZE_REDUCTION(+:r) - for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { - r += a_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vuint16m1_t zero = __riscv_vmv_v_x_u16m1(0 , 1); + vuint16m1_t sum = __riscv_vwredsumu_vs_u8m1_u16m1(a_.sv64 , zero , 8); + r = __riscv_vmv_x_s_u16m1_u16 (sum); + #else + SIMDE_VECTORIZE_REDUCTION(+:r) + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + r += a_.values[i]; + } + #endif return r; #endif @@ -136,16 +161,22 @@ uint32_t simde_vaddlv_u16(simde_uint16x4_t a) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vaddlv_u16(a); - #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) + #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && !defined(SIMDE_RISCV_V_NATIVE) return simde_vaddvq_u32(simde_vmovl_u16(a)); #else simde_uint16x4_private a_ = simde_uint16x4_to_private(a); uint32_t r = 0; - SIMDE_VECTORIZE_REDUCTION(+:r) - for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { - r += a_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vuint32m1_t zero = __riscv_vmv_v_x_u32m1(0 , 1); + vuint32m1_t sum = __riscv_vwredsumu_vs_u16m1_u32m1(a_.sv64 , zero , 4); + r = __riscv_vmv_x_s_u32m1_u32 (sum); + #else + SIMDE_VECTORIZE_REDUCTION(+:r) + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + r += a_.values[i]; + } + #endif return r; #endif @@ -160,16 +191,22 @@ uint64_t simde_vaddlv_u32(simde_uint32x2_t a) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vaddlv_u32(a); - #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) + #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && !defined(SIMDE_RISCV_V_NATIVE) return simde_vaddvq_u64(simde_vmovl_u32(a)); #else simde_uint32x2_private a_ = simde_uint32x2_to_private(a); uint64_t r = 0; - SIMDE_VECTORIZE_REDUCTION(+:r) - for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { - r += a_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vuint64m1_t zero = __riscv_vmv_v_x_u64m1(0 , 1); + vuint64m1_t sum = __riscv_vwredsumu_vs_u32m1_u64m1(a_.sv64 , zero , 2); + r = __riscv_vmv_x_s_u64m1_u64 (sum); + #else + SIMDE_VECTORIZE_REDUCTION(+:r) + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + r += a_.values[i]; + } + #endif return r; #endif @@ -194,10 +231,16 @@ simde_vaddlvq_s8(simde_int8x16_t a) { simde_int8x16_private a_ = simde_int8x16_to_private(a); int16_t r = 0; - SIMDE_VECTORIZE_REDUCTION(+:r) - for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { - r += a_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vint16m1_t zero = __riscv_vmv_v_x_i16m1(0 , 1); + vint16m1_t sum = __riscv_vwredsum_vs_i8m1_i16m1(a_.sv128 , zero , 16); + r = __riscv_vmv_x_s_i16m1_i16 (sum); + #else + SIMDE_VECTORIZE_REDUCTION(+:r) + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + r += a_.values[i]; + } + #endif return r; #endif @@ -223,10 +266,16 @@ simde_vaddlvq_s16(simde_int16x8_t a) { simde_int16x8_private a_ = simde_int16x8_to_private(a); int32_t r = 0; - SIMDE_VECTORIZE_REDUCTION(+:r) - for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { - r += a_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vint32m1_t zero = __riscv_vmv_v_x_i32m1(0 , 1); + vint32m1_t sum = __riscv_vwredsum_vs_i16m1_i32m1(a_.sv128 , zero , 8); + r = __riscv_vmv_x_s_i32m1_i32 (sum); + #else + SIMDE_VECTORIZE_REDUCTION(+:r) + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + r += a_.values[i]; + } + #endif return r; #endif @@ -245,10 +294,16 @@ simde_vaddlvq_s32(simde_int32x4_t a) { simde_int32x4_private a_ = simde_int32x4_to_private(a); int64_t r = 0; - SIMDE_VECTORIZE_REDUCTION(+:r) - for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { - r += a_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vint64m1_t zero = __riscv_vmv_v_x_i64m1(0 , 1); + vint64m1_t sum = __riscv_vwredsum_vs_i32m1_i64m1(a_.sv128 , zero , 4); + r = __riscv_vmv_x_s_i64m1_i64 (sum); + #else + SIMDE_VECTORIZE_REDUCTION(+:r) + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + r += a_.values[i]; + } + #endif return r; #endif @@ -272,10 +327,16 @@ simde_vaddlvq_u8(simde_uint8x16_t a) { simde_uint8x16_private a_ = simde_uint8x16_to_private(a); uint16_t r = 0; - SIMDE_VECTORIZE_REDUCTION(+:r) - for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { - r += a_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vuint16m1_t zero = __riscv_vmv_v_x_u16m1(0 , 1); + vuint16m1_t sum = __riscv_vwredsumu_vs_u8m1_u16m1(a_.sv128 , zero , 16); + r = __riscv_vmv_x_s_u16m1_u16 (sum); + #else + SIMDE_VECTORIZE_REDUCTION(+:r) + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + r += a_.values[i]; + } + #endif return r; #endif @@ -298,14 +359,20 @@ simde_vaddlvq_u16(simde_uint16x8_t a) { return HEDLEY_STATIC_CAST(uint32_t, _mm_cvtsi128_si32(a_)); #else simde_uint16x8_private a_ = simde_uint16x8_to_private(a); - uint32_t r = 0; - - SIMDE_VECTORIZE_REDUCTION(+:r) - for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { - r += a_.values[i]; - } + uint32_t r = 0; + #if defined(SIMDE_RISCV_V_NATIVE) + vuint32m1_t zero = __riscv_vmv_v_x_u32m1(0 , 1); + vuint32m1_t sum = __riscv_vwredsumu_vs_u16m1_u32m1(a_.sv128 , zero , 8); + r = __riscv_vmv_x_s_u32m1_u32 (sum); + #else + SIMDE_VECTORIZE_REDUCTION(+:r) + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + r += a_.values[i]; + } + #endif return r; + #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -322,10 +389,16 @@ simde_vaddlvq_u32(simde_uint32x4_t a) { simde_uint32x4_private a_ = simde_uint32x4_to_private(a); uint64_t r = 0; - SIMDE_VECTORIZE_REDUCTION(+:r) - for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { - r += a_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vuint64m1_t zero = __riscv_vmv_v_x_u64m1(0 , 1); + vuint64m1_t sum = __riscv_vwredsumu_vs_u32m1_u64m1(a_.sv128 , zero , 4); + r = __riscv_vmv_x_s_u64m1_u64 (sum); + #else + SIMDE_VECTORIZE_REDUCTION(+:r) + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + r += a_.values[i]; + } + #endif return r; #endif diff --git a/simde/arm/neon/addv.h b/simde/arm/neon/addv.h index 6beb9836c..00ed4c8e1 100644 --- a/simde/arm/neon/addv.h +++ b/simde/arm/neon/addv.h @@ -22,6 +22,7 @@ * * Copyright: * 2020 Evan Nemerson + * 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_ADDV_H) @@ -43,11 +44,17 @@ simde_vaddv_f32(simde_float32x2_t a) { #else simde_float32x2_private a_ = simde_float32x2_to_private(a); - r = 0; - SIMDE_VECTORIZE_REDUCTION(+:r) - for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { - r += a_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vfloat32m1_t zero = __riscv_vfmv_v_f_f32m1(0 , 1); + vfloat32m1_t sum = __riscv_vfredosum_vs_f32m1_f32m1(a_.sv64 , zero , 2); + r = __riscv_vfmv_f_s_f32m1_f32 (sum); + #else + r = 0; + SIMDE_VECTORIZE_REDUCTION(+:r) + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + r += a_.values[i]; + } + #endif #endif return r; @@ -67,11 +74,17 @@ simde_vaddv_s8(simde_int8x8_t a) { #else simde_int8x8_private a_ = simde_int8x8_to_private(a); - r = 0; - SIMDE_VECTORIZE_REDUCTION(+:r) - for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { - r += a_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vint8m1_t zero = __riscv_vmv_v_x_i8m1(0 , 1); + vint8m1_t sum = __riscv_vredsum_vs_i8m1_i8m1(a_.sv64 , zero , 8); + r = __riscv_vmv_x_s_i8m1_i8 (sum); + #else + r = 0; + SIMDE_VECTORIZE_REDUCTION(+:r) + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + r += a_.values[i]; + } + #endif #endif return r; @@ -91,11 +104,17 @@ simde_vaddv_s16(simde_int16x4_t a) { #else simde_int16x4_private a_ = simde_int16x4_to_private(a); - r = 0; - SIMDE_VECTORIZE_REDUCTION(+:r) - for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { - r += a_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vint16m1_t zero = __riscv_vmv_v_x_i16m1(0 , 1); + vint16m1_t sum = __riscv_vredsum_vs_i16m1_i16m1(a_.sv64 , zero , 4); + r = __riscv_vmv_x_s_i16m1_i16 (sum); + #else + r = 0; + SIMDE_VECTORIZE_REDUCTION(+:r) + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + r += a_.values[i]; + } + #endif #endif return r; @@ -115,11 +134,17 @@ simde_vaddv_s32(simde_int32x2_t a) { #else simde_int32x2_private a_ = simde_int32x2_to_private(a); - r = 0; - SIMDE_VECTORIZE_REDUCTION(+:r) - for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { - r += a_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vint32m1_t zero = __riscv_vmv_v_x_i32m1(0 , 1); + vint32m1_t sum = __riscv_vredsum_vs_i32m1_i32m1(a_.sv64 , zero , 2); + r = __riscv_vmv_x_s_i32m1_i32 (sum); + #else + r = 0; + SIMDE_VECTORIZE_REDUCTION(+:r) + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + r += a_.values[i]; + } + #endif #endif return r; @@ -139,11 +164,17 @@ simde_vaddv_u8(simde_uint8x8_t a) { #else simde_uint8x8_private a_ = simde_uint8x8_to_private(a); - r = 0; - SIMDE_VECTORIZE_REDUCTION(+:r) - for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { - r += a_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vuint8m1_t zero = __riscv_vmv_v_x_u8m1(0 , 1); + vuint8m1_t sum = __riscv_vredsum_vs_u8m1_u8m1(a_.sv64 , zero , 8); + r = __riscv_vmv_x_s_u8m1_u8 (sum); + #else + r = 0; + SIMDE_VECTORIZE_REDUCTION(+:r) + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + r += a_.values[i]; + } + #endif #endif return r; @@ -163,11 +194,17 @@ simde_vaddv_u16(simde_uint16x4_t a) { #else simde_uint16x4_private a_ = simde_uint16x4_to_private(a); - r = 0; - SIMDE_VECTORIZE_REDUCTION(+:r) - for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { - r += a_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vuint16m1_t zero = __riscv_vmv_v_x_u16m1(0 , 1); + vuint16m1_t sum = __riscv_vredsum_vs_u16m1_u16m1(a_.sv64 , zero , 4); + r = __riscv_vmv_x_s_u16m1_u16(sum); + #else + r = 0; + SIMDE_VECTORIZE_REDUCTION(+:r) + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + r += a_.values[i]; + } + #endif #endif return r; @@ -187,11 +224,17 @@ simde_vaddv_u32(simde_uint32x2_t a) { #else simde_uint32x2_private a_ = simde_uint32x2_to_private(a); - r = 0; - SIMDE_VECTORIZE_REDUCTION(+:r) - for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { - r += a_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vuint32m1_t zero = __riscv_vmv_v_x_u32m1(0 , 1); + vuint32m1_t sum = __riscv_vredsum_vs_u32m1_u32m1(a_.sv64 , zero , 2); + r = __riscv_vmv_x_s_u32m1_u32 (sum); + #else + r = 0; + SIMDE_VECTORIZE_REDUCTION(+:r) + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + r += a_.values[i]; + } + #endif #endif return r; @@ -211,11 +254,17 @@ simde_vaddvq_f32(simde_float32x4_t a) { #else simde_float32x4_private a_ = simde_float32x4_to_private(a); - r = 0; - SIMDE_VECTORIZE_REDUCTION(+:r) - for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { - r += a_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vfloat32m1_t zero = __riscv_vfmv_v_f_f32m1(0 , 1); + vfloat32m1_t sum = __riscv_vfredosum_vs_f32m1_f32m1(a_.sv128 , zero , 4); + r = __riscv_vfmv_f_s_f32m1_f32 (sum); + #else + r = 0; + SIMDE_VECTORIZE_REDUCTION(+:r) + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + r += a_.values[i]; + } + #endif #endif return r; @@ -235,11 +284,17 @@ simde_vaddvq_f64(simde_float64x2_t a) { #else simde_float64x2_private a_ = simde_float64x2_to_private(a); - r = 0; - SIMDE_VECTORIZE_REDUCTION(+:r) - for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { - r += a_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vfloat64m1_t zero = __riscv_vfmv_v_f_f64m1(0 , 1); + vfloat64m1_t sum = __riscv_vfredosum_vs_f64m1_f64m1(a_.sv128 , zero , 2); + r = __riscv_vfmv_f_s_f64m1_f64 (sum); + #else + r = 0; + SIMDE_VECTORIZE_REDUCTION(+:r) + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + r += a_.values[i]; + } + #endif #endif return r; @@ -259,11 +314,17 @@ simde_vaddvq_s8(simde_int8x16_t a) { #else simde_int8x16_private a_ = simde_int8x16_to_private(a); - r = 0; - SIMDE_VECTORIZE_REDUCTION(+:r) - for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { - r += a_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vint8m1_t zero = __riscv_vmv_v_x_i8m1(0 , 1); + vint8m1_t sum = __riscv_vredsum_vs_i8m1_i8m1(a_.sv128 , zero , 16); + r = __riscv_vmv_x_s_i8m1_i8 (sum); + #else + r = 0; + SIMDE_VECTORIZE_REDUCTION(+:r) + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + r += a_.values[i]; + } + #endif #endif return r; @@ -283,11 +344,17 @@ simde_vaddvq_s16(simde_int16x8_t a) { #else simde_int16x8_private a_ = simde_int16x8_to_private(a); - r = 0; - SIMDE_VECTORIZE_REDUCTION(+:r) - for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { - r += a_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vint16m1_t zero = __riscv_vmv_v_x_i16m1(0 , 1); + vint16m1_t sum = __riscv_vredsum_vs_i16m1_i16m1(a_.sv128 , zero , 8); + r = __riscv_vmv_x_s_i16m1_i16 (sum); + #else + r = 0; + SIMDE_VECTORIZE_REDUCTION(+:r) + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + r += a_.values[i]; + } + #endif #endif return r; @@ -307,11 +374,17 @@ simde_vaddvq_s32(simde_int32x4_t a) { #else simde_int32x4_private a_ = simde_int32x4_to_private(a); - r = 0; - SIMDE_VECTORIZE_REDUCTION(+:r) - for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { - r += a_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vint32m1_t zero = __riscv_vmv_v_x_i32m1(0 , 1); + vint32m1_t sum = __riscv_vredsum_vs_i32m1_i32m1(a_.sv128 , zero , 4); + r = __riscv_vmv_x_s_i32m1_i32 (sum); + #else + r = 0; + SIMDE_VECTORIZE_REDUCTION(+:r) + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + r += a_.values[i]; + } + #endif #endif return r; @@ -331,11 +404,17 @@ simde_vaddvq_s64(simde_int64x2_t a) { #else simde_int64x2_private a_ = simde_int64x2_to_private(a); - r = 0; - SIMDE_VECTORIZE_REDUCTION(+:r) - for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { - r += a_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vint64m1_t zero = __riscv_vmv_v_x_i64m1(0 , 1); + vint64m1_t sum = __riscv_vredsum_vs_i64m1_i64m1(a_.sv128 , zero , 2); + r = __riscv_vmv_x_s_i64m1_i64 (sum); + #else + r = 0; + SIMDE_VECTORIZE_REDUCTION(+:r) + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + r += a_.values[i]; + } + #endif #endif return r; @@ -360,11 +439,17 @@ simde_vaddvq_u8(simde_uint8x16_t a) { #else simde_uint8x16_private a_ = simde_uint8x16_to_private(a); - r = 0; - SIMDE_VECTORIZE_REDUCTION(+:r) - for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { - r += a_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vuint8m1_t zero = __riscv_vmv_v_x_u8m1(0 , 1); + vuint8m1_t sum = __riscv_vredsum_vs_u8m1_u8m1(a_.sv128 , zero , 16); + r = __riscv_vmv_x_s_u8m1_u8 (sum); + #else + r = 0; + SIMDE_VECTORIZE_REDUCTION(+:r) + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + r += a_.values[i]; + } + #endif #endif return r; @@ -384,11 +469,17 @@ simde_vaddvq_u16(simde_uint16x8_t a) { #else simde_uint16x8_private a_ = simde_uint16x8_to_private(a); - r = 0; - SIMDE_VECTORIZE_REDUCTION(+:r) - for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { - r += a_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vuint16m1_t zero = __riscv_vmv_v_x_u16m1(0 , 1); + vuint16m1_t sum = __riscv_vredsum_vs_u16m1_u16m1(a_.sv128 , zero , 8); + r = __riscv_vmv_x_s_u16m1_u16(sum); + #else + r = 0; + SIMDE_VECTORIZE_REDUCTION(+:r) + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + r += a_.values[i]; + } + #endif #endif return r; @@ -408,11 +499,17 @@ simde_vaddvq_u32(simde_uint32x4_t a) { #else simde_uint32x4_private a_ = simde_uint32x4_to_private(a); - r = 0; - SIMDE_VECTORIZE_REDUCTION(+:r) - for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { - r += a_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vuint32m1_t zero = __riscv_vmv_v_x_u32m1(0 , 1); + vuint32m1_t sum = __riscv_vredsum_vs_u32m1_u32m1(a_.sv128 , zero , 4); + r = __riscv_vmv_x_s_u32m1_u32 (sum); + #else + r = 0; + SIMDE_VECTORIZE_REDUCTION(+:r) + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + r += a_.values[i]; + } + #endif #endif return r; @@ -432,11 +529,17 @@ simde_vaddvq_u64(simde_uint64x2_t a) { #else simde_uint64x2_private a_ = simde_uint64x2_to_private(a); - r = 0; - SIMDE_VECTORIZE_REDUCTION(+:r) - for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { - r += a_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vuint64m1_t zero = __riscv_vmv_v_x_u64m1(0 , 1); + vuint64m1_t sum = __riscv_vredsum_vs_u64m1_u64m1(a_.sv128 , zero , 2); + r = __riscv_vmv_x_s_u64m1_u64 (sum); + #else + r = 0; + SIMDE_VECTORIZE_REDUCTION(+:r) + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + r += a_.values[i]; + } + #endif #endif return r; diff --git a/simde/arm/neon/cge.h b/simde/arm/neon/cge.h index 2ed6655a4..ec2406dfd 100644 --- a/simde/arm/neon/cge.h +++ b/simde/arm/neon/cge.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Christopher Moore + * 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_CGE_H) @@ -59,10 +60,16 @@ simde_vcgeq_f16(simde_float16x8_t a, simde_float16x8_t b) { b_ = simde_float16x8_to_private(b); simde_uint16x8_private r_; - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = simde_vcgeh_f16(a_.values[i], b_.values[i]); - } + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVFH) + vbool16_t result = __riscv_vmfge_vv_f16m1_b16(a_.sv128, b_.sv128, 8); + r_.sv128 = __riscv_vmv_v_x_u16m1(0, 8); + r_.sv128 = __riscv_vmerge_vxm_u16m1(r_.sv128, -1, result, 8); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcgeh_f16(a_.values[i], b_.values[i]); + } + #endif return simde_uint16x8_from_private(r_); #endif @@ -85,10 +92,15 @@ simde_vcgeq_f32(simde_float32x4_t a, simde_float32x4_t b) { b_ = simde_float32x4_to_private(b); simde_uint32x4_private r_; + #if defined(SIMDE_X86_SSE2_NATIVE) r_.m128i = _mm_castps_si128(_mm_cmpge_ps(a_.m128, b_.m128)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_f32x4_ge(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool32_t result = __riscv_vmfge_vv_f32m1_b32(a_.sv128, b_.sv128, 4); + r_.sv128 = __riscv_vmv_v_x_u32m1(0, 4); + r_.sv128 = __riscv_vmerge_vxm_u32m1(r_.sv128, -1, result, 4); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values >= b_.values); #else @@ -123,6 +135,10 @@ simde_vcgeq_f64(simde_float64x2_t a, simde_float64x2_t b) { r_.m128i = _mm_castpd_si128(_mm_cmpge_pd(a_.m128d, b_.m128d)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_f64x2_ge(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool64_t result = __riscv_vmfge_vv_f64m1_b64(a_.sv128, b_.sv128, 2); + r_.sv128 = __riscv_vmv_v_x_u64m1(0, 2); + r_.sv128 = __riscv_vmerge_vxm_u64m1(r_.sv128, -1, result, 2); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values >= b_.values); #else @@ -157,6 +173,10 @@ simde_vcgeq_s8(simde_int8x16_t a, simde_int8x16_t b) { r_.m128i = _mm_or_si128(_mm_cmpgt_epi8(a_.m128i, b_.m128i), _mm_cmpeq_epi8(a_.m128i, b_.m128i)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i8x16_ge(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool8_t result = __riscv_vmsge_vv_i8m1_b8(a_.sv128, b_.sv128, 16); + r_.sv128 = __riscv_vmv_v_x_u8m1(0, 16); + r_.sv128 = __riscv_vmerge_vxm_u8m1(r_.sv128, -1, result, 16); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values >= b_.values); #else @@ -191,6 +211,10 @@ simde_vcgeq_s16(simde_int16x8_t a, simde_int16x8_t b) { r_.m128i = _mm_or_si128(_mm_cmpgt_epi16(a_.m128i, b_.m128i), _mm_cmpeq_epi16(a_.m128i, b_.m128i)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i16x8_ge(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool16_t result = __riscv_vmsge_vv_i16m1_b16(a_.sv128, b_.sv128, 8); + r_.sv128 = __riscv_vmv_v_x_u16m1(0, 8); + r_.sv128 = __riscv_vmerge_vxm_u16m1(r_.sv128, -1, result, 8); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values >= b_.values); #else @@ -225,6 +249,10 @@ simde_vcgeq_s32(simde_int32x4_t a, simde_int32x4_t b) { r_.m128i = _mm_or_si128(_mm_cmpgt_epi32(a_.m128i, b_.m128i), _mm_cmpeq_epi32(a_.m128i, b_.m128i)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i32x4_ge(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool32_t result = __riscv_vmsge_vv_i32m1_b32(a_.sv128, b_.sv128, 4); + r_.sv128 = __riscv_vmv_v_x_u32m1(0, 4); + r_.sv128 = __riscv_vmerge_vxm_u32m1(r_.sv128, -1, result, 4); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values >= b_.values); #else @@ -259,6 +287,10 @@ simde_vcgeq_s64(simde_int64x2_t a, simde_int64x2_t b) { #if defined(SIMDE_X86_SSE4_2_NATIVE) r_.m128i = _mm_or_si128(_mm_cmpgt_epi64(a_.m128i, b_.m128i), _mm_cmpeq_epi64(a_.m128i, b_.m128i)); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool64_t result = __riscv_vmsge_vv_i64m1_b64(a_.sv128, b_.sv128, 2); + r_.sv128 = __riscv_vmv_v_x_u64m1(0, 2); + r_.sv128 = __riscv_vmerge_vxm_u64m1(r_.sv128, -1, result, 2); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values >= b_.values); #else @@ -297,6 +329,10 @@ simde_vcgeq_u8(simde_uint8x16_t a, simde_uint8x16_t b) { ); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_u8x16_ge(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool8_t result = __riscv_vmsgeu_vv_u8m1_b8(a_.sv128, b_.sv128, 16); + r_.sv128 = __riscv_vmv_v_x_u8m1(0, 16); + r_.sv128 = __riscv_vmerge_vxm_u8m1(r_.sv128, -1, result, 16); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values >= b_.values); #else @@ -338,6 +374,10 @@ simde_vcgeq_u16(simde_uint16x8_t a, simde_uint16x8_t b) { r_.m128i = _mm_or_si128(_mm_cmpgt_epi16(_mm_xor_si128(a_.m128i, sign_bits), _mm_xor_si128(b_.m128i, sign_bits)), _mm_cmpeq_epi16(a_.m128i, b_.m128i)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_u16x8_ge(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool16_t result = __riscv_vmsgeu_vv_u16m1_b16(a_.sv128, b_.sv128, 8); + r_.sv128 = __riscv_vmv_v_x_u16m1(0, 8); + r_.sv128 = __riscv_vmerge_vxm_u16m1(r_.sv128, -1, result, 8); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values >= b_.values); #else @@ -379,6 +419,10 @@ simde_vcgeq_u32(simde_uint32x4_t a, simde_uint32x4_t b) { r_.m128i = _mm_or_si128(_mm_cmpgt_epi32(_mm_xor_si128(a_.m128i, sign_bits), _mm_xor_si128(b_.m128i, sign_bits)), _mm_cmpeq_epi32(a_.m128i, b_.m128i)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_u32x4_ge(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool32_t result = __riscv_vmsgeu_vv_u32m1_b32(a_.sv128, b_.sv128, 4); + r_.sv128 = __riscv_vmv_v_x_u32m1(0, 4); + r_.sv128 = __riscv_vmerge_vxm_u32m1(r_.sv128, -1, result, 4); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values >= b_.values); #else @@ -418,6 +462,10 @@ simde_vcgeq_u64(simde_uint64x2_t a, simde_uint64x2_t b) { #elif defined(SIMDE_X86_SSE4_2_NATIVE) __m128i sign_bits = _mm_set1_epi64x(INT64_MIN); r_.m128i = _mm_or_si128(_mm_cmpgt_epi64(_mm_xor_si128(a_.m128i, sign_bits), _mm_xor_si128(b_.m128i, sign_bits)), _mm_cmpeq_epi64(a_.m128i, b_.m128i)); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool64_t result = __riscv_vmsgeu_vv_u64m1_b64(a_.sv128, b_.sv128, 2); + r_.sv128 = __riscv_vmv_v_x_u64m1(0, 2); + r_.sv128 = __riscv_vmerge_vxm_u64m1(r_.sv128, -1, result, 2); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values >= b_.values); #else @@ -446,10 +494,16 @@ simde_vcge_f16(simde_float16x4_t a, simde_float16x4_t b) { b_ = simde_float16x4_to_private(b); simde_uint16x4_private r_; - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = simde_vcgeh_f16(a_.values[i], b_.values[i]); - } + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVFH) + vbool16_t result = __riscv_vmfge_vv_f16m1_b16(a_.sv64, b_.sv64, 4); + r_.sv64 = __riscv_vmv_v_x_u16m1(0, 4); + r_.sv64 = __riscv_vmerge_vxm_u16m1(r_.sv64, 0xffff, result, 4); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcgeh_f16(a_.values[i], b_.values[i]); + } + #endif return simde_uint16x4_from_private(r_); #endif @@ -470,7 +524,11 @@ simde_vcge_f32(simde_float32x2_t a, simde_float32x2_t b) { b_ = simde_float32x2_to_private(b); simde_uint32x2_private r_; - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) + #if defined(SIMDE_RISCV_V_NATIVE) + vbool32_t result = __riscv_vmfge_vv_f32m1_b32(a_.sv64, b_.sv64, 2); + r_.sv64 = __riscv_vmv_v_x_u32m1(0, 2); + r_.sv64 = __riscv_vmerge_vxm_u32m1(r_.sv64, -1, result, 2); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values >= b_.values); #else SIMDE_VECTORIZE @@ -498,7 +556,11 @@ simde_vcge_f64(simde_float64x1_t a, simde_float64x1_t b) { b_ = simde_float64x1_to_private(b); simde_uint64x1_private r_; - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) + #if defined(SIMDE_RISCV_V_NATIVE) + vbool64_t result = __riscv_vmfge_vv_f64m1_b64(a_.sv64, b_.sv64, 1); + r_.sv64 = __riscv_vmv_v_x_u64m1(0, 1); + r_.sv64 = __riscv_vmerge_vxm_u64m1(r_.sv64, -1, result, 1); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values >= b_.values); #else SIMDE_VECTORIZE @@ -528,6 +590,10 @@ simde_vcge_s8(simde_int8x8_t a, simde_int8x8_t b) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_or_si64(_mm_cmpgt_pi8(a_.m64, b_.m64), _mm_cmpeq_pi8(a_.m64, b_.m64)); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool8_t result = __riscv_vmsge_vv_i8m1_b8(a_.sv64, b_.sv64, 8); + r_.sv64 = __riscv_vmv_v_x_u8m1(0, 8); + r_.sv64 = __riscv_vmerge_vxm_u8m1(r_.sv64, -1, result, 8); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values >= b_.values); #else @@ -560,6 +626,10 @@ simde_vcge_s16(simde_int16x4_t a, simde_int16x4_t b) { r_.m64 = _mm_or_si64(_mm_cmpgt_pi16(a_.m64, b_.m64), _mm_cmpeq_pi16(a_.m64, b_.m64)); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values >= b_.values); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool16_t result = __riscv_vmsge_vv_i16m1_b16(a_.sv64, b_.sv64, 4); + r_.sv64 = __riscv_vmv_v_x_i16m1(0, 4); + r_.sv64 = __riscv_vmerge_vxm_i16m1(r_.sv64, -1, result, 4); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -590,6 +660,10 @@ simde_vcge_s32(simde_int32x2_t a, simde_int32x2_t b) { r_.m64 = _mm_or_si64(_mm_cmpgt_pi32(a_.m64, b_.m64), _mm_cmpeq_pi32(a_.m64, b_.m64)); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values >= b_.values); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool32_t result = __riscv_vmsge_vv_i32m1_b32(a_.sv64, b_.sv64, 2); + r_.sv64 = __riscv_vmv_v_x_i32m1(0, 2); + r_.sv64 = __riscv_vmerge_vxm_i32m1(r_.sv64, -1, result, 2); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -616,7 +690,11 @@ simde_vcge_s64(simde_int64x1_t a, simde_int64x1_t b) { b_ = simde_int64x1_to_private(b); simde_uint64x1_private r_; - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_RISCV_V_NATIVE) + vbool64_t result = __riscv_vmsge_vv_i64m1_b64(a_.sv64, b_.sv64, 1); + r_.sv64 = __riscv_vmv_v_x_u64m1(0, 1); + r_.sv64 = __riscv_vmerge_vxm_u64m1(r_.sv64, -1, result, 1); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values >= b_.values); #else SIMDE_VECTORIZE @@ -647,6 +725,10 @@ simde_vcge_u8(simde_uint8x8_t a, simde_uint8x8_t b) { #if defined(SIMDE_X86_MMX_NATIVE) __m64 sign_bits = _mm_set1_pi8(INT8_MIN); r_.m64 = _mm_or_si64(_mm_cmpgt_pi8(_mm_xor_si64(a_.m64, sign_bits), _mm_xor_si64(b_.m64, sign_bits)), _mm_cmpeq_pi8(a_.m64, b_.m64)); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool8_t result = __riscv_vmsgeu_vv_u8m1_b8(a_.sv64, b_.sv64, 8); + r_.sv64 = __riscv_vmv_v_x_u8m1(0, 8); + r_.sv64 = __riscv_vmerge_vxm_u8m1(r_.sv64, -1, result, 8); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values >= b_.values); #else @@ -678,6 +760,10 @@ simde_vcge_u16(simde_uint16x4_t a, simde_uint16x4_t b) { #if defined(SIMDE_X86_MMX_NATIVE) __m64 sign_bits = _mm_set1_pi16(INT16_MIN); r_.m64 = _mm_or_si64(_mm_cmpgt_pi16(_mm_xor_si64(a_.m64, sign_bits), _mm_xor_si64(b_.m64, sign_bits)), _mm_cmpeq_pi16(a_.m64, b_.m64)); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool16_t result = __riscv_vmsgeu_vv_u16m1_b16(a_.sv64, b_.sv64, 4); + r_.sv64 = __riscv_vmv_v_x_u16m1(0, 4); + r_.sv64 = __riscv_vmerge_vxm_u16m1(r_.sv64, -1, result, 4); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values >= b_.values); #else @@ -709,6 +795,10 @@ simde_vcge_u32(simde_uint32x2_t a, simde_uint32x2_t b) { #if defined(SIMDE_X86_MMX_NATIVE) __m64 sign_bits = _mm_set1_pi32(INT32_MIN); r_.m64 = _mm_or_si64(_mm_cmpgt_pi32(_mm_xor_si64(a_.m64, sign_bits), _mm_xor_si64(b_.m64, sign_bits)), _mm_cmpeq_pi32(a_.m64, b_.m64)); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool32_t result = __riscv_vmsgeu_vv_u32m1_b32(a_.sv64, b_.sv64, 2); + r_.sv64 = __riscv_vmv_v_x_u32m1(0, 2); + r_.sv64 = __riscv_vmerge_vxm_u32m1(r_.sv64, -1, result, 2); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values >= b_.values); #else @@ -737,7 +827,11 @@ simde_vcge_u64(simde_uint64x1_t a, simde_uint64x1_t b) { a_ = simde_uint64x1_to_private(a), b_ = simde_uint64x1_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_RISCV_V_NATIVE) + vbool64_t result = __riscv_vmsgeu_vv_u64m1_b64(a_.sv64, b_.sv64, 1); + r_.sv64 = __riscv_vmv_v_x_u64m1(0, 1); + r_.sv64 = __riscv_vmerge_vxm_u64m1(r_.sv64, -1, result, 1); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values >= b_.values); #else SIMDE_VECTORIZE diff --git a/simde/arm/neon/cgt.h b/simde/arm/neon/cgt.h index 465cdb917..f3023cbb1 100644 --- a/simde/arm/neon/cgt.h +++ b/simde/arm/neon/cgt.h @@ -24,6 +24,7 @@ * 2020 Evan Nemerson * 2020 Christopher Moore * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_CGT_H) @@ -121,10 +122,16 @@ simde_vcgtq_f16(simde_float16x8_t a, simde_float16x8_t b) { b_ = simde_float16x8_to_private(b); simde_uint16x8_private r_; - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = simde_vcgth_f16(a_.values[i], b_.values[i]); - } + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVFH) + vbool16_t result = __riscv_vmfgt_vv_f16m1_b16(a_.sv128, b_.sv128, 8); + r_.sv128 = __riscv_vmv_v_x_u16m1(0, 8); + r_.sv128 = __riscv_vmerge_vxm_u16m1(r_.sv128, -1, result, 8); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcgth_f16(a_.values[i], b_.values[i]); + } + #endif return simde_uint16x8_from_private(r_); #endif @@ -151,6 +158,10 @@ simde_vcgtq_f32(simde_float32x4_t a, simde_float32x4_t b) { r_.m128i = _mm_castps_si128(_mm_cmpgt_ps(a_.m128, b_.m128)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_f32x4_gt(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool32_t result = __riscv_vmfgt_vv_f32m1_b32(a_.sv128, b_.sv128, 4); + r_.sv128 = __riscv_vmv_v_x_u32m1(0, 4); + r_.sv128 = __riscv_vmerge_vxm_u32m1(r_.sv128, -1, result, 4); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values > b_.values); #else @@ -185,6 +196,10 @@ simde_vcgtq_f64(simde_float64x2_t a, simde_float64x2_t b) { r_.m128i = _mm_castpd_si128(_mm_cmpgt_pd(a_.m128d, b_.m128d)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_f64x2_gt(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool64_t result = __riscv_vmfgt_vv_f64m1_b64(a_.sv128, b_.sv128, 2); + r_.sv128 = __riscv_vmv_v_x_u64m1(0, 2); + r_.sv128 = __riscv_vmerge_vxm_u64m1(r_.sv128, -1, result, 2); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values > b_.values); #else @@ -219,6 +234,10 @@ simde_vcgtq_s8(simde_int8x16_t a, simde_int8x16_t b) { r_.m128i = _mm_cmpgt_epi8(a_.m128i, b_.m128i); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i8x16_gt(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool8_t result = __riscv_vmsgt_vv_i8m1_b8(a_.sv128, b_.sv128, 16); + r_.sv128 = __riscv_vmv_v_x_u8m1(0, 16); + r_.sv128 = __riscv_vmerge_vxm_u8m1(r_.sv128, -1, result, 16); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values > b_.values); #else @@ -253,6 +272,10 @@ simde_vcgtq_s16(simde_int16x8_t a, simde_int16x8_t b) { r_.m128i = _mm_cmpgt_epi16(a_.m128i, b_.m128i); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i16x8_gt(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool16_t result = __riscv_vmsgt_vv_i16m1_b16(a_.sv128, b_.sv128, 8); + r_.sv128 = __riscv_vmv_v_x_u16m1(0, 8); + r_.sv128 = __riscv_vmerge_vxm_u16m1(r_.sv128, -1, result, 8); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values > b_.values); #else @@ -287,6 +310,10 @@ simde_vcgtq_s32(simde_int32x4_t a, simde_int32x4_t b) { r_.m128i = _mm_cmpgt_epi32(a_.m128i, b_.m128i); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i32x4_gt(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool32_t result = __riscv_vmsgt_vv_i32m1_b32(a_.sv128, b_.sv128, 4); + r_.sv128 = __riscv_vmv_v_x_u32m1(0, 4); + r_.sv128 = __riscv_vmerge_vxm_u32m1(r_.sv128, -1, result, 4); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values > b_.values); #else @@ -326,6 +353,10 @@ simde_vcgtq_s64(simde_int64x2_t a, simde_int64x2_t b) { __m128i r = _mm_and_si128(_mm_cmpeq_epi32(a_.m128i, b_.m128i), _mm_sub_epi64(b_.m128i, a_.m128i)); r = _mm_or_si128(r, _mm_cmpgt_epi32(a_.m128i, b_.m128i)); r_.m128i = _mm_shuffle_epi32(r, _MM_SHUFFLE(3,3,1,1)); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool64_t result = __riscv_vmsgt_vv_i64m1_b64(a_.sv128, b_.sv128, 2); + r_.sv128 = __riscv_vmv_v_x_u64m1(0, 2); + r_.sv128 = __riscv_vmerge_vxm_u64m1(r_.sv128, -1, result, 2); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values > b_.values); #else @@ -361,6 +392,10 @@ simde_vcgtq_u8(simde_uint8x16_t a, simde_uint8x16_t b) { r_.m128i = _mm_adds_epu8(tmp, _mm_sub_epi8(_mm_setzero_si128(), tmp)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_u8x16_gt(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool8_t result = __riscv_vmsgtu_vv_u8m1_b8(a_.sv128, b_.sv128, 16); + r_.sv128 = __riscv_vmv_v_x_u8m1(0, 16); + r_.sv128 = __riscv_vmerge_vxm_u8m1(r_.sv128, -1, result, 16); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values > b_.values); #else @@ -396,6 +431,10 @@ simde_vcgtq_u16(simde_uint16x8_t a, simde_uint16x8_t b) { r_.m128i = _mm_adds_epu16(tmp, _mm_sub_epi16(_mm_setzero_si128(), tmp)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_u16x8_gt(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool16_t result = __riscv_vmsgtu_vv_u16m1_b16(a_.sv128, b_.sv128, 8); + r_.sv128 = __riscv_vmv_v_x_u16m1(0, 8); + r_.sv128 = __riscv_vmerge_vxm_u16m1(r_.sv128, -1, result, 8); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values > b_.values); #else @@ -434,6 +473,10 @@ simde_vcgtq_u32(simde_uint32x4_t a, simde_uint32x4_t b) { ); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_u32x4_gt(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool32_t result = __riscv_vmsgtu_vv_u32m1_b32(a_.sv128, b_.sv128, 4); + r_.sv128 = __riscv_vmv_v_x_u32m1(0, 4); + r_.sv128 = __riscv_vmerge_vxm_u32m1(r_.sv128, -1, result, 4); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values > b_.values); #else @@ -467,6 +510,10 @@ simde_vcgtq_u64(simde_uint64x2_t a, simde_uint64x2_t b) { #if defined(SIMDE_X86_SSE4_2_NATIVE) __m128i sign_bit = _mm_set1_epi64x(INT64_MIN); r_.m128i = _mm_cmpgt_epi64(_mm_xor_si128(a_.m128i, sign_bit), _mm_xor_si128(b_.m128i, sign_bit)); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool64_t result = __riscv_vmsgtu_vv_u64m1_b64(a_.sv128, b_.sv128, 2); + r_.sv128 = __riscv_vmv_v_x_u64m1(0, 2); + r_.sv128 = __riscv_vmerge_vxm_u64m1(r_.sv128, -1, result, 2); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values > b_.values); #else @@ -495,10 +542,16 @@ simde_vcgt_f16(simde_float16x4_t a, simde_float16x4_t b) { b_ = simde_float16x4_to_private(b); simde_uint16x4_private r_; - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = simde_vcgth_f16(a_.values[i], b_.values[i]); - } + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVFH) + vbool16_t result = __riscv_vmfgt_vv_f16m1_b16(a_.sv64, b_.sv64, 4); + r_.sv64 = __riscv_vmv_v_x_u16m1(0, 4); + r_.sv64 = __riscv_vmerge_vxm_u16m1(r_.sv64, -1, result, 4); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcgth_f16(a_.values[i], b_.values[i]); + } + #endif return simde_uint16x4_from_private(r_); #endif @@ -519,7 +572,11 @@ simde_vcgt_f32(simde_float32x2_t a, simde_float32x2_t b) { b_ = simde_float32x2_to_private(b); simde_uint32x2_private r_; - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) + #if defined(SIMDE_RISCV_V_NATIVE) + vbool32_t result = __riscv_vmfgt_vv_f32m1_b32(a_.sv64, b_.sv64, 2); + r_.sv64 = __riscv_vmv_v_x_u32m1(0, 2); + r_.sv64 = __riscv_vmerge_vxm_u32m1(r_.sv64, -1, result, 2); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values > b_.values); #else SIMDE_VECTORIZE @@ -547,7 +604,11 @@ simde_vcgt_f64(simde_float64x1_t a, simde_float64x1_t b) { b_ = simde_float64x1_to_private(b); simde_uint64x1_private r_; - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_RISCV_V_NATIVE) + vbool64_t result = __riscv_vmfgt_vv_f64m1_b64(a_.sv64, b_.sv64, 1); + r_.sv64 = __riscv_vmv_v_x_u64m1(0, 1); + r_.sv64 = __riscv_vmerge_vxm_u64m1(r_.sv64, -1, result, 1); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values > b_.values); #else SIMDE_VECTORIZE @@ -577,6 +638,10 @@ simde_vcgt_s8(simde_int8x8_t a, simde_int8x8_t b) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_cmpgt_pi8(a_.m64, b_.m64); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool8_t result = __riscv_vmsgt_vv_i8m1_b8(a_.sv64, b_.sv64, 8); + r_.sv64 = __riscv_vmv_v_x_u8m1(0, 8); + r_.sv64 = __riscv_vmerge_vxm_u8m1(r_.sv64, -1, result, 8); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values > b_.values); #else @@ -607,6 +672,10 @@ simde_vcgt_s16(simde_int16x4_t a, simde_int16x4_t b) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_cmpgt_pi16(a_.m64, b_.m64); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool16_t result = __riscv_vmsgt_vv_i16m1_b16(a_.sv64, b_.sv64, 4); + r_.sv64 = __riscv_vmv_v_x_u16m1(0, 4); + r_.sv64 = __riscv_vmerge_vxm_u16m1(r_.sv64, -1, result, 4); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values > b_.values); #else @@ -637,6 +706,10 @@ simde_vcgt_s32(simde_int32x2_t a, simde_int32x2_t b) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_cmpgt_pi32(a_.m64, b_.m64); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool32_t result = __riscv_vmsgt_vv_i32m1_b32(a_.sv64, b_.sv64, 2); + r_.sv64 = __riscv_vmv_v_x_u32m1(0, 2); + r_.sv64 = __riscv_vmerge_vxm_u32m1(r_.sv64, -1, result, 2); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values > b_.values); #else @@ -665,7 +738,11 @@ simde_vcgt_s64(simde_int64x1_t a, simde_int64x1_t b) { b_ = simde_int64x1_to_private(b); simde_uint64x1_private r_; - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_RISCV_V_NATIVE) + vbool64_t result = __riscv_vmsgt_vv_i64m1_b64(a_.sv64, b_.sv64, 1); + r_.sv64 = __riscv_vmv_v_x_u64m1(0, 1); + r_.sv64 = __riscv_vmerge_vxm_u64m1(r_.sv64, -1, result, 1); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values > b_.values); #else SIMDE_VECTORIZE @@ -696,6 +773,10 @@ simde_vcgt_u8(simde_uint8x8_t a, simde_uint8x8_t b) { #if defined(SIMDE_X86_MMX_NATIVE) __m64 sign_bit = _mm_set1_pi8(INT8_MIN); r_.m64 = _mm_cmpgt_pi8(_mm_xor_si64(a_.m64, sign_bit), _mm_xor_si64(b_.m64, sign_bit)); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool8_t result = __riscv_vmsgtu_vv_u8m1_b8(a_.sv64, b_.sv64, 8); + r_.sv64 = __riscv_vmv_v_x_u8m1(0, 8); + r_.sv64 = __riscv_vmerge_vxm_u8m1(r_.sv64, -1, result, 8); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values > b_.values); #else @@ -727,6 +808,10 @@ simde_vcgt_u16(simde_uint16x4_t a, simde_uint16x4_t b) { #if defined(SIMDE_X86_MMX_NATIVE) __m64 sign_bit = _mm_set1_pi16(INT16_MIN); r_.m64 = _mm_cmpgt_pi16(_mm_xor_si64(a_.m64, sign_bit), _mm_xor_si64(b_.m64, sign_bit)); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool16_t result = __riscv_vmsgtu_vv_u16m1_b16(a_.sv64, b_.sv64, 4); + r_.sv64 = __riscv_vmv_v_x_u16m1(0, 4); + r_.sv64 = __riscv_vmerge_vxm_u16m1(r_.sv64, -1, result, 4); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values > b_.values); #else @@ -758,6 +843,10 @@ simde_vcgt_u32(simde_uint32x2_t a, simde_uint32x2_t b) { #if defined(SIMDE_X86_MMX_NATIVE) __m64 sign_bit = _mm_set1_pi32(INT32_MIN); r_.m64 = _mm_cmpgt_pi32(_mm_xor_si64(a_.m64, sign_bit), _mm_xor_si64(b_.m64, sign_bit)); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool32_t result = __riscv_vmsgtu_vv_u32m1_b32(a_.sv64, b_.sv64, 2); + r_.sv64 = __riscv_vmv_v_x_u32m1(0, 2); + r_.sv64 = __riscv_vmerge_vxm_u32m1(r_.sv64, -1, result, 2); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values > b_.values); #else @@ -786,7 +875,11 @@ simde_vcgt_u64(simde_uint64x1_t a, simde_uint64x1_t b) { a_ = simde_uint64x1_to_private(a), b_ = simde_uint64x1_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_RISCV_V_NATIVE) + vbool64_t result = __riscv_vmsgtu_vv_u64m1_b64(a_.sv64, b_.sv64, 1); + r_.sv64 = __riscv_vmv_v_x_u64m1(0, 1); + r_.sv64 = __riscv_vmerge_vxm_u64m1(r_.sv64, -1, result, 1); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values > b_.values); #else SIMDE_VECTORIZE diff --git a/simde/arm/neon/cle.h b/simde/arm/neon/cle.h index fedfcc522..b71f5c936 100644 --- a/simde/arm/neon/cle.h +++ b/simde/arm/neon/cle.h @@ -24,6 +24,7 @@ * 2020 Evan Nemerson * 2020 Christopher Moore * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_CLE_H) @@ -116,10 +117,16 @@ simde_vcleq_f16(simde_float16x8_t a, simde_float16x8_t b) { b_ = simde_float16x8_to_private(b); simde_uint16x8_private r_; - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = simde_vcleh_f16(a_.values[i], b_.values[i]); - } + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVFH) + vbool16_t result = __riscv_vmfle_vv_f16m1_b16(a_.sv128, b_.sv128, 8); + r_.sv128 = __riscv_vmv_v_x_u16m1(0, 8); + r_.sv128 = __riscv_vmerge_vxm_u16m1(r_.sv128, -1, result, 8); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcleh_f16(a_.values[i], b_.values[i]); + } + #endif return simde_uint16x8_from_private(r_); #endif @@ -146,6 +153,10 @@ simde_vcleq_f32(simde_float32x4_t a, simde_float32x4_t b) { r_.m128i = _mm_castps_si128(_mm_cmple_ps(a_.m128, b_.m128)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_f32x4_le(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool32_t result = __riscv_vmfle_vv_f32m1_b32(a_.sv128, b_.sv128, 4); + r_.sv128 = __riscv_vmv_v_x_u32m1(0, 4); + r_.sv128 = __riscv_vmerge_vxm_u32m1(r_.sv128, -1, result, 4); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values <= b_.values); #else @@ -180,6 +191,10 @@ simde_vcleq_f64(simde_float64x2_t a, simde_float64x2_t b) { r_.m128i = _mm_castpd_si128(_mm_cmple_pd(a_.m128d, b_.m128d)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_f64x2_le(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool64_t result = __riscv_vmfle_vv_f64m1_b64(a_.sv128, b_.sv128, 2); + r_.sv128 = __riscv_vmv_v_x_u64m1(0, 2); + r_.sv128 = __riscv_vmerge_vxm_u64m1(r_.sv128, -1, result, 2); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values <= b_.values); #else @@ -214,6 +229,10 @@ simde_vcleq_s8(simde_int8x16_t a, simde_int8x16_t b) { r_.m128i = _mm_or_si128(_mm_cmpgt_epi8(b_.m128i, a_.m128i), _mm_cmpeq_epi8(a_.m128i, b_.m128i)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i8x16_le(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool8_t result = __riscv_vmsle_vv_i8m1_b8(a_.sv128, b_.sv128, 16); + r_.sv128 = __riscv_vmv_v_x_u8m1(0, 16); + r_.sv128 = __riscv_vmerge_vxm_u8m1(r_.sv128, -1, result, 16); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values <= b_.values); #else @@ -248,6 +267,10 @@ simde_vcleq_s16(simde_int16x8_t a, simde_int16x8_t b) { r_.m128i = _mm_or_si128(_mm_cmpgt_epi16(b_.m128i, a_.m128i), _mm_cmpeq_epi16(a_.m128i, b_.m128i)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i16x8_le(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool16_t result = __riscv_vmsle_vv_i16m1_b16(a_.sv128, b_.sv128, 8); + r_.sv128 = __riscv_vmv_v_x_u16m1(0, 8); + r_.sv128 = __riscv_vmerge_vxm_u16m1(r_.sv128, -1, result, 8); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values <= b_.values); #else @@ -282,6 +305,10 @@ simde_vcleq_s32(simde_int32x4_t a, simde_int32x4_t b) { r_.m128i = _mm_or_si128(_mm_cmpgt_epi32(b_.m128i, a_.m128i), _mm_cmpeq_epi32(a_.m128i, b_.m128i)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i32x4_le(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool32_t result = __riscv_vmsle_vv_i32m1_b32(a_.sv128, b_.sv128, 4); + r_.sv128 = __riscv_vmv_v_x_u32m1(0, 4); + r_.sv128 = __riscv_vmerge_vxm_u32m1(r_.sv128, -1, result, 4); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values <= b_.values); #else @@ -316,6 +343,10 @@ simde_vcleq_s64(simde_int64x2_t a, simde_int64x2_t b) { #if defined(SIMDE_X86_SSE4_2_NATIVE) r_.m128i = _mm_or_si128(_mm_cmpgt_epi64(b_.m128i, a_.m128i), _mm_cmpeq_epi64(a_.m128i, b_.m128i)); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool64_t result = __riscv_vmsle_vv_i64m1_b64(a_.sv128, b_.sv128, 2); + r_.sv128 = __riscv_vmv_v_x_u64m1(0, 2); + r_.sv128 = __riscv_vmerge_vxm_u64m1(r_.sv128, -1, result, 2); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values <= b_.values); #else @@ -355,6 +386,10 @@ simde_vcleq_u8(simde_uint8x16_t a, simde_uint8x16_t b) { ); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_u8x16_le(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool8_t result = __riscv_vmsleu_vv_u8m1_b8(a_.sv128, b_.sv128, 16); + r_.sv128 = __riscv_vmv_v_x_u8m1(0, 16); + r_.sv128 = __riscv_vmerge_vxm_u8m1(r_.sv128, -1, result, 16); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values <= b_.values); #else @@ -403,6 +438,10 @@ simde_vcleq_u16(simde_uint16x8_t a, simde_uint16x8_t b) { ); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_u16x8_le(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool16_t result = __riscv_vmsleu_vv_u16m1_b16(a_.sv128, b_.sv128, 8); + r_.sv128 = __riscv_vmv_v_x_u16m1(0, 8); + r_.sv128 = __riscv_vmerge_vxm_u16m1(r_.sv128, -1, result, 8); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values <= b_.values); #else @@ -451,6 +490,10 @@ simde_vcleq_u32(simde_uint32x4_t a, simde_uint32x4_t b) { ); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_u32x4_le(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool32_t result = __riscv_vmsleu_vv_u32m1_b32(a_.sv128, b_.sv128, 4); + r_.sv128 = __riscv_vmv_v_x_u32m1(0, 4); + r_.sv128 = __riscv_vmerge_vxm_u32m1(r_.sv128, -1, result, 4); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values <= b_.values); #else @@ -497,6 +540,10 @@ simde_vcleq_u64(simde_uint64x2_t a, simde_uint64x2_t b) { ), _mm_cmpeq_epi64(a_.m128i, b_.m128i) ); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool64_t result = __riscv_vmsleu_vv_u64m1_b64(a_.sv128, b_.sv128, 2); + r_.sv128 = __riscv_vmv_v_x_u64m1(0, 2); + r_.sv128 = __riscv_vmerge_vxm_u64m1(r_.sv128, -1, result, 2); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values <= b_.values); #else @@ -525,10 +572,16 @@ simde_vcle_f16(simde_float16x4_t a, simde_float16x4_t b) { b_ = simde_float16x4_to_private(b); simde_uint16x4_private r_; - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = simde_vcleh_f16(a_.values[i], b_.values[i]); - } + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVFH) + vbool16_t result = __riscv_vmfle_vv_f16m1_b16(a_.sv64, b_.sv64, 4); + r_.sv64 = __riscv_vmv_v_x_u16m1(0, 4); + r_.sv64 = __riscv_vmerge_vxm_u16m1(r_.sv64, -1, result, 4); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcleh_f16(a_.values[i], b_.values[i]); + } + #endif return simde_uint16x4_from_private(r_); #endif @@ -549,7 +602,11 @@ simde_vcle_f32(simde_float32x2_t a, simde_float32x2_t b) { b_ = simde_float32x2_to_private(b); simde_uint32x2_private r_; - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) + #if defined(SIMDE_RISCV_V_NATIVE) + vbool32_t result = __riscv_vmfle_vv_f32m1_b32(a_.sv64, b_.sv64, 2); + r_.sv64 = __riscv_vmv_v_x_u32m1(0, 2); + r_.sv64 = __riscv_vmerge_vxm_u32m1(r_.sv64, -1, result, 2); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values <= b_.values); #else SIMDE_VECTORIZE @@ -577,7 +634,11 @@ simde_vcle_f64(simde_float64x1_t a, simde_float64x1_t b) { b_ = simde_float64x1_to_private(b); simde_uint64x1_private r_; - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_RISCV_V_NATIVE) + vbool64_t result = __riscv_vmfle_vv_f64m1_b64(a_.sv64, b_.sv64, 1); + r_.sv64 = __riscv_vmv_v_x_u64m1(0, 1); + r_.sv64 = __riscv_vmerge_vxm_u64m1(r_.sv64, -1, result, 1); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values <= b_.values); #else SIMDE_VECTORIZE @@ -607,6 +668,10 @@ simde_vcle_s8(simde_int8x8_t a, simde_int8x8_t b) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_or_si64(_mm_cmpgt_pi8(b_.m64, a_.m64), _mm_cmpeq_pi8(a_.m64, b_.m64)); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool8_t result = __riscv_vmsle_vv_i8m1_b8(a_.sv64, b_.sv64, 8); + r_.sv64 = __riscv_vmv_v_x_u8m1(0, 8); + r_.sv64 = __riscv_vmerge_vxm_u8m1(r_.sv64, -1, result, 8); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values <= b_.values); #else @@ -637,6 +702,10 @@ simde_vcle_s16(simde_int16x4_t a, simde_int16x4_t b) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_or_si64(_mm_cmpgt_pi16(b_.m64, a_.m64), _mm_cmpeq_pi16(a_.m64, b_.m64)); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool16_t result = __riscv_vmsle_vv_i16m1_b16(a_.sv64, b_.sv64, 4); + r_.sv64 = __riscv_vmv_v_x_u16m1(0, 4); + r_.sv64 = __riscv_vmerge_vxm_u16m1(r_.sv64, -1, result, 4); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values <= b_.values); #else @@ -667,6 +736,10 @@ simde_vcle_s32(simde_int32x2_t a, simde_int32x2_t b) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_or_si64(_mm_cmpgt_pi32(b_.m64, a_.m64), _mm_cmpeq_pi32(a_.m64, b_.m64)); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool32_t result = __riscv_vmsle_vv_i32m1_b32(a_.sv64, b_.sv64, 2); + r_.sv64 = __riscv_vmv_v_x_u32m1(0, 2); + r_.sv64 = __riscv_vmerge_vxm_u32m1(r_.sv64, -1, result, 2); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values <= b_.values); #else @@ -695,7 +768,11 @@ simde_vcle_s64(simde_int64x1_t a, simde_int64x1_t b) { b_ = simde_int64x1_to_private(b); simde_uint64x1_private r_; - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_RISCV_V_NATIVE) + vbool64_t result = __riscv_vmsle_vv_i64m1_b64(a_.sv64, b_.sv64, 1); + r_.sv64 = __riscv_vmv_v_x_u64m1(0, 1); + r_.sv64 = __riscv_vmerge_vxm_u64m1(r_.sv64, -1, result, 1); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values <= b_.values); #else SIMDE_VECTORIZE @@ -726,6 +803,10 @@ simde_vcle_u8(simde_uint8x8_t a, simde_uint8x8_t b) { #if defined(SIMDE_X86_MMX_NATIVE) __m64 sign_bits = _mm_set1_pi8(INT8_MIN); r_.m64 = _mm_or_si64(_mm_cmpgt_pi8(_mm_xor_si64(b_.m64, sign_bits), _mm_xor_si64(a_.m64, sign_bits)), _mm_cmpeq_pi8(a_.m64, b_.m64)); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool8_t result = __riscv_vmsleu_vv_u8m1_b8(a_.sv64, b_.sv64, 8); + r_.sv64 = __riscv_vmv_v_x_u8m1(0, 8); + r_.sv64 = __riscv_vmerge_vxm_u8m1(r_.sv64, -1, result, 8); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values <= b_.values); #else @@ -757,6 +838,10 @@ simde_vcle_u16(simde_uint16x4_t a, simde_uint16x4_t b) { #if defined(SIMDE_X86_MMX_NATIVE) __m64 sign_bits = _mm_set1_pi16(INT16_MIN); r_.m64 = _mm_or_si64(_mm_cmpgt_pi16(_mm_xor_si64(b_.m64, sign_bits), _mm_xor_si64(a_.m64, sign_bits)), _mm_cmpeq_pi16(a_.m64, b_.m64)); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool16_t result = __riscv_vmsleu_vv_u16m1_b16(a_.sv64, b_.sv64, 4); + r_.sv64 = __riscv_vmv_v_x_u16m1(0, 4); + r_.sv64 = __riscv_vmerge_vxm_u16m1(r_.sv64, -1, result, 4); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values <= b_.values); #else @@ -788,6 +873,10 @@ simde_vcle_u32(simde_uint32x2_t a, simde_uint32x2_t b) { #if defined(SIMDE_X86_MMX_NATIVE) __m64 sign_bits = _mm_set1_pi32(INT32_MIN); r_.m64 = _mm_or_si64(_mm_cmpgt_pi32(_mm_xor_si64(b_.m64, sign_bits), _mm_xor_si64(a_.m64, sign_bits)), _mm_cmpeq_pi32(a_.m64, b_.m64)); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool32_t result = __riscv_vmsleu_vv_u32m1_b32(a_.sv64, b_.sv64, 2); + r_.sv64 = __riscv_vmv_v_x_u32m1(0, 2); + r_.sv64 = __riscv_vmerge_vxm_u32m1(r_.sv64, -1, result, 2); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values <= b_.values); #else @@ -816,7 +905,11 @@ simde_vcle_u64(simde_uint64x1_t a, simde_uint64x1_t b) { a_ = simde_uint64x1_to_private(a), b_ = simde_uint64x1_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_RISCV_V_NATIVE) + vbool64_t result = __riscv_vmsleu_vv_u64m1_b64(a_.sv64, b_.sv64, 1); + r_.sv64 = __riscv_vmv_v_x_u64m1(0, 1); + r_.sv64 = __riscv_vmerge_vxm_u64m1(r_.sv64, -1, result, 1); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values <= b_.values); #else SIMDE_VECTORIZE diff --git a/simde/arm/neon/clez.h b/simde/arm/neon/clez.h index dd308c7f4..b8f1b5f8e 100644 --- a/simde/arm/neon/clez.h +++ b/simde/arm/neon/clez.h @@ -24,6 +24,7 @@ * 2020 Evan Nemerson * 2020 Christopher Moore * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_CLEZ_H) @@ -104,10 +105,16 @@ simde_vclezq_f16(simde_float16x8_t a) { simde_float16x8_private a_ = simde_float16x8_to_private(a); simde_uint16x8_private r_; - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = simde_vclezh_f16(a_.values[i]); - } + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVFH) + vbool16_t result = __riscv_vmfle_vf_f16m1_b16(a_.sv128, 0, 8); + r_.sv128 = __riscv_vmv_v_x_u16m1(0, 8); + r_.sv128 = __riscv_vmerge_vxm_u16m1(r_.sv128, -1, result, 8); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vclezh_f16(a_.values[i]); + } + #endif return simde_uint16x8_from_private(r_); #endif @@ -122,13 +129,17 @@ simde_uint32x4_t simde_vclezq_f32(simde_float32x4_t a) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vclezq_f32(a); - #elif SIMDE_NATURAL_VECTOR_SIZE > 0 + #elif SIMDE_NATURAL_VECTOR_SIZE > 0 && !defined(SIMDE_RISCV_V_NATIVE) return simde_vcleq_f32(a, simde_vdupq_n_f32(SIMDE_FLOAT32_C(0.0))); #else simde_float32x4_private a_ = simde_float32x4_to_private(a); simde_uint32x4_private r_; - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + #if defined(SIMDE_RISCV_V_NATIVE) + vbool32_t result = __riscv_vmfle_vf_f32m1_b32(a_.sv128, 0, 4); + r_.sv128 = __riscv_vmv_v_x_u32m1(0, 4); + r_.sv128 = __riscv_vmerge_vxm_u32m1(r_.sv128, -1, result, 4); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values <= SIMDE_FLOAT32_C(0.0)); #else SIMDE_VECTORIZE @@ -150,13 +161,17 @@ simde_uint64x2_t simde_vclezq_f64(simde_float64x2_t a) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vclezq_f64(a); - #elif SIMDE_NATURAL_VECTOR_SIZE > 0 + #elif SIMDE_NATURAL_VECTOR_SIZE > 0 && !defined(SIMDE_RISCV_V_NATIVE) return simde_vcleq_f64(a, simde_vdupq_n_f64(SIMDE_FLOAT64_C(0.0))); #else simde_float64x2_private a_ = simde_float64x2_to_private(a); simde_uint64x2_private r_; - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + #if defined(SIMDE_RISCV_V_NATIVE) + vbool64_t result = __riscv_vmfle_vf_f64m1_b64(a_.sv128, 0, 2); + r_.sv128 = __riscv_vmv_v_x_u64m1(0, 2); + r_.sv128 = __riscv_vmerge_vxm_u64m1(r_.sv128, -1, result, 2); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values <= SIMDE_FLOAT64_C(0.0)); #else SIMDE_VECTORIZE @@ -178,13 +193,17 @@ simde_uint8x16_t simde_vclezq_s8(simde_int8x16_t a) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vclezq_s8(a); - #elif SIMDE_NATURAL_VECTOR_SIZE > 0 + #elif SIMDE_NATURAL_VECTOR_SIZE > 0 && !defined(SIMDE_RISCV_V_NATIVE) return simde_vcleq_s8(a, simde_vdupq_n_s8(0)); #else simde_int8x16_private a_ = simde_int8x16_to_private(a); simde_uint8x16_private r_; - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + #if defined(SIMDE_RISCV_V_NATIVE) + vbool8_t result = __riscv_vmsle_vx_i8m1_b8(a_.sv128, 0, 16); + r_.sv128 = __riscv_vmv_v_x_u8m1(0, 16); + r_.sv128 = __riscv_vmerge_vxm_u8m1(r_.sv128, -1, result, 16); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values <= 0); #else SIMDE_VECTORIZE @@ -206,13 +225,17 @@ simde_uint16x8_t simde_vclezq_s16(simde_int16x8_t a) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vclezq_s16(a); - #elif SIMDE_NATURAL_VECTOR_SIZE > 0 + #elif SIMDE_NATURAL_VECTOR_SIZE > 0 && !defined(SIMDE_RISCV_V_NATIVE) return simde_vcleq_s16(a, simde_vdupq_n_s16(0)); #else simde_int16x8_private a_ = simde_int16x8_to_private(a); simde_uint16x8_private r_; - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + #if defined(SIMDE_RISCV_V_NATIVE) + vbool16_t result = __riscv_vmsle_vx_i16m1_b16(a_.sv128, 0, 8); + r_.sv128 = __riscv_vmv_v_x_u16m1(0, 8); + r_.sv128 = __riscv_vmerge_vxm_u16m1(r_.sv128, -1, result, 8); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values <= 0); #else SIMDE_VECTORIZE @@ -234,13 +257,17 @@ simde_uint32x4_t simde_vclezq_s32(simde_int32x4_t a) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vclezq_s32(a); - #elif SIMDE_NATURAL_VECTOR_SIZE > 0 + #elif SIMDE_NATURAL_VECTOR_SIZE > 0 && !defined(SIMDE_RISCV_V_NATIVE) return simde_vcleq_s32(a, simde_vdupq_n_s32(0)); #else simde_int32x4_private a_ = simde_int32x4_to_private(a); simde_uint32x4_private r_; - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + #if defined(SIMDE_RISCV_V_NATIVE) + vbool32_t result = __riscv_vmsle_vx_i32m1_b32(a_.sv128, 0, 4); + r_.sv128 = __riscv_vmv_v_x_u32m1(0, 4); + r_.sv128 = __riscv_vmerge_vxm_u32m1(r_.sv128, -1, result, 4); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values <= 0); #else SIMDE_VECTORIZE @@ -262,13 +289,17 @@ simde_uint64x2_t simde_vclezq_s64(simde_int64x2_t a) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vclezq_s64(a); - #elif SIMDE_NATURAL_VECTOR_SIZE > 0 + #elif SIMDE_NATURAL_VECTOR_SIZE > 0 && !defined(SIMDE_RISCV_V_NATIVE) return simde_vcleq_s64(a, simde_vdupq_n_s64(0)); #else simde_int64x2_private a_ = simde_int64x2_to_private(a); simde_uint64x2_private r_; - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + #if defined(SIMDE_RISCV_V_NATIVE) + vbool64_t result = __riscv_vmsle_vx_i64m1_b64(a_.sv128, 0, 2); + r_.sv128 = __riscv_vmv_v_x_u64m1(0, 2); + r_.sv128 = __riscv_vmerge_vxm_u64m1(r_.sv128, -1, result, 2); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values <= 0); #else SIMDE_VECTORIZE @@ -294,10 +325,16 @@ simde_vclez_f16(simde_float16x4_t a) { simde_float16x4_private a_ = simde_float16x4_to_private(a); simde_uint16x4_private r_; - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = simde_vclezh_f16(a_.values[i]); - } + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVFH) + vbool16_t result = __riscv_vmfle_vf_f16m1_b16(a_.sv64, 0, 4); + r_.sv64 = __riscv_vmv_v_x_u16m1(0, 4); + r_.sv64 = __riscv_vmerge_vxm_u16m1(r_.sv64, -1, result, 4); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vclezh_f16(a_.values[i]); + } + #endif return simde_uint16x4_from_private(r_); #endif @@ -312,13 +349,17 @@ simde_uint32x2_t simde_vclez_f32(simde_float32x2_t a) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vclez_f32(a); - #elif SIMDE_NATURAL_VECTOR_SIZE > 0 + #elif SIMDE_NATURAL_VECTOR_SIZE > 0 && !defined(SIMDE_RISCV_V_NATIVE) return simde_vcle_f32(a, simde_vdup_n_f32(SIMDE_FLOAT32_C(0.0))); #else simde_float32x2_private a_ = simde_float32x2_to_private(a); simde_uint32x2_private r_; - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_100762) + #if defined(SIMDE_RISCV_V_NATIVE) + vbool32_t result = __riscv_vmfle_vf_f32m1_b32(a_.sv64, 0, 2); + r_.sv64 = __riscv_vmv_v_x_u32m1(0, 2); + r_.sv64 = __riscv_vmerge_vxm_u32m1(r_.sv64, -1, result, 2); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_100762) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values <= SIMDE_FLOAT32_C(0.0)); #else SIMDE_VECTORIZE @@ -340,13 +381,17 @@ simde_uint64x1_t simde_vclez_f64(simde_float64x1_t a) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vclez_f64(a); - #elif SIMDE_NATURAL_VECTOR_SIZE > 0 + #elif SIMDE_NATURAL_VECTOR_SIZE > 0 && !defined(SIMDE_RISCV_V_NATIVE) return simde_vcle_f64(a, simde_vdup_n_f64(SIMDE_FLOAT64_C(0.0))); #else simde_float64x1_private a_ = simde_float64x1_to_private(a); simde_uint64x1_private r_; - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + #if defined(SIMDE_RISCV_V_NATIVE) + vbool64_t result = __riscv_vmfle_vf_f64m1_b64(a_.sv64, 0, 1); + r_.sv64 = __riscv_vmv_v_x_u64m1(0, 1); + r_.sv64 = __riscv_vmerge_vxm_u64m1(r_.sv64, -1, result, 1); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values <= SIMDE_FLOAT64_C(0.0)); #else SIMDE_VECTORIZE @@ -368,13 +413,17 @@ simde_uint8x8_t simde_vclez_s8(simde_int8x8_t a) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vclez_s8(a); - #elif SIMDE_NATURAL_VECTOR_SIZE > 0 + #elif SIMDE_NATURAL_VECTOR_SIZE > 0 && !defined(SIMDE_RISCV_V_NATIVE) return simde_vcle_s8(a, simde_vdup_n_s8(0)); #else simde_int8x8_private a_ = simde_int8x8_to_private(a); simde_uint8x8_private r_; - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_100762) + #if defined(SIMDE_RISCV_V_NATIVE) + vbool8_t result = __riscv_vmsle_vx_i8m1_b8(a_.sv64, 0, 8); + r_.sv64 = __riscv_vmv_v_x_u8m1(0, 8); + r_.sv64 = __riscv_vmerge_vxm_u8m1(r_.sv64, -1, result, 8); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_100762) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values <= 0); #else SIMDE_VECTORIZE @@ -396,13 +445,17 @@ simde_uint16x4_t simde_vclez_s16(simde_int16x4_t a) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vclez_s16(a); - #elif SIMDE_NATURAL_VECTOR_SIZE > 0 + #elif SIMDE_NATURAL_VECTOR_SIZE > 0 && !defined(SIMDE_RISCV_V_NATIVE) return simde_vcle_s16(a, simde_vdup_n_s16(0)); #else simde_int16x4_private a_ = simde_int16x4_to_private(a); simde_uint16x4_private r_; - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_100762) + #if defined(SIMDE_RISCV_V_NATIVE) + vbool16_t result = __riscv_vmsle_vx_i16m1_b16(a_.sv64, 0, 4); + r_.sv64 = __riscv_vmv_v_x_u16m1(0, 4); + r_.sv64 = __riscv_vmerge_vxm_u16m1(r_.sv64, -1, result, 4); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_100762) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values <= 0); #else SIMDE_VECTORIZE @@ -424,13 +477,17 @@ simde_uint32x2_t simde_vclez_s32(simde_int32x2_t a) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vclez_s32(a); - #elif SIMDE_NATURAL_VECTOR_SIZE > 0 + #elif SIMDE_NATURAL_VECTOR_SIZE > 0 && !defined(SIMDE_RISCV_V_NATIVE) return simde_vcle_s32(a, simde_vdup_n_s32(0)); #else simde_int32x2_private a_ = simde_int32x2_to_private(a); simde_uint32x2_private r_; - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_100762) + #if defined(SIMDE_RISCV_V_NATIVE) + vbool32_t result = __riscv_vmsle_vx_i32m1_b32(a_.sv64, 0, 2); + r_.sv64 = __riscv_vmv_v_x_u32m1(0, 2); + r_.sv64 = __riscv_vmerge_vxm_u32m1(r_.sv64, -1, result, 2); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_100762) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values <= 0); #else SIMDE_VECTORIZE @@ -452,13 +509,17 @@ simde_uint64x1_t simde_vclez_s64(simde_int64x1_t a) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vclez_s64(a); - #elif SIMDE_NATURAL_VECTOR_SIZE > 0 + #elif SIMDE_NATURAL_VECTOR_SIZE > 0 && !defined(SIMDE_RISCV_V_NATIVE) return simde_vcle_s64(a, simde_vdup_n_s64(0)); #else simde_int64x1_private a_ = simde_int64x1_to_private(a); simde_uint64x1_private r_; - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + #if defined(SIMDE_RISCV_V_NATIVE) + vbool64_t result = __riscv_vmsle_vx_i64m1_b64(a_.sv64, 0, 1); + r_.sv64 = __riscv_vmv_v_x_u64m1(0, 1); + r_.sv64 = __riscv_vmerge_vxm_u64m1(r_.sv64, -1, result, 1); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values <= 0); #else SIMDE_VECTORIZE diff --git a/simde/arm/neon/clt.h b/simde/arm/neon/clt.h index 9d3cf4076..8f1281ae0 100644 --- a/simde/arm/neon/clt.h +++ b/simde/arm/neon/clt.h @@ -24,6 +24,7 @@ * 2020 Evan Nemerson * 2020 Christopher Moore * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_CLT_H) @@ -120,10 +121,16 @@ simde_vcltq_f16(simde_float16x8_t a, simde_float16x8_t b) { b_ = simde_float16x8_to_private(b); simde_uint16x8_private r_; - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = simde_vclth_f16(a_.values[i], b_.values[i]); - } + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVFH) + vbool16_t result = __riscv_vmflt_vv_f16m1_b16(a_.sv128, b_.sv128, 8); + r_.sv128 = __riscv_vmv_v_x_u16m1(0, 8); + r_.sv128 = __riscv_vmerge_vxm_u16m1(r_.sv128, -1, result, 8); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vclth_f16(a_.values[i], b_.values[i]); + } + #endif return simde_uint16x8_from_private(r_); #endif @@ -150,6 +157,10 @@ simde_vcltq_f32(simde_float32x4_t a, simde_float32x4_t b) { r_.m128i = _mm_castps_si128(_mm_cmplt_ps(a_.m128, b_.m128)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_f32x4_lt(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool32_t result = __riscv_vmflt_vv_f32m1_b32(a_.sv128, b_.sv128, 4); + r_.sv128 = __riscv_vmv_v_x_u32m1(0, 4); + r_.sv128 = __riscv_vmerge_vxm_u32m1(r_.sv128, -1, result, 4); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values < b_.values); #else @@ -184,6 +195,10 @@ simde_vcltq_f64(simde_float64x2_t a, simde_float64x2_t b) { r_.m128i = _mm_castpd_si128(_mm_cmplt_pd(a_.m128d, b_.m128d)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_f64x2_lt(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool64_t result = __riscv_vmflt_vv_f64m1_b64(a_.sv128, b_.sv128, 2); + r_.sv128 = __riscv_vmv_v_x_u64m1(0, 2); + r_.sv128 = __riscv_vmerge_vxm_u64m1(r_.sv128, -1, result, 2); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values < b_.values); #else @@ -218,6 +233,10 @@ simde_vcltq_s8(simde_int8x16_t a, simde_int8x16_t b) { r_.m128i = _mm_cmplt_epi8(a_.m128i, b_.m128i); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i8x16_lt(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool8_t result = __riscv_vmslt_vv_i8m1_b8(a_.sv128, b_.sv128, 16); + r_.sv128 = __riscv_vmv_v_x_u8m1(0, 16); + r_.sv128 = __riscv_vmerge_vxm_u8m1(r_.sv128, -1, result, 16); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values < b_.values); #else @@ -252,6 +271,10 @@ simde_vcltq_s16(simde_int16x8_t a, simde_int16x8_t b) { r_.m128i = _mm_cmplt_epi16(a_.m128i, b_.m128i); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i16x8_lt(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool16_t result = __riscv_vmslt_vv_i16m1_b16(a_.sv128, b_.sv128, 8); + r_.sv128 = __riscv_vmv_v_x_u16m1(0, 8); + r_.sv128 = __riscv_vmerge_vxm_u16m1(r_.sv128, -1, result, 8); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values < b_.values); #else @@ -286,6 +309,10 @@ simde_vcltq_s32(simde_int32x4_t a, simde_int32x4_t b) { r_.m128i = _mm_cmplt_epi32(a_.m128i, b_.m128i); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i32x4_lt(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool32_t result = __riscv_vmslt_vv_i32m1_b32(a_.sv128, b_.sv128, 4); + r_.sv128 = __riscv_vmv_v_x_u32m1(0, 4); + r_.sv128 = __riscv_vmerge_vxm_u32m1(r_.sv128, -1, result, 4); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values < b_.values); #else @@ -320,6 +347,10 @@ simde_vcltq_s64(simde_int64x2_t a, simde_int64x2_t b) { #if defined(SIMDE_X86_SSE4_2_NATIVE) r_.m128i = _mm_cmpgt_epi64(b_.m128i, a_.m128i); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool64_t result = __riscv_vmslt_vv_i64m1_b64(a_.sv128, b_.sv128, 2); + r_.sv128 = __riscv_vmv_v_x_u64m1(0, 2); + r_.sv128 = __riscv_vmerge_vxm_u64m1(r_.sv128, -1, result, 2); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values < b_.values); #else @@ -357,6 +388,10 @@ simde_vcltq_u8(simde_uint8x16_t a, simde_uint8x16_t b) { ); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_u8x16_lt(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool8_t result = __riscv_vmsltu_vv_u8m1_b8(a_.sv128, b_.sv128, 16); + r_.sv128 = __riscv_vmv_v_x_u8m1(0, 16); + r_.sv128 = __riscv_vmerge_vxm_u8m1(r_.sv128, -1, result, 16); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values < b_.values); #else @@ -397,6 +432,10 @@ simde_vcltq_u16(simde_uint16x8_t a, simde_uint16x8_t b) { r_.m128i = _mm_cmplt_epi16(_mm_xor_si128(a_.m128i, sign_bits), _mm_xor_si128(b_.m128i, sign_bits)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_u16x8_lt(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool16_t result = __riscv_vmsltu_vv_u16m1_b16(a_.sv128, b_.sv128, 8); + r_.sv128 = __riscv_vmv_v_x_u16m1(0, 8); + r_.sv128 = __riscv_vmerge_vxm_u16m1(r_.sv128, -1, result, 8); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values < b_.values); #else @@ -437,6 +476,10 @@ simde_vcltq_u32(simde_uint32x4_t a, simde_uint32x4_t b) { r_.m128i = _mm_cmplt_epi32(_mm_xor_si128(a_.m128i, sign_bits), _mm_xor_si128(b_.m128i, sign_bits)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_u32x4_lt(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool32_t result = __riscv_vmsltu_vv_u32m1_b32(a_.sv128, b_.sv128, 4); + r_.sv128 = __riscv_vmv_v_x_u32m1(0, 4); + r_.sv128 = __riscv_vmerge_vxm_u32m1(r_.sv128, -1, result, 4); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values < b_.values); #else @@ -475,6 +518,10 @@ simde_vcltq_u64(simde_uint64x2_t a, simde_uint64x2_t b) { #elif defined(SIMDE_X86_SSE4_2_NATIVE) __m128i sign_bits = _mm_set1_epi64x(INT64_MIN); r_.m128i = _mm_cmpgt_epi64(_mm_xor_si128(b_.m128i, sign_bits), _mm_xor_si128(a_.m128i, sign_bits)); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool64_t result = __riscv_vmsltu_vv_u64m1_b64(a_.sv128, b_.sv128, 2); + r_.sv128 = __riscv_vmv_v_x_u64m1(0, 2); + r_.sv128 = __riscv_vmerge_vxm_u64m1(r_.sv128, -1, result, 2); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values < b_.values); #else @@ -503,10 +550,16 @@ simde_vclt_f16(simde_float16x4_t a, simde_float16x4_t b) { b_ = simde_float16x4_to_private(b); simde_uint16x4_private r_; - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = simde_vclth_f16(a_.values[i], b_.values[i]); - } + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVFH) + vbool16_t result = __riscv_vmflt_vv_f16m1_b16(a_.sv64, b_.sv64, 4); + r_.sv64 = __riscv_vmv_v_x_u16m1(0, 4); + r_.sv64 = __riscv_vmerge_vxm_u16m1(r_.sv64, -1, result, 4); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vclth_f16(a_.values[i], b_.values[i]); + } + #endif return simde_uint16x4_from_private(r_); #endif @@ -527,7 +580,11 @@ simde_vclt_f32(simde_float32x2_t a, simde_float32x2_t b) { b_ = simde_float32x2_to_private(b); simde_uint32x2_private r_; - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) + #if defined(SIMDE_RISCV_V_NATIVE) + vbool32_t result = __riscv_vmflt_vv_f32m1_b32(a_.sv64, b_.sv64, 2); + r_.sv64 = __riscv_vmv_v_x_u32m1(0, 2); + r_.sv64 = __riscv_vmerge_vxm_u32m1(r_.sv64, -1, result, 2); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values < b_.values); #else SIMDE_VECTORIZE @@ -555,7 +612,11 @@ simde_vclt_f64(simde_float64x1_t a, simde_float64x1_t b) { b_ = simde_float64x1_to_private(b); simde_uint64x1_private r_; - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_RISCV_V_NATIVE) + vbool64_t result = __riscv_vmflt_vv_f64m1_b64(a_.sv64, b_.sv64, 1); + r_.sv64 = __riscv_vmv_v_x_u64m1(0, 1); + r_.sv64 = __riscv_vmerge_vxm_u64m1(r_.sv64, -1, result, 1); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values < b_.values); #else SIMDE_VECTORIZE @@ -585,6 +646,10 @@ simde_vclt_s8(simde_int8x8_t a, simde_int8x8_t b) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_cmpgt_pi8(b_.m64, a_.m64); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool8_t result = __riscv_vmslt_vv_i8m1_b8(a_.sv64, b_.sv64, 8); + r_.sv64 = __riscv_vmv_v_x_u8m1(0, 8); + r_.sv64 = __riscv_vmerge_vxm_u8m1(r_.sv64, -1, result, 8); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values < b_.values); #else @@ -615,6 +680,10 @@ simde_vclt_s16(simde_int16x4_t a, simde_int16x4_t b) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_cmpgt_pi16(b_.m64, a_.m64); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool16_t result = __riscv_vmslt_vv_i16m1_b16(a_.sv64, b_.sv64, 4); + r_.sv64 = __riscv_vmv_v_x_u16m1(0, 4); + r_.sv64 = __riscv_vmerge_vxm_u16m1(r_.sv64, -1, result, 4); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values < b_.values); #else @@ -645,6 +714,10 @@ simde_vclt_s32(simde_int32x2_t a, simde_int32x2_t b) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_cmpgt_pi32(b_.m64, a_.m64); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool32_t result = __riscv_vmslt_vv_i32m1_b32(a_.sv64, b_.sv64, 2); + r_.sv64 = __riscv_vmv_v_x_u32m1(0, 2); + r_.sv64 = __riscv_vmerge_vxm_u32m1(r_.sv64, -1, result, 2); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values < b_.values); #else @@ -673,7 +746,11 @@ simde_vclt_s64(simde_int64x1_t a, simde_int64x1_t b) { b_ = simde_int64x1_to_private(b); simde_uint64x1_private r_; - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_RISCV_V_NATIVE) + vbool64_t result = __riscv_vmslt_vv_i64m1_b64(a_.sv64, b_.sv64, 1); + r_.sv64 = __riscv_vmv_v_x_u64m1(0, 1); + r_.sv64 = __riscv_vmerge_vxm_u64m1(r_.sv64, -1, result, 1); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values < b_.values); #else SIMDE_VECTORIZE @@ -704,6 +781,10 @@ simde_vclt_u8(simde_uint8x8_t a, simde_uint8x8_t b) { #if defined(SIMDE_X86_MMX_NATIVE) __m64 sign_bits = _mm_set1_pi8(INT8_MIN); r_.m64 = _mm_cmpgt_pi8(_mm_xor_si64(b_.m64, sign_bits), _mm_xor_si64(a_.m64, sign_bits)); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool8_t result = __riscv_vmsltu_vv_u8m1_b8(a_.sv64, b_.sv64, 8); + r_.sv64 = __riscv_vmv_v_x_u8m1(0, 8); + r_.sv64 = __riscv_vmerge_vxm_u8m1(r_.sv64, -1, result, 8); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values < b_.values); #else @@ -735,6 +816,10 @@ simde_vclt_u16(simde_uint16x4_t a, simde_uint16x4_t b) { #if defined(SIMDE_X86_MMX_NATIVE) __m64 sign_bits = _mm_set1_pi16(INT16_MIN); r_.m64 = _mm_cmpgt_pi16(_mm_xor_si64(b_.m64, sign_bits), _mm_xor_si64(a_.m64, sign_bits)); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool16_t result = __riscv_vmsltu_vv_u16m1_b16(a_.sv64, b_.sv64, 4); + r_.sv64 = __riscv_vmv_v_x_u16m1(0, 4); + r_.sv64 = __riscv_vmerge_vxm_u16m1(r_.sv64, -1, result, 4); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values < b_.values); #else @@ -766,6 +851,10 @@ simde_vclt_u32(simde_uint32x2_t a, simde_uint32x2_t b) { #if defined(SIMDE_X86_MMX_NATIVE) __m64 sign_bits = _mm_set1_pi32(INT32_MIN); r_.m64 = _mm_cmpgt_pi32(_mm_xor_si64(b_.m64, sign_bits), _mm_xor_si64(a_.m64, sign_bits)); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool32_t result = __riscv_vmsltu_vv_u32m1_b32(a_.sv64, b_.sv64, 2); + r_.sv64 = __riscv_vmv_v_x_u32m1(0, 2); + r_.sv64 = __riscv_vmerge_vxm_u32m1(r_.sv64, -1, result, 2); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values < b_.values); #else @@ -794,7 +883,11 @@ simde_vclt_u64(simde_uint64x1_t a, simde_uint64x1_t b) { a_ = simde_uint64x1_to_private(a), b_ = simde_uint64x1_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_RISCV_V_NATIVE) + vbool64_t result = __riscv_vmsltu_vv_u64m1_b64(a_.sv64, b_.sv64, 1); + r_.sv64 = __riscv_vmv_v_x_u64m1(0, 1); + r_.sv64 = __riscv_vmerge_vxm_u64m1(r_.sv64, -1, result, 1); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values < b_.values); #else SIMDE_VECTORIZE diff --git a/simde/arm/neon/cnt.h b/simde/arm/neon/cnt.h index 9169f7e24..faeaf51c4 100644 --- a/simde/arm/neon/cnt.h +++ b/simde/arm/neon/cnt.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_CNT_H) @@ -55,10 +56,24 @@ simde_vcnt_s8(simde_int8x8_t a) { r_, a_ = simde_int8x8_to_private(a); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = HEDLEY_STATIC_CAST(int8_t, simde_x_arm_neon_cntb(HEDLEY_STATIC_CAST(uint8_t, a_.values[i]))); - } + #if defined(SIMDE_RISCV_V_NATIVE) + vuint8m1_t p = __riscv_vreinterpret_v_i8m1_u8m1(a_.sv64); + vuint8m1_t tmp = __riscv_vand_vv_u8m1(__riscv_vsrl_vx_u8m1(p , 1 , 8) , __riscv_vmv_v_x_u8m1(0x55 , 8) , 8); + p = __riscv_vsub_vv_u8m1(p , tmp , 8); + tmp = p; + p = __riscv_vand_vv_u8m1(p , __riscv_vmv_v_x_u8m1(0x33 , 8) , 8); + tmp = __riscv_vand_vv_u8m1(__riscv_vsrl_vx_u8m1(tmp , 2 , 8) , __riscv_vmv_v_x_u8m1(0x33 , 8) , 8); + p = __riscv_vadd_vv_u8m1(p , tmp , 8); + tmp = __riscv_vsrl_vx_u8m1(p, 4 , 8); + p = __riscv_vadd_vv_u8m1(p , tmp , 8); + p = __riscv_vand_vv_u8m1(p , __riscv_vmv_v_x_u8m1(0xf , 8) , 8); + r_.sv64 = __riscv_vreinterpret_v_u8m1_i8m1(p); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = HEDLEY_STATIC_CAST(int8_t, simde_x_arm_neon_cntb(HEDLEY_STATIC_CAST(uint8_t, a_.values[i]))); + } + #endif return simde_int8x8_from_private(r_); #endif @@ -140,6 +155,16 @@ simde_vcntq_s8(simde_int8x16_t a) { tmp = _mm_srli_epi16(a_.m128i, 4); a_.m128i = _mm_add_epi8(a_.m128i, tmp); r_.m128i = _mm_and_si128(a_.m128i, _mm_set1_epi8(0x0f)); + #elif defined(SIMDE_RISCV_V_NATIVE) + vint8m1_t tmp = __riscv_vand_vv_i8m1(__riscv_vsra_vx_i8m1(a_.sv128 , 1 , 16) , __riscv_vmv_v_x_i8m1(0x55 , 16) , 16); + a_.sv128 = __riscv_vsub_vv_i8m1(a_.sv128 , tmp , 16); + tmp = a_.sv128; + a_.sv128 = __riscv_vand_vv_i8m1(a_.sv128 , __riscv_vmv_v_x_i8m1(0x33 , 16) , 16); + tmp = __riscv_vand_vv_i8m1(__riscv_vsra_vx_i8m1(tmp , 2 , 16) , __riscv_vmv_v_x_i8m1(0x33 , 16) , 16); + a_.sv128 = __riscv_vadd_vv_i8m1(a_.sv128 , tmp , 16); + tmp = __riscv_vsra_vx_i8m1(a_.sv128, 4 , 16); + a_.sv128 = __riscv_vadd_vv_i8m1(a_.sv128 , tmp , 16); + r_.sv128 = __riscv_vand_vv_i8m1(a_.sv128 , __riscv_vmv_v_x_i8m1(0xf , 16) , 16); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { diff --git a/simde/arm/neon/fma.h b/simde/arm/neon/fma.h index aaf9e04e0..ecf90d5b5 100644 --- a/simde/arm/neon/fma.h +++ b/simde/arm/neon/fma.h @@ -23,6 +23,7 @@ * Copyright: * 2021 Atharva Nimbalkar * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) +* 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_FMA_H) @@ -54,6 +55,15 @@ simde_float32x2_t simde_vfma_f32(simde_float32x2_t a, simde_float32x2_t b, simde_float32x2_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) return vfma_f32(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_float32x2_private + r_, + a_ = simde_float32x2_to_private(a), + b_ = simde_float32x2_to_private(b), + c_ = simde_float32x2_to_private(c); + + r_.sv64 = __riscv_vfmacc_vv_f32m1(a_.sv64 , b_.sv64 , c_.sv64 , 2); + return simde_float32x2_from_private(r_); #else return simde_vadd_f32(a, simde_vmul_f32(b, c)); #endif @@ -68,6 +78,15 @@ simde_float64x1_t simde_vfma_f64(simde_float64x1_t a, simde_float64x1_t b, simde_float64x1_t c) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) return vfma_f64(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_float64x1_private + r_, + a_ = simde_float64x1_to_private(a), + b_ = simde_float64x1_to_private(b), + c_ = simde_float64x1_to_private(c); + + r_.sv64 = __riscv_vfmacc_vv_f64m1(a_.sv64 , b_.sv64 , c_.sv64 , 1); + return simde_float64x1_from_private(r_); #else return simde_vadd_f64(a, simde_vmul_f64(b, c)); #endif @@ -82,6 +101,15 @@ simde_float16x4_t simde_vfma_f16(simde_float16x4_t a, simde_float16x4_t b, simde_float16x4_t c) { #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && defined(SIMDE_ARM_NEON_FP16) return vfma_f16(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVFH) + simde_float16x4_private + r_, + a_ = simde_float16x4_to_private(a), + b_ = simde_float16x4_to_private(b), + c_ = simde_float16x4_to_private(c); + + r_.sv64 = __riscv_vfmacc_vv_f16m1(a_.sv64 , b_.sv64 , c_.sv64 , 4); + return simde_float16x4_from_private(r_); #else return simde_vadd_f16(a, simde_vmul_f16(b, c)); #endif @@ -96,6 +124,15 @@ simde_float16x8_t simde_vfmaq_f16(simde_float16x8_t a, simde_float16x8_t b, simde_float16x8_t c) { #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && defined(SIMDE_ARM_NEON_FP16) return vfmaq_f16(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVFH) + simde_float16x8_private + r_, + a_ = simde_float16x8_to_private(a), + b_ = simde_float16x8_to_private(b), + c_ = simde_float16x8_to_private(c); + + r_.sv128 = __riscv_vfmacc_vv_f16m1(a_.sv128 , b_.sv128 , c_.sv128 , 8); + return simde_float16x8_from_private(r_); #else return simde_vaddq_f16(a, simde_vmulq_f16(b, c)); #endif @@ -113,7 +150,7 @@ simde_vfmaq_f32(simde_float32x4_t a, simde_float32x4_t b, simde_float32x4_t c) { #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) return vec_madd(b, c, a); #elif \ - defined(SIMDE_X86_FMA_NATIVE) + defined(SIMDE_X86_FMA_NATIVE) || defined(SIMDE_RISCV_V_NATIVE) simde_float32x4_private r_, a_ = simde_float32x4_to_private(a), @@ -122,6 +159,8 @@ simde_vfmaq_f32(simde_float32x4_t a, simde_float32x4_t b, simde_float32x4_t c) { #if defined(SIMDE_X86_FMA_NATIVE) r_.m128 = _mm_fmadd_ps(b_.m128, c_.m128, a_.m128); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vfmacc_vv_f32m1(a_.sv128 , b_.sv128 , c_.sv128 , 4); #endif return simde_float32x4_from_private(r_); @@ -142,7 +181,7 @@ simde_vfmaq_f64(simde_float64x2_t a, simde_float64x2_t b, simde_float64x2_t c) { #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) return vec_madd(b, c, a); #elif \ - defined(SIMDE_X86_FMA_NATIVE) + defined(SIMDE_X86_FMA_NATIVE) || defined(SIMDE_RISCV_V_NATIVE) simde_float64x2_private r_, a_ = simde_float64x2_to_private(a), @@ -151,6 +190,8 @@ simde_vfmaq_f64(simde_float64x2_t a, simde_float64x2_t b, simde_float64x2_t c) { #if defined(SIMDE_X86_FMA_NATIVE) r_.m128d = _mm_fmadd_pd(b_.m128d, c_.m128d, a_.m128d); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vfmacc_vv_f64m1(a_.sv128 , b_.sv128 , c_.sv128 , 2); #endif return simde_float64x2_from_private(r_); diff --git a/simde/arm/neon/fms.h b/simde/arm/neon/fms.h index 0ad265c3d..21823f2c0 100644 --- a/simde/arm/neon/fms.h +++ b/simde/arm/neon/fms.h @@ -22,6 +22,7 @@ * * Copyright: * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) +* 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_FMS_H) @@ -54,6 +55,14 @@ simde_float32x2_t simde_vfms_f32(simde_float32x2_t a, simde_float32x2_t b, simde_float32x2_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) return vfms_f32(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_float32x2_private + r_, + a_ = simde_float32x2_to_private(a), + b_ = simde_float32x2_to_private(b), + c_ = simde_float32x2_to_private(c); + r_.sv64 = __riscv_vfnmsac_vv_f32m1(a_.sv64 , b_.sv64 , c_.sv64 , 2); + return simde_float32x2_from_private(r_); #else return simde_vadd_f32(a, simde_vneg_f32(simde_vmul_f32(b, c))); #endif @@ -68,6 +77,14 @@ simde_float64x1_t simde_vfms_f64(simde_float64x1_t a, simde_float64x1_t b, simde_float64x1_t c) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) return vfms_f64(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_float64x1_private + r_, + a_ = simde_float64x1_to_private(a), + b_ = simde_float64x1_to_private(b), + c_ = simde_float64x1_to_private(c); + r_.sv64 = __riscv_vfnmsac_vv_f64m1(a_.sv64 , b_.sv64 , c_.sv64 , 1); + return simde_float64x1_from_private(r_); #else return simde_vadd_f64(a, simde_vneg_f64(simde_vmul_f64(b, c))); #endif @@ -82,6 +99,14 @@ simde_float16x4_t simde_vfms_f16(simde_float16x4_t a, simde_float16x4_t b, simde_float16x4_t c) { #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && defined(SIMDE_ARM_NEON_FP16) return vfms_f16(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVFH) + simde_float16x4_private + r_, + a_ = simde_float16x4_to_private(a), + b_ = simde_float16x4_to_private(b), + c_ = simde_float16x4_to_private(c); + r_.sv64 = __riscv_vfnmsac_vv_f16m1(a_.sv64 , b_.sv64 , c_.sv64 , 4); + return simde_float16x4_from_private(r_); #else return simde_vadd_f16(a, simde_vneg_f16(simde_vmul_f16(b, c))); #endif @@ -96,6 +121,14 @@ simde_float16x8_t simde_vfmsq_f16(simde_float16x8_t a, simde_float16x8_t b, simde_float16x8_t c) { #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && defined(SIMDE_ARM_NEON_FP16) return vfmsq_f16(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVFH) + simde_float16x8_private + r_, + a_ = simde_float16x8_to_private(a), + b_ = simde_float16x8_to_private(b), + c_ = simde_float16x8_to_private(c); + r_.sv128 = __riscv_vfnmsac_vv_f16m1(a_.sv128 , b_.sv128 , c_.sv128 , 8); + return simde_float16x8_from_private(r_); #else return simde_vaddq_f16(a, simde_vnegq_f16(simde_vmulq_f16(b, c))); #endif @@ -110,6 +143,14 @@ simde_float32x4_t simde_vfmsq_f32(simde_float32x4_t a, simde_float32x4_t b, simde_float32x4_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) return vfmsq_f32(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_float32x4_private + r_, + a_ = simde_float32x4_to_private(a), + b_ = simde_float32x4_to_private(b), + c_ = simde_float32x4_to_private(c); + r_.sv128 = __riscv_vfnmsac_vv_f32m1(a_.sv128 , b_.sv128 , c_.sv128 , 4); + return simde_float32x4_from_private(r_); #else return simde_vaddq_f32(a, simde_vnegq_f32(simde_vmulq_f32(b, c))); #endif @@ -124,6 +165,14 @@ simde_float64x2_t simde_vfmsq_f64(simde_float64x2_t a, simde_float64x2_t b, simde_float64x2_t c) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) return vfmsq_f64(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_float64x2_private + r_, + a_ = simde_float64x2_to_private(a), + b_ = simde_float64x2_to_private(b), + c_ = simde_float64x2_to_private(c); + r_.sv128 = __riscv_vfnmsac_vv_f64m1(a_.sv128 , b_.sv128 , c_.sv128 , 2); + return simde_float64x2_from_private(r_); #else return simde_vaddq_f64(a, simde_vnegq_f64(simde_vmulq_f64(b, c))); #endif diff --git a/simde/arm/neon/fms_n.h b/simde/arm/neon/fms_n.h index 6011ae415..6783988a2 100644 --- a/simde/arm/neon/fms_n.h +++ b/simde/arm/neon/fms_n.h @@ -22,6 +22,7 @@ * * Copyright: * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) +* 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_FMS_N_H) @@ -40,6 +41,13 @@ simde_float16x4_t simde_vfms_n_f16(simde_float16x4_t a, simde_float16x4_t b, simde_float16_t c) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) && !defined(SIMDE_BUG_GCC_95399) && defined(SIMDE_ARM_NEON_FP16) return vfms_n_f16(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVFH) + simde_float16x4_private + r_, + a_ = simde_float16x4_to_private(a), + b_ = simde_float16x4_to_private(b); + r_.sv64 = __riscv_vfnmsac_vf_f16m1(a_.sv64 , c , b_.sv64 , 4); + return simde_float16x4_from_private(r_); #else return simde_vfms_f16(a, b, simde_vdup_n_f16(c)); #endif @@ -54,6 +62,13 @@ simde_float16x8_t simde_vfmsq_n_f16(simde_float16x8_t a, simde_float16x8_t b, simde_float16_t c) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) && !defined(SIMDE_BUG_GCC_95399) && defined(SIMDE_ARM_NEON_FP16) return vfmsq_n_f16(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVFH) + simde_float16x8_private + r_, + a_ = simde_float16x8_to_private(a), + b_ = simde_float16x8_to_private(b); + r_.sv128 = __riscv_vfnmsac_vf_f16m1(a_.sv128 , c , b_.sv128 , 8); + return simde_float16x8_from_private(r_); #else return simde_vfmsq_f16(a, b, simde_vdupq_n_f16(c)); #endif @@ -68,6 +83,13 @@ simde_float32x2_t simde_vfms_n_f32(simde_float32x2_t a, simde_float32x2_t b, simde_float32_t c) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) && !defined(SIMDE_BUG_GCC_95399) return vfms_n_f32(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_float32x2_private + r_, + a_ = simde_float32x2_to_private(a), + b_ = simde_float32x2_to_private(b); + r_.sv64 = __riscv_vfnmsac_vf_f32m1(a_.sv64 , c , b_.sv64 , 2); + return simde_float32x2_from_private(r_); #else return simde_vfms_f32(a, b, simde_vdup_n_f32(c)); #endif @@ -82,6 +104,13 @@ simde_float64x1_t simde_vfms_n_f64(simde_float64x1_t a, simde_float64x1_t b, simde_float64_t c) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) return vfms_n_f64(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_float64x1_private + r_, + a_ = simde_float64x1_to_private(a), + b_ = simde_float64x1_to_private(b); + r_.sv64 = __riscv_vfnmsac_vf_f64m1(a_.sv64 , c , b_.sv64 , 1); + return simde_float64x1_from_private(r_); #else return simde_vfms_f64(a, b, simde_vdup_n_f64(c)); #endif @@ -96,6 +125,13 @@ simde_float32x4_t simde_vfmsq_n_f32(simde_float32x4_t a, simde_float32x4_t b, simde_float32_t c) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) && !defined(SIMDE_BUG_GCC_95399) return vfmsq_n_f32(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_float32x4_private + r_, + a_ = simde_float32x4_to_private(a), + b_ = simde_float32x4_to_private(b); + r_.sv128 = __riscv_vfnmsac_vf_f32m1(a_.sv128 , c , b_.sv128 , 4); + return simde_float32x4_from_private(r_); #else return simde_vfmsq_f32(a, b, simde_vdupq_n_f32(c)); #endif @@ -110,6 +146,13 @@ simde_float64x2_t simde_vfmsq_n_f64(simde_float64x2_t a, simde_float64x2_t b, simde_float64_t c) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) return vfmsq_n_f64(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_float64x2_private + r_, + a_ = simde_float64x2_to_private(a), + b_ = simde_float64x2_to_private(b); + r_.sv128 = __riscv_vfnmsac_vf_f64m1(a_.sv128 , c , b_.sv128 , 2); + return simde_float64x2_from_private(r_); #else return simde_vfmsq_f64(a, b, simde_vdupq_n_f64(c)); #endif diff --git a/simde/arm/neon/get_high.h b/simde/arm/neon/get_high.h index df37cccca..899dc3f45 100644 --- a/simde/arm/neon/get_high.h +++ b/simde/arm/neon/get_high.h @@ -24,6 +24,7 @@ * 2020 Evan Nemerson * 2020 Sean Maher (Copyright owned by Google, LLC) * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_GET_HIGH_H) @@ -43,12 +44,14 @@ simde_vget_high_f16(simde_float16x8_t a) { #else simde_float16x4_private r_; simde_float16x8_private a_ = simde_float16x8_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i + (sizeof(r_.values) / sizeof(r_.values[0]))]; - } - + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + r_.sv64 = __riscv_vslidedown_vx_f16m1(a_.sv128 , 4 , 8); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i + (sizeof(r_.values) / sizeof(r_.values[0]))]; + } + #endif return simde_float16x4_from_private(r_); #endif } @@ -66,7 +69,9 @@ simde_vget_high_f32(simde_float32x4_t a) { simde_float32x2_private r_; simde_float32x4_private a_ = simde_float32x4_to_private(a); - #if HEDLEY_HAS_BUILTIN(__builtin_shufflevector) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vslidedown_vx_f32m1(a_.sv128 , 2 , 4); + #elif HEDLEY_HAS_BUILTIN(__builtin_shufflevector) r_.values = __builtin_shufflevector(a_.values, a_.values, 2, 3); #else SIMDE_VECTORIZE @@ -92,7 +97,9 @@ simde_vget_high_f64(simde_float64x2_t a) { simde_float64x1_private r_; simde_float64x2_private a_ = simde_float64x2_to_private(a); - #if HEDLEY_HAS_BUILTIN(__builtin_shufflevector) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vslidedown_vx_f64m1(a_.sv128 , 1 , 2); + #elif HEDLEY_HAS_BUILTIN(__builtin_shufflevector) r_.values = __builtin_shufflevector(a_.values, a_.values, 1); #else SIMDE_VECTORIZE @@ -118,7 +125,9 @@ simde_vget_high_s8(simde_int8x16_t a) { simde_int8x8_private r_; simde_int8x16_private a_ = simde_int8x16_to_private(a); - #if HEDLEY_HAS_BUILTIN(__builtin_shufflevector) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vslidedown_vx_i8m1(a_.sv128 , 8 , 16); + #elif HEDLEY_HAS_BUILTIN(__builtin_shufflevector) r_.values = __builtin_shufflevector(a_.values, a_.values, 8, 9, 10, 11, 12, 13, 14, 15); #else SIMDE_VECTORIZE @@ -144,7 +153,9 @@ simde_vget_high_s16(simde_int16x8_t a) { simde_int16x4_private r_; simde_int16x8_private a_ = simde_int16x8_to_private(a); - #if HEDLEY_HAS_BUILTIN(__builtin_shufflevector) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vslidedown_vx_i16m1(a_.sv128 , 4 , 8); + #elif HEDLEY_HAS_BUILTIN(__builtin_shufflevector) r_.values = __builtin_shufflevector(a_.values, a_.values, 4, 5, 6, 7); #else SIMDE_VECTORIZE @@ -170,7 +181,9 @@ simde_vget_high_s32(simde_int32x4_t a) { simde_int32x2_private r_; simde_int32x4_private a_ = simde_int32x4_to_private(a); - #if HEDLEY_HAS_BUILTIN(__builtin_shufflevector) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vslidedown_vx_i32m1(a_.sv128 , 2 , 4); + #elif HEDLEY_HAS_BUILTIN(__builtin_shufflevector) r_.values = __builtin_shufflevector(a_.values, a_.values, 2, 3); #else SIMDE_VECTORIZE @@ -196,7 +209,9 @@ simde_vget_high_s64(simde_int64x2_t a) { simde_int64x1_private r_; simde_int64x2_private a_ = simde_int64x2_to_private(a); - #if HEDLEY_HAS_BUILTIN(__builtin_shufflevector) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vslidedown_vx_i64m1(a_.sv128 , 1 , 2); + #elif HEDLEY_HAS_BUILTIN(__builtin_shufflevector) r_.values = __builtin_shufflevector(a_.values, a_.values, 1); #else SIMDE_VECTORIZE @@ -222,7 +237,9 @@ simde_vget_high_u8(simde_uint8x16_t a) { simde_uint8x8_private r_; simde_uint8x16_private a_ = simde_uint8x16_to_private(a); - #if HEDLEY_HAS_BUILTIN(__builtin_shufflevector) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vslidedown_vx_u8m1(a_.sv128 , 8 , 16); + #elif HEDLEY_HAS_BUILTIN(__builtin_shufflevector) r_.values = __builtin_shufflevector(a_.values, a_.values, 8, 9, 10, 11, 12, 13, 14,15); #else SIMDE_VECTORIZE @@ -248,7 +265,9 @@ simde_vget_high_u16(simde_uint16x8_t a) { simde_uint16x4_private r_; simde_uint16x8_private a_ = simde_uint16x8_to_private(a); - #if HEDLEY_HAS_BUILTIN(__builtin_shufflevector) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vslidedown_vx_u16m1(a_.sv128 , 4 , 8); + #elif HEDLEY_HAS_BUILTIN(__builtin_shufflevector) r_.values = __builtin_shufflevector(a_.values, a_.values, 4, 5, 6, 7); #else SIMDE_VECTORIZE @@ -274,7 +293,9 @@ simde_vget_high_u32(simde_uint32x4_t a) { simde_uint32x2_private r_; simde_uint32x4_private a_ = simde_uint32x4_to_private(a); - #if HEDLEY_HAS_BUILTIN(__builtin_shufflevector) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vslidedown_vx_u32m1(a_.sv128 , 2 , 4); + #elif HEDLEY_HAS_BUILTIN(__builtin_shufflevector) r_.values = __builtin_shufflevector(a_.values, a_.values, 2, 3); #else SIMDE_VECTORIZE @@ -300,7 +321,9 @@ simde_vget_high_u64(simde_uint64x2_t a) { simde_uint64x1_private r_; simde_uint64x2_private a_ = simde_uint64x2_to_private(a); - #if HEDLEY_HAS_BUILTIN(__builtin_shufflevector) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vslidedown_vx_u64m1(a_.sv128 , 1 , 2); + #elif HEDLEY_HAS_BUILTIN(__builtin_shufflevector) r_.values = __builtin_shufflevector(a_.values, a_.values, 1); #else SIMDE_VECTORIZE diff --git a/simde/arm/neon/get_low.h b/simde/arm/neon/get_low.h index 4594a3064..99180cb72 100644 --- a/simde/arm/neon/get_low.h +++ b/simde/arm/neon/get_low.h @@ -24,6 +24,7 @@ * 2020 Evan Nemerson * 2020 Sean Maher (Copyright owned by Google, LLC) * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_GET_LOW_H) @@ -44,10 +45,14 @@ simde_vget_low_f16(simde_float16x8_t a) { simde_float16x4_private r_; simde_float16x8_private a_ = simde_float16x8_to_private(a); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVFH) + r_.sv64 = a_.sv128; + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i]; + } + #endif return simde_float16x4_from_private(r_); #endif @@ -66,7 +71,9 @@ simde_vget_low_f32(simde_float32x4_t a) { simde_float32x2_private r_; simde_float32x4_private a_ = simde_float32x4_to_private(a); - #if HEDLEY_HAS_BUILTIN(__builtin_shufflevector) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = a_.sv128; + #elif HEDLEY_HAS_BUILTIN(__builtin_shufflevector) r_.values = __builtin_shufflevector(a_.values, a_.values, 0, 1); #else SIMDE_VECTORIZE @@ -92,7 +99,9 @@ simde_vget_low_f64(simde_float64x2_t a) { simde_float64x1_private r_; simde_float64x2_private a_ = simde_float64x2_to_private(a); - #if HEDLEY_HAS_BUILTIN(__builtin_shufflevector) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = a_.sv128; + #elif HEDLEY_HAS_BUILTIN(__builtin_shufflevector) r_.values = __builtin_shufflevector(a_.values, a_.values, 0); #else SIMDE_VECTORIZE @@ -120,6 +129,8 @@ simde_vget_low_s8(simde_int8x16_t a) { #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_movepi64_pi64(a_.m128i); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = a_.sv128; #else #if HEDLEY_HAS_BUILTIN(__builtin_shufflevector) r_.values = __builtin_shufflevector(a_.values, a_.values, 0, 1, 2, 3, 4, 5, 6, 7); @@ -150,6 +161,8 @@ simde_vget_low_s16(simde_int16x8_t a) { #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_movepi64_pi64(a_.m128i); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = a_.sv128; #else #if HEDLEY_HAS_BUILTIN(__builtin_shufflevector) r_.values = __builtin_shufflevector(a_.values, a_.values, 0, 1, 2, 3); @@ -180,6 +193,8 @@ simde_vget_low_s32(simde_int32x4_t a) { #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_movepi64_pi64(a_.m128i); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = a_.sv128; #else #if HEDLEY_HAS_BUILTIN(__builtin_shufflevector) r_.values = __builtin_shufflevector(a_.values, a_.values, 0, 1); @@ -210,6 +225,8 @@ simde_vget_low_s64(simde_int64x2_t a) { #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_movepi64_pi64(a_.m128i); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = a_.sv128; #else #if HEDLEY_HAS_BUILTIN(__builtin_shufflevector) r_.values = __builtin_shufflevector(a_.values, a_.values, 0); @@ -240,6 +257,8 @@ simde_vget_low_u8(simde_uint8x16_t a) { #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_movepi64_pi64(a_.m128i); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = a_.sv128; #else #if HEDLEY_HAS_BUILTIN(__builtin_shufflevector) r_.values = __builtin_shufflevector(a_.values, a_.values, 0, 1, 2, 3, 4, 5, 6, 7); @@ -270,6 +289,8 @@ simde_vget_low_u16(simde_uint16x8_t a) { #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_movepi64_pi64(a_.m128i); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = a_.sv128; #else #if HEDLEY_HAS_BUILTIN(__builtin_shufflevector) r_.values = __builtin_shufflevector(a_.values, a_.values, 0, 1, 2, 3); @@ -300,6 +321,8 @@ simde_vget_low_u32(simde_uint32x4_t a) { #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_movepi64_pi64(a_.m128i); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = a_.sv128; #else #if HEDLEY_HAS_BUILTIN(__builtin_shufflevector) r_.values = __builtin_shufflevector(a_.values, a_.values, 0, 1); @@ -330,6 +353,8 @@ simde_vget_low_u64(simde_uint64x2_t a) { #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_movepi64_pi64(a_.m128i); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = a_.sv128; #else #if HEDLEY_HAS_BUILTIN(__builtin_shufflevector) r_.values = __builtin_shufflevector(a_.values, a_.values, 0); diff --git a/simde/arm/neon/hsub.h b/simde/arm/neon/hsub.h index d8e7e02fb..17c563b95 100644 --- a/simde/arm/neon/hsub.h +++ b/simde/arm/neon/hsub.h @@ -22,6 +22,7 @@ * * Copyright: * 2020 Evan Nemerson + * 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ /* TODO: the 128-bit versions only require AVX-512 because of the final @@ -46,6 +47,14 @@ simde_int8x8_t simde_vhsub_s8(simde_int8x8_t a, simde_int8x8_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vhsub_s8(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int8x8_private + r_, + a_ = simde_int8x8_to_private(a), + b_ = simde_int8x8_to_private(b); + + r_.sv64 = __riscv_vasub_vv_i8m1(a_.sv64, b_.sv64, 2, 8); + return simde_int8x8_from_private(r_); #else return simde_vmovn_s16(simde_vshrq_n_s16(simde_vsubl_s8(a, b), 1)); #endif @@ -60,6 +69,14 @@ simde_int16x4_t simde_vhsub_s16(simde_int16x4_t a, simde_int16x4_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vhsub_s16(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int16x4_private + r_, + a_ = simde_int16x4_to_private(a), + b_ = simde_int16x4_to_private(b); + + r_.sv64 = __riscv_vasub_vv_i16m1(a_.sv64, b_.sv64, 2, 4); + return simde_int16x4_from_private(r_); #else return simde_vmovn_s32(simde_vshrq_n_s32(simde_vsubl_s16(a, b), 1)); #endif @@ -74,6 +91,14 @@ simde_int32x2_t simde_vhsub_s32(simde_int32x2_t a, simde_int32x2_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vhsub_s32(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int32x2_private + r_, + a_ = simde_int32x2_to_private(a), + b_ = simde_int32x2_to_private(b); + + r_.sv64 = __riscv_vasub_vv_i32m1(a_.sv64, b_.sv64, 2, 2); + return simde_int32x2_from_private(r_); #else return simde_vmovn_s64(simde_vshrq_n_s64(simde_vsubl_s32(a, b), 1)); #endif @@ -88,6 +113,14 @@ simde_uint8x8_t simde_vhsub_u8(simde_uint8x8_t a, simde_uint8x8_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vhsub_u8(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint8x8_private + r_, + a_ = simde_uint8x8_to_private(a), + b_ = simde_uint8x8_to_private(b); + + r_.sv64 = __riscv_vasubu_vv_u8m1(a_.sv64, b_.sv64, 2, 8); + return simde_uint8x8_from_private(r_); #else return simde_vmovn_u16(simde_vshrq_n_u16(simde_vsubl_u8(a, b), 1)); #endif @@ -102,6 +135,14 @@ simde_uint16x4_t simde_vhsub_u16(simde_uint16x4_t a, simde_uint16x4_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vhsub_u16(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint16x4_private + r_, + a_ = simde_uint16x4_to_private(a), + b_ = simde_uint16x4_to_private(b); + + r_.sv64 = __riscv_vasubu_vv_u16m1(a_.sv64, b_.sv64, 2, 4); + return simde_uint16x4_from_private(r_); #else return simde_vmovn_u32(simde_vshrq_n_u32(simde_vsubl_u16(a, b), 1)); #endif @@ -116,6 +157,14 @@ simde_uint32x2_t simde_vhsub_u32(simde_uint32x2_t a, simde_uint32x2_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vhsub_u32(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint32x2_private + r_, + a_ = simde_uint32x2_to_private(a), + b_ = simde_uint32x2_to_private(b); + + r_.sv64 = __riscv_vasubu_vv_u32m1(a_.sv64, b_.sv64, 2, 2); + return simde_uint32x2_from_private(r_); #else return simde_vmovn_u64(simde_vshrq_n_u64(simde_vsubl_u32(a, b), 1)); #endif @@ -138,6 +187,8 @@ simde_vhsubq_s8(simde_int8x16_t a, simde_int8x16_t b) { #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) r_.m128i = _mm256_cvtepi16_epi8(_mm256_srai_epi16(_mm256_sub_epi16(_mm256_cvtepi8_epi16(a_.m128i), _mm256_cvtepi8_epi16(b_.m128i)), 1)); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vasub_vv_i8m1(a_.sv128, b_.sv128, 2, 16); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -166,6 +217,8 @@ simde_vhsubq_s16(simde_int16x8_t a, simde_int16x8_t b) { #if defined(SIMDE_X86_AVX512VL_NATIVE) r_.m128i = _mm256_cvtepi32_epi16(_mm256_srai_epi32(_mm256_sub_epi32(_mm256_cvtepi16_epi32(a_.m128i), _mm256_cvtepi16_epi32(b_.m128i)), 1)); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vasub_vv_i16m1(a_.sv128, b_.sv128, 2, 8); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -194,6 +247,8 @@ simde_vhsubq_s32(simde_int32x4_t a, simde_int32x4_t b) { #if defined(SIMDE_X86_AVX512VL_NATIVE) r_.m128i = _mm256_cvtepi64_epi32(_mm256_srai_epi64(_mm256_sub_epi64(_mm256_cvtepi32_epi64(a_.m128i), _mm256_cvtepi32_epi64(b_.m128i)), 1)); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vasub_vv_i32m1(a_.sv128, b_.sv128, 2, 4); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -222,6 +277,8 @@ simde_vhsubq_u8(simde_uint8x16_t a, simde_uint8x16_t b) { #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) r_.m128i = _mm256_cvtepi16_epi8(_mm256_srli_epi16(_mm256_sub_epi16(_mm256_cvtepu8_epi16(a_.m128i), _mm256_cvtepu8_epi16(b_.m128i)), 1)); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vasubu_vv_u8m1(a_.sv128, b_.sv128, 2, 16); #elif defined(SIMDE_WASM_SIMD128_NATIVE) v128_t lo = wasm_u16x8_shr(wasm_i16x8_sub(wasm_u16x8_extend_low_u8x16(a_.v128), @@ -261,6 +318,8 @@ simde_vhsubq_u16(simde_uint16x8_t a, simde_uint16x8_t b) { #if defined(SIMDE_X86_AVX512VL_NATIVE) r_.m128i = _mm256_cvtepi32_epi16(_mm256_srli_epi32(_mm256_sub_epi32(_mm256_cvtepu16_epi32(a_.m128i), _mm256_cvtepu16_epi32(b_.m128i)), 1)); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vasubu_vv_u16m1(a_.sv128, b_.sv128, 2, 8); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -289,6 +348,8 @@ simde_vhsubq_u32(simde_uint32x4_t a, simde_uint32x4_t b) { #if defined(SIMDE_X86_AVX512VL_NATIVE) r_.m128i = _mm256_cvtepi64_epi32(_mm256_srli_epi64(_mm256_sub_epi64(_mm256_cvtepu32_epi64(a_.m128i), _mm256_cvtepu32_epi64(b_.m128i)), 1)); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vasubu_vv_u32m1(a_.sv128, b_.sv128, 2, 4); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { diff --git a/simde/arm/neon/mla.h b/simde/arm/neon/mla.h index 4c57edaf6..aaf24a02b 100644 --- a/simde/arm/neon/mla.h +++ b/simde/arm/neon/mla.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Sean Maher (Copyright owned by Google, LLC) + * 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_MLA_H) @@ -41,6 +42,15 @@ simde_float32x2_t simde_vmla_f32(simde_float32x2_t a, simde_float32x2_t b, simde_float32x2_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmla_f32(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_float32x2_private + r_, + a_ = simde_float32x2_to_private(a), + b_ = simde_float32x2_to_private(b), + c_ = simde_float32x2_to_private(c); + + r_.sv64 = __riscv_vfmacc_vv_f32m1(a_.sv64 , b_.sv64 , c_.sv64 , 2); + return simde_float32x2_from_private(r_); #else return simde_vadd_f32(simde_vmul_f32(b, c), a); #endif @@ -55,6 +65,15 @@ simde_float64x1_t simde_vmla_f64(simde_float64x1_t a, simde_float64x1_t b, simde_float64x1_t c) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vmla_f64(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_float64x1_private + r_, + a_ = simde_float64x1_to_private(a), + b_ = simde_float64x1_to_private(b), + c_ = simde_float64x1_to_private(c); + + r_.sv64 = __riscv_vfmacc_vv_f64m1(a_.sv64 , b_.sv64 , c_.sv64 , 1); + return simde_float64x1_from_private(r_); #else return simde_vadd_f64(simde_vmul_f64(b, c), a); #endif @@ -69,6 +88,15 @@ simde_int8x8_t simde_vmla_s8(simde_int8x8_t a, simde_int8x8_t b, simde_int8x8_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmla_s8(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int8x8_private + r_, + a_ = simde_int8x8_to_private(a), + b_ = simde_int8x8_to_private(b), + c_ = simde_int8x8_to_private(c); + + r_.sv64 = __riscv_vmacc_vv_i8m1(a_.sv64 , b_.sv64 , c_.sv64 , 8); + return simde_int8x8_from_private(r_); #else return simde_vadd_s8(simde_vmul_s8(b, c), a); #endif @@ -83,6 +111,15 @@ simde_int16x4_t simde_vmla_s16(simde_int16x4_t a, simde_int16x4_t b, simde_int16x4_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmla_s16(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int16x4_private + r_, + a_ = simde_int16x4_to_private(a), + b_ = simde_int16x4_to_private(b), + c_ = simde_int16x4_to_private(c); + + r_.sv64 = __riscv_vmacc_vv_i16m1(a_.sv64 , b_.sv64 , c_.sv64 , 4); + return simde_int16x4_from_private(r_); #else return simde_vadd_s16(simde_vmul_s16(b, c), a); #endif @@ -97,6 +134,15 @@ simde_int32x2_t simde_vmla_s32(simde_int32x2_t a, simde_int32x2_t b, simde_int32x2_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmla_s32(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int32x2_private + r_, + a_ = simde_int32x2_to_private(a), + b_ = simde_int32x2_to_private(b), + c_ = simde_int32x2_to_private(c); + + r_.sv64 = __riscv_vmacc_vv_i32m1(a_.sv64 , b_.sv64 , c_.sv64 , 2); + return simde_int32x2_from_private(r_); #else return simde_vadd_s32(simde_vmul_s32(b, c), a); #endif @@ -111,6 +157,15 @@ simde_uint8x8_t simde_vmla_u8(simde_uint8x8_t a, simde_uint8x8_t b, simde_uint8x8_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmla_u8(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint8x8_private + r_, + a_ = simde_uint8x8_to_private(a), + b_ = simde_uint8x8_to_private(b), + c_ = simde_uint8x8_to_private(c); + + r_.sv64 = __riscv_vmacc_vv_u8m1(a_.sv64 , b_.sv64 , c_.sv64 , 8); + return simde_uint8x8_from_private(r_); #else return simde_vadd_u8(simde_vmul_u8(b, c), a); #endif @@ -125,6 +180,15 @@ simde_uint16x4_t simde_vmla_u16(simde_uint16x4_t a, simde_uint16x4_t b, simde_uint16x4_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmla_u16(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint16x4_private + r_, + a_ = simde_uint16x4_to_private(a), + b_ = simde_uint16x4_to_private(b), + c_ = simde_uint16x4_to_private(c); + + r_.sv64 = __riscv_vmacc_vv_u16m1(a_.sv64 , b_.sv64 , c_.sv64 , 4); + return simde_uint16x4_from_private(r_); #else return simde_vadd_u16(simde_vmul_u16(b, c), a); #endif @@ -139,6 +203,15 @@ simde_uint32x2_t simde_vmla_u32(simde_uint32x2_t a, simde_uint32x2_t b, simde_uint32x2_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmla_u32(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint32x2_private + r_, + a_ = simde_uint32x2_to_private(a), + b_ = simde_uint32x2_to_private(b), + c_ = simde_uint32x2_to_private(c); + + r_.sv64 = __riscv_vmacc_vv_u32m1(a_.sv64 , b_.sv64 , c_.sv64 , 2); + return simde_uint32x2_from_private(r_); #else return simde_vadd_u32(simde_vmul_u32(b, c), a); #endif @@ -156,7 +229,7 @@ simde_vmlaq_f32(simde_float32x4_t a, simde_float32x4_t b, simde_float32x4_t c) { #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) return vec_madd(b, c, a); #elif \ - defined(SIMDE_X86_FMA_NATIVE) + defined(SIMDE_X86_FMA_NATIVE) || defined(SIMDE_RISCV_V_NATIVE) simde_float32x4_private r_, a_ = simde_float32x4_to_private(a), @@ -165,6 +238,8 @@ simde_vmlaq_f32(simde_float32x4_t a, simde_float32x4_t b, simde_float32x4_t c) { #if defined(SIMDE_X86_FMA_NATIVE) r_.m128 = _mm_fmadd_ps(b_.m128, c_.m128, a_.m128); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vfmacc_vv_f32m1(a_.sv128 , b_.sv128 , c_.sv128 , 4); #endif return simde_float32x4_from_private(r_); @@ -185,7 +260,7 @@ simde_vmlaq_f64(simde_float64x2_t a, simde_float64x2_t b, simde_float64x2_t c) { #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) return vec_madd(b, c, a); #elif \ - defined(SIMDE_X86_FMA_NATIVE) + defined(SIMDE_X86_FMA_NATIVE) || defined(SIMDE_RISCV_V_NATIVE) simde_float64x2_private r_, a_ = simde_float64x2_to_private(a), @@ -194,6 +269,8 @@ simde_vmlaq_f64(simde_float64x2_t a, simde_float64x2_t b, simde_float64x2_t c) { #if defined(SIMDE_X86_FMA_NATIVE) r_.m128d = _mm_fmadd_pd(b_.m128d, c_.m128d, a_.m128d); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vfmacc_vv_f64m1(a_.sv128 , b_.sv128 , c_.sv128 , 2); #endif return simde_float64x2_from_private(r_); @@ -211,6 +288,15 @@ simde_int8x16_t simde_vmlaq_s8(simde_int8x16_t a, simde_int8x16_t b, simde_int8x16_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmlaq_s8(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int8x16_private + r_, + a_ = simde_int8x16_to_private(a), + b_ = simde_int8x16_to_private(b), + c_ = simde_int8x16_to_private(c); + + r_.sv128 = __riscv_vmacc_vv_i8m1(a_.sv128 , b_.sv128 , c_.sv128 , 16); + return simde_int8x16_from_private(r_); #else return simde_vaddq_s8(simde_vmulq_s8(b, c), a); #endif @@ -225,6 +311,15 @@ simde_int16x8_t simde_vmlaq_s16(simde_int16x8_t a, simde_int16x8_t b, simde_int16x8_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmlaq_s16(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int16x8_private + r_, + a_ = simde_int16x8_to_private(a), + b_ = simde_int16x8_to_private(b), + c_ = simde_int16x8_to_private(c); + + r_.sv128 = __riscv_vmacc_vv_i16m1(a_.sv128 , b_.sv128 , c_.sv128 , 8); + return simde_int16x8_from_private(r_); #else return simde_vaddq_s16(simde_vmulq_s16(b, c), a); #endif @@ -239,6 +334,15 @@ simde_int32x4_t simde_vmlaq_s32(simde_int32x4_t a, simde_int32x4_t b, simde_int32x4_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmlaq_s32(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int32x4_private + r_, + a_ = simde_int32x4_to_private(a), + b_ = simde_int32x4_to_private(b), + c_ = simde_int32x4_to_private(c); + + r_.sv128 = __riscv_vmacc_vv_i32m1(a_.sv128 , b_.sv128 , c_.sv128 , 4); + return simde_int32x4_from_private(r_); #else return simde_vaddq_s32(simde_vmulq_s32(b, c), a); #endif @@ -253,6 +357,15 @@ simde_uint8x16_t simde_vmlaq_u8(simde_uint8x16_t a, simde_uint8x16_t b, simde_uint8x16_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmlaq_u8(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint8x16_private + r_, + a_ = simde_uint8x16_to_private(a), + b_ = simde_uint8x16_to_private(b), + c_ = simde_uint8x16_to_private(c); + + r_.sv128 = __riscv_vmacc_vv_u8m1(a_.sv128 , b_.sv128 , c_.sv128 , 16); + return simde_uint8x16_from_private(r_); #else return simde_vaddq_u8(simde_vmulq_u8(b, c), a); #endif @@ -267,6 +380,15 @@ simde_uint16x8_t simde_vmlaq_u16(simde_uint16x8_t a, simde_uint16x8_t b, simde_uint16x8_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmlaq_u16(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint16x8_private + r_, + a_ = simde_uint16x8_to_private(a), + b_ = simde_uint16x8_to_private(b), + c_ = simde_uint16x8_to_private(c); + + r_.sv128 = __riscv_vmacc_vv_u16m1(a_.sv128 , b_.sv128 , c_.sv128 , 8); + return simde_uint16x8_from_private(r_); #else return simde_vaddq_u16(simde_vmulq_u16(b, c), a); #endif @@ -281,6 +403,15 @@ simde_uint32x4_t simde_vmlaq_u32(simde_uint32x4_t a, simde_uint32x4_t b, simde_uint32x4_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmlaq_u32(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint32x4_private + r_, + a_ = simde_uint32x4_to_private(a), + b_ = simde_uint32x4_to_private(b), + c_ = simde_uint32x4_to_private(c); + + r_.sv128 = __riscv_vmacc_vv_u32m1(a_.sv128 , b_.sv128 , c_.sv128 , 4); + return simde_uint32x4_from_private(r_); #else return simde_vaddq_u32(simde_vmulq_u32(b, c), a); #endif diff --git a/simde/arm/neon/mla_n.h b/simde/arm/neon/mla_n.h index f4521eb5f..ecb726d9b 100644 --- a/simde/arm/neon/mla_n.h +++ b/simde/arm/neon/mla_n.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Sean Maher (Copyright owned by Google, LLC) + * 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_MLA_N_H) @@ -48,7 +49,9 @@ simde_vmla_n_f32(simde_float32x2_t a, simde_float32x2_t b, simde_float32 c) { a_ = simde_float32x2_to_private(a), b_ = simde_float32x2_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_53784) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vfmacc_vf_f32m1(a_.sv64 , c , b_.sv64 , 2); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_53784) r_.values = (b_.values * c) + a_.values; #else SIMDE_VECTORIZE @@ -76,7 +79,9 @@ simde_vmla_n_s16(simde_int16x4_t a, simde_int16x4_t b, int16_t c) { a_ = simde_int16x4_to_private(a), b_ = simde_int16x4_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_53784) && !defined(SIMDE_BUG_GCC_100762) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmacc_vx_i16m1(a_.sv64 , c , b_.sv64 , 4); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_53784) && !defined(SIMDE_BUG_GCC_100762) r_.values = (b_.values * c) + a_.values; #else SIMDE_VECTORIZE @@ -104,7 +109,9 @@ simde_vmla_n_s32(simde_int32x2_t a, simde_int32x2_t b, int32_t c) { a_ = simde_int32x2_to_private(a), b_ = simde_int32x2_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_100762) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmacc_vx_i32m1(a_.sv64 , c , b_.sv64 , 2); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_100762) r_.values = (b_.values * c) + a_.values; #else SIMDE_VECTORIZE @@ -132,7 +139,9 @@ simde_vmla_n_u16(simde_uint16x4_t a, simde_uint16x4_t b, uint16_t c) { a_ = simde_uint16x4_to_private(a), b_ = simde_uint16x4_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_100762) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmacc_vx_u16m1(a_.sv64 , c , b_.sv64 , 4); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_100762) r_.values = (b_.values * c) + a_.values; #else SIMDE_VECTORIZE @@ -160,7 +169,9 @@ simde_vmla_n_u32(simde_uint32x2_t a, simde_uint32x2_t b, uint32_t c) { a_ = simde_uint32x2_to_private(a), b_ = simde_uint32x2_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_100762) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmacc_vx_u32m1(a_.sv64 , c , b_.sv64 , 2); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_100762) r_.values = (b_.values * c) + a_.values; #else SIMDE_VECTORIZE @@ -182,7 +193,7 @@ simde_float32x4_t simde_vmlaq_n_f32(simde_float32x4_t a, simde_float32x4_t b, simde_float32 c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmlaq_n_f32(a, b, c); - #elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) && !defined(SIMDE_RISCV_V_NATIVE) return simde_vaddq_f32(simde_vmulq_n_f32(b, c), a); #else simde_float32x4_private @@ -190,7 +201,9 @@ simde_vmlaq_n_f32(simde_float32x4_t a, simde_float32x4_t b, simde_float32 c) { a_ = simde_float32x4_to_private(a), b_ = simde_float32x4_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_53784) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vfmacc_vf_f32m1(a_.sv128 , c , b_.sv128 , 4); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_53784) r_.values = (b_.values * c) + a_.values; #else SIMDE_VECTORIZE @@ -212,7 +225,7 @@ simde_int16x8_t simde_vmlaq_n_s16(simde_int16x8_t a, simde_int16x8_t b, int16_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmlaq_n_s16(a, b, c); - #elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) && !defined(SIMDE_RISCV_V_NATIVE) return simde_vaddq_s16(simde_vmulq_n_s16(b, c), a); #else simde_int16x8_private @@ -220,7 +233,9 @@ simde_vmlaq_n_s16(simde_int16x8_t a, simde_int16x8_t b, int16_t c) { a_ = simde_int16x8_to_private(a), b_ = simde_int16x8_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_53784) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vmacc_vx_i16m1(a_.sv128 , c , b_.sv128 , 8); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_53784) r_.values = (b_.values * c) + a_.values; #else SIMDE_VECTORIZE @@ -242,7 +257,7 @@ simde_int32x4_t simde_vmlaq_n_s32(simde_int32x4_t a, simde_int32x4_t b, int32_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmlaq_n_s32(a, b, c); - #elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) && !defined(SIMDE_RISCV_V_NATIVE) return simde_vaddq_s32(simde_vmulq_n_s32(b, c), a); #else simde_int32x4_private @@ -250,7 +265,9 @@ simde_vmlaq_n_s32(simde_int32x4_t a, simde_int32x4_t b, int32_t c) { a_ = simde_int32x4_to_private(a), b_ = simde_int32x4_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vmacc_vx_i32m1(a_.sv128 , c , b_.sv128 , 4); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.values = (b_.values * c) + a_.values; #else SIMDE_VECTORIZE @@ -272,7 +289,7 @@ simde_uint16x8_t simde_vmlaq_n_u16(simde_uint16x8_t a, simde_uint16x8_t b, uint16_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmlaq_n_u16(a, b, c); - #elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) && !defined(SIMDE_RISCV_V_NATIVE) return simde_vaddq_u16(simde_vmulq_n_u16(b, c), a); #else simde_uint16x8_private @@ -280,7 +297,9 @@ simde_vmlaq_n_u16(simde_uint16x8_t a, simde_uint16x8_t b, uint16_t c) { a_ = simde_uint16x8_to_private(a), b_ = simde_uint16x8_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vmacc_vx_u16m1(a_.sv128 , c , b_.sv128 , 8); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.values = (b_.values * c) + a_.values; #else SIMDE_VECTORIZE @@ -310,7 +329,9 @@ simde_vmlaq_n_u32(simde_uint32x4_t a, simde_uint32x4_t b, uint32_t c) { a_ = simde_uint32x4_to_private(a), b_ = simde_uint32x4_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vmacc_vx_u32m1(a_.sv128 , c , b_.sv128 , 4); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.values = (b_.values * c) + a_.values; #else SIMDE_VECTORIZE diff --git a/simde/arm/neon/mlal.h b/simde/arm/neon/mlal.h index 0403b81a8..594fc26e3 100644 --- a/simde/arm/neon/mlal.h +++ b/simde/arm/neon/mlal.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Sean Maher (Copyright owned by Google, LLC) + * 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_MLAL_H) @@ -41,6 +42,15 @@ simde_int16x8_t simde_vmlal_s8(simde_int16x8_t a, simde_int8x8_t b, simde_int8x8_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmlal_s8(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int16x8_private r_; + simde_int16x8_private a_ = simde_int16x8_to_private(a); + simde_int8x8_private b_ = simde_int8x8_to_private(b); + simde_int8x8_private c_ = simde_int8x8_to_private(c); + vint8mf2_t vb = __riscv_vlmul_trunc_v_i8m1_i8mf2 (b_.sv64); + vint8mf2_t vc = __riscv_vlmul_trunc_v_i8m1_i8mf2 (c_.sv64); + r_.sv128 = __riscv_vwmacc_vv_i16m1(a_.sv128 , vb , vc , 8); + return simde_int16x8_from_private(r_); #else return simde_vmlaq_s16(a, simde_vmovl_s8(b), simde_vmovl_s8(c)); #endif @@ -55,6 +65,15 @@ simde_int32x4_t simde_vmlal_s16(simde_int32x4_t a, simde_int16x4_t b, simde_int16x4_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmlal_s16(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int32x4_private r_; + simde_int32x4_private a_ = simde_int32x4_to_private(a); + simde_int16x4_private b_ = simde_int16x4_to_private(b); + simde_int16x4_private c_ = simde_int16x4_to_private(c); + vint16mf2_t vb = __riscv_vlmul_trunc_v_i16m1_i16mf2 (b_.sv64); + vint16mf2_t vc = __riscv_vlmul_trunc_v_i16m1_i16mf2 (c_.sv64); + r_.sv128 = __riscv_vwmacc_vv_i32m1(a_.sv128 , vb , vc , 4); + return simde_int32x4_from_private(r_); #else return simde_vmlaq_s32(a, simde_vmovl_s16(b), simde_vmovl_s16(c)); #endif @@ -69,6 +88,15 @@ simde_int64x2_t simde_vmlal_s32(simde_int64x2_t a, simde_int32x2_t b, simde_int32x2_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmlal_s32(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int64x2_private r_; + simde_int64x2_private a_ = simde_int64x2_to_private(a); + simde_int32x2_private b_ = simde_int32x2_to_private(b); + simde_int32x2_private c_ = simde_int32x2_to_private(c); + vint32mf2_t vb = __riscv_vlmul_trunc_v_i32m1_i32mf2 (b_.sv64); + vint32mf2_t vc = __riscv_vlmul_trunc_v_i32m1_i32mf2 (c_.sv64); + r_.sv128 = __riscv_vwmacc_vv_i64m1(a_.sv128 , vb , vc , 2); + return simde_int64x2_from_private(r_); #else simde_int64x2_private r_, @@ -98,6 +126,15 @@ simde_uint16x8_t simde_vmlal_u8(simde_uint16x8_t a, simde_uint8x8_t b, simde_uint8x8_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmlal_u8(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint16x8_private r_; + simde_uint16x8_private a_ = simde_uint16x8_to_private(a); + simde_uint8x8_private b_ = simde_uint8x8_to_private(b); + simde_uint8x8_private c_ = simde_uint8x8_to_private(c); + vuint8mf2_t vb = __riscv_vlmul_trunc_v_u8m1_u8mf2 (b_.sv64); + vuint8mf2_t vc = __riscv_vlmul_trunc_v_u8m1_u8mf2 (c_.sv64); + r_.sv128 = __riscv_vwmaccu_vv_u16m1(a_.sv128 , vb , vc , 8); + return simde_uint16x8_from_private(r_); #else return simde_vmlaq_u16(a, simde_vmovl_u8(b), simde_vmovl_u8(c)); #endif @@ -112,6 +149,15 @@ simde_uint32x4_t simde_vmlal_u16(simde_uint32x4_t a, simde_uint16x4_t b, simde_uint16x4_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmlal_u16(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint32x4_private r_; + simde_uint32x4_private a_ = simde_uint32x4_to_private(a); + simde_uint16x4_private b_ = simde_uint16x4_to_private(b); + simde_uint16x4_private c_ = simde_uint16x4_to_private(c); + vuint16mf2_t vb = __riscv_vlmul_trunc_v_u16m1_u16mf2 (b_.sv64); + vuint16mf2_t vc = __riscv_vlmul_trunc_v_u16m1_u16mf2 (c_.sv64); + r_.sv128 = __riscv_vwmaccu_vv_u32m1(a_.sv128 , vb , vc , 4); + return simde_uint32x4_from_private(r_); #else return simde_vmlaq_u32(a, simde_vmovl_u16(b), simde_vmovl_u16(c)); #endif @@ -126,6 +172,15 @@ simde_uint64x2_t simde_vmlal_u32(simde_uint64x2_t a, simde_uint32x2_t b, simde_uint32x2_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmlal_u32(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint64x2_private r_; + simde_uint64x2_private a_ = simde_uint64x2_to_private(a); + simde_uint32x2_private b_ = simde_uint32x2_to_private(b); + simde_uint32x2_private c_ = simde_uint32x2_to_private(c); + vuint32mf2_t vb = __riscv_vlmul_trunc_v_u32m1_u32mf2 (b_.sv64); + vuint32mf2_t vc = __riscv_vlmul_trunc_v_u32m1_u32mf2 (c_.sv64); + r_.sv128 = __riscv_vwmaccu_vv_u64m1(a_.sv128 , vb , vc , 2); + return simde_uint64x2_from_private(r_); #else simde_uint64x2_private r_, diff --git a/simde/arm/neon/mlal_high.h b/simde/arm/neon/mlal_high.h index f7222d16f..21e7221ce 100644 --- a/simde/arm/neon/mlal_high.h +++ b/simde/arm/neon/mlal_high.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Sean Maher (Copyright owned by Google, LLC) + * 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_MLAL_HIGH_H) @@ -41,6 +42,15 @@ simde_int16x8_t simde_vmlal_high_s8(simde_int16x8_t a, simde_int8x16_t b, simde_int8x16_t c) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vmlal_high_s8(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int16x8_private r_; + simde_int16x8_private a_ = simde_int16x8_to_private(a); + simde_int8x16_private b_ = simde_int8x16_to_private(b); + simde_int8x16_private c_ = simde_int8x16_to_private(c); + b_.sv128 = __riscv_vslidedown_vx_i8m1(b_.sv128 , 8 , 16); + c_.sv128 = __riscv_vslidedown_vx_i8m1(c_.sv128 , 8 , 16); + r_.sv128 = __riscv_vwmacc_vv_i16m1(a_.sv128 , __riscv_vlmul_trunc_v_i8m1_i8mf2 (b_.sv128) , __riscv_vlmul_trunc_v_i8m1_i8mf2 (c_.sv128) , 8); + return simde_int16x8_from_private(r_); #else return simde_vmlaq_s16(a, simde_vmovl_high_s8(b), simde_vmovl_high_s8(c)); #endif @@ -55,6 +65,15 @@ simde_int32x4_t simde_vmlal_high_s16(simde_int32x4_t a, simde_int16x8_t b, simde_int16x8_t c) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vmlal_high_s16(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int32x4_private r_; + simde_int32x4_private a_ = simde_int32x4_to_private(a); + simde_int16x8_private b_ = simde_int16x8_to_private(b); + simde_int16x8_private c_ = simde_int16x8_to_private(c); + b_.sv128 = __riscv_vslidedown_vx_i16m1(b_.sv128 , 4 , 8); + c_.sv128 = __riscv_vslidedown_vx_i16m1(c_.sv128 , 4 , 8); + r_.sv128 = __riscv_vwmacc_vv_i32m1(a_.sv128 , __riscv_vlmul_trunc_v_i16m1_i16mf2 (b_.sv128) , __riscv_vlmul_trunc_v_i16m1_i16mf2 (c_.sv128) , 4); + return simde_int32x4_from_private(r_); #else return simde_vmlaq_s32(a, simde_vmovl_high_s16(b), simde_vmovl_high_s16(c)); #endif @@ -69,6 +88,15 @@ simde_int64x2_t simde_vmlal_high_s32(simde_int64x2_t a, simde_int32x4_t b, simde_int32x4_t c) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vmlal_high_s32(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int64x2_private r_; + simde_int64x2_private a_ = simde_int64x2_to_private(a); + simde_int32x4_private b_ = simde_int32x4_to_private(b); + simde_int32x4_private c_ = simde_int32x4_to_private(c); + b_.sv128 = __riscv_vslidedown_vx_i32m1(b_.sv128 , 2, 4); + c_.sv128 = __riscv_vslidedown_vx_i32m1(c_.sv128 , 2, 4); + r_.sv128 = __riscv_vwmacc_vv_i64m1(a_.sv128 , __riscv_vlmul_trunc_v_i32m1_i32mf2 (b_.sv128) , __riscv_vlmul_trunc_v_i32m1_i32mf2 (c_.sv128) , 2); + return simde_int64x2_from_private(r_); #else simde_int64x2_private r_, @@ -98,6 +126,15 @@ simde_uint16x8_t simde_vmlal_high_u8(simde_uint16x8_t a, simde_uint8x16_t b, simde_uint8x16_t c) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vmlal_high_u8(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint16x8_private r_; + simde_uint16x8_private a_ = simde_uint16x8_to_private(a); + simde_uint8x16_private b_ = simde_uint8x16_to_private(b); + simde_uint8x16_private c_ = simde_uint8x16_to_private(c); + b_.sv128 = __riscv_vslidedown_vx_u8m1(b_.sv128 , 8 , 16); + c_.sv128 = __riscv_vslidedown_vx_u8m1(c_.sv128 , 8 , 16); + r_.sv128 = __riscv_vwmaccu_vv_u16m1(a_.sv128 , __riscv_vlmul_trunc_v_u8m1_u8mf2 (b_.sv128) , __riscv_vlmul_trunc_v_u8m1_u8mf2 (c_.sv128) , 8); + return simde_uint16x8_from_private(r_); #else return simde_vmlaq_u16(a, simde_vmovl_high_u8(b), simde_vmovl_high_u8(c)); #endif @@ -112,6 +149,15 @@ simde_uint32x4_t simde_vmlal_high_u16(simde_uint32x4_t a, simde_uint16x8_t b, simde_uint16x8_t c) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vmlal_high_u16(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint32x4_private r_; + simde_uint32x4_private a_ = simde_uint32x4_to_private(a); + simde_uint16x8_private b_ = simde_uint16x8_to_private(b); + simde_uint16x8_private c_ = simde_uint16x8_to_private(c); + b_.sv128 = __riscv_vslidedown_vx_u16m1(b_.sv128 , 4 , 8); + c_.sv128 = __riscv_vslidedown_vx_u16m1(c_.sv128 , 4 , 8); + r_.sv128 = __riscv_vwmaccu_vv_u32m1(a_.sv128 , __riscv_vlmul_trunc_v_u16m1_u16mf2 (b_.sv128) , __riscv_vlmul_trunc_v_u16m1_u16mf2 (c_.sv128) , 4); + return simde_uint32x4_from_private(r_); #else return simde_vmlaq_u32(a, simde_vmovl_high_u16(b), simde_vmovl_high_u16(c)); #endif @@ -126,6 +172,15 @@ simde_uint64x2_t simde_vmlal_high_u32(simde_uint64x2_t a, simde_uint32x4_t b, simde_uint32x4_t c) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vmlal_high_u32(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint64x2_private r_; + simde_uint64x2_private a_ = simde_uint64x2_to_private(a); + simde_uint32x4_private b_ = simde_uint32x4_to_private(b); + simde_uint32x4_private c_ = simde_uint32x4_to_private(c); + b_.sv128 = __riscv_vslidedown_vx_u32m1(b_.sv128 , 2, 4); + c_.sv128 = __riscv_vslidedown_vx_u32m1(c_.sv128 , 2, 4); + r_.sv128 = __riscv_vwmaccu_vv_u64m1(a_.sv128 , __riscv_vlmul_trunc_v_u32m1_u32mf2 (b_.sv128) , __riscv_vlmul_trunc_v_u32m1_u32mf2 (c_.sv128) , 2); + return simde_uint64x2_from_private(r_); #else simde_uint64x2_private r_, diff --git a/simde/arm/neon/mlal_high_n.h b/simde/arm/neon/mlal_high_n.h index 0c26174ec..876c19333 100644 --- a/simde/arm/neon/mlal_high_n.h +++ b/simde/arm/neon/mlal_high_n.h @@ -22,6 +22,7 @@ * * Copyright: * 2021 Décio Luiz Gazzoni Filho + * 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_MLAL_HIGH_N_H) @@ -41,6 +42,13 @@ simde_int32x4_t simde_vmlal_high_n_s16(simde_int32x4_t a, simde_int16x8_t b, int16_t c) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vmlal_high_n_s16(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int32x4_private r_; + simde_int32x4_private a_ = simde_int32x4_to_private(a); + simde_int16x8_private b_ = simde_int16x8_to_private(b); + b_.sv128 = __riscv_vslidedown_vx_i16m1(b_.sv128 , 4 , 8); + r_.sv128 = __riscv_vwmacc_vx_i32m1(a_.sv128 , c , __riscv_vlmul_trunc_v_i16m1_i16mf2 (b_.sv128) , 4); + return simde_int32x4_from_private(r_); #else return simde_vmlaq_s32(a, simde_vmovl_high_s16(b), simde_vdupq_n_s32(c)); #endif @@ -55,6 +63,13 @@ simde_int64x2_t simde_vmlal_high_n_s32(simde_int64x2_t a, simde_int32x4_t b, int32_t c) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vmlal_high_n_s32(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int64x2_private r_; + simde_int64x2_private a_ = simde_int64x2_to_private(a); + simde_int32x4_private b_ = simde_int32x4_to_private(b); + b_.sv128 = __riscv_vslidedown_vx_i32m1(b_.sv128 , 2, 4); + r_.sv128 = __riscv_vwmacc_vx_i64m1(a_.sv128 , c , __riscv_vlmul_trunc_v_i32m1_i32mf2 (b_.sv128) , 2); + return simde_int64x2_from_private(r_); #else simde_int64x2_private r_, @@ -84,6 +99,13 @@ simde_uint32x4_t simde_vmlal_high_n_u16(simde_uint32x4_t a, simde_uint16x8_t b, uint16_t c) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vmlal_high_n_u16(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint32x4_private r_; + simde_uint32x4_private a_ = simde_uint32x4_to_private(a); + simde_uint16x8_private b_ = simde_uint16x8_to_private(b); + b_.sv128 = __riscv_vslidedown_vx_u16m1(b_.sv128 , 4 , 8); + r_.sv128 = __riscv_vwmaccu_vx_u32m1(a_.sv128 , c , __riscv_vlmul_trunc_v_u16m1_u16mf2 (b_.sv128) , 4); + return simde_uint32x4_from_private(r_); #else return simde_vmlaq_u32(a, simde_vmovl_high_u16(b), simde_vdupq_n_u32(c)); #endif @@ -98,6 +120,13 @@ simde_uint64x2_t simde_vmlal_high_n_u32(simde_uint64x2_t a, simde_uint32x4_t b, uint32_t c) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vmlal_high_n_u32(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint64x2_private r_; + simde_uint64x2_private a_ = simde_uint64x2_to_private(a); + simde_uint32x4_private b_ = simde_uint32x4_to_private(b); + b_.sv128 = __riscv_vslidedown_vx_u32m1(b_.sv128 , 2, 4); + r_.sv128 = __riscv_vwmaccu_vx_u64m1(a_.sv128 , c , __riscv_vlmul_trunc_v_u32m1_u32mf2 (b_.sv128) , 2); + return simde_uint64x2_from_private(r_); #else simde_uint64x2_private r_, diff --git a/simde/arm/neon/mlal_n.h b/simde/arm/neon/mlal_n.h index 6025492d2..6b585c58a 100644 --- a/simde/arm/neon/mlal_n.h +++ b/simde/arm/neon/mlal_n.h @@ -22,6 +22,7 @@ * * Copyright: * 2020 Evan Nemerson + * 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_MLAL_N_H) @@ -41,6 +42,13 @@ simde_int32x4_t simde_vmlal_n_s16(simde_int32x4_t a, simde_int16x4_t b, int16_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmlal_n_s16(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int32x4_private r_; + simde_int32x4_private a_ = simde_int32x4_to_private(a); + simde_int16x4_private b_ = simde_int16x4_to_private(b); + vint16mf2_t vb = __riscv_vlmul_trunc_v_i16m1_i16mf2 (b_.sv64); + r_.sv128 = __riscv_vwmacc_vx_i32m1(a_.sv128 , c , vb , 4); + return simde_int32x4_from_private(r_); #else return simde_vmlaq_s32(a, simde_vmovl_s16(b), simde_vdupq_n_s32(c)); #endif @@ -55,13 +63,19 @@ simde_int64x2_t simde_vmlal_n_s32(simde_int64x2_t a, simde_int32x2_t b, int32_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmlal_n_s32(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int64x2_private r_; + simde_int64x2_private a_ = simde_int64x2_to_private(a); + simde_int32x2_private b_ = simde_int32x2_to_private(b); + vint32mf2_t vb = __riscv_vlmul_trunc_v_i32m1_i32mf2 (b_.sv64); + r_.sv128 = __riscv_vwmacc_vx_i64m1(a_.sv128 , c , vb , 2); + return simde_int64x2_from_private(r_); #else simde_int64x2_private r_, a_ = simde_int64x2_to_private(a), b_ = simde_int64x2_to_private(simde_vmovl_s32(b)), c_ = simde_int64x2_to_private(simde_vdupq_n_s64(c)); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = (b_.values * c_.values) + a_.values; #else @@ -84,6 +98,13 @@ simde_uint32x4_t simde_vmlal_n_u16(simde_uint32x4_t a, simde_uint16x4_t b, uint16_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmlal_n_u16(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint32x4_private r_; + simde_uint32x4_private a_ = simde_uint32x4_to_private(a); + simde_uint16x4_private b_ = simde_uint16x4_to_private(b); + vuint16mf2_t vb = __riscv_vlmul_trunc_v_u16m1_u16mf2 (b_.sv64); + r_.sv128 = __riscv_vwmaccu_vx_u32m1(a_.sv128 , c , vb , 4); + return simde_uint32x4_from_private(r_); #else return simde_vmlaq_u32(a, simde_vmovl_u16(b), simde_vdupq_n_u32(c)); #endif @@ -98,6 +119,13 @@ simde_uint64x2_t simde_vmlal_n_u32(simde_uint64x2_t a, simde_uint32x2_t b, uint32_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmlal_n_u32(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint64x2_private r_; + simde_uint64x2_private a_ = simde_uint64x2_to_private(a); + simde_uint32x2_private b_ = simde_uint32x2_to_private(b); + vuint32mf2_t vb = __riscv_vlmul_trunc_v_u32m1_u32mf2 (b_.sv64); + r_.sv128 = __riscv_vwmaccu_vx_u64m1(a_.sv128 , c , vb , 2); + return simde_uint64x2_from_private(r_); #else simde_uint64x2_private r_, diff --git a/simde/arm/neon/mls.h b/simde/arm/neon/mls.h index c92547f7d..0ee06a2b9 100644 --- a/simde/arm/neon/mls.h +++ b/simde/arm/neon/mls.h @@ -22,6 +22,7 @@ * * Copyright: * 2020 Evan Nemerson + * 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_MLS_H) @@ -39,6 +40,14 @@ simde_float32x2_t simde_vmls_f32(simde_float32x2_t a, simde_float32x2_t b, simde_float32x2_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmls_f32(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_float32x2_private + r_, + a_ = simde_float32x2_to_private(a), + b_ = simde_float32x2_to_private(b), + c_ = simde_float32x2_to_private(c); + r_.sv64 = __riscv_vfnmsac_vv_f32m1(a_.sv64 , b_.sv64 , c_.sv64 , 2); + return simde_float32x2_from_private(r_); #else return simde_vsub_f32(a, simde_vmul_f32(b, c)); #endif @@ -53,6 +62,14 @@ simde_float64x1_t simde_vmls_f64(simde_float64x1_t a, simde_float64x1_t b, simde_float64x1_t c) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vmls_f64(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_float64x1_private + r_, + a_ = simde_float64x1_to_private(a), + b_ = simde_float64x1_to_private(b), + c_ = simde_float64x1_to_private(c); + r_.sv64 = __riscv_vfnmsac_vv_f64m1(a_.sv64 , b_.sv64 , c_.sv64 , 1); + return simde_float64x1_from_private(r_); #else return simde_vsub_f64(a, simde_vmul_f64(b, c)); #endif @@ -67,6 +84,14 @@ simde_int8x8_t simde_vmls_s8(simde_int8x8_t a, simde_int8x8_t b, simde_int8x8_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmls_s8(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int8x8_private + r_, + a_ = simde_int8x8_to_private(a), + b_ = simde_int8x8_to_private(b), + c_ = simde_int8x8_to_private(c); + r_.sv64 = __riscv_vnmsac_vv_i8m1(a_.sv64 , b_.sv64 , c_.sv64 , 8); + return simde_int8x8_from_private(r_); #else return simde_vsub_s8(a, simde_vmul_s8(b, c)); #endif @@ -81,6 +106,14 @@ simde_int16x4_t simde_vmls_s16(simde_int16x4_t a, simde_int16x4_t b, simde_int16x4_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmls_s16(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int16x4_private + r_, + a_ = simde_int16x4_to_private(a), + b_ = simde_int16x4_to_private(b), + c_ = simde_int16x4_to_private(c); + r_.sv64 = __riscv_vnmsac_vv_i16m1(a_.sv64 , b_.sv64 , c_.sv64 , 4); + return simde_int16x4_from_private(r_); #else return simde_vsub_s16(a, simde_vmul_s16(b, c)); #endif @@ -95,6 +128,14 @@ simde_int32x2_t simde_vmls_s32(simde_int32x2_t a, simde_int32x2_t b, simde_int32x2_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmls_s32(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int32x2_private + r_, + a_ = simde_int32x2_to_private(a), + b_ = simde_int32x2_to_private(b), + c_ = simde_int32x2_to_private(c); + r_.sv64 = __riscv_vnmsac_vv_i32m1(a_.sv64 , b_.sv64 , c_.sv64 , 2); + return simde_int32x2_from_private(r_); #else return simde_vsub_s32(a, simde_vmul_s32(b, c)); #endif @@ -109,6 +150,14 @@ simde_uint8x8_t simde_vmls_u8(simde_uint8x8_t a, simde_uint8x8_t b, simde_uint8x8_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmls_u8(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint8x8_private + r_, + a_ = simde_uint8x8_to_private(a), + b_ = simde_uint8x8_to_private(b), + c_ = simde_uint8x8_to_private(c); + r_.sv64 = __riscv_vnmsac_vv_u8m1(a_.sv64 , b_.sv64 , c_.sv64 , 8); + return simde_uint8x8_from_private(r_); #else return simde_vsub_u8(a, simde_vmul_u8(b, c)); #endif @@ -123,6 +172,14 @@ simde_uint16x4_t simde_vmls_u16(simde_uint16x4_t a, simde_uint16x4_t b, simde_uint16x4_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmls_u16(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint16x4_private + r_, + a_ = simde_uint16x4_to_private(a), + b_ = simde_uint16x4_to_private(b), + c_ = simde_uint16x4_to_private(c); + r_.sv64 = __riscv_vnmsac_vv_u16m1(a_.sv64 , b_.sv64 , c_.sv64 , 4); + return simde_uint16x4_from_private(r_); #else return simde_vsub_u16(a, simde_vmul_u16(b, c)); #endif @@ -137,6 +194,14 @@ simde_uint32x2_t simde_vmls_u32(simde_uint32x2_t a, simde_uint32x2_t b, simde_uint32x2_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmls_u32(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint32x2_private + r_, + a_ = simde_uint32x2_to_private(a), + b_ = simde_uint32x2_to_private(b), + c_ = simde_uint32x2_to_private(c); + r_.sv64 = __riscv_vnmsac_vv_u32m1(a_.sv64 , b_.sv64 , c_.sv64 , 2); + return simde_uint32x2_from_private(r_); #else return simde_vsub_u32(a, simde_vmul_u32(b, c)); #endif @@ -151,13 +216,19 @@ simde_float32x4_t simde_vmlsq_f32(simde_float32x4_t a, simde_float32x4_t b, simde_float32x4_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmlsq_f32(a, b, c); - #elif defined(SIMDE_X86_FMA_NATIVE) + #elif defined(SIMDE_X86_FMA_NATIVE) || defined(SIMDE_RISCV_V_NATIVE) simde_float32x4_private r_, a_ = simde_float32x4_to_private(a), b_ = simde_float32x4_to_private(b), c_ = simde_float32x4_to_private(c); - r_.m128 = _mm_fnmadd_ps(b_.m128, c_.m128, a_.m128); + + #if defined(SIMDE_X86_FMA_NATIVE) + r_.m128 = _mm_fnmadd_ps(b_.m128, c_.m128, a_.m128); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vfnmsac_vv_f32m1(a_.sv128 , b_.sv128 , c_.sv128 , 4); + #endif + return simde_float32x4_from_private(r_); #else return simde_vsubq_f32(a, simde_vmulq_f32(b, c)); @@ -173,13 +244,19 @@ simde_float64x2_t simde_vmlsq_f64(simde_float64x2_t a, simde_float64x2_t b, simde_float64x2_t c) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vmlsq_f64(a, b, c); - #elif defined(SIMDE_X86_FMA_NATIVE) + #elif defined(SIMDE_X86_FMA_NATIVE) || defined(SIMDE_X86_FMA_NATIVE) simde_float64x2_private r_, a_ = simde_float64x2_to_private(a), b_ = simde_float64x2_to_private(b), c_ = simde_float64x2_to_private(c); - r_.m128d = _mm_fnmadd_pd(b_.m128d, c_.m128d, a_.m128d); + + #if defined(SIMDE_X86_FMA_NATIVE) + r_.m128d = _mm_fnmadd_pd(b_.m128d, c_.m128d, a_.m128d); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vfnmsac_vv_f64m1(a_.sv128 , b_.sv128 , c_.sv128 , 2); + #endif + return simde_float64x2_from_private(r_); #else return simde_vsubq_f64(a, simde_vmulq_f64(b, c)); @@ -195,6 +272,14 @@ simde_int8x16_t simde_vmlsq_s8(simde_int8x16_t a, simde_int8x16_t b, simde_int8x16_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmlsq_s8(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int8x16_private + r_, + a_ = simde_int8x16_to_private(a), + b_ = simde_int8x16_to_private(b), + c_ = simde_int8x16_to_private(c); + r_.sv128 = __riscv_vnmsac_vv_i8m1(a_.sv128 , b_.sv128 , c_.sv128 , 16); + return simde_int8x16_from_private(r_); #else return simde_vsubq_s8(a, simde_vmulq_s8(b, c)); #endif @@ -209,6 +294,14 @@ simde_int16x8_t simde_vmlsq_s16(simde_int16x8_t a, simde_int16x8_t b, simde_int16x8_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmlsq_s16(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int16x8_private + r_, + a_ = simde_int16x8_to_private(a), + b_ = simde_int16x8_to_private(b), + c_ = simde_int16x8_to_private(c); + r_.sv128 = __riscv_vnmsac_vv_i16m1(a_.sv128 , b_.sv128 , c_.sv128 , 8); + return simde_int16x8_from_private(r_); #else return simde_vsubq_s16(a, simde_vmulq_s16(b, c)); #endif @@ -223,6 +316,14 @@ simde_int32x4_t simde_vmlsq_s32(simde_int32x4_t a, simde_int32x4_t b, simde_int32x4_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmlsq_s32(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int32x4_private + r_, + a_ = simde_int32x4_to_private(a), + b_ = simde_int32x4_to_private(b), + c_ = simde_int32x4_to_private(c); + r_.sv128 = __riscv_vnmsac_vv_i32m1(a_.sv128 , b_.sv128 , c_.sv128 , 4); + return simde_int32x4_from_private(r_); #else return simde_vsubq_s32(a, simde_vmulq_s32(b, c)); #endif @@ -237,6 +338,14 @@ simde_uint8x16_t simde_vmlsq_u8(simde_uint8x16_t a, simde_uint8x16_t b, simde_uint8x16_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmlsq_u8(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint8x16_private + r_, + a_ = simde_uint8x16_to_private(a), + b_ = simde_uint8x16_to_private(b), + c_ = simde_uint8x16_to_private(c); + r_.sv128 = __riscv_vnmsac_vv_u8m1(a_.sv128 , b_.sv128 , c_.sv128 , 16); + return simde_uint8x16_from_private(r_); #else return simde_vsubq_u8(a, simde_vmulq_u8(b, c)); #endif @@ -251,6 +360,14 @@ simde_uint16x8_t simde_vmlsq_u16(simde_uint16x8_t a, simde_uint16x8_t b, simde_uint16x8_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmlsq_u16(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint16x8_private + r_, + a_ = simde_uint16x8_to_private(a), + b_ = simde_uint16x8_to_private(b), + c_ = simde_uint16x8_to_private(c); + r_.sv128 = __riscv_vnmsac_vv_u16m1(a_.sv128 , b_.sv128 , c_.sv128 , 8); + return simde_uint16x8_from_private(r_); #else return simde_vsubq_u16(a, simde_vmulq_u16(b, c)); #endif @@ -265,6 +382,14 @@ simde_uint32x4_t simde_vmlsq_u32(simde_uint32x4_t a, simde_uint32x4_t b, simde_uint32x4_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmlsq_u32(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint32x4_private + r_, + a_ = simde_uint32x4_to_private(a), + b_ = simde_uint32x4_to_private(b), + c_ = simde_uint32x4_to_private(c); + r_.sv128 = __riscv_vnmsac_vv_u32m1(a_.sv128 , b_.sv128 , c_.sv128 , 4); + return simde_uint32x4_from_private(r_); #else return simde_vsubq_u32(a, simde_vmulq_u32(b, c)); #endif diff --git a/simde/arm/neon/mls_n.h b/simde/arm/neon/mls_n.h index 2ff48e231..9a4239fe4 100644 --- a/simde/arm/neon/mls_n.h +++ b/simde/arm/neon/mls_n.h @@ -22,6 +22,7 @@ * * Copyright: * 2020 Evan Nemerson + * 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_MLS_N_H) @@ -40,6 +41,13 @@ simde_float32x2_t simde_vmls_n_f32(simde_float32x2_t a, simde_float32x2_t b, simde_float32 c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmls_n_f32(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_float32x2_private + r_, + a_ = simde_float32x2_to_private(a), + b_ = simde_float32x2_to_private(b); + r_.sv64 = __riscv_vfnmsac_vf_f32m1(a_.sv64 , c , b_.sv64 , 2); + return simde_float32x2_from_private(r_); #else return simde_vmls_f32(a, b, simde_vdup_n_f32(c)); #endif @@ -54,6 +62,13 @@ simde_int16x4_t simde_vmls_n_s16(simde_int16x4_t a, simde_int16x4_t b, int16_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmls_n_s16(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int16x4_private + r_, + a_ = simde_int16x4_to_private(a), + b_ = simde_int16x4_to_private(b); + r_.sv64 = __riscv_vnmsac_vx_i16m1(a_.sv64 , c , b_.sv64 , 4); + return simde_int16x4_from_private(r_); #else return simde_vmls_s16(a, b, simde_vdup_n_s16(c)); #endif @@ -68,6 +83,13 @@ simde_int32x2_t simde_vmls_n_s32(simde_int32x2_t a, simde_int32x2_t b, int32_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmls_n_s32(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int32x2_private + r_, + a_ = simde_int32x2_to_private(a), + b_ = simde_int32x2_to_private(b); + r_.sv64 = __riscv_vnmsac_vx_i32m1(a_.sv64 , c , b_.sv64 , 2); + return simde_int32x2_from_private(r_); #else return simde_vmls_s32(a, b, simde_vdup_n_s32(c)); #endif @@ -82,6 +104,13 @@ simde_uint16x4_t simde_vmls_n_u16(simde_uint16x4_t a, simde_uint16x4_t b, uint16_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmls_n_u16(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) && (SIMDE_NATURAL_VECTOR_SIZE == 128) + simde_uint16x4_private + r_, + a_ = simde_uint16x4_to_private(a), + b_ = simde_uint16x4_to_private(b); + r_.sv64 = __riscv_vnmsac_vx_u16m1(a_.sv64 , c , b_.sv64 , 4); + return simde_uint16x4_from_private(r_); #else return simde_vmls_u16(a, b, simde_vdup_n_u16(c)); #endif @@ -96,6 +125,13 @@ simde_uint32x2_t simde_vmls_n_u32(simde_uint32x2_t a, simde_uint32x2_t b, uint32_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmls_n_u32(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) && (SIMDE_NATURAL_VECTOR_SIZE == 128) + simde_uint32x2_private + r_, + a_ = simde_uint32x2_to_private(a), + b_ = simde_uint32x2_to_private(b); + r_.sv64 = __riscv_vnmsac_vx_u32m1(a_.sv64 , c , b_.sv64 , 2); + return simde_uint32x2_from_private(r_); #else return simde_vmls_u32(a, b, simde_vdup_n_u32(c)); #endif @@ -110,6 +146,13 @@ simde_float32x4_t simde_vmlsq_n_f32(simde_float32x4_t a, simde_float32x4_t b, simde_float32 c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmlsq_n_f32(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) && (SIMDE_NATURAL_VECTOR_SIZE == 128) + simde_float32x4_private + r_, + a_ = simde_float32x4_to_private(a), + b_ = simde_float32x4_to_private(b); + r_.sv128 = __riscv_vfnmsac_vf_f32m1(a_.sv128 , c , b_.sv128 , 4); + return simde_float32x4_from_private(r_); #else return simde_vmlsq_f32(a, b, simde_vdupq_n_f32(c)); #endif @@ -124,6 +167,13 @@ simde_int16x8_t simde_vmlsq_n_s16(simde_int16x8_t a, simde_int16x8_t b, int16_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmlsq_n_s16(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int16x8_private + r_, + a_ = simde_int16x8_to_private(a), + b_ = simde_int16x8_to_private(b); + r_.sv128 = __riscv_vnmsac_vx_i16m1(a_.sv128 , c , b_.sv128 , 8); + return simde_int16x8_from_private(r_); #else return simde_vmlsq_s16(a, b, simde_vdupq_n_s16(c)); #endif @@ -138,6 +188,13 @@ simde_int32x4_t simde_vmlsq_n_s32(simde_int32x4_t a, simde_int32x4_t b, int32_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmlsq_n_s32(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int32x4_private + r_, + a_ = simde_int32x4_to_private(a), + b_ = simde_int32x4_to_private(b); + r_.sv128 = __riscv_vnmsac_vx_i32m1(a_.sv128 , c , b_.sv128 , 4); + return simde_int32x4_from_private(r_); #else return simde_vmlsq_s32(a, b, simde_vdupq_n_s32(c)); #endif @@ -152,6 +209,13 @@ simde_uint16x8_t simde_vmlsq_n_u16(simde_uint16x8_t a, simde_uint16x8_t b, uint16_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmlsq_n_u16(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint16x8_private + r_, + a_ = simde_uint16x8_to_private(a), + b_ = simde_uint16x8_to_private(b); + r_.sv128 = __riscv_vnmsac_vx_u16m1(a_.sv128 , c , b_.sv128 , 8); + return simde_uint16x8_from_private(r_); #else return simde_vmlsq_u16(a, b, simde_vdupq_n_u16(c)); #endif @@ -166,6 +230,13 @@ simde_uint32x4_t simde_vmlsq_n_u32(simde_uint32x4_t a, simde_uint32x4_t b, uint32_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmlsq_n_u32(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint32x4_private + r_, + a_ = simde_uint32x4_to_private(a), + b_ = simde_uint32x4_to_private(b); + r_.sv128 = __riscv_vnmsac_vx_u32m1(a_.sv128 , c , b_.sv128 , 4); + return simde_uint32x4_from_private(r_); #else return simde_vmlsq_u32(a, b, simde_vdupq_n_u32(c)); #endif diff --git a/simde/arm/neon/mlsl.h b/simde/arm/neon/mlsl.h index e79cea157..6dae3de71 100644 --- a/simde/arm/neon/mlsl.h +++ b/simde/arm/neon/mlsl.h @@ -22,6 +22,7 @@ * * Copyright: * 2020 Evan Nemerson + * 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_MLSL_H) @@ -39,6 +40,15 @@ simde_int16x8_t simde_vmlsl_s8(simde_int16x8_t a, simde_int8x8_t b, simde_int8x8_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmlsl_s8(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) && (SIMDE_NATURAL_VECTOR_SIZE == 128) + simde_int16x8_private r_; + simde_int16x8_private a_ = simde_int16x8_to_private(a); + simde_int8x8_private b_ = simde_int8x8_to_private(b); + simde_int8x8_private c_ = simde_int8x8_to_private(c); + vint8mf2_t vb = __riscv_vlmul_trunc_v_i8m1_i8mf2 (b_.sv64); + vint8mf2_t vc = __riscv_vlmul_trunc_v_i8m1_i8mf2 (c_.sv64); + r_.sv128 = __riscv_vsub_vv_i16m1(a_.sv128 , __riscv_vwmul_vv_i16m1(vb , vc , 8) , 8); + return simde_int16x8_from_private(r_); #else return simde_vsubq_s16(a, simde_vmull_s8(b, c)); #endif @@ -53,6 +63,15 @@ simde_int32x4_t simde_vmlsl_s16(simde_int32x4_t a, simde_int16x4_t b, simde_int16x4_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmlsl_s16(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) && (SIMDE_NATURAL_VECTOR_SIZE == 128) + simde_int32x4_private r_; + simde_int32x4_private a_ = simde_int32x4_to_private(a); + simde_int16x4_private b_ = simde_int16x4_to_private(b); + simde_int16x4_private c_ = simde_int16x4_to_private(c); + vint16mf2_t vb = __riscv_vlmul_trunc_v_i16m1_i16mf2 (b_.sv64); + vint16mf2_t vc = __riscv_vlmul_trunc_v_i16m1_i16mf2 (c_.sv64); + r_.sv128 = __riscv_vsub_vv_i32m1(a_.sv128 , __riscv_vwmul_vv_i32m1(vb , vc , 4) , 4); + return simde_int32x4_from_private(r_); #else return simde_vsubq_s32(a, simde_vmull_s16(b, c)); #endif @@ -67,6 +86,15 @@ simde_int64x2_t simde_vmlsl_s32(simde_int64x2_t a, simde_int32x2_t b, simde_int32x2_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmlsl_s32(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) && (SIMDE_NATURAL_VECTOR_SIZE == 128) + simde_int64x2_private r_; + simde_int64x2_private a_ = simde_int64x2_to_private(a); + simde_int32x2_private b_ = simde_int32x2_to_private(b); + simde_int32x2_private c_ = simde_int32x2_to_private(c); + vint32mf2_t vb = __riscv_vlmul_trunc_v_i32m1_i32mf2 (b_.sv64); + vint32mf2_t vc = __riscv_vlmul_trunc_v_i32m1_i32mf2 (c_.sv64); + r_.sv128 = __riscv_vsub_vv_i64m1(a_.sv128 , __riscv_vwmul_vv_i64m1(vb , vc , 2) , 2); + return simde_int64x2_from_private(r_); #else return simde_vsubq_s64(a, simde_vmull_s32(b, c)); #endif @@ -81,6 +109,15 @@ simde_uint16x8_t simde_vmlsl_u8(simde_uint16x8_t a, simde_uint8x8_t b, simde_uint8x8_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmlsl_u8(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) && (SIMDE_NATURAL_VECTOR_SIZE == 128) + simde_uint16x8_private r_; + simde_uint16x8_private a_ = simde_uint16x8_to_private(a); + simde_uint8x8_private b_ = simde_uint8x8_to_private(b); + simde_uint8x8_private c_ = simde_uint8x8_to_private(c); + vuint8mf2_t vb = __riscv_vlmul_trunc_v_u8m1_u8mf2 (b_.sv64); + vuint8mf2_t vc = __riscv_vlmul_trunc_v_u8m1_u8mf2 (c_.sv64); + r_.sv128 = __riscv_vsub_vv_u16m1(a_.sv128 , __riscv_vwmulu_vv_u16m1(vb , vc , 8) , 8); + return simde_uint16x8_from_private(r_); #else return simde_vsubq_u16(a, simde_vmull_u8(b, c)); #endif @@ -95,6 +132,15 @@ simde_uint32x4_t simde_vmlsl_u16(simde_uint32x4_t a, simde_uint16x4_t b, simde_uint16x4_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmlsl_u16(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) && (SIMDE_NATURAL_VECTOR_SIZE == 128) + simde_uint32x4_private r_; + simde_uint32x4_private a_ = simde_uint32x4_to_private(a); + simde_uint16x4_private b_ = simde_uint16x4_to_private(b); + simde_uint16x4_private c_ = simde_uint16x4_to_private(c); + vuint16mf2_t vb = __riscv_vlmul_trunc_v_u16m1_u16mf2 (b_.sv64); + vuint16mf2_t vc = __riscv_vlmul_trunc_v_u16m1_u16mf2 (c_.sv64); + r_.sv128 = __riscv_vsub_vv_u32m1(a_.sv128 , __riscv_vwmulu_vv_u32m1(vb , vc , 4) , 4); + return simde_uint32x4_from_private(r_); #else return simde_vsubq_u32(a, simde_vmull_u16(b, c)); #endif @@ -109,6 +155,15 @@ simde_uint64x2_t simde_vmlsl_u32(simde_uint64x2_t a, simde_uint32x2_t b, simde_uint32x2_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmlsl_u32(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) && (SIMDE_NATURAL_VECTOR_SIZE == 128) + simde_uint64x2_private r_; + simde_uint64x2_private a_ = simde_uint64x2_to_private(a); + simde_uint32x2_private b_ = simde_uint32x2_to_private(b); + simde_uint32x2_private c_ = simde_uint32x2_to_private(c); + vuint32mf2_t vb = __riscv_vlmul_trunc_v_u32m1_u32mf2 (b_.sv64); + vuint32mf2_t vc = __riscv_vlmul_trunc_v_u32m1_u32mf2 (c_.sv64); + r_.sv128 = __riscv_vsub_vv_u64m1(a_.sv128 , __riscv_vwmulu_vv_u64m1(vb , vc , 2) , 2); + return simde_uint64x2_from_private(r_); #else return simde_vsubq_u64(a, simde_vmull_u32(b, c)); #endif diff --git a/simde/arm/neon/mlsl_high.h b/simde/arm/neon/mlsl_high.h index d70ca935d..4477064e5 100644 --- a/simde/arm/neon/mlsl_high.h +++ b/simde/arm/neon/mlsl_high.h @@ -22,6 +22,7 @@ * * Copyright: * 2020 Evan Nemerson + * 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_MLSL_HIGH_H) @@ -39,6 +40,17 @@ simde_int16x8_t simde_vmlsl_high_s8(simde_int16x8_t a, simde_int8x16_t b, simde_int8x16_t c) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vmlsl_high_s8(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int16x8_private r_; + simde_int16x8_private a_ = simde_int16x8_to_private(a); + simde_int8x16_private b_ = simde_int8x16_to_private(b); + simde_int8x16_private c_ = simde_int8x16_to_private(c); + b_.sv128 = __riscv_vslidedown_vx_i8m1(b_.sv128 , 8 , 16); + c_.sv128 = __riscv_vslidedown_vx_i8m1(c_.sv128 , 8 , 16); + vint8mf2_t vb = __riscv_vlmul_trunc_v_i8m1_i8mf2 (b_.sv128); + vint8mf2_t vc = __riscv_vlmul_trunc_v_i8m1_i8mf2 (c_.sv128); + r_.sv128 = __riscv_vsub_vv_i16m1(a_.sv128 , __riscv_vwmul_vv_i16m1(vb , vc , 8) , 8); + return simde_int16x8_from_private(r_); #else return simde_vsubq_s16(a, simde_vmull_high_s8(b, c)); #endif @@ -53,6 +65,17 @@ simde_int32x4_t simde_vmlsl_high_s16(simde_int32x4_t a, simde_int16x8_t b, simde_int16x8_t c) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vmlsl_high_s16(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int32x4_private r_; + simde_int32x4_private a_ = simde_int32x4_to_private(a); + simde_int16x8_private b_ = simde_int16x8_to_private(b); + simde_int16x8_private c_ = simde_int16x8_to_private(c); + b_.sv128 = __riscv_vslidedown_vx_i16m1(b_.sv128 , 4 , 8); + c_.sv128 = __riscv_vslidedown_vx_i16m1(c_.sv128 , 4 , 8); + vint16mf2_t vb = __riscv_vlmul_trunc_v_i16m1_i16mf2 (b_.sv128); + vint16mf2_t vc = __riscv_vlmul_trunc_v_i16m1_i16mf2 (c_.sv128); + r_.sv128 = __riscv_vsub_vv_i32m1(a_.sv128 , __riscv_vwmul_vv_i32m1(vb , vc , 4) , 4); + return simde_int32x4_from_private(r_); #else return simde_vsubq_s32(a, simde_vmull_high_s16(b, c)); #endif @@ -67,6 +90,17 @@ simde_int64x2_t simde_vmlsl_high_s32(simde_int64x2_t a, simde_int32x4_t b, simde_int32x4_t c) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vmlsl_high_s32(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int64x2_private r_; + simde_int64x2_private a_ = simde_int64x2_to_private(a); + simde_int32x4_private b_ = simde_int32x4_to_private(b); + simde_int32x4_private c_ = simde_int32x4_to_private(c); + b_.sv128 = __riscv_vslidedown_vx_i32m1(b_.sv128 , 2, 4); + c_.sv128 = __riscv_vslidedown_vx_i32m1(c_.sv128 , 2, 4); + vint32mf2_t vb = __riscv_vlmul_trunc_v_i32m1_i32mf2 (b_.sv128); + vint32mf2_t vc = __riscv_vlmul_trunc_v_i32m1_i32mf2 (c_.sv128); + r_.sv128 = __riscv_vsub_vv_i64m1(a_.sv128 , __riscv_vwmul_vv_i64m1(vb , vc , 2) , 2); + return simde_int64x2_from_private(r_); #else return simde_vsubq_s64(a, simde_vmull_high_s32(b, c)); #endif @@ -81,6 +115,17 @@ simde_uint16x8_t simde_vmlsl_high_u8(simde_uint16x8_t a, simde_uint8x16_t b, simde_uint8x16_t c) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vmlsl_high_u8(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint16x8_private r_; + simde_uint16x8_private a_ = simde_uint16x8_to_private(a); + simde_uint8x16_private b_ = simde_uint8x16_to_private(b); + simde_uint8x16_private c_ = simde_uint8x16_to_private(c); + b_.sv128 = __riscv_vslidedown_vx_u8m1(b_.sv128 , 8 , 16); + c_.sv128 = __riscv_vslidedown_vx_u8m1(c_.sv128 , 8 , 16); + vuint8mf2_t vb = __riscv_vlmul_trunc_v_u8m1_u8mf2 (b_.sv128); + vuint8mf2_t vc = __riscv_vlmul_trunc_v_u8m1_u8mf2 (c_.sv128); + r_.sv128 = __riscv_vsub_vv_u16m1(a_.sv128 , __riscv_vwmulu_vv_u16m1(vb , vc , 8) , 8); + return simde_uint16x8_from_private(r_); #else return simde_vsubq_u16(a, simde_vmull_high_u8(b, c)); #endif @@ -95,6 +140,17 @@ simde_uint32x4_t simde_vmlsl_high_u16(simde_uint32x4_t a, simde_uint16x8_t b, simde_uint16x8_t c) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vmlsl_high_u16(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint32x4_private r_; + simde_uint32x4_private a_ = simde_uint32x4_to_private(a); + simde_uint16x8_private b_ = simde_uint16x8_to_private(b); + simde_uint16x8_private c_ = simde_uint16x8_to_private(c); + b_.sv128 = __riscv_vslidedown_vx_u16m1(b_.sv128 , 4 , 8); + c_.sv128 = __riscv_vslidedown_vx_u16m1(c_.sv128 , 4 , 8); + vuint16mf2_t vb = __riscv_vlmul_trunc_v_u16m1_u16mf2 (b_.sv128); + vuint16mf2_t vc = __riscv_vlmul_trunc_v_u16m1_u16mf2 (c_.sv128); + r_.sv128 = __riscv_vsub_vv_u32m1(a_.sv128 , __riscv_vwmulu_vv_u32m1(vb , vc , 4) , 4); + return simde_uint32x4_from_private(r_); #else return simde_vsubq_u32(a, simde_vmull_high_u16(b, c)); #endif @@ -109,6 +165,17 @@ simde_uint64x2_t simde_vmlsl_high_u32(simde_uint64x2_t a, simde_uint32x4_t b, simde_uint32x4_t c) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vmlsl_high_u32(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint64x2_private r_; + simde_uint64x2_private a_ = simde_uint64x2_to_private(a); + simde_uint32x4_private b_ = simde_uint32x4_to_private(b); + simde_uint32x4_private c_ = simde_uint32x4_to_private(c); + b_.sv128 = __riscv_vslidedown_vx_u32m1(b_.sv128 , 2, 4); + c_.sv128 = __riscv_vslidedown_vx_u32m1(c_.sv128 , 2, 4); + vuint32mf2_t vb = __riscv_vlmul_trunc_v_u32m1_u32mf2 (b_.sv128); + vuint32mf2_t vc = __riscv_vlmul_trunc_v_u32m1_u32mf2 (c_.sv128); + r_.sv128 = __riscv_vsub_vv_u64m1(a_.sv128 , __riscv_vwmulu_vv_u64m1(vb , vc , 2) , 2); + return simde_uint64x2_from_private(r_); #else return simde_vsubq_u64(a, simde_vmull_high_u32(b, c)); #endif diff --git a/simde/arm/neon/mlsl_high_n.h b/simde/arm/neon/mlsl_high_n.h index 7be34c81b..be23c0079 100644 --- a/simde/arm/neon/mlsl_high_n.h +++ b/simde/arm/neon/mlsl_high_n.h @@ -22,6 +22,7 @@ * * Copyright: * 2021 Décio Luiz Gazzoni Filho + * 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_MLSL_HIGH_N_H) @@ -41,6 +42,14 @@ simde_int32x4_t simde_vmlsl_high_n_s16(simde_int32x4_t a, simde_int16x8_t b, int16_t c) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vmlsl_high_n_s16(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int32x4_private r_; + simde_int32x4_private a_ = simde_int32x4_to_private(a); + simde_int16x8_private b_ = simde_int16x8_to_private(b); + b_.sv128 = __riscv_vslidedown_vx_i16m1(b_.sv128 , 4 , 8); + vint16mf2_t vb = __riscv_vlmul_trunc_v_i16m1_i16mf2 (b_.sv128); + r_.sv128 = __riscv_vsub_vv_i32m1(a_.sv128 , __riscv_vwmul_vx_i32m1(vb , c , 4) , 4); + return simde_int32x4_from_private(r_); #else return simde_vmlsq_s32(a, simde_vmovl_high_s16(b), simde_vdupq_n_s32(c)); #endif @@ -55,6 +64,14 @@ simde_int64x2_t simde_vmlsl_high_n_s32(simde_int64x2_t a, simde_int32x4_t b, int32_t c) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vmlsl_high_n_s32(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int64x2_private r_; + simde_int64x2_private a_ = simde_int64x2_to_private(a); + simde_int32x4_private b_ = simde_int32x4_to_private(b); + b_.sv128 = __riscv_vslidedown_vx_i32m1(b_.sv128 , 2, 4); + vint32mf2_t vb = __riscv_vlmul_trunc_v_i32m1_i32mf2 (b_.sv128); + r_.sv128 = __riscv_vsub_vv_i64m1(a_.sv128 , __riscv_vwmul_vx_i64m1(vb , c , 2) , 2); + return simde_int64x2_from_private(r_); #else simde_int64x2_private r_, @@ -84,6 +101,14 @@ simde_uint32x4_t simde_vmlsl_high_n_u16(simde_uint32x4_t a, simde_uint16x8_t b, uint16_t c) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vmlsl_high_n_u16(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint32x4_private r_; + simde_uint32x4_private a_ = simde_uint32x4_to_private(a); + simde_uint16x8_private b_ = simde_uint16x8_to_private(b); + b_.sv128 = __riscv_vslidedown_vx_u16m1(b_.sv128 , 4 , 8); + vuint16mf2_t vb = __riscv_vlmul_trunc_v_u16m1_u16mf2 (b_.sv128); + r_.sv128 = __riscv_vsub_vv_u32m1(a_.sv128 , __riscv_vwmulu_vx_u32m1(vb , c , 4) , 4); + return simde_uint32x4_from_private(r_); #else return simde_vmlsq_u32(a, simde_vmovl_high_u16(b), simde_vdupq_n_u32(c)); #endif @@ -98,6 +123,14 @@ simde_uint64x2_t simde_vmlsl_high_n_u32(simde_uint64x2_t a, simde_uint32x4_t b, uint32_t c) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vmlsl_high_n_u32(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint64x2_private r_; + simde_uint64x2_private a_ = simde_uint64x2_to_private(a); + simde_uint32x4_private b_ = simde_uint32x4_to_private(b); + b_.sv128 = __riscv_vslidedown_vx_u32m1(b_.sv128 , 2, 4); + vuint32mf2_t vb = __riscv_vlmul_trunc_v_u32m1_u32mf2 (b_.sv128); + r_.sv128 = __riscv_vsub_vv_u64m1(a_.sv128 , __riscv_vwmulu_vx_u64m1(vb , c , 2) , 2); + return simde_uint64x2_from_private(r_); #else simde_uint64x2_private r_, diff --git a/simde/arm/neon/mlsl_n.h b/simde/arm/neon/mlsl_n.h index 68ee44bff..1ec4a36ce 100644 --- a/simde/arm/neon/mlsl_n.h +++ b/simde/arm/neon/mlsl_n.h @@ -22,6 +22,7 @@ * * Copyright: * 2020 Evan Nemerson + * 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_MLSL_N_H) @@ -39,6 +40,13 @@ simde_int32x4_t simde_vmlsl_n_s16(simde_int32x4_t a, simde_int16x4_t b, int16_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmlsl_n_s16(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int32x4_private r_; + simde_int32x4_private a_ = simde_int32x4_to_private(a); + simde_int16x4_private b_ = simde_int16x4_to_private(b); + vint16mf2_t vb = __riscv_vlmul_trunc_v_i16m1_i16mf2 (b_.sv64); + r_.sv128 = __riscv_vsub_vv_i32m1(a_.sv128 , __riscv_vwmul_vx_i32m1(vb , c , 4) , 4); + return simde_int32x4_from_private(r_); #else return simde_vsubq_s32(a, simde_vmull_n_s16(b, c)); #endif @@ -53,6 +61,13 @@ simde_int64x2_t simde_vmlsl_n_s32(simde_int64x2_t a, simde_int32x2_t b, int32_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmlsl_n_s32(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int64x2_private r_; + simde_int64x2_private a_ = simde_int64x2_to_private(a); + simde_int32x2_private b_ = simde_int32x2_to_private(b); + vint32mf2_t vb = __riscv_vlmul_trunc_v_i32m1_i32mf2 (b_.sv64); + r_.sv128 = __riscv_vsub_vv_i64m1(a_.sv128 , __riscv_vwmul_vx_i64m1(vb , c , 2) , 2); + return simde_int64x2_from_private(r_); #else return simde_vsubq_s64(a, simde_vmull_n_s32(b, c)); #endif @@ -67,6 +82,13 @@ simde_uint32x4_t simde_vmlsl_n_u16(simde_uint32x4_t a, simde_uint16x4_t b, uint16_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmlsl_n_u16(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint32x4_private r_; + simde_uint32x4_private a_ = simde_uint32x4_to_private(a); + simde_uint16x4_private b_ = simde_uint16x4_to_private(b); + vuint16mf2_t vb = __riscv_vlmul_trunc_v_u16m1_u16mf2 (b_.sv64); + r_.sv128 = __riscv_vsub_vv_u32m1(a_.sv128 , __riscv_vwmulu_vx_u32m1(vb , c , 4) , 4); + return simde_uint32x4_from_private(r_); #else return simde_vsubq_u32(a, simde_vmull_n_u16(b, c)); #endif @@ -81,6 +103,13 @@ simde_uint64x2_t simde_vmlsl_n_u32(simde_uint64x2_t a, simde_uint32x2_t b, uint32_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmlsl_n_u32(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint64x2_private r_; + simde_uint64x2_private a_ = simde_uint64x2_to_private(a); + simde_uint32x2_private b_ = simde_uint32x2_to_private(b); + vuint32mf2_t vb = __riscv_vlmul_trunc_v_u32m1_u32mf2 (b_.sv64); + r_.sv128 = __riscv_vsub_vv_u64m1(a_.sv128 , __riscv_vwmulu_vx_u64m1(vb , c , 2) , 2); + return simde_uint64x2_from_private(r_); #else return simde_vsubq_u64(a, simde_vmull_n_u32(b, c)); #endif diff --git a/simde/arm/neon/qsub.h b/simde/arm/neon/qsub.h index 0c3e375c1..87213b43e 100644 --- a/simde/arm/neon/qsub.h +++ b/simde/arm/neon/qsub.h @@ -22,6 +22,7 @@ * * Copyright: * 2020 Evan Nemerson + * 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_QSUB_H) @@ -134,6 +135,8 @@ simde_vqsub_s8(simde_int8x8_t a, simde_int8x8_t b) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_subs_pi8(a_.m64, b_.m64); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vssub_vv_i8m1(a_.sv64, b_.sv64, 8); #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) const __typeof__(r_.values) diff_sat = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), (b_.values > a_.values) ^ INT8_MAX); const __typeof__(r_.values) diff = a_.values - b_.values; @@ -168,6 +171,8 @@ simde_vqsub_s16(simde_int16x4_t a, simde_int16x4_t b) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_subs_pi16(a_.m64, b_.m64); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vssub_vv_i16m1(a_.sv64, b_.sv64, 4); #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) const __typeof__(r_.values) diff_sat = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), (b_.values > a_.values) ^ INT16_MAX); const __typeof__(r_.values) diff = a_.values - b_.values; @@ -200,7 +205,9 @@ simde_vqsub_s32(simde_int32x2_t a, simde_int32x2_t b) { a_ = simde_int32x2_to_private(a), b_ = simde_int32x2_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vssub_vv_i32m1(a_.sv64, b_.sv64, 2); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) const __typeof__(r_.values) diff_sat = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), (b_.values > a_.values) ^ INT32_MAX); const __typeof__(r_.values) diff = a_.values - b_.values; const __typeof__(r_.values) saturate = diff_sat ^ diff; @@ -232,7 +239,9 @@ simde_vqsub_s64(simde_int64x1_t a, simde_int64x1_t b) { a_ = simde_int64x1_to_private(a), b_ = simde_int64x1_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vssub_vv_i64m1(a_.sv64, b_.sv64, 1); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) const __typeof__(r_.values) diff_sat = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), (b_.values > a_.values) ^ INT64_MAX); const __typeof__(r_.values) diff = a_.values - b_.values; const __typeof__(r_.values) saturate = diff_sat ^ diff; @@ -266,6 +275,8 @@ simde_vqsub_u8(simde_uint8x8_t a, simde_uint8x8_t b) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_subs_pu8(a_.m64, b_.m64); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vssubu_vv_u8m1(a_.sv64, b_.sv64, 8); #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.values = a_.values - b_.values; r_.values &= HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), (r_.values <= a_.values)); @@ -297,6 +308,8 @@ simde_vqsub_u16(simde_uint16x4_t a, simde_uint16x4_t b) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_subs_pu16(a_.m64, b_.m64); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vssubu_vv_u16m1(a_.sv64, b_.sv64, 4); #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.values = a_.values - b_.values; r_.values &= HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), (r_.values <= a_.values)); @@ -326,7 +339,9 @@ simde_vqsub_u32(simde_uint32x2_t a, simde_uint32x2_t b) { a_ = simde_uint32x2_to_private(a), b_ = simde_uint32x2_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vssubu_vv_u32m1(a_.sv64, b_.sv64, 2); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.values = a_.values - b_.values; r_.values &= HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), (r_.values <= a_.values)); #else @@ -355,7 +370,9 @@ simde_vqsub_u64(simde_uint64x1_t a, simde_uint64x1_t b) { a_ = simde_uint64x1_to_private(a), b_ = simde_uint64x1_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vssubu_vv_u64m1(a_.sv64, b_.sv64, 1); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.values = a_.values - b_.values; r_.values &= HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), (r_.values <= a_.values)); #else @@ -390,6 +407,8 @@ simde_vqsubq_s8(simde_int8x16_t a, simde_int8x16_t b) { r_.v128 = wasm_i8x16_sub_sat(a_.v128, b_.v128); #elif defined(SIMDE_X86_SSE2_NATIVE) r_.m128i = _mm_subs_epi8(a_.m128i, b_.m128i); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vssub_vv_i8m1(a_.sv128 , b_.sv128 , 16); #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) const __typeof__(r_.values) diff_sat = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), (b_.values > a_.values) ^ INT8_MAX); const __typeof__(r_.values) diff = a_.values - b_.values; @@ -428,6 +447,8 @@ simde_vqsubq_s16(simde_int16x8_t a, simde_int16x8_t b) { r_.v128 = wasm_i16x8_sub_sat(a_.v128, b_.v128); #elif defined(SIMDE_X86_SSE2_NATIVE) r_.m128i = _mm_subs_epi16(a_.m128i, b_.m128i); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vssub_vv_i16m1(a_.sv128 , b_.sv128 , 8); #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) const __typeof__(r_.values) diff_sat = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), (b_.values > a_.values) ^ INT16_MAX); const __typeof__(r_.values) diff = a_.values - b_.values; @@ -479,6 +500,8 @@ simde_vqsubq_s32(simde_int32x4_t a, simde_int32x4_t b) { #else r_.m128i = _mm_xor_si128(diff, _mm_and_si128(t, _mm_srai_epi32(t, 31))); #endif + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vssub_vv_i32m1(a_.sv128 , b_.sv128 , 4); #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) const __typeof__(r_.values) diff_sat = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), (b_.values > a_.values) ^ INT32_MAX); const __typeof__(r_.values) diff = a_.values - b_.values; @@ -511,7 +534,9 @@ simde_vqsubq_s64(simde_int64x2_t a, simde_int64x2_t b) { a_ = simde_int64x2_to_private(a), b_ = simde_int64x2_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vssub_vv_i64m1(a_.sv128 , b_.sv128 , 2); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) const __typeof__(r_.values) diff_sat = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), (b_.values > a_.values) ^ INT64_MAX); const __typeof__(r_.values) diff = a_.values - b_.values; const __typeof__(r_.values) saturate = diff_sat ^ diff; @@ -549,6 +574,8 @@ simde_vqsubq_u8(simde_uint8x16_t a, simde_uint8x16_t b) { r_.v128 = wasm_u8x16_sub_sat(a_.v128, b_.v128); #elif defined(SIMDE_X86_SSE2_NATIVE) r_.m128i = _mm_subs_epu8(a_.m128i, b_.m128i); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vssubu_vv_u8m1(a_.sv128 , b_.sv128 , 16); #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.values = a_.values - b_.values; r_.values &= HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), r_.values <= a_.values); @@ -584,6 +611,8 @@ simde_vqsubq_u16(simde_uint16x8_t a, simde_uint16x8_t b) { r_.v128 = wasm_u16x8_sub_sat(a_.v128, b_.v128); #elif defined(SIMDE_X86_SSE2_NATIVE) r_.m128i = _mm_subs_epu16(a_.m128i, b_.m128i); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vssubu_vv_u16m1(a_.sv128 , b_.sv128 , 8); #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.values = a_.values - b_.values; r_.values &= HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), r_.values <= a_.values); @@ -629,6 +658,8 @@ simde_vqsubq_u32(simde_uint32x4_t a, simde_uint32x4_t b) { _mm_set1_epi32(~INT32_C(0)) ) ); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vssubu_vv_u32m1(a_.sv128 , b_.sv128 , 4); #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.values = a_.values - b_.values; r_.values &= HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), (r_.values <= a_.values)); @@ -661,7 +692,9 @@ simde_vqsubq_u64(simde_uint64x2_t a, simde_uint64x2_t b) { a_ = simde_uint64x2_to_private(a), b_ = simde_uint64x2_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vssubu_vv_u64m1(a_.sv128 , b_.sv128 , 2); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.values = a_.values - b_.values; r_.values &= HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), (r_.values <= a_.values)); #else diff --git a/simde/arm/neon/qtbl.h b/simde/arm/neon/qtbl.h index f1897d77b..066278a93 100644 --- a/simde/arm/neon/qtbl.h +++ b/simde/arm/neon/qtbl.h @@ -24,6 +24,7 @@ * 2020 Evan Nemerson * 2020 Christopher Moore * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_QTBL_H) @@ -55,6 +56,10 @@ simde_vqtbl1_u8(simde_uint8x16_t t, simde_uint8x8_t idx) { __m128i idx128 = _mm_set1_epi64(idx_.m64); __m128i r128 = _mm_shuffle_epi8(t_.m128i, _mm_or_si128(idx128, _mm_cmpgt_epi8(idx128, _mm_set1_epi8(15)))); r_.m64 = _mm_movepi64_pi64(r128); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool8_t mask = __riscv_vmsgeu_vx_u8m1_b8 (idx_.sv64, 16, 8); + r_.sv64 = __riscv_vrgather_vv_u8m1(t_.sv128 , idx_.sv64 , 8); + r_.sv64 = __riscv_vmerge_vxm_u8m1(r_.sv64, 0, mask, 8); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -108,6 +113,14 @@ simde_vqtbl2_u8(simde_uint8x16x2_t t, simde_uint8x8_t idx) { __m128i r128_1 = _mm_shuffle_epi8(t_[1].m128i, idx128); __m128i r128 = _mm_blendv_epi8(r128_0, r128_1, _mm_slli_epi32(idx128, 3)); r_.m64 = _mm_movepi64_pi64(r128); + #elif defined(SIMDE_RISCV_V_NATIVE) + vuint8m2_t t1 = __riscv_vlmul_ext_v_u8m1_u8m2 (t_[0].sv128); + vuint8m2_t t2 = __riscv_vlmul_ext_v_u8m1_u8m2 (t_[1].sv128); + vuint8m2_t t_combine = __riscv_vslideup_vx_u8m2(t1 , t2 , 16 , 32); + vuint8m2_t idxm2 = __riscv_vlmul_ext_v_u8m1_u8m2(idx_.sv64); + vbool4_t mask = __riscv_vmsgeu_vx_u8m2_b4 (idxm2, 32, 8); + vuint8m2_t r_tmp = __riscv_vrgather_vv_u8m2(t_combine , idxm2 , 8); + r_.sv64 = __riscv_vlmul_trunc_v_u8m2_u8m1(__riscv_vmerge_vxm_u8m2(r_tmp, 0, mask, 8)); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -169,6 +182,16 @@ simde_vqtbl3_u8(simde_uint8x16x3_t t, simde_uint8x8_t idx) { __m128i r128_2 = _mm_shuffle_epi8(t_[2].m128i, idx128); __m128i r128 = _mm_blendv_epi8(r128_01, r128_2, _mm_slli_epi32(idx128, 2)); r_.m64 = _mm_movepi64_pi64(r128); + #elif defined(SIMDE_RISCV_V_NATIVE) + vuint8m4_t t1 = __riscv_vlmul_ext_v_u8m1_u8m4 (t_[0].sv128); + vuint8m4_t t2 = __riscv_vlmul_ext_v_u8m1_u8m4 (t_[1].sv128); + vuint8m4_t t3 = __riscv_vlmul_ext_v_u8m1_u8m4 (t_[2].sv128); + vuint8m4_t t_combine = __riscv_vslideup_vx_u8m4(t2 , t3 , 16 , 48); + t_combine = __riscv_vslideup_vx_u8m4(t1 , t_combine , 16 , 48); + vuint8m4_t idxm4 = __riscv_vlmul_ext_v_u8m1_u8m4(idx_.sv64); + vbool2_t mask = __riscv_vmsgeu_vx_u8m4_b2 (idxm4, 48, 8); + vuint8m4_t r_tmp = __riscv_vrgather_vv_u8m4(t_combine , idxm4 , 8); + r_.sv64 = __riscv_vlmul_trunc_v_u8m4_u8m1(__riscv_vmerge_vxm_u8m4(r_tmp, 0, mask, 8)); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -233,6 +256,18 @@ simde_vqtbl4_u8(simde_uint8x16x4_t t, simde_uint8x8_t idx) { __m128i r128_23 = _mm_blendv_epi8(r128_2, r128_3, idx128_shl3); __m128i r128 = _mm_blendv_epi8(r128_01, r128_23, _mm_slli_epi32(idx128, 2)); r_.m64 = _mm_movepi64_pi64(r128); + #elif defined(SIMDE_RISCV_V_NATIVE) + vuint8m4_t t1 = __riscv_vlmul_ext_v_u8m1_u8m4 (t_[0].sv128); + vuint8m4_t t2 = __riscv_vlmul_ext_v_u8m1_u8m4 (t_[1].sv128); + vuint8m4_t t3 = __riscv_vlmul_ext_v_u8m1_u8m4 (t_[2].sv128); + vuint8m4_t t4 = __riscv_vlmul_ext_v_u8m1_u8m4 (t_[3].sv128); + vuint8m4_t t_combine = __riscv_vslideup_vx_u8m4(t3 , t4 , 16 , 64); + t_combine = __riscv_vslideup_vx_u8m4(t2 , t_combine , 16 , 64); + t_combine = __riscv_vslideup_vx_u8m4(t1 , t_combine , 16 , 64); + vuint8m4_t idxm4 = __riscv_vlmul_ext_v_u8m1_u8m4(idx_.sv64); + vbool2_t mask = __riscv_vmsgeu_vx_u8m4_b2 (idxm4, 64, 8); + vuint8m4_t r_tmp = __riscv_vrgather_vv_u8m4(t_combine , idxm4 , 8); + r_.sv64 = __riscv_vlmul_trunc_v_u8m4_u8m1(__riscv_vmerge_vxm_u8m4(r_tmp, 0, mask, 8)); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -289,6 +324,10 @@ simde_vqtbl1q_u8(simde_uint8x16_t t, simde_uint8x16_t idx) { r_.m128i = _mm_shuffle_epi8(t_.m128i, _mm_or_si128(idx_.m128i, _mm_cmpgt_epi8(idx_.m128i, _mm_set1_epi8(15)))); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i8x16_swizzle(t_.v128, idx_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool8_t mask = __riscv_vmsgeu_vx_u8m1_b8 (idx_.sv128, 16, 16); + r_.sv128 = __riscv_vrgather_vv_u8m1(t_.sv128 , idx_.sv128 , 16); + r_.sv128 = __riscv_vmerge_vxm_u8m1(r_.sv128, 0, mask, 16); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -348,6 +387,14 @@ simde_vqtbl2q_u8(simde_uint8x16x2_t t, simde_uint8x16_t idx) { #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_v128_or(wasm_i8x16_swizzle(t_[0].v128, idx_.v128), wasm_i8x16_swizzle(t_[1].v128, wasm_i8x16_sub(idx_.v128, wasm_i8x16_splat(16)))); + #elif defined(SIMDE_RISCV_V_NATIVE) + vuint8m2_t t1 = __riscv_vlmul_ext_v_u8m1_u8m2 (t_[0].sv128); + vuint8m2_t t2 = __riscv_vlmul_ext_v_u8m1_u8m2 (t_[1].sv128); + vuint8m2_t t_combine = __riscv_vslideup_vx_u8m2(t1 , t2 , 16 , 32); + vuint8m2_t idxm2 = __riscv_vlmul_ext_v_u8m1_u8m2(idx_.sv128); + vbool4_t mask = __riscv_vmsgeu_vx_u8m2_b4 (idxm2, 32, 16); + vuint8m2_t r_tmp = __riscv_vrgather_vv_u8m2(t_combine , idxm2 , 16); + r_.sv128 = __riscv_vlmul_trunc_v_u8m2_u8m1(__riscv_vmerge_vxm_u8m2(r_tmp, 0, mask, 16)); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -418,6 +465,16 @@ simde_vqtbl3q_u8(simde_uint8x16x3_t t, simde_uint8x16_t idx) { r_.v128 = wasm_v128_or(wasm_v128_or(wasm_i8x16_swizzle(t_[0].v128, idx_.v128), wasm_i8x16_swizzle(t_[1].v128, wasm_i8x16_sub(idx_.v128, wasm_i8x16_splat(16)))), wasm_i8x16_swizzle(t_[2].v128, wasm_i8x16_sub(idx_.v128, wasm_i8x16_splat(32)))); + #elif defined(SIMDE_RISCV_V_NATIVE) + vuint8m4_t t1 = __riscv_vlmul_ext_v_u8m1_u8m4 (t_[0].sv128); + vuint8m4_t t2 = __riscv_vlmul_ext_v_u8m1_u8m4 (t_[1].sv128); + vuint8m4_t t3 = __riscv_vlmul_ext_v_u8m1_u8m4 (t_[2].sv128); + vuint8m4_t t_combine = __riscv_vslideup_vx_u8m4(t2 , t3 , 16 , 48); + t_combine = __riscv_vslideup_vx_u8m4(t1 , t_combine , 16 , 48); + vuint8m4_t idxm4 = __riscv_vlmul_ext_v_u8m1_u8m4(idx_.sv128); + vbool2_t mask = __riscv_vmsgeu_vx_u8m4_b2 (idxm4, 48, 16); + vuint8m4_t r_tmp = __riscv_vrgather_vv_u8m4(t_combine , idxm4 , 16); + r_.sv128 = __riscv_vlmul_trunc_v_u8m4_u8m1(__riscv_vmerge_vxm_u8m4(r_tmp, 0, mask, 16)); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -492,6 +549,18 @@ simde_vqtbl4q_u8(simde_uint8x16x4_t t, simde_uint8x16_t idx) { wasm_i8x16_swizzle(t_[1].v128, wasm_i8x16_sub(idx_.v128, wasm_i8x16_splat(16)))), wasm_v128_or(wasm_i8x16_swizzle(t_[2].v128, wasm_i8x16_sub(idx_.v128, wasm_i8x16_splat(32))), wasm_i8x16_swizzle(t_[3].v128, wasm_i8x16_sub(idx_.v128, wasm_i8x16_splat(48))))); + #elif defined(SIMDE_RISCV_V_NATIVE) + vuint8m4_t t1 = __riscv_vlmul_ext_v_u8m1_u8m4 (t_[0].sv128); + vuint8m4_t t2 = __riscv_vlmul_ext_v_u8m1_u8m4 (t_[1].sv128); + vuint8m4_t t3 = __riscv_vlmul_ext_v_u8m1_u8m4 (t_[2].sv128); + vuint8m4_t t4 = __riscv_vlmul_ext_v_u8m1_u8m4 (t_[3].sv128); + vuint8m4_t t_combine = __riscv_vslideup_vx_u8m4(t3 , t4 , 16 , 64); + t_combine = __riscv_vslideup_vx_u8m4(t2 , t_combine , 16 , 64); + t_combine = __riscv_vslideup_vx_u8m4(t1 , t_combine , 16 , 64); + vuint8m4_t idxm4 = __riscv_vlmul_ext_v_u8m1_u8m4(idx_.sv128); + vbool2_t mask = __riscv_vmsgeu_vx_u8m4_b2 (idxm4, 64, 16); + vuint8m4_t r_tmp = __riscv_vrgather_vv_u8m4(t_combine , idxm4 , 16); + r_.sv128 = __riscv_vlmul_trunc_v_u8m4_u8m1(__riscv_vmerge_vxm_u8m4(r_tmp, 0, mask, 16)); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { diff --git a/simde/arm/neon/qtbx.h b/simde/arm/neon/qtbx.h index 221c4b5df..326bf9f21 100644 --- a/simde/arm/neon/qtbx.h +++ b/simde/arm/neon/qtbx.h @@ -24,6 +24,7 @@ * 2020 Evan Nemerson * 2020 Christopher Moore * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_QTBX_H) @@ -58,6 +59,10 @@ simde_vqtbx1_u8(simde_uint8x8_t a, simde_uint8x16_t t, simde_uint8x8_t idx) { __m128i r128 = _mm_shuffle_epi8(t_.m128i, idx128); r128 = _mm_blendv_epi8(r128, _mm_set1_epi64(a_.m64), idx128); r_.m64 = _mm_movepi64_pi64(r128); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool8_t mask = __riscv_vmsgeu_vx_u8m1_b8 (idx_.sv64, 16, 8); + r_.sv64 = __riscv_vrgather_vv_u8m1(t_.sv128 , idx_.sv64 , 8); + r_.sv64 = __riscv_vmerge_vvm_u8m1(r_.sv64, a_.sv64, mask, 8); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -113,6 +118,15 @@ simde_vqtbx2_u8(simde_uint8x8_t a, simde_uint8x16x2_t t, simde_uint8x8_t idx) { __m128i r128 = _mm_blendv_epi8(r128_0, r128_1, _mm_slli_epi32(idx128, 3)); r128 = _mm_blendv_epi8(r128, _mm_set1_epi64(a_.m64), idx128); r_.m64 = _mm_movepi64_pi64(r128); + #elif defined(SIMDE_RISCV_V_NATIVE) + vuint8m2_t t1 = __riscv_vlmul_ext_v_u8m1_u8m2 (t_[0].sv128); + vuint8m2_t t2 = __riscv_vlmul_ext_v_u8m1_u8m2 (t_[1].sv128); + vuint8m2_t am2 = __riscv_vlmul_ext_v_u8m1_u8m2(a_.sv64); + vuint8m2_t t_combine = __riscv_vslideup_vx_u8m2(t1 , t2 , 16 , 32); + vuint8m2_t idxm2 = __riscv_vlmul_ext_v_u8m1_u8m2(idx_.sv64); + vbool4_t mask = __riscv_vmsgeu_vx_u8m2_b4 (idxm2, 32, 8); + vuint8m2_t r_tmp = __riscv_vrgather_vv_u8m2(t_combine , idxm2 , 8); + r_.sv64 = __riscv_vlmul_trunc_v_u8m2_u8m1(__riscv_vmerge_vvm_u8m2(r_tmp, am2, mask, 8)); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -174,6 +188,17 @@ simde_vqtbx3_u8(simde_uint8x8_t a, simde_uint8x16x3_t t, simde_uint8x8_t idx) { __m128i r128 = _mm_blendv_epi8(r128_01, r128_2, _mm_slli_epi32(idx128, 2)); r128 = _mm_blendv_epi8(r128, _mm_set1_epi64(a_.m64), idx128); r_.m64 = _mm_movepi64_pi64(r128); + #elif defined(SIMDE_RISCV_V_NATIVE) + vuint8m4_t t1 = __riscv_vlmul_ext_v_u8m1_u8m4 (t_[0].sv128); + vuint8m4_t t2 = __riscv_vlmul_ext_v_u8m1_u8m4 (t_[1].sv128); + vuint8m4_t t3 = __riscv_vlmul_ext_v_u8m1_u8m4 (t_[2].sv128); + vuint8m4_t am4 = __riscv_vlmul_ext_v_u8m1_u8m4 (a_.sv64); + vuint8m4_t t_combine = __riscv_vslideup_vx_u8m4(t2 , t3 , 16 , 48); + t_combine = __riscv_vslideup_vx_u8m4(t1 , t_combine , 16 , 48); + vuint8m4_t idxm4 = __riscv_vlmul_ext_v_u8m1_u8m4(idx_.sv64); + vbool2_t mask = __riscv_vmsgeu_vx_u8m4_b2 (idxm4, 48, 8); + vuint8m4_t r_tmp = __riscv_vrgather_vv_u8m4(t_combine , idxm4 , 8); + r_.sv64 = __riscv_vlmul_trunc_v_u8m4_u8m1(__riscv_vmerge_vvm_u8m4(r_tmp, am4, mask, 8)); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -238,6 +263,19 @@ simde_vqtbx4_u8(simde_uint8x8_t a, simde_uint8x16x4_t t, simde_uint8x8_t idx) { __m128i r128 = _mm_blendv_epi8(r128_01, r128_23, _mm_slli_epi32(idx128, 2)); r128 = _mm_blendv_epi8(r128, _mm_set1_epi64(a_.m64), idx128); r_.m64 = _mm_movepi64_pi64(r128); + #elif defined(SIMDE_RISCV_V_NATIVE) + vuint8m4_t t1 = __riscv_vlmul_ext_v_u8m1_u8m4 (t_[0].sv128); + vuint8m4_t t2 = __riscv_vlmul_ext_v_u8m1_u8m4 (t_[1].sv128); + vuint8m4_t t3 = __riscv_vlmul_ext_v_u8m1_u8m4 (t_[2].sv128); + vuint8m4_t t4 = __riscv_vlmul_ext_v_u8m1_u8m4 (t_[3].sv128); + vuint8m4_t am4 = __riscv_vlmul_ext_v_u8m1_u8m4 (a_.sv64); + vuint8m4_t t_combine = __riscv_vslideup_vx_u8m4(t3 , t4 , 16 , 64); + t_combine = __riscv_vslideup_vx_u8m4(t2 , t_combine , 16 , 64); + t_combine = __riscv_vslideup_vx_u8m4(t1 , t_combine , 16 , 64); + vuint8m4_t idxm4 = __riscv_vlmul_ext_v_u8m1_u8m4(idx_.sv64); + vbool2_t mask = __riscv_vmsgeu_vx_u8m4_b2 (idxm4, 64, 8); + vuint8m4_t r_tmp = __riscv_vrgather_vv_u8m4(t_combine , idxm4 , 8); + r_.sv64 = __riscv_vlmul_trunc_v_u8m4_u8m1(__riscv_vmerge_vvm_u8m4(r_tmp, am4, mask, 8)); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -299,6 +337,10 @@ simde_vqtbx1q_u8(simde_uint8x16_t a, simde_uint8x16_t t, simde_uint8x16_t idx) { #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_v128_or(wasm_i8x16_swizzle(t_.v128, idx_.v128), wasm_v128_and(a_.v128, wasm_u8x16_gt(idx_.v128, wasm_i8x16_splat(15)))); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool8_t mask = __riscv_vmsgeu_vx_u8m1_b8 (idx_.sv128, 16, 16); + r_.sv128 = __riscv_vrgather_vv_u8m1(t_.sv128 , idx_.sv128 , 16); + r_.sv128 = __riscv_vmerge_vvm_u8m1(r_.sv128, a_.sv128, mask, 16); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -361,6 +403,15 @@ simde_vqtbx2q_u8(simde_uint8x16_t a, simde_uint8x16x2_t t, simde_uint8x16_t idx) r_.v128 = wasm_v128_or(wasm_v128_or(wasm_i8x16_swizzle(t_[0].v128, idx_.v128), wasm_i8x16_swizzle(t_[1].v128, wasm_i8x16_sub(idx_.v128, wasm_i8x16_splat(16)))), wasm_v128_and(a_.v128, wasm_u8x16_gt(idx_.v128, wasm_i8x16_splat(31)))); + #elif defined(SIMDE_RISCV_V_NATIVE) + vuint8m2_t t1 = __riscv_vlmul_ext_v_u8m1_u8m2 (t_[0].sv128); + vuint8m2_t t2 = __riscv_vlmul_ext_v_u8m1_u8m2 (t_[1].sv128); + vuint8m2_t am2 = __riscv_vlmul_ext_v_u8m1_u8m2 (a_.sv128); + vuint8m2_t t_combine = __riscv_vslideup_vx_u8m2(t1 , t2 , 16 , 32); + vuint8m2_t idxm2 = __riscv_vlmul_ext_v_u8m1_u8m2(idx_.sv128); + vbool4_t mask = __riscv_vmsgeu_vx_u8m2_b4 (idxm2, 32, 16); + vuint8m2_t r_tmp = __riscv_vrgather_vv_u8m2(t_combine , idxm2 , 16); + r_.sv128 = __riscv_vlmul_trunc_v_u8m2_u8m1(__riscv_vmerge_vvm_u8m2(r_tmp, am2, mask, 16)); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -434,6 +485,17 @@ simde_vqtbx3q_u8(simde_uint8x16_t a, simde_uint8x16x3_t t, simde_uint8x16_t idx) wasm_i8x16_swizzle(t_[1].v128, wasm_i8x16_sub(idx_.v128, wasm_i8x16_splat(16)))), wasm_v128_or(wasm_i8x16_swizzle(t_[2].v128, wasm_i8x16_sub(idx_.v128, wasm_i8x16_splat(32))) , wasm_v128_and(a_.v128, wasm_u8x16_gt(idx_.v128, wasm_i8x16_splat(47))))); + #elif defined(SIMDE_RISCV_V_NATIVE) + vuint8m4_t t1 = __riscv_vlmul_ext_v_u8m1_u8m4 (t_[0].sv128); + vuint8m4_t t2 = __riscv_vlmul_ext_v_u8m1_u8m4 (t_[1].sv128); + vuint8m4_t t3 = __riscv_vlmul_ext_v_u8m1_u8m4 (t_[2].sv128); + vuint8m4_t am4 = __riscv_vlmul_ext_v_u8m1_u8m4 (a_.sv128); + vuint8m4_t t_combine = __riscv_vslideup_vx_u8m4(t2 , t3 , 16 , 48); + t_combine = __riscv_vslideup_vx_u8m4(t1 , t_combine , 16 , 48); + vuint8m4_t idxm4 = __riscv_vlmul_ext_v_u8m1_u8m4(idx_.sv128); + vbool2_t mask = __riscv_vmsgeu_vx_u8m4_b2 (idxm4, 48, 16); + vuint8m4_t r_tmp = __riscv_vrgather_vv_u8m4(t_combine , idxm4 , 16); + r_.sv128 = __riscv_vlmul_trunc_v_u8m4_u8m1(__riscv_vmerge_vvm_u8m4(r_tmp, am4, mask, 16)); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -511,6 +573,19 @@ simde_vqtbx4q_u8(simde_uint8x16_t a, simde_uint8x16x4_t t, simde_uint8x16_t idx) wasm_v128_or(wasm_i8x16_swizzle(t_[2].v128, wasm_i8x16_sub(idx_.v128, wasm_i8x16_splat(32))), wasm_i8x16_swizzle(t_[3].v128, wasm_i8x16_sub(idx_.v128, wasm_i8x16_splat(48))))), wasm_v128_and(a_.v128, wasm_u8x16_gt(idx_.v128, wasm_i8x16_splat(63)))); + #elif defined(SIMDE_RISCV_V_NATIVE) + vuint8m4_t t1 = __riscv_vlmul_ext_v_u8m1_u8m4 (t_[0].sv128); + vuint8m4_t t2 = __riscv_vlmul_ext_v_u8m1_u8m4 (t_[1].sv128); + vuint8m4_t t3 = __riscv_vlmul_ext_v_u8m1_u8m4 (t_[2].sv128); + vuint8m4_t t4 = __riscv_vlmul_ext_v_u8m1_u8m4 (t_[3].sv128); + vuint8m4_t am4 = __riscv_vlmul_ext_v_u8m1_u8m4 (a_.sv128); + vuint8m4_t t_combine = __riscv_vslideup_vx_u8m4(t3 , t4 , 16 , 64); + t_combine = __riscv_vslideup_vx_u8m4(t2 , t_combine , 16 , 64); + t_combine = __riscv_vslideup_vx_u8m4(t1 , t_combine , 16 , 64); + vuint8m4_t idxm4 = __riscv_vlmul_ext_v_u8m1_u8m4(idx_.sv128); + vbool2_t mask = __riscv_vmsgeu_vx_u8m4_b2 (idxm4, 64, 16); + vuint8m4_t r_tmp = __riscv_vrgather_vv_u8m4(t_combine , idxm4 , 16); + r_.sv128 = __riscv_vlmul_trunc_v_u8m4_u8m1(__riscv_vmerge_vvm_u8m4(r_tmp, am4, mask, 16)); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { diff --git a/simde/arm/neon/rbit.h b/simde/arm/neon/rbit.h index 647c4c5ae..ce63117c1 100644 --- a/simde/arm/neon/rbit.h +++ b/simde/arm/neon/rbit.h @@ -24,6 +24,7 @@ * 2020 Evan Nemerson * 2020 Christopher Moore * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ /* The GFNI implementation is based on Wojciech Muła's work at @@ -62,6 +63,13 @@ simde_vrbit_u8(simde_uint8x8_t a) { a_.m64 = _mm_or_si64(_mm_andnot_si64(mask, _mm_slli_pi16(a_.m64, 2)), _mm_and_si64(mask, _mm_srli_pi16(a_.m64, 2))); mask = _mm_set1_pi8(0x0F); r_.m64 = _mm_or_si64(_mm_andnot_si64(mask, _mm_slli_pi16(a_.m64, 4)), _mm_and_si64(mask, _mm_srli_pi16(a_.m64, 4))); + #elif defined(SIMDE_RISCV_V_NATIVE) + vuint8m1_t mask; + mask = __riscv_vmv_v_x_u8m1(0x55 , 8); + a_.sv64 = __riscv_vor_vv_u8m1(__riscv_vand_vv_u8m1(mask , __riscv_vsrl_vx_u8m1(a_.sv64 , 1 , 8) , 8) , __riscv_vsll_vx_u8m1(__riscv_vand_vv_u8m1(mask , a_.sv64 , 8) , 1 , 8) , 8); + mask = __riscv_vmv_v_x_u8m1(0x33 , 8); + a_.sv64 = __riscv_vor_vv_u8m1(__riscv_vand_vv_u8m1(mask , __riscv_vsrl_vx_u8m1(a_.sv64 , 2 , 8) , 8) , __riscv_vsll_vx_u8m1(__riscv_vand_vv_u8m1(mask , a_.sv64 , 8) , 2 , 8) , 8); + r_.sv64 = __riscv_vor_vv_u8m1(__riscv_vsrl_vx_u8m1(a_.sv64 , 4 , 8) , __riscv_vsll_vx_u8m1(a_.sv64 , 4 , 8) , 8); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -127,6 +135,13 @@ simde_vrbitq_u8(simde_uint8x16_t a) { a_.v128 = wasm_v128_bitselect(wasm_u8x16_shr(a_.v128, 1), wasm_i8x16_shl(a_.v128, 1), wasm_i8x16_splat(0x55)); a_.v128 = wasm_v128_bitselect(wasm_u8x16_shr(a_.v128, 2), wasm_i8x16_shl(a_.v128, 2), wasm_i8x16_splat(0x33)); r_.v128 = wasm_v128_or(wasm_u8x16_shr(a_.v128, 4), wasm_i8x16_shl(a_.v128, 4)); + #elif defined(SIMDE_RISCV_V_NATIVE) + vuint8m1_t mask; + mask = __riscv_vmv_v_x_u8m1(0x55 , 16); + a_.sv128 = __riscv_vor_vv_u8m1(__riscv_vand_vv_u8m1(mask , __riscv_vsrl_vx_u8m1(a_.sv128 , 1 , 16) , 16) , __riscv_vsll_vx_u8m1(__riscv_vand_vv_u8m1(mask , a_.sv128 , 16) , 1 , 16) , 16); + mask = __riscv_vmv_v_x_u8m1(0x33 , 16); + a_.sv128 = __riscv_vor_vv_u8m1(__riscv_vand_vv_u8m1(mask , __riscv_vsrl_vx_u8m1(a_.sv128 , 2 , 16) , 16) , __riscv_vsll_vx_u8m1(__riscv_vand_vv_u8m1(mask , a_.sv128 , 16) , 2 , 16) , 16); + r_.sv128 = __riscv_vor_vv_u8m1(__riscv_vsrl_vx_u8m1(a_.sv128 , 4 , 16) , __riscv_vsll_vx_u8m1(a_.sv128 , 4 , 16) , 16); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { diff --git a/simde/arm/neon/recpe.h b/simde/arm/neon/recpe.h index 382d12fc4..be068a06c 100644 --- a/simde/arm/neon/recpe.h +++ b/simde/arm/neon/recpe.h @@ -24,6 +24,7 @@ * 2020 Evan Nemerson * 2021 Zhi An Ng (Copyright owned by Google, LLC) * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_RECPE_H) @@ -90,10 +91,14 @@ simde_vrecpe_f16(simde_float16x4_t a) { r_, a_ = simde_float16x4_to_private(a); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = simde_vrecpeh_f16(a_.values[i]); - } + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVFH) + r_.sv64 = __riscv_vfrec7_v_f16m1(a_.sv64 , 4); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vrecpeh_f16(a_.values[i]); + } + #endif return simde_float16x4_from_private(r_); #endif @@ -113,7 +118,9 @@ simde_vrecpe_f32(simde_float32x2_t a) { r_, a_ = simde_float32x2_to_private(a); - #if defined(SIMDE_IEEE754_STORAGE) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vfrec7_v_f32m1(a_.sv64 , 2); + #elif defined(SIMDE_IEEE754_STORAGE) /* https://stackoverflow.com/questions/12227126/division-as-multiply-and-lut-fast-float-division-reciprocal/12228234#12228234 */ SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -152,7 +159,9 @@ simde_vrecpe_f64(simde_float64x1_t a) { r_, a_ = simde_float64x1_to_private(a); - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vfrec7_v_f64m1(a_.sv64 , 1); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.values = 1.0 / a_.values; #else SIMDE_VECTORIZE @@ -179,7 +188,9 @@ simde_vrecpeq_f64(simde_float64x2_t a) { r_, a_ = simde_float64x2_to_private(a); - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vfrec7_v_f64m1(a_.sv128 , 2); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.values = 1.0 / a_.values; #else SIMDE_VECTORIZE @@ -208,8 +219,11 @@ simde_vrecpeq_f32(simde_float32x4_t a) { r_, a_ = simde_float32x4_to_private(a); + #if defined(SIMDE_X86_SSE_NATIVE) r_.m128 = _mm_rcp_ps(a_.m128); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vfrec7_v_f32m1(a_.sv128 , 4); #elif defined(SIMDE_IEEE754_STORAGE) /* https://stackoverflow.com/questions/12227126/division-as-multiply-and-lut-fast-float-division-reciprocal/12228234#12228234 */ SIMDE_VECTORIZE @@ -249,10 +263,14 @@ simde_vrecpeq_f16(simde_float16x8_t a) { r_, a_ = simde_float16x8_to_private(a); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = simde_vrecpeh_f16(a_.values[i]); - } + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVFH) + r_.sv128 = __riscv_vfrec7_v_f16m1(a_.sv128 , 8); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vrecpeh_f16(a_.values[i]); + } + #endif return simde_float16x8_from_private(r_); #endif diff --git a/simde/arm/neon/rev16.h b/simde/arm/neon/rev16.h index 5ad0bffd1..3cbd3df71 100644 --- a/simde/arm/neon/rev16.h +++ b/simde/arm/neon/rev16.h @@ -24,6 +24,7 @@ * 2020 Evan Nemerson * 2020 Christopher Moore * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_REV16_H) @@ -48,6 +49,9 @@ simde_vrev16_s8(simde_int8x8_t a) { #if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_shuffle_pi8(a_.m64, _mm_set_pi8(6, 7, 4, 5, 2, 3, 0, 1)); + #elif defined(SIMDE_RISCV_V_NATIVE) + uint8_t shuffle_idx[] = {1, 0, 3, 2, 5, 4, 7, 6}; + r_.sv64 = __riscv_vrgather_vv_i8m1(a_.sv64, __riscv_vle8_v_u8m1(shuffle_idx, 8), 8); #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100762) r_.values = SIMDE_SHUFFLE_VECTOR_(8, 8, a_.values, a_.values, 1, 0, 3, 2, 5, 4, 7, 6); #else @@ -99,6 +103,9 @@ simde_vrev16q_s8(simde_int8x16_t a) { r_.m128i = _mm_shuffle_epi8(a_.m128i, _mm_set_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i8x16_shuffle(a_.v128, a_.v128, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); + #elif defined(SIMDE_RISCV_V_NATIVE) + uint8_t shuffle_idx[] = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; + r_.sv128 = __riscv_vrgather_vv_i8m1(a_.sv128, __riscv_vle8_v_u8m1(shuffle_idx, 16), 16); #elif defined(SIMDE_SHUFFLE_VECTOR_) r_.values = SIMDE_SHUFFLE_VECTOR_(8, 16, a_.values, a_.values, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); #else diff --git a/simde/arm/neon/rev32.h b/simde/arm/neon/rev32.h index 172c38cd4..e3dff42cc 100644 --- a/simde/arm/neon/rev32.h +++ b/simde/arm/neon/rev32.h @@ -24,6 +24,7 @@ * 2020 Evan Nemerson * 2020 Christopher Moore * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_REV32_H) @@ -48,6 +49,9 @@ simde_vrev32_s8(simde_int8x8_t a) { #if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_shuffle_pi8(a_.m64, _mm_set_pi8(4, 5, 6, 7, 0, 1, 2, 3)); + #elif defined(SIMDE_RISCV_V_NATIVE) + uint8_t shuffle_idx[] = {3, 2, 1, 0, 7, 6, 5, 4}; + r_.sv64 = __riscv_vrgather_vv_i8m1(a_.sv64, __riscv_vle8_v_u8m1(shuffle_idx, 8), 8); #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100762) r_.values = SIMDE_SHUFFLE_VECTOR_(8, 8, a_.values, a_.values, 3, 2, 1, 0, 7, 6, 5, 4); #else @@ -77,6 +81,9 @@ simde_vrev32_s16(simde_int16x4_t a) { #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_shuffle_pi16(a_.m64, (2 << 6) | (3 << 4) | (0 << 2) | (1 << 0)); + #elif defined(SIMDE_RISCV_V_NATIVE) + uint16_t shuffle_idx[] = {1, 0, 3, 2}; + r_.sv64 = __riscv_vrgather_vv_i16m1(a_.sv64, __riscv_vle16_v_u16m1(shuffle_idx, 4), 4); #elif defined(SIMDE_SHUFFLE_VECTOR_) r_.values = SIMDE_SHUFFLE_VECTOR_(16, 8, a_.values, a_.values, 1, 0, 3, 2); #else @@ -143,6 +150,9 @@ simde_vrev32q_s8(simde_int8x16_t a) { 4, 5, 6, 7, 0, 1, 2, 3)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i8x16_shuffle(a_.v128, a_.v128, 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12); + #elif defined(SIMDE_RISCV_V_NATIVE) + uint8_t shuffle_idx[] = {3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12}; + r_.sv128 = __riscv_vrgather_vv_i8m1(a_.sv128, __riscv_vle8_v_u8m1(shuffle_idx, 16), 16); #elif defined(SIMDE_SHUFFLE_VECTOR_) r_.values = SIMDE_SHUFFLE_VECTOR_(8, 16, a_.values, a_.values, 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12); #else @@ -173,7 +183,10 @@ simde_vrev32q_s16(simde_int16x8_t a) { r_, a_ = simde_int16x8_to_private(a); - #if defined(SIMDE_X86_SSSE3_NATIVE) + #if defined(SIMDE_RISCV_V_NATIVE) + uint16_t shuffle_idx[] = {1, 0, 3, 2, 5, 4, 7, 6}; + r_.sv128 = __riscv_vrgather_vv_i16m1(a_.sv128, __riscv_vle16_v_u16m1(shuffle_idx, 8), 8); + #elif defined(SIMDE_X86_SSSE3_NATIVE) r_.m128i = _mm_shuffle_epi8(a_.m128i, _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2)); #elif defined(SIMDE_X86_SSE2_NATIVE) diff --git a/simde/arm/neon/rev64.h b/simde/arm/neon/rev64.h index 4e3af9c51..565fd5902 100644 --- a/simde/arm/neon/rev64.h +++ b/simde/arm/neon/rev64.h @@ -24,6 +24,7 @@ * 2020 Evan Nemerson * 2020 Christopher Moore * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_REV64_H) @@ -48,6 +49,9 @@ simde_vrev64_s8(simde_int8x8_t a) { #if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_shuffle_pi8(a_.m64, _mm_set_pi8(0, 1, 2, 3, 4, 5, 6, 7)); + #elif defined(SIMDE_RISCV_V_NATIVE) + uint8_t shuffle_idx[] = {7, 6, 5, 4, 3, 2, 1, 0}; + r_.sv64 = __riscv_vrgather_vv_i8m1(a_.sv64, __riscv_vle8_v_u8m1(shuffle_idx, 8), 8); #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100762) r_.values = SIMDE_SHUFFLE_VECTOR_(8, 8, a_.values, a_.values, 7, 6, 5, 4, 3, 2, 1, 0); #else @@ -77,6 +81,9 @@ simde_vrev64_s16(simde_int16x4_t a) { #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_shuffle_pi16(a_.m64, (0 << 6) | (1 << 4) | (2 << 2) | (3 << 0)); + #elif defined(SIMDE_RISCV_V_NATIVE) + uint16_t shuffle_idx[] = {3, 2, 1, 0}; + r_.sv64 = __riscv_vrgather_vv_i16m1(a_.sv64, __riscv_vle16_v_u16m1(shuffle_idx, 4), 4); #elif defined(SIMDE_SHUFFLE_VECTOR_) r_.values = SIMDE_SHUFFLE_VECTOR_(16, 8, a_.values, a_.values, 3, 2, 1, 0); #else @@ -106,6 +113,9 @@ simde_vrev64_s32(simde_int32x2_t a) { #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_shuffle_pi16(a_.m64, (1 << 6) | (0 << 4) | (3 << 2) | (2 << 0)); + #elif defined(SIMDE_RISCV_V_NATIVE) + uint32_t shuffle_idx[] = {1, 0}; + r_.sv64 = __riscv_vrgather_vv_i32m1(a_.sv64, __riscv_vle32_v_u32m1(shuffle_idx, 2), 2); #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100762) r_.values = SIMDE_SHUFFLE_VECTOR_(32, 8, a_.values, a_.values, 1, 0); #else @@ -214,6 +224,9 @@ simde_vrev64q_s8(simde_int8x16_t a) { 0, 1, 2, 3, 4, 5, 6, 7)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i8x16_shuffle(a_.v128, a_.v128, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8); + #elif defined(SIMDE_RISCV_V_NATIVE) + uint8_t shuffle_idx[] = {7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8}; + r_.sv128 = __riscv_vrgather_vv_i8m1(a_.sv128, __riscv_vle8_v_u8m1(shuffle_idx, 16), 16); #elif defined(SIMDE_SHUFFLE_VECTOR_) r_.values = SIMDE_SHUFFLE_VECTOR_(8, 16, a_.values, a_.values, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8); #else @@ -253,6 +266,9 @@ simde_vrev64q_s16(simde_int16x8_t a) { (0 << 6) | (1 << 4) | (2 << 2) | (3 << 0)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i8x16_shuffle(a_.v128, a_.v128, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12, 13, 10, 11, 8, 9); + #elif defined(SIMDE_RISCV_V_NATIVE) + uint16_t shuffle_idx[] = {3, 2, 1, 0, 7, 6, 5, 4}; + r_.sv128 = __riscv_vrgather_vv_i16m1(a_.sv128, __riscv_vle16_v_u16m1(shuffle_idx, 8), 8); #elif defined(SIMDE_SHUFFLE_VECTOR_) r_.values = SIMDE_SHUFFLE_VECTOR_(16, 16, a_.values, a_.values, 3, 2, 1, 0, 7, 6, 5, 4); #else @@ -287,6 +303,9 @@ simde_vrev64q_s32(simde_int32x4_t a) { r_.m128i = _mm_shuffle_epi32(a_.m128i, (2 << 6) | (3 << 4) | (0 << 2) | (1 << 0)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i8x16_shuffle(a_.v128, a_.v128, 4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11); + #elif defined(SIMDE_RISCV_V_NATIVE) + uint32_t shuffle_idx[] = {1, 0, 3, 2}; + r_.sv128 = __riscv_vrgather_vv_i32m1(a_.sv128, __riscv_vle32_v_u32m1(shuffle_idx, 4), 4); #elif defined(SIMDE_SHUFFLE_VECTOR_) r_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 1, 0, 3, 2); #else diff --git a/simde/arm/neon/subl.h b/simde/arm/neon/subl.h index 356bf5610..3ac143f7d 100644 --- a/simde/arm/neon/subl.h +++ b/simde/arm/neon/subl.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Sean Maher (Copyright owned by Google, LLC) + * 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_SUBL_H) @@ -42,6 +43,12 @@ simde_int16x8_t simde_vsubl_s8(simde_int8x8_t a, simde_int8x8_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vsubl_s8(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int16x8_private r_; + simde_int8x8_private a_ = simde_int8x8_to_private(a); + simde_int8x8_private b_ = simde_int8x8_to_private(b); + r_.sv128 = __riscv_vwsub_vv_i16m1(__riscv_vlmul_trunc_v_i8m1_i8mf2(a_.sv64) , __riscv_vlmul_trunc_v_i8m1_i8mf2(b_.sv64) , 8); + return simde_int16x8_from_private(r_); #else return simde_vsubq_s16(simde_vmovl_s8(a), simde_vmovl_s8(b)); #endif @@ -56,6 +63,12 @@ simde_int32x4_t simde_vsubl_s16(simde_int16x4_t a, simde_int16x4_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vsubl_s16(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int32x4_private r_; + simde_int16x4_private a_ = simde_int16x4_to_private(a); + simde_int16x4_private b_ = simde_int16x4_to_private(b); + r_.sv128 = __riscv_vwsub_vv_i32m1(__riscv_vlmul_trunc_v_i16m1_i16mf2(a_.sv64) , __riscv_vlmul_trunc_v_i16m1_i16mf2(b_.sv64) , 4); + return simde_int32x4_from_private(r_); #else return simde_vsubq_s32(simde_vmovl_s16(a), simde_vmovl_s16(b)); #endif @@ -70,6 +83,12 @@ simde_int64x2_t simde_vsubl_s32(simde_int32x2_t a, simde_int32x2_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vsubl_s32(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int64x2_private r_; + simde_int32x2_private a_ = simde_int32x2_to_private(a); + simde_int32x2_private b_ = simde_int32x2_to_private(b); + r_.sv128 = __riscv_vwsub_vv_i64m1(__riscv_vlmul_trunc_v_i32m1_i32mf2(a_.sv64) , __riscv_vlmul_trunc_v_i32m1_i32mf2(b_.sv64) , 2); + return simde_int64x2_from_private(r_); #else return simde_vsubq_s64(simde_vmovl_s32(a), simde_vmovl_s32(b)); #endif @@ -84,6 +103,12 @@ simde_uint16x8_t simde_vsubl_u8(simde_uint8x8_t a, simde_uint8x8_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vsubl_u8(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint16x8_private r_; + simde_uint8x8_private a_ = simde_uint8x8_to_private(a); + simde_uint8x8_private b_ = simde_uint8x8_to_private(b); + r_.sv128 = __riscv_vwsubu_vv_u16m1(__riscv_vlmul_trunc_v_u8m1_u8mf2 (a_.sv64) , __riscv_vlmul_trunc_v_u8m1_u8mf2 (b_.sv64) , 8); + return simde_uint16x8_from_private(r_); #else return simde_vsubq_u16(simde_vmovl_u8(a), simde_vmovl_u8(b)); #endif @@ -98,6 +123,12 @@ simde_uint32x4_t simde_vsubl_u16(simde_uint16x4_t a, simde_uint16x4_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vsubl_u16(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint32x4_private r_; + simde_uint16x4_private a_ = simde_uint16x4_to_private(a); + simde_uint16x4_private b_ = simde_uint16x4_to_private(b); + r_.sv128 = __riscv_vwsubu_vv_u32m1(__riscv_vlmul_trunc_v_u16m1_u16mf2 (a_.sv64) , __riscv_vlmul_trunc_v_u16m1_u16mf2 (b_.sv64) , 4); + return simde_uint32x4_from_private(r_); #else return simde_vsubq_u32(simde_vmovl_u16(a), simde_vmovl_u16(b)); #endif @@ -112,6 +143,12 @@ simde_uint64x2_t simde_vsubl_u32(simde_uint32x2_t a, simde_uint32x2_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vsubl_u32(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint64x2_private r_; + simde_uint32x2_private a_ = simde_uint32x2_to_private(a); + simde_uint32x2_private b_ = simde_uint32x2_to_private(b); + r_.sv128 = __riscv_vwsubu_vv_u64m1(__riscv_vlmul_trunc_v_u32m1_u32mf2 (a_.sv64) , __riscv_vlmul_trunc_v_u32m1_u32mf2 (b_.sv64) , 4); + return simde_uint64x2_from_private(r_); #else return simde_vsubq_u64(simde_vmovl_u32(a), simde_vmovl_u32(b)); #endif diff --git a/simde/arm/neon/subl_high.h b/simde/arm/neon/subl_high.h index d45f4989b..860cb6e4d 100644 --- a/simde/arm/neon/subl_high.h +++ b/simde/arm/neon/subl_high.h @@ -22,6 +22,7 @@ * * Copyright: * 2021 Décio Luiz Gazzoni Filho + * 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_SUBL_HIGH_H) @@ -41,6 +42,14 @@ simde_int16x8_t simde_vsubl_high_s8(simde_int8x16_t a, simde_int8x16_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vsubl_high_s8(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) && (SIMDE_NATURAL_VECTOR_SIZE == 128) + simde_int16x8_private r_; + simde_int8x16_private a_ = simde_int8x16_to_private(a); + simde_int8x16_private b_ = simde_int8x16_to_private(b); + a_.sv128 = __riscv_vslidedown_vx_i8m1(a_.sv128 , 8 , 16); + b_.sv128 = __riscv_vslidedown_vx_i8m1(b_.sv128 , 8 , 16); + r_.sv128 = __riscv_vwsub_vv_i16m1(__riscv_vlmul_trunc_v_i8m1_i8mf2(a_.sv128) , __riscv_vlmul_trunc_v_i8m1_i8mf2(b_.sv128) , 8); + return simde_int16x8_from_private(r_); #else return simde_vsubq_s16(simde_vmovl_high_s8(a), simde_vmovl_high_s8(b)); #endif @@ -55,6 +64,14 @@ simde_int32x4_t simde_vsubl_high_s16(simde_int16x8_t a, simde_int16x8_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vsubl_high_s16(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) && (SIMDE_NATURAL_VECTOR_SIZE == 128) + simde_int32x4_private r_; + simde_int16x8_private a_ = simde_int16x8_to_private(a); + simde_int16x8_private b_ = simde_int16x8_to_private(b); + a_.sv128 = __riscv_vslidedown_vx_i16m1(a_.sv128 , 4 , 8); + b_.sv128 = __riscv_vslidedown_vx_i16m1(b_.sv128 , 4 , 8); + r_.sv128 = __riscv_vwsub_vv_i32m1(__riscv_vlmul_trunc_v_i16m1_i16mf2(a_.sv128) , __riscv_vlmul_trunc_v_i16m1_i16mf2(b_.sv128) , 4); + return simde_int32x4_from_private(r_); #else return simde_vsubq_s32(simde_vmovl_high_s16(a), simde_vmovl_high_s16(b)); #endif @@ -69,6 +86,14 @@ simde_int64x2_t simde_vsubl_high_s32(simde_int32x4_t a, simde_int32x4_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vsubl_high_s32(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) && (SIMDE_NATURAL_VECTOR_SIZE == 128) + simde_int64x2_private r_; + simde_int32x4_private a_ = simde_int32x4_to_private(a); + simde_int32x4_private b_ = simde_int32x4_to_private(b); + a_.sv128 = __riscv_vslidedown_vx_i32m1(a_.sv128 , 2, 4); + b_.sv128 = __riscv_vslidedown_vx_i32m1(b_.sv128 , 2, 4); + r_.sv128 = __riscv_vwsub_vv_i64m1(__riscv_vlmul_trunc_v_i32m1_i32mf2(a_.sv128) , __riscv_vlmul_trunc_v_i32m1_i32mf2(b_.sv128) , 2); + return simde_int64x2_from_private(r_); #else return simde_vsubq_s64(simde_vmovl_high_s32(a), simde_vmovl_high_s32(b)); #endif @@ -83,6 +108,14 @@ simde_uint16x8_t simde_vsubl_high_u8(simde_uint8x16_t a, simde_uint8x16_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vsubl_high_u8(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) && (SIMDE_NATURAL_VECTOR_SIZE == 128) + simde_uint16x8_private r_; + simde_uint8x16_private a_ = simde_uint8x16_to_private(a); + simde_uint8x16_private b_ = simde_uint8x16_to_private(b); + a_.sv128 = __riscv_vslidedown_vx_u8m1(a_.sv128 , 8 , 16); + b_.sv128 = __riscv_vslidedown_vx_u8m1(b_.sv128 , 8 , 16); + r_.sv128 = __riscv_vwsubu_vv_u16m1(__riscv_vlmul_trunc_v_u8m1_u8mf2 (a_.sv128) , __riscv_vlmul_trunc_v_u8m1_u8mf2 (b_.sv128) , 8); + return simde_uint16x8_from_private(r_); #else return simde_vsubq_u16(simde_vmovl_high_u8(a), simde_vmovl_high_u8(b)); #endif @@ -97,6 +130,14 @@ simde_uint32x4_t simde_vsubl_high_u16(simde_uint16x8_t a, simde_uint16x8_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vsubl_high_u16(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) && (SIMDE_NATURAL_VECTOR_SIZE == 128) + simde_uint32x4_private r_; + simde_uint16x8_private a_ = simde_uint16x8_to_private(a); + simde_uint16x8_private b_ = simde_uint16x8_to_private(b); + a_.sv128 = __riscv_vslidedown_vx_u16m1(a_.sv128 , 4 , 8); + b_.sv128 = __riscv_vslidedown_vx_u16m1(b_.sv128 , 4 , 8); + r_.sv128 = __riscv_vwsubu_vv_u32m1(__riscv_vlmul_trunc_v_u16m1_u16mf2 (a_.sv128) , __riscv_vlmul_trunc_v_u16m1_u16mf2 (b_.sv128) , 4); + return simde_uint32x4_from_private(r_); #else return simde_vsubq_u32(simde_vmovl_high_u16(a), simde_vmovl_high_u16(b)); #endif @@ -111,6 +152,14 @@ simde_uint64x2_t simde_vsubl_high_u32(simde_uint32x4_t a, simde_uint32x4_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vsubl_high_u32(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) && (SIMDE_NATURAL_VECTOR_SIZE == 128) + simde_uint64x2_private r_; + simde_uint32x4_private a_ = simde_uint32x4_to_private(a); + simde_uint32x4_private b_ = simde_uint32x4_to_private(b); + a_.sv128 = __riscv_vslidedown_vx_u32m1(a_.sv128 , 2, 4); + b_.sv128 = __riscv_vslidedown_vx_u32m1(b_.sv128 , 2, 4); + r_.sv128 = __riscv_vwsubu_vv_u64m1(__riscv_vlmul_trunc_v_u32m1_u32mf2 (a_.sv128) , __riscv_vlmul_trunc_v_u32m1_u32mf2 (b_.sv128) , 2); + return simde_uint64x2_from_private(r_); #else return simde_vsubq_u64(simde_vmovl_high_u32(a), simde_vmovl_high_u32(b)); #endif diff --git a/simde/arm/neon/subw.h b/simde/arm/neon/subw.h index 51d6cf4bf..2f44a3529 100644 --- a/simde/arm/neon/subw.h +++ b/simde/arm/neon/subw.h @@ -22,6 +22,7 @@ * * Copyright: * 2020 Evan Nemerson + * 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_SUBW_H) @@ -40,14 +41,16 @@ simde_int16x8_t simde_vsubw_s8(simde_int16x8_t a, simde_int8x8_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vsubw_s8(a, b); - #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) + #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && !defined(SIMDE_RISCV_V_NATIVE) return simde_vsubq_s16(a, simde_vmovl_s8(b)); #else simde_int16x8_private r_; simde_int16x8_private a_ = simde_int16x8_to_private(a); simde_int8x8_private b_ = simde_int8x8_to_private(b); - #if (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vwsub_wv_i16m1(a_.sv128 , __riscv_vlmul_trunc_v_i8m1_i8mf2(b_.sv64) , 8); + #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_) SIMDE_CONVERT_VECTOR_(r_.values, b_.values); r_.values -= a_.values; #else @@ -70,14 +73,16 @@ simde_int32x4_t simde_vsubw_s16(simde_int32x4_t a, simde_int16x4_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vsubw_s16(a, b); - #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) + #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && !defined(SIMDE_RISCV_V_NATIVE) return simde_vsubq_s32(a, simde_vmovl_s16(b)); #else simde_int32x4_private r_; simde_int32x4_private a_ = simde_int32x4_to_private(a); simde_int16x4_private b_ = simde_int16x4_to_private(b); - #if (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vwsub_wv_i32m1(a_.sv128 , __riscv_vlmul_trunc_v_i16m1_i16mf2(b_.sv64) , 4); + #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_) SIMDE_CONVERT_VECTOR_(r_.values, b_.values); r_.values -= a_.values; #else @@ -100,14 +105,16 @@ simde_int64x2_t simde_vsubw_s32(simde_int64x2_t a, simde_int32x2_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vsubw_s32(a, b); - #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) + #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && !defined(SIMDE_RISCV_V_NATIVE) return simde_vsubq_s64(a, simde_vmovl_s32(b)); #else simde_int64x2_private r_; simde_int64x2_private a_ = simde_int64x2_to_private(a); simde_int32x2_private b_ = simde_int32x2_to_private(b); - #if (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vwsub_wv_i64m1(a_.sv128 , __riscv_vlmul_trunc_v_i32m1_i32mf2(b_.sv64) , 2); + #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_) SIMDE_CONVERT_VECTOR_(r_.values, b_.values); r_.values -= a_.values; #else @@ -130,14 +137,16 @@ simde_uint16x8_t simde_vsubw_u8(simde_uint16x8_t a, simde_uint8x8_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vsubw_u8(a, b); - #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) + #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && !defined(SIMDE_RISCV_V_NATIVE) return simde_vsubq_u16(a, simde_vmovl_u8(b)); #else simde_uint16x8_private r_; simde_uint16x8_private a_ = simde_uint16x8_to_private(a); simde_uint8x8_private b_ = simde_uint8x8_to_private(b); - #if (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vwsubu_wv_u16m1(a_.sv128 , __riscv_vlmul_trunc_v_u8m1_u8mf2(b_.sv64) , 8); + #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_) SIMDE_CONVERT_VECTOR_(r_.values, b_.values); r_.values -= a_.values; #else @@ -160,14 +169,16 @@ simde_uint32x4_t simde_vsubw_u16(simde_uint32x4_t a, simde_uint16x4_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vsubw_u16(a, b); - #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) + #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && !defined(SIMDE_RISCV_V_NATIVE) return simde_vsubq_u32(a, simde_vmovl_u16(b)); #else simde_uint32x4_private r_; simde_uint32x4_private a_ = simde_uint32x4_to_private(a); simde_uint16x4_private b_ = simde_uint16x4_to_private(b); - #if (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vwsubu_wv_u32m1(a_.sv128 , __riscv_vlmul_trunc_v_u16m1_u16mf2(b_.sv64) , 4); + #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_) SIMDE_CONVERT_VECTOR_(r_.values, b_.values); r_.values -= a_.values; #else @@ -190,14 +201,16 @@ simde_uint64x2_t simde_vsubw_u32(simde_uint64x2_t a, simde_uint32x2_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vsubw_u32(a, b); - #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) + #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && !defined(SIMDE_RISCV_V_NATIVE) return simde_vsubq_u64(a, simde_vmovl_u32(b)); #else simde_uint64x2_private r_; simde_uint64x2_private a_ = simde_uint64x2_to_private(a); simde_uint32x2_private b_ = simde_uint32x2_to_private(b); - #if (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vwsubu_wv_u64m1(a_.sv128 , __riscv_vlmul_trunc_v_u32m1_u32mf2(b_.sv64) , 2); + #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_) SIMDE_CONVERT_VECTOR_(r_.values, b_.values); r_.values -= a_.values; #else diff --git a/simde/arm/neon/subw_high.h b/simde/arm/neon/subw_high.h index 729a478a7..f48c6ed67 100644 --- a/simde/arm/neon/subw_high.h +++ b/simde/arm/neon/subw_high.h @@ -22,6 +22,7 @@ * * Copyright: * 2020 Evan Nemerson + * 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_SUBW_HIGH_H) @@ -40,14 +41,17 @@ simde_int16x8_t simde_vsubw_high_s8(simde_int16x8_t a, simde_int8x16_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vsubw_high_s8(a, b); - #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) + #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && !defined(SIMDE_RISCV_V_NATIVE) return simde_vsubq_s16(a, simde_vmovl_high_s8(b)); #else simde_int16x8_private r_; simde_int16x8_private a_ = simde_int16x8_to_private(a); simde_int8x16_private b_ = simde_int8x16_to_private(b); - #if (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_) + #if defined(SIMDE_RISCV_V_NATIVE) + b_.sv128 = __riscv_vslidedown_vx_i8m1(b_.sv128 , 8 , 16); + r_.sv128 = __riscv_vwsub_wv_i16m1(a_.sv128 , __riscv_vlmul_trunc_v_i8m1_i8mf2(b_.sv128) , 8); + #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_) SIMDE_CONVERT_VECTOR_(r_.values, b_.values); r_.values -= a_.values; #else @@ -70,14 +74,17 @@ simde_int32x4_t simde_vsubw_high_s16(simde_int32x4_t a, simde_int16x8_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vsubw_high_s16(a, b); - #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) + #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && !defined(SIMDE_RISCV_V_NATIVE) return simde_vsubq_s32(a, simde_vmovl_high_s16(b)); #else simde_int32x4_private r_; simde_int32x4_private a_ = simde_int32x4_to_private(a); simde_int16x8_private b_ = simde_int16x8_to_private(b); - #if (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_) + #if defined(SIMDE_RISCV_V_NATIVE) + b_.sv128 = __riscv_vslidedown_vx_i16m1(b_.sv128 , 4 , 8); + r_.sv128 = __riscv_vwsub_wv_i32m1(a_.sv128 , __riscv_vlmul_trunc_v_i16m1_i16mf2(b_.sv128) , 4); + #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_) SIMDE_CONVERT_VECTOR_(r_.values, b_.values); r_.values -= a_.values; #else @@ -100,14 +107,16 @@ simde_int64x2_t simde_vsubw_high_s32(simde_int64x2_t a, simde_int32x4_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vsubw_high_s32(a, b); - #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) + #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && !defined(SIMDE_RISCV_V_NATIVE) return simde_vsubq_s64(a, simde_vmovl_high_s32(b)); #else simde_int64x2_private r_; simde_int64x2_private a_ = simde_int64x2_to_private(a); simde_int32x4_private b_ = simde_int32x4_to_private(b); - - #if (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_) + #if defined(SIMDE_RISCV_V_NATIVE) + b_.sv128 = __riscv_vslidedown_vx_i32m1(b_.sv128 , 2 , 4); + r_.sv128 = __riscv_vwsub_wv_i64m1(a_.sv128 , __riscv_vlmul_trunc_v_i32m1_i32mf2(b_.sv128) , 2); + #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_) SIMDE_CONVERT_VECTOR_(r_.values, b_.values); r_.values -= a_.values; #else @@ -130,14 +139,17 @@ simde_uint16x8_t simde_vsubw_high_u8(simde_uint16x8_t a, simde_uint8x16_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vsubw_high_u8(a, b); - #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) + #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && !defined(SIMDE_RISCV_V_NATIVE) return simde_vsubq_u16(a, simde_vmovl_high_u8(b)); #else simde_uint16x8_private r_; simde_uint16x8_private a_ = simde_uint16x8_to_private(a); simde_uint8x16_private b_ = simde_uint8x16_to_private(b); - #if (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_) + #if defined(SIMDE_RISCV_V_NATIVE) + b_.sv128 = __riscv_vslidedown_vx_u8m1(b_.sv128 , 8 , 16); + r_.sv128 = __riscv_vwsubu_wv_u16m1(a_.sv128 , __riscv_vlmul_trunc_v_u8m1_u8mf2(b_.sv128) , 8); + #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_) SIMDE_CONVERT_VECTOR_(r_.values, b_.values); r_.values -= a_.values; #else @@ -160,14 +172,17 @@ simde_uint32x4_t simde_vsubw_high_u16(simde_uint32x4_t a, simde_uint16x8_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vsubw_high_u16(a, b); - #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) + #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && !defined(SIMDE_RISCV_V_NATIVE) return simde_vsubq_u32(a, simde_vmovl_high_u16(b)); #else simde_uint32x4_private r_; simde_uint32x4_private a_ = simde_uint32x4_to_private(a); simde_uint16x8_private b_ = simde_uint16x8_to_private(b); - #if (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_) + #if defined(SIMDE_RISCV_V_NATIVE) + b_.sv128 = __riscv_vslidedown_vx_u16m1(b_.sv128 , 4 , 8); + r_.sv128 = __riscv_vwsubu_wv_u32m1(a_.sv128 , __riscv_vlmul_trunc_v_u16m1_u16mf2(b_.sv128) , 4); + #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_) SIMDE_CONVERT_VECTOR_(r_.values, b_.values); r_.values -= a_.values; #else @@ -190,14 +205,17 @@ simde_uint64x2_t simde_vsubw_high_u32(simde_uint64x2_t a, simde_uint32x4_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vsubw_high_u32(a, b); - #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) + #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && !defined(SIMDE_RISCV_V_NATIVE) return simde_vsubq_u64(a, simde_vmovl_high_u32(b)); #else simde_uint64x2_private r_; simde_uint64x2_private a_ = simde_uint64x2_to_private(a); simde_uint32x4_private b_ = simde_uint32x4_to_private(b); - #if (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_) + #if defined(SIMDE_RISCV_V_NATIVE) + b_.sv128 = __riscv_vslidedown_vx_u32m1(b_.sv128 , 2 , 4); + r_.sv128 = __riscv_vwsubu_wv_u64m1(a_.sv128 , __riscv_vlmul_trunc_v_u32m1_u32mf2(b_.sv128) , 2); + #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_) SIMDE_CONVERT_VECTOR_(r_.values, b_.values); r_.values -= a_.values; #else diff --git a/simde/arm/neon/tbl.h b/simde/arm/neon/tbl.h index 3d0d841d6..36b7d3c5e 100644 --- a/simde/arm/neon/tbl.h +++ b/simde/arm/neon/tbl.h @@ -24,6 +24,7 @@ * 2020 Evan Nemerson * 2020 Christopher Moore * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_TBL_H) @@ -60,6 +61,10 @@ simde_vtbl1_u8(simde_uint8x8_t a, simde_uint8x8_t b) { #if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_shuffle_pi8(a_.m64, _mm_or_si64(b_.m64, _mm_cmpgt_pi8(b_.m64, _mm_set1_pi8(7)))); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool8_t mask = __riscv_vmsgeu_vx_u8m1_b8 (b_.sv64, 8, 8); + r_.sv64 = __riscv_vrgather_vv_u8m1(a_.sv64 , b_.sv64 , 8); + r_.sv64 = __riscv_vmerge_vxm_u8m1(r_.sv64, 0, mask, 8); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -107,6 +112,11 @@ simde_vtbl2_u8(simde_uint8x8x2_t a, simde_uint8x8_t b) { __m128i b128 = _mm_set1_epi64(b_.m64); __m128i r128 = _mm_shuffle_epi8(a128, _mm_or_si128(b128, _mm_cmpgt_epi8(b128, _mm_set1_epi8(15)))); r_.m64 = _mm_movepi64_pi64(r128); + #elif defined(SIMDE_RISCV_V_NATIVE) + vuint8m1_t t_combine = __riscv_vslideup_vx_u8m1(a_[0].sv64 , a_[1].sv64 , 8 , 16); + vbool8_t mask = __riscv_vmsgeu_vx_u8m1_b8 (b_.sv64 , 16 , 8); + vuint8m1_t r_tmp = __riscv_vrgather_vv_u8m1(t_combine , b_.sv64 , 8); + r_.sv64 = __riscv_vmerge_vxm_u8m1(r_tmp, 0, mask, 8); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -156,6 +166,16 @@ simde_vtbl3_u8(simde_uint8x8x3_t a, simde_uint8x8_t b) { __m128i r128_2 = _mm_shuffle_epi8(_mm_set1_epi64(a_[2].m64), b128); __m128i r128 = _mm_blendv_epi8(r128_01, r128_2, _mm_slli_epi32(b128, 3)); r_.m64 = _mm_movepi64_pi64(r128); + #elif defined(SIMDE_RISCV_V_NATIVE) + vuint8m2_t t1 = __riscv_vlmul_ext_v_u8m1_u8m2 (a_[0].sv64); + vuint8m2_t t2 = __riscv_vlmul_ext_v_u8m1_u8m2 (a_[1].sv64); + vuint8m2_t t3 = __riscv_vlmul_ext_v_u8m1_u8m2 (a_[2].sv64); + vuint8m2_t t_combine = __riscv_vslideup_vx_u8m2(t2 , t3 , 8 , 24); + t_combine = __riscv_vslideup_vx_u8m2(t1 , t_combine , 8 , 24); + vuint8m2_t idxm2 = __riscv_vlmul_ext_v_u8m1_u8m2(b_.sv64); + vbool4_t mask = __riscv_vmsgeu_vx_u8m2_b4 (idxm2, 24, 8); + vuint8m2_t r_tmp = __riscv_vrgather_vv_u8m2(t_combine , idxm2 , 8); + r_.sv64 = __riscv_vlmul_trunc_v_u8m2_u8m1(__riscv_vmerge_vxm_u8m2(r_tmp, 0, mask, 8)); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -205,6 +225,18 @@ simde_vtbl4_u8(simde_uint8x8x4_t a, simde_uint8x8_t b) { __m128i r128_23 = _mm_shuffle_epi8(_mm_set_epi64(a_[3].m64, a_[2].m64), b128); __m128i r128 = _mm_blendv_epi8(r128_01, r128_23, _mm_slli_epi32(b128, 3)); r_.m64 = _mm_movepi64_pi64(r128); + #elif defined(SIMDE_RISCV_V_NATIVE) + vuint8m2_t t1 = __riscv_vlmul_ext_v_u8m1_u8m2 (a_[0].sv64); + vuint8m2_t t2 = __riscv_vlmul_ext_v_u8m1_u8m2 (a_[1].sv64); + vuint8m2_t t3 = __riscv_vlmul_ext_v_u8m1_u8m2 (a_[2].sv64); + vuint8m2_t t4 = __riscv_vlmul_ext_v_u8m1_u8m2 (a_[3].sv64); + vuint8m2_t t_combine = __riscv_vslideup_vx_u8m2(t3 , t4 , 8 , 32); + t_combine = __riscv_vslideup_vx_u8m2(t2 , t_combine , 8 , 32); + t_combine = __riscv_vslideup_vx_u8m2(t1 , t_combine , 8 , 32); + vuint8m2_t idxm2 = __riscv_vlmul_ext_v_u8m1_u8m2(b_.sv64); + vbool4_t mask = __riscv_vmsgeu_vx_u8m2_b4 (idxm2, 32, 8); + vuint8m2_t r_tmp = __riscv_vrgather_vv_u8m2(t_combine , idxm2 , 8); + r_.sv64 = __riscv_vlmul_trunc_v_u8m2_u8m1(__riscv_vmerge_vxm_u8m2(r_tmp, 0, mask, 8)); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { diff --git a/simde/arm/neon/tbx.h b/simde/arm/neon/tbx.h index 0b2cae222..fdd450d4c 100644 --- a/simde/arm/neon/tbx.h +++ b/simde/arm/neon/tbx.h @@ -24,6 +24,7 @@ * 2020 Evan Nemerson * 2020 Christopher Moore * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_TBX_H) @@ -56,6 +57,10 @@ simde_vtbx1_u8(simde_uint8x8_t a, simde_uint8x8_t b, simde_uint8x8_t c) { __m128i r128 = _mm_shuffle_epi8(b128, c128); r128 = _mm_blendv_epi8(r128, a128, c128); r_.m64 = _mm_movepi64_pi64(r128); + #elif defined(SIMDE_RISCV_V_NATIVE) + vbool8_t mask = __riscv_vmsgeu_vx_u8m1_b8 (c_.sv64, 8, 16); + r_.sv64 = __riscv_vrgather_vv_u8m1(b_.sv64 , c_.sv64 , 8); + r_.sv64 = __riscv_vmerge_vvm_u8m1(r_.sv64, a_.sv64, mask, 8); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -107,6 +112,11 @@ simde_vtbx2_u8(simde_uint8x8_t a, simde_uint8x8x2_t b, simde_uint8x8_t c) { __m128i r128 = _mm_shuffle_epi8(b128, c128); r128 = _mm_blendv_epi8(r128, a128, c128); r_.m64 = _mm_movepi64_pi64(r128); + #elif defined(SIMDE_RISCV_V_NATIVE) + vuint8m1_t t_combine = __riscv_vslideup_vx_u8m1(b_[0].sv64 , b_[1].sv64 , 8 , 16); + vbool8_t mask = __riscv_vmsgeu_vx_u8m1_b8 (c_.sv64 , 16 , 8); + vuint8m1_t r_tmp = __riscv_vrgather_vv_u8m1(t_combine , c_.sv64 , 8); + r_.sv64 = __riscv_vmerge_vvm_u8m1(r_tmp, a_.sv64, mask, 8); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -161,6 +171,17 @@ simde_vtbx3_u8(simde_uint8x8_t a, simde_uint8x8x3_t b, simde_uint8x8_t c) { __m128i r128 = _mm_blendv_epi8(r128_01, r128_2, _mm_slli_epi32(c128, 3)); r128 = _mm_blendv_epi8(r128, a128, c128); r_.m64 = _mm_movepi64_pi64(r128); + #elif defined(SIMDE_RISCV_V_NATIVE) + vuint8m2_t t1 = __riscv_vlmul_ext_v_u8m1_u8m2 (b_[0].sv64); + vuint8m2_t t2 = __riscv_vlmul_ext_v_u8m1_u8m2 (b_[1].sv64); + vuint8m2_t t3 = __riscv_vlmul_ext_v_u8m1_u8m2 (b_[2].sv64); + vuint8m2_t am2 = __riscv_vlmul_ext_v_u8m1_u8m2 (a_.sv64); + vuint8m2_t t_combine = __riscv_vslideup_vx_u8m2(t2 , t3 , 8 , 24); + t_combine = __riscv_vslideup_vx_u8m2(t1 , t_combine , 8 , 24); + vuint8m2_t idxm2 = __riscv_vlmul_ext_v_u8m1_u8m2(c_.sv64); + vbool4_t mask = __riscv_vmsgeu_vx_u8m2_b4 (idxm2, 24, 8); + vuint8m2_t r_tmp = __riscv_vrgather_vv_u8m2(t_combine , idxm2 , 8); + r_.sv64 = __riscv_vlmul_trunc_v_u8m2_u8m1(__riscv_vmerge_vvm_u8m2(r_tmp, am2, mask, 8)); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -215,6 +236,19 @@ simde_vtbx4_u8(simde_uint8x8_t a, simde_uint8x8x4_t b, simde_uint8x8_t c) { __m128i r128 = _mm_blendv_epi8(r128_01, r128_23, _mm_slli_epi32(c128, 3)); r128 = _mm_blendv_epi8(r128, a128, c128); r_.m64 = _mm_movepi64_pi64(r128); + #elif defined(SIMDE_RISCV_V_NATIVE) + vuint8m2_t t1 = __riscv_vlmul_ext_v_u8m1_u8m2 (b_[0].sv64); + vuint8m2_t t2 = __riscv_vlmul_ext_v_u8m1_u8m2 (b_[1].sv64); + vuint8m2_t t3 = __riscv_vlmul_ext_v_u8m1_u8m2 (b_[2].sv64); + vuint8m2_t t4 = __riscv_vlmul_ext_v_u8m1_u8m2 (b_[3].sv64); + vuint8m2_t am2 = __riscv_vlmul_ext_v_u8m1_u8m2 (a_.sv64); + vuint8m2_t t_combine = __riscv_vslideup_vx_u8m2(t3 , t4 , 8 , 32); + t_combine = __riscv_vslideup_vx_u8m2(t2 , t_combine , 8 , 32); + t_combine = __riscv_vslideup_vx_u8m2(t1 , t_combine , 8 , 32); + vuint8m2_t idxm2 = __riscv_vlmul_ext_v_u8m1_u8m2(c_.sv64); + vbool4_t mask = __riscv_vmsgeu_vx_u8m2_b4 (idxm2, 32, 8); + vuint8m2_t r_tmp = __riscv_vrgather_vv_u8m2(t_combine , idxm2 , 8); + r_.sv64 = __riscv_vlmul_trunc_v_u8m2_u8m1(__riscv_vmerge_vvm_u8m2(r_tmp, am2, mask, 8)); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {