Skip to content

Commit

Permalink
arm/neon riscv64: additional RVV implementations - part1 (#1188)
Browse files Browse the repository at this point in the history
Contains RVV implementations for the following Neon instructions.

`abs`, `addl`, `addl_high`, `addlv`, `addv`, `cge`, `cgt`, `cle`, `clez`, `clt`, `cnt`, `fma`, `fms`, `fms_n`, `get_high`, `get_low`, `hsub`, `mla`, `mla_n`, `mlal`, `mlal_high`, `mlal_high_n`, `mlal_n`, `mls`, `mls_n`, `mlsl`, `mlsl_high`, `mlsl_high_n`, `mlsl_n`, `qsub`, `qtbl`, `qtbx`, `rbit`, `recpe`, `rev16`, `rev32`, `rev64`, `subl`, `subl_high`, `subw`, `subw_high`, `tbl`, `tbx`
  • Loading branch information
Ruhung authored Jun 21, 2024
1 parent e30e6ec commit 6346405
Show file tree
Hide file tree
Showing 43 changed files with 2,474 additions and 330 deletions.
72 changes: 55 additions & 17 deletions simde/arm/neon/abs.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
* Copyright:
* 2020 Evan Nemerson <[email protected]>
* 2023 Yi-Yen Chung <[email protected]> (Copyright owned by Andes Technology)
* 2023 Ju-Hung Li <[email protected]> (Copyright owned by NTHU pllab)
*/

#if !defined(SIMDE_ARM_NEON_ABS_H)
Expand Down Expand Up @@ -74,10 +75,14 @@ simde_vabs_f16(simde_float16x4_t a) {
r_,
a_ = simde_float16x4_to_private(a);

SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
r_.values[i] = simde_vabsh_f16(a_.values[i]);
}
#if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVFH)
r_.sv64 = __riscv_vfabs_v_f16m1(a_.sv64 , 4);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
r_.values[i] = simde_vabsh_f16(a_.values[i]);
}
#endif

return simde_float16x4_from_private(r_);
#endif
Expand All @@ -97,10 +102,14 @@ simde_vabs_f32(simde_float32x2_t a) {
r_,
a_ = simde_float32x2_to_private(a);

SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
r_.values[i] = a_.values[i] < 0 ? -a_.values[i] : a_.values[i];
}
#if defined(SIMDE_RISCV_V_NATIVE)
r_.sv64 = __riscv_vfabs_v_f32m1(a_.sv64 , 2);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
r_.values[i] = a_.values[i] < 0 ? -a_.values[i] : a_.values[i];
}
#endif

return simde_float32x2_from_private(r_);
#endif
Expand All @@ -120,10 +129,14 @@ simde_vabs_f64(simde_float64x1_t a) {
r_,
a_ = simde_float64x1_to_private(a);

SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
r_.values[i] = a_.values[i] < 0 ? -a_.values[i] : a_.values[i];
}
#if defined(SIMDE_RISCV_V_NATIVE)
r_.sv64 = __riscv_vfabs_v_f64m1(a_.sv64 , 1);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
r_.values[i] = a_.values[i] < 0 ? -a_.values[i] : a_.values[i];
}
#endif

return simde_float64x1_from_private(r_);
#endif
Expand All @@ -145,6 +158,8 @@ simde_vabs_s8(simde_int8x8_t a) {

#if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
r_.m64 = _mm_abs_pi8(a_.m64);
#elif defined(SIMDE_RISCV_V_NATIVE)
r_.sv64 = __riscv_vmax_vv_i8m1(a_.sv64 , __riscv_vneg_v_i8m1(a_.sv64 , 8) , 8);
#elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_100762)
__typeof__(r_.values) m = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values < INT8_C(0));
r_.values = (-a_.values & m) | (a_.values & ~m);
Expand Down Expand Up @@ -175,6 +190,8 @@ simde_vabs_s16(simde_int16x4_t a) {

#if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
r_.m64 = _mm_abs_pi16(a_.m64);
#elif defined(SIMDE_RISCV_V_NATIVE)
r_.sv64 = __riscv_vmax_vv_i16m1(a_.sv64 , __riscv_vneg_v_i16m1(a_.sv64 , 4) , 4);
#elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_100761)
__typeof__(r_.values) m = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values < INT16_C(0));
r_.values = (-a_.values & m) | (a_.values & ~m);
Expand Down Expand Up @@ -205,6 +222,8 @@ simde_vabs_s32(simde_int32x2_t a) {

#if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
r_.m64 = _mm_abs_pi32(a_.m64);
#elif defined(SIMDE_RISCV_V_NATIVE)
r_.sv64 = __riscv_vmax_vv_i32m1(a_.sv64 , __riscv_vneg_v_i32m1(a_.sv64 , 2) , 2);
#elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_100761)
__typeof__(r_.values) m = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values < INT32_C(0));
r_.values = (-a_.values & m) | (a_.values & ~m);
Expand Down Expand Up @@ -233,7 +252,9 @@ simde_vabs_s64(simde_int64x1_t a) {
r_,
a_ = simde_int64x1_to_private(a);

#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
#if defined(SIMDE_RISCV_V_NATIVE)
r_.sv64 = __riscv_vmax_vv_i64m1(a_.sv64 , __riscv_vneg_v_i64m1(a_.sv64 , 1) , 1);
#elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
__typeof__(r_.values) m = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values < INT64_C(0));
r_.values = (-a_.values & m) | (a_.values & ~m);
#else
Expand Down Expand Up @@ -261,10 +282,14 @@ simde_vabsq_f16(simde_float16x8_t a) {
r_,
a_ = simde_float16x8_to_private(a);

SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
r_.values[i] = simde_vabsh_f16(a_.values[i]);
}
#if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVFH)
r_.sv128 = __riscv_vfabs_v_f16m1(a_.sv128 , 8);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
r_.values[i] = simde_vabsh_f16(a_.values[i]);
}
#endif

return simde_float16x8_from_private(r_);
#endif
Expand All @@ -288,6 +313,8 @@ simde_vabsq_f32(simde_float32x4_t a) {

#if defined(SIMDE_WASM_SIMD128_NATIVE)
r_.v128 = wasm_f32x4_abs(a_.v128);
#elif defined(SIMDE_RISCV_V_NATIVE)
r_.sv128 = __riscv_vfabs_v_f32m1(a_.sv128 , 4);
#elif defined(SIMDE_X86_SSE_NATIVE)
simde_float32 mask_;
uint32_t u32_ = UINT32_C(0x7FFFFFFF);
Expand Down Expand Up @@ -325,6 +352,8 @@ simde_vabsq_f64(simde_float64x2_t a) {
uint64_t u64_ = UINT64_C(0x7FFFFFFFFFFFFFFF);
simde_memcpy(&mask_, &u64_, sizeof(u64_));
r_.m128d = _mm_and_pd(_mm_set1_pd(mask_), a_.m128d);
#elif defined(SIMDE_RISCV_V_NATIVE)
r_.sv128 = __riscv_vfabs_v_f64m1(a_.sv128 , 2);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
Expand Down Expand Up @@ -358,6 +387,8 @@ simde_vabsq_s8(simde_int8x16_t a) {
r_.m128i = _mm_min_epu8(a_.m128i, _mm_sub_epi8(_mm_setzero_si128(), a_.m128i));
#elif defined(SIMDE_WASM_SIMD128_NATIVE)
r_.v128 = wasm_i8x16_abs(a_.v128);
#elif defined(SIMDE_RISCV_V_NATIVE)
r_.sv128 = __riscv_vmax_vv_i8m1(a_.sv128 , __riscv_vneg_v_i8m1(a_.sv128 , 16) , 16);
#elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
__typeof__(r_.values) m = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values < INT8_C(0));
r_.values = (-a_.values & m) | (a_.values & ~m);
Expand Down Expand Up @@ -394,6 +425,8 @@ simde_vabsq_s16(simde_int16x8_t a) {
r_.m128i = _mm_max_epi16(a_.m128i, _mm_sub_epi16(_mm_setzero_si128(), a_.m128i));
#elif defined(SIMDE_WASM_SIMD128_NATIVE)
r_.v128 = wasm_i16x8_abs(a_.v128);
#elif defined(SIMDE_RISCV_V_NATIVE)
r_.sv128 = __riscv_vmax_vv_i16m1(a_.sv128 , __riscv_vneg_v_i16m1(a_.sv128 , 8) , 8);
#elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
__typeof__(r_.values) m = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values < INT16_C(0));
r_.values = (-a_.values & m) | (a_.values & ~m);
Expand Down Expand Up @@ -431,6 +464,8 @@ simde_vabsq_s32(simde_int32x4_t a) {
r_.m128i = _mm_sub_epi32(_mm_xor_si128(a_.m128i, m), m);
#elif defined(SIMDE_WASM_SIMD128_NATIVE)
r_.v128 = wasm_i32x4_abs(a_.v128);
#elif defined(SIMDE_RISCV_V_NATIVE)
r_.sv128 = __riscv_vmax_vv_i32m1(a_.sv128 , __riscv_vneg_v_i32m1(a_.sv128 , 4) , 4);
#elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
__typeof__(r_.values) m = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values < INT32_C(0));
r_.values = (-a_.values & m) | (a_.values & ~m);
Expand All @@ -452,6 +487,7 @@ simde_vabsq_s32(simde_int32x4_t a) {
SIMDE_FUNCTION_ATTRIBUTES
simde_int64x2_t
simde_vabsq_s64(simde_int64x2_t a) {

#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
return vabsq_s64(a);
#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
Expand All @@ -470,6 +506,8 @@ simde_vabsq_s64(simde_int64x2_t a) {
r_.m128i = _mm_sub_epi64(_mm_xor_si128(a_.m128i, m), m);
#elif defined(SIMDE_WASM_SIMD128_NATIVE)
r_.v128 = wasm_i64x2_abs(a_.v128);
#elif defined(SIMDE_RISCV_V_NATIVE)
r_.sv128 = __riscv_vmax_vv_i64m1(a_.sv128 , __riscv_vneg_v_i64m1(a_.sv128 , 2) , 2);
#elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
__typeof__(r_.values) m = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values < INT64_C(0));
r_.values = (-a_.values & m) | (a_.values & ~m);
Expand Down
43 changes: 43 additions & 0 deletions simde/arm/neon/addl.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
* Copyright:
* 2020 Evan Nemerson <[email protected]>
* 2020 Sean Maher <[email protected]> (Copyright owned by Google, LLC)
* 2023 Ju-Hung Li <[email protected]> (Copyright owned by NTHU pllab)
*/

#if !defined(SIMDE_ARM_NEON_ADDL_H)
Expand All @@ -42,6 +43,13 @@ simde_int16x8_t
simde_vaddl_s8(simde_int8x8_t a, simde_int8x8_t b) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vaddl_s8(a, b);
#elif defined(SIMDE_RISCV_V_NATIVE)
simde_int16x8_private r_;
simde_int8x8_private a_ = simde_int8x8_to_private(a);
simde_int8x8_private b_ = simde_int8x8_to_private(b);

r_.sv128 = __riscv_vwadd_vv_i16m1(__riscv_vlmul_trunc_v_i8m1_i8mf2(a_.sv64) , __riscv_vlmul_trunc_v_i8m1_i8mf2(b_.sv64) , 8);
return simde_int16x8_from_private(r_);
#else
return simde_vaddq_s16(simde_vmovl_s8(a), simde_vmovl_s8(b));
#endif
Expand All @@ -56,6 +64,13 @@ simde_int32x4_t
simde_vaddl_s16(simde_int16x4_t a, simde_int16x4_t b) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vaddl_s16(a, b);
#elif defined(SIMDE_RISCV_V_NATIVE)
simde_int32x4_private r_;
simde_int16x4_private a_ = simde_int16x4_to_private(a);
simde_int16x4_private b_ = simde_int16x4_to_private(b);

r_.sv128 = __riscv_vwadd_vv_i32m1(__riscv_vlmul_trunc_v_i16m1_i16mf2(a_.sv64) , __riscv_vlmul_trunc_v_i16m1_i16mf2(b_.sv64) , 4);
return simde_int32x4_from_private(r_);
#else
return simde_vaddq_s32(simde_vmovl_s16(a), simde_vmovl_s16(b));
#endif
Expand All @@ -70,6 +85,13 @@ simde_int64x2_t
simde_vaddl_s32(simde_int32x2_t a, simde_int32x2_t b) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vaddl_s32(a, b);
#elif defined(SIMDE_RISCV_V_NATIVE)
simde_int64x2_private r_;
simde_int32x2_private a_ = simde_int32x2_to_private(a);
simde_int32x2_private b_ = simde_int32x2_to_private(b);

r_.sv128 = __riscv_vwadd_vv_i64m1(__riscv_vlmul_trunc_v_i32m1_i32mf2(a_.sv64) , __riscv_vlmul_trunc_v_i32m1_i32mf2(b_.sv64) , 2);
return simde_int64x2_from_private(r_);
#else
return simde_vaddq_s64(simde_vmovl_s32(a), simde_vmovl_s32(b));
#endif
Expand All @@ -84,6 +106,13 @@ simde_uint16x8_t
simde_vaddl_u8(simde_uint8x8_t a, simde_uint8x8_t b) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vaddl_u8(a, b);
#elif defined(SIMDE_RISCV_V_NATIVE)
simde_uint16x8_private r_;
simde_uint8x8_private a_ = simde_uint8x8_to_private(a);
simde_uint8x8_private b_ = simde_uint8x8_to_private(b);

r_.sv128 = __riscv_vwaddu_vv_u16m1(__riscv_vlmul_trunc_v_u8m1_u8mf2 (a_.sv64) , __riscv_vlmul_trunc_v_u8m1_u8mf2 (b_.sv64) , 8);
return simde_uint16x8_from_private(r_);
#else
return simde_vaddq_u16(simde_vmovl_u8(a), simde_vmovl_u8(b));
#endif
Expand All @@ -98,6 +127,13 @@ simde_uint32x4_t
simde_vaddl_u16(simde_uint16x4_t a, simde_uint16x4_t b) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vaddl_u16(a, b);
#elif defined(SIMDE_RISCV_V_NATIVE)
simde_uint32x4_private r_;
simde_uint16x4_private a_ = simde_uint16x4_to_private(a);
simde_uint16x4_private b_ = simde_uint16x4_to_private(b);

r_.sv128 = __riscv_vwaddu_vv_u32m1(__riscv_vlmul_trunc_v_u16m1_u16mf2 (a_.sv64) , __riscv_vlmul_trunc_v_u16m1_u16mf2 (b_.sv64) , 4);
return simde_uint32x4_from_private(r_);
#else
return simde_vaddq_u32(simde_vmovl_u16(a), simde_vmovl_u16(b));
#endif
Expand All @@ -112,6 +148,13 @@ simde_uint64x2_t
simde_vaddl_u32(simde_uint32x2_t a, simde_uint32x2_t b) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vaddl_u32(a, b);
#elif defined(SIMDE_RISCV_V_NATIVE)
simde_uint64x2_private r_;
simde_uint32x2_private a_ = simde_uint32x2_to_private(a);
simde_uint32x2_private b_ = simde_uint32x2_to_private(b);

r_.sv128 = __riscv_vwaddu_vv_u64m1(__riscv_vlmul_trunc_v_u32m1_u32mf2 (a_.sv64) , __riscv_vlmul_trunc_v_u32m1_u32mf2 (b_.sv64) , 4);
return simde_uint64x2_from_private(r_);
#else
return simde_vaddq_u64(simde_vmovl_u32(a), simde_vmovl_u32(b));
#endif
Expand Down
55 changes: 55 additions & 0 deletions simde/arm/neon/addl_high.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
* Copyright:
* 2020 Evan Nemerson <[email protected]>
* 2020 Sean Maher <[email protected]> (Copyright owned by Google, LLC)
* 2023 Ju-Hung Li <[email protected]> (Copyright owned by NTHU pllab)
*/

#if !defined(SIMDE_ARM_NEON_ADDL_HIGH_H)
Expand All @@ -42,6 +43,15 @@ simde_int16x8_t
simde_vaddl_high_s8(simde_int8x16_t a, simde_int8x16_t b) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
return vaddl_high_s8(a, b);
#elif defined(SIMDE_RISCV_V_NATIVE)
simde_int16x8_private r_;
simde_int8x16_private a_ = simde_int8x16_to_private(a);
simde_int8x16_private b_ = simde_int8x16_to_private(b);

a_.sv128 = __riscv_vslidedown_vx_i8m1(a_.sv128 , 8 , 16);
b_.sv128 = __riscv_vslidedown_vx_i8m1(b_.sv128 , 8 , 16);
r_.sv128 = __riscv_vwadd_vv_i16m1(__riscv_vlmul_trunc_v_i8m1_i8mf2(a_.sv128) , __riscv_vlmul_trunc_v_i8m1_i8mf2(b_.sv128) , 8);
return simde_int16x8_from_private(r_);
#else
return simde_vaddq_s16(simde_vmovl_high_s8(a), simde_vmovl_high_s8(b));
#endif
Expand All @@ -56,6 +66,15 @@ simde_int32x4_t
simde_vaddl_high_s16(simde_int16x8_t a, simde_int16x8_t b) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
return vaddl_high_s16(a, b);
#elif defined(SIMDE_RISCV_V_NATIVE)
simde_int32x4_private r_;
simde_int16x8_private a_ = simde_int16x8_to_private(a);
simde_int16x8_private b_ = simde_int16x8_to_private(b);

a_.sv128 = __riscv_vslidedown_vx_i16m1(a_.sv128 , 4 , 8);
b_.sv128 = __riscv_vslidedown_vx_i16m1(b_.sv128 , 4 , 8);
r_.sv128 = __riscv_vwadd_vv_i32m1(__riscv_vlmul_trunc_v_i16m1_i16mf2(a_.sv128) , __riscv_vlmul_trunc_v_i16m1_i16mf2(b_.sv128) , 4);
return simde_int32x4_from_private(r_);
#else
return simde_vaddq_s32(simde_vmovl_high_s16(a), simde_vmovl_high_s16(b));
#endif
Expand All @@ -70,6 +89,15 @@ simde_int64x2_t
simde_vaddl_high_s32(simde_int32x4_t a, simde_int32x4_t b) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
return vaddl_high_s32(a, b);
#elif defined(SIMDE_RISCV_V_NATIVE)
simde_int64x2_private r_;
simde_int32x4_private a_ = simde_int32x4_to_private(a);
simde_int32x4_private b_ = simde_int32x4_to_private(b);

a_.sv128 = __riscv_vslidedown_vx_i32m1(a_.sv128 , 2, 4);
b_.sv128 = __riscv_vslidedown_vx_i32m1(b_.sv128 , 2, 4);
r_.sv128 = __riscv_vwadd_vv_i64m1(__riscv_vlmul_trunc_v_i32m1_i32mf2(a_.sv128) , __riscv_vlmul_trunc_v_i32m1_i32mf2(b_.sv128) , 2);
return simde_int64x2_from_private(r_);
#else
return simde_vaddq_s64(simde_vmovl_high_s32(a), simde_vmovl_high_s32(b));
#endif
Expand All @@ -84,6 +112,15 @@ simde_uint16x8_t
simde_vaddl_high_u8(simde_uint8x16_t a, simde_uint8x16_t b) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
return vaddl_high_u8(a, b);
#elif defined(SIMDE_RISCV_V_NATIVE)
simde_uint16x8_private r_;
simde_uint8x16_private a_ = simde_uint8x16_to_private(a);
simde_uint8x16_private b_ = simde_uint8x16_to_private(b);

a_.sv128 = __riscv_vslidedown_vx_u8m1(a_.sv128 , 8 , 16);
b_.sv128 = __riscv_vslidedown_vx_u8m1(b_.sv128 , 8 , 16);
r_.sv128 = __riscv_vwaddu_vv_u16m1(__riscv_vlmul_trunc_v_u8m1_u8mf2 (a_.sv128) , __riscv_vlmul_trunc_v_u8m1_u8mf2 (b_.sv128) , 8);
return simde_uint16x8_from_private(r_);
#else
return simde_vaddq_u16(simde_vmovl_high_u8(a), simde_vmovl_high_u8(b));
#endif
Expand All @@ -98,6 +135,15 @@ simde_uint32x4_t
simde_vaddl_high_u16(simde_uint16x8_t a, simde_uint16x8_t b) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
return vaddl_high_u16(a, b);
#elif defined(SIMDE_RISCV_V_NATIVE)
simde_uint32x4_private r_;
simde_uint16x8_private a_ = simde_uint16x8_to_private(a);
simde_uint16x8_private b_ = simde_uint16x8_to_private(b);

a_.sv128 = __riscv_vslidedown_vx_u16m1(a_.sv128 , 4 , 8);
b_.sv128 = __riscv_vslidedown_vx_u16m1(b_.sv128 , 4 , 8);
r_.sv128 = __riscv_vwaddu_vv_u32m1(__riscv_vlmul_trunc_v_u16m1_u16mf2 (a_.sv128) , __riscv_vlmul_trunc_v_u16m1_u16mf2 (b_.sv128) , 4);
return simde_uint32x4_from_private(r_);
#else
return simde_vaddq_u32(simde_vmovl_high_u16(a), simde_vmovl_high_u16(b));
#endif
Expand All @@ -112,6 +158,15 @@ simde_uint64x2_t
simde_vaddl_high_u32(simde_uint32x4_t a, simde_uint32x4_t b) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
return vaddl_high_u32(a, b);
#elif defined(SIMDE_RISCV_V_NATIVE)
simde_uint64x2_private r_;
simde_uint32x4_private a_ = simde_uint32x4_to_private(a);
simde_uint32x4_private b_ = simde_uint32x4_to_private(b);

a_.sv128 = __riscv_vslidedown_vx_u32m1(a_.sv128 , 2, 4);
b_.sv128 = __riscv_vslidedown_vx_u32m1(b_.sv128 , 2, 4);
r_.sv128 = __riscv_vwaddu_vv_u64m1(__riscv_vlmul_trunc_v_u32m1_u32mf2 (a_.sv128) , __riscv_vlmul_trunc_v_u32m1_u32mf2 (b_.sv128) , 2);
return simde_uint64x2_from_private(r_);
#else
return simde_vaddq_u64(simde_vmovl_high_u32(a), simde_vmovl_high_u32(b));
#endif
Expand Down
Loading

0 comments on commit 6346405

Please sign in to comment.