Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Neon : RVV Intrinsics implementations - part1. #1188

Merged
merged 6 commits into from
Jun 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 55 additions & 17 deletions simde/arm/neon/abs.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
* Copyright:
* 2020 Evan Nemerson <[email protected]>
* 2023 Yi-Yen Chung <[email protected]> (Copyright owned by Andes Technology)
* 2023 Ju-Hung Li <[email protected]> (Copyright owned by NTHU pllab)
*/

#if !defined(SIMDE_ARM_NEON_ABS_H)
Expand Down Expand Up @@ -74,10 +75,14 @@ simde_vabs_f16(simde_float16x4_t a) {
r_,
a_ = simde_float16x4_to_private(a);

SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
r_.values[i] = simde_vabsh_f16(a_.values[i]);
}
#if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVFH)
r_.sv64 = __riscv_vfabs_v_f16m1(a_.sv64 , 4);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
r_.values[i] = simde_vabsh_f16(a_.values[i]);
}
#endif

return simde_float16x4_from_private(r_);
#endif
Expand All @@ -97,10 +102,14 @@ simde_vabs_f32(simde_float32x2_t a) {
r_,
a_ = simde_float32x2_to_private(a);

SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
r_.values[i] = a_.values[i] < 0 ? -a_.values[i] : a_.values[i];
}
#if defined(SIMDE_RISCV_V_NATIVE)
r_.sv64 = __riscv_vfabs_v_f32m1(a_.sv64 , 2);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
r_.values[i] = a_.values[i] < 0 ? -a_.values[i] : a_.values[i];
}
#endif

return simde_float32x2_from_private(r_);
#endif
Expand All @@ -120,10 +129,14 @@ simde_vabs_f64(simde_float64x1_t a) {
r_,
a_ = simde_float64x1_to_private(a);

SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
r_.values[i] = a_.values[i] < 0 ? -a_.values[i] : a_.values[i];
}
#if defined(SIMDE_RISCV_V_NATIVE)
r_.sv64 = __riscv_vfabs_v_f64m1(a_.sv64 , 1);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
r_.values[i] = a_.values[i] < 0 ? -a_.values[i] : a_.values[i];
}
#endif

return simde_float64x1_from_private(r_);
#endif
Expand All @@ -145,6 +158,8 @@ simde_vabs_s8(simde_int8x8_t a) {

#if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
r_.m64 = _mm_abs_pi8(a_.m64);
#elif defined(SIMDE_RISCV_V_NATIVE)
r_.sv64 = __riscv_vmax_vv_i8m1(a_.sv64 , __riscv_vneg_v_i8m1(a_.sv64 , 8) , 8);
#elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_100762)
__typeof__(r_.values) m = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values < INT8_C(0));
r_.values = (-a_.values & m) | (a_.values & ~m);
Expand Down Expand Up @@ -175,6 +190,8 @@ simde_vabs_s16(simde_int16x4_t a) {

#if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
r_.m64 = _mm_abs_pi16(a_.m64);
#elif defined(SIMDE_RISCV_V_NATIVE)
r_.sv64 = __riscv_vmax_vv_i16m1(a_.sv64 , __riscv_vneg_v_i16m1(a_.sv64 , 4) , 4);
#elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_100761)
__typeof__(r_.values) m = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values < INT16_C(0));
r_.values = (-a_.values & m) | (a_.values & ~m);
Expand Down Expand Up @@ -205,6 +222,8 @@ simde_vabs_s32(simde_int32x2_t a) {

#if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
r_.m64 = _mm_abs_pi32(a_.m64);
#elif defined(SIMDE_RISCV_V_NATIVE)
r_.sv64 = __riscv_vmax_vv_i32m1(a_.sv64 , __riscv_vneg_v_i32m1(a_.sv64 , 2) , 2);
#elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_100761)
__typeof__(r_.values) m = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values < INT32_C(0));
r_.values = (-a_.values & m) | (a_.values & ~m);
Expand Down Expand Up @@ -233,7 +252,9 @@ simde_vabs_s64(simde_int64x1_t a) {
r_,
a_ = simde_int64x1_to_private(a);

#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
#if defined(SIMDE_RISCV_V_NATIVE)
r_.sv64 = __riscv_vmax_vv_i64m1(a_.sv64 , __riscv_vneg_v_i64m1(a_.sv64 , 1) , 1);
#elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
__typeof__(r_.values) m = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values < INT64_C(0));
r_.values = (-a_.values & m) | (a_.values & ~m);
#else
Expand Down Expand Up @@ -261,10 +282,14 @@ simde_vabsq_f16(simde_float16x8_t a) {
r_,
a_ = simde_float16x8_to_private(a);

SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
r_.values[i] = simde_vabsh_f16(a_.values[i]);
}
#if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVFH)
r_.sv128 = __riscv_vfabs_v_f16m1(a_.sv128 , 8);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
r_.values[i] = simde_vabsh_f16(a_.values[i]);
}
#endif

return simde_float16x8_from_private(r_);
#endif
Expand All @@ -288,6 +313,8 @@ simde_vabsq_f32(simde_float32x4_t a) {

#if defined(SIMDE_WASM_SIMD128_NATIVE)
r_.v128 = wasm_f32x4_abs(a_.v128);
#elif defined(SIMDE_RISCV_V_NATIVE)
r_.sv128 = __riscv_vfabs_v_f32m1(a_.sv128 , 4);
#elif defined(SIMDE_X86_SSE_NATIVE)
simde_float32 mask_;
uint32_t u32_ = UINT32_C(0x7FFFFFFF);
Expand Down Expand Up @@ -325,6 +352,8 @@ simde_vabsq_f64(simde_float64x2_t a) {
uint64_t u64_ = UINT64_C(0x7FFFFFFFFFFFFFFF);
simde_memcpy(&mask_, &u64_, sizeof(u64_));
r_.m128d = _mm_and_pd(_mm_set1_pd(mask_), a_.m128d);
#elif defined(SIMDE_RISCV_V_NATIVE)
r_.sv128 = __riscv_vfabs_v_f64m1(a_.sv128 , 2);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
Expand Down Expand Up @@ -358,6 +387,8 @@ simde_vabsq_s8(simde_int8x16_t a) {
r_.m128i = _mm_min_epu8(a_.m128i, _mm_sub_epi8(_mm_setzero_si128(), a_.m128i));
#elif defined(SIMDE_WASM_SIMD128_NATIVE)
r_.v128 = wasm_i8x16_abs(a_.v128);
#elif defined(SIMDE_RISCV_V_NATIVE)
r_.sv128 = __riscv_vmax_vv_i8m1(a_.sv128 , __riscv_vneg_v_i8m1(a_.sv128 , 16) , 16);
#elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
__typeof__(r_.values) m = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values < INT8_C(0));
r_.values = (-a_.values & m) | (a_.values & ~m);
Expand Down Expand Up @@ -394,6 +425,8 @@ simde_vabsq_s16(simde_int16x8_t a) {
r_.m128i = _mm_max_epi16(a_.m128i, _mm_sub_epi16(_mm_setzero_si128(), a_.m128i));
#elif defined(SIMDE_WASM_SIMD128_NATIVE)
r_.v128 = wasm_i16x8_abs(a_.v128);
#elif defined(SIMDE_RISCV_V_NATIVE)
r_.sv128 = __riscv_vmax_vv_i16m1(a_.sv128 , __riscv_vneg_v_i16m1(a_.sv128 , 8) , 8);
#elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
__typeof__(r_.values) m = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values < INT16_C(0));
r_.values = (-a_.values & m) | (a_.values & ~m);
Expand Down Expand Up @@ -431,6 +464,8 @@ simde_vabsq_s32(simde_int32x4_t a) {
r_.m128i = _mm_sub_epi32(_mm_xor_si128(a_.m128i, m), m);
#elif defined(SIMDE_WASM_SIMD128_NATIVE)
r_.v128 = wasm_i32x4_abs(a_.v128);
#elif defined(SIMDE_RISCV_V_NATIVE)
r_.sv128 = __riscv_vmax_vv_i32m1(a_.sv128 , __riscv_vneg_v_i32m1(a_.sv128 , 4) , 4);
#elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
__typeof__(r_.values) m = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values < INT32_C(0));
r_.values = (-a_.values & m) | (a_.values & ~m);
Expand All @@ -452,6 +487,7 @@ simde_vabsq_s32(simde_int32x4_t a) {
SIMDE_FUNCTION_ATTRIBUTES
simde_int64x2_t
simde_vabsq_s64(simde_int64x2_t a) {

#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
return vabsq_s64(a);
#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
Expand All @@ -470,6 +506,8 @@ simde_vabsq_s64(simde_int64x2_t a) {
r_.m128i = _mm_sub_epi64(_mm_xor_si128(a_.m128i, m), m);
#elif defined(SIMDE_WASM_SIMD128_NATIVE)
r_.v128 = wasm_i64x2_abs(a_.v128);
#elif defined(SIMDE_RISCV_V_NATIVE)
r_.sv128 = __riscv_vmax_vv_i64m1(a_.sv128 , __riscv_vneg_v_i64m1(a_.sv128 , 2) , 2);
#elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
__typeof__(r_.values) m = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values < INT64_C(0));
r_.values = (-a_.values & m) | (a_.values & ~m);
Expand Down
43 changes: 43 additions & 0 deletions simde/arm/neon/addl.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
* Copyright:
* 2020 Evan Nemerson <[email protected]>
* 2020 Sean Maher <[email protected]> (Copyright owned by Google, LLC)
* 2023 Ju-Hung Li <[email protected]> (Copyright owned by NTHU pllab)
*/

#if !defined(SIMDE_ARM_NEON_ADDL_H)
Expand All @@ -42,6 +43,13 @@ simde_int16x8_t
simde_vaddl_s8(simde_int8x8_t a, simde_int8x8_t b) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vaddl_s8(a, b);
#elif defined(SIMDE_RISCV_V_NATIVE)
simde_int16x8_private r_;
simde_int8x8_private a_ = simde_int8x8_to_private(a);
simde_int8x8_private b_ = simde_int8x8_to_private(b);

r_.sv128 = __riscv_vwadd_vv_i16m1(__riscv_vlmul_trunc_v_i8m1_i8mf2(a_.sv64) , __riscv_vlmul_trunc_v_i8m1_i8mf2(b_.sv64) , 8);
return simde_int16x8_from_private(r_);
#else
return simde_vaddq_s16(simde_vmovl_s8(a), simde_vmovl_s8(b));
#endif
Expand All @@ -56,6 +64,13 @@ simde_int32x4_t
simde_vaddl_s16(simde_int16x4_t a, simde_int16x4_t b) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vaddl_s16(a, b);
#elif defined(SIMDE_RISCV_V_NATIVE)
simde_int32x4_private r_;
simde_int16x4_private a_ = simde_int16x4_to_private(a);
simde_int16x4_private b_ = simde_int16x4_to_private(b);

r_.sv128 = __riscv_vwadd_vv_i32m1(__riscv_vlmul_trunc_v_i16m1_i16mf2(a_.sv64) , __riscv_vlmul_trunc_v_i16m1_i16mf2(b_.sv64) , 4);
return simde_int32x4_from_private(r_);
#else
return simde_vaddq_s32(simde_vmovl_s16(a), simde_vmovl_s16(b));
#endif
Expand All @@ -70,6 +85,13 @@ simde_int64x2_t
simde_vaddl_s32(simde_int32x2_t a, simde_int32x2_t b) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vaddl_s32(a, b);
#elif defined(SIMDE_RISCV_V_NATIVE)
simde_int64x2_private r_;
simde_int32x2_private a_ = simde_int32x2_to_private(a);
simde_int32x2_private b_ = simde_int32x2_to_private(b);

r_.sv128 = __riscv_vwadd_vv_i64m1(__riscv_vlmul_trunc_v_i32m1_i32mf2(a_.sv64) , __riscv_vlmul_trunc_v_i32m1_i32mf2(b_.sv64) , 2);
return simde_int64x2_from_private(r_);
#else
return simde_vaddq_s64(simde_vmovl_s32(a), simde_vmovl_s32(b));
#endif
Expand All @@ -84,6 +106,13 @@ simde_uint16x8_t
simde_vaddl_u8(simde_uint8x8_t a, simde_uint8x8_t b) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vaddl_u8(a, b);
#elif defined(SIMDE_RISCV_V_NATIVE)
simde_uint16x8_private r_;
simde_uint8x8_private a_ = simde_uint8x8_to_private(a);
simde_uint8x8_private b_ = simde_uint8x8_to_private(b);

r_.sv128 = __riscv_vwaddu_vv_u16m1(__riscv_vlmul_trunc_v_u8m1_u8mf2 (a_.sv64) , __riscv_vlmul_trunc_v_u8m1_u8mf2 (b_.sv64) , 8);
return simde_uint16x8_from_private(r_);
#else
return simde_vaddq_u16(simde_vmovl_u8(a), simde_vmovl_u8(b));
#endif
Expand All @@ -98,6 +127,13 @@ simde_uint32x4_t
simde_vaddl_u16(simde_uint16x4_t a, simde_uint16x4_t b) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vaddl_u16(a, b);
#elif defined(SIMDE_RISCV_V_NATIVE)
simde_uint32x4_private r_;
simde_uint16x4_private a_ = simde_uint16x4_to_private(a);
simde_uint16x4_private b_ = simde_uint16x4_to_private(b);

r_.sv128 = __riscv_vwaddu_vv_u32m1(__riscv_vlmul_trunc_v_u16m1_u16mf2 (a_.sv64) , __riscv_vlmul_trunc_v_u16m1_u16mf2 (b_.sv64) , 4);
return simde_uint32x4_from_private(r_);
#else
return simde_vaddq_u32(simde_vmovl_u16(a), simde_vmovl_u16(b));
#endif
Expand All @@ -112,6 +148,13 @@ simde_uint64x2_t
simde_vaddl_u32(simde_uint32x2_t a, simde_uint32x2_t b) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vaddl_u32(a, b);
#elif defined(SIMDE_RISCV_V_NATIVE)
simde_uint64x2_private r_;
simde_uint32x2_private a_ = simde_uint32x2_to_private(a);
simde_uint32x2_private b_ = simde_uint32x2_to_private(b);

r_.sv128 = __riscv_vwaddu_vv_u64m1(__riscv_vlmul_trunc_v_u32m1_u32mf2 (a_.sv64) , __riscv_vlmul_trunc_v_u32m1_u32mf2 (b_.sv64) , 4);
return simde_uint64x2_from_private(r_);
#else
return simde_vaddq_u64(simde_vmovl_u32(a), simde_vmovl_u32(b));
#endif
Expand Down
55 changes: 55 additions & 0 deletions simde/arm/neon/addl_high.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
* Copyright:
* 2020 Evan Nemerson <[email protected]>
* 2020 Sean Maher <[email protected]> (Copyright owned by Google, LLC)
* 2023 Ju-Hung Li <[email protected]> (Copyright owned by NTHU pllab)
*/

#if !defined(SIMDE_ARM_NEON_ADDL_HIGH_H)
Expand All @@ -42,6 +43,15 @@ simde_int16x8_t
simde_vaddl_high_s8(simde_int8x16_t a, simde_int8x16_t b) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
return vaddl_high_s8(a, b);
#elif defined(SIMDE_RISCV_V_NATIVE)
simde_int16x8_private r_;
simde_int8x16_private a_ = simde_int8x16_to_private(a);
simde_int8x16_private b_ = simde_int8x16_to_private(b);

a_.sv128 = __riscv_vslidedown_vx_i8m1(a_.sv128 , 8 , 16);
b_.sv128 = __riscv_vslidedown_vx_i8m1(b_.sv128 , 8 , 16);
r_.sv128 = __riscv_vwadd_vv_i16m1(__riscv_vlmul_trunc_v_i8m1_i8mf2(a_.sv128) , __riscv_vlmul_trunc_v_i8m1_i8mf2(b_.sv128) , 8);
return simde_int16x8_from_private(r_);
#else
return simde_vaddq_s16(simde_vmovl_high_s8(a), simde_vmovl_high_s8(b));
#endif
Expand All @@ -56,6 +66,15 @@ simde_int32x4_t
simde_vaddl_high_s16(simde_int16x8_t a, simde_int16x8_t b) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
return vaddl_high_s16(a, b);
#elif defined(SIMDE_RISCV_V_NATIVE)
simde_int32x4_private r_;
simde_int16x8_private a_ = simde_int16x8_to_private(a);
simde_int16x8_private b_ = simde_int16x8_to_private(b);

a_.sv128 = __riscv_vslidedown_vx_i16m1(a_.sv128 , 4 , 8);
b_.sv128 = __riscv_vslidedown_vx_i16m1(b_.sv128 , 4 , 8);
r_.sv128 = __riscv_vwadd_vv_i32m1(__riscv_vlmul_trunc_v_i16m1_i16mf2(a_.sv128) , __riscv_vlmul_trunc_v_i16m1_i16mf2(b_.sv128) , 4);
return simde_int32x4_from_private(r_);
#else
return simde_vaddq_s32(simde_vmovl_high_s16(a), simde_vmovl_high_s16(b));
#endif
Expand All @@ -70,6 +89,15 @@ simde_int64x2_t
simde_vaddl_high_s32(simde_int32x4_t a, simde_int32x4_t b) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
return vaddl_high_s32(a, b);
#elif defined(SIMDE_RISCV_V_NATIVE)
simde_int64x2_private r_;
simde_int32x4_private a_ = simde_int32x4_to_private(a);
simde_int32x4_private b_ = simde_int32x4_to_private(b);

a_.sv128 = __riscv_vslidedown_vx_i32m1(a_.sv128 , 2, 4);
b_.sv128 = __riscv_vslidedown_vx_i32m1(b_.sv128 , 2, 4);
r_.sv128 = __riscv_vwadd_vv_i64m1(__riscv_vlmul_trunc_v_i32m1_i32mf2(a_.sv128) , __riscv_vlmul_trunc_v_i32m1_i32mf2(b_.sv128) , 2);
return simde_int64x2_from_private(r_);
#else
return simde_vaddq_s64(simde_vmovl_high_s32(a), simde_vmovl_high_s32(b));
#endif
Expand All @@ -84,6 +112,15 @@ simde_uint16x8_t
simde_vaddl_high_u8(simde_uint8x16_t a, simde_uint8x16_t b) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
return vaddl_high_u8(a, b);
#elif defined(SIMDE_RISCV_V_NATIVE)
simde_uint16x8_private r_;
simde_uint8x16_private a_ = simde_uint8x16_to_private(a);
simde_uint8x16_private b_ = simde_uint8x16_to_private(b);

a_.sv128 = __riscv_vslidedown_vx_u8m1(a_.sv128 , 8 , 16);
b_.sv128 = __riscv_vslidedown_vx_u8m1(b_.sv128 , 8 , 16);
r_.sv128 = __riscv_vwaddu_vv_u16m1(__riscv_vlmul_trunc_v_u8m1_u8mf2 (a_.sv128) , __riscv_vlmul_trunc_v_u8m1_u8mf2 (b_.sv128) , 8);
return simde_uint16x8_from_private(r_);
#else
return simde_vaddq_u16(simde_vmovl_high_u8(a), simde_vmovl_high_u8(b));
#endif
Expand All @@ -98,6 +135,15 @@ simde_uint32x4_t
simde_vaddl_high_u16(simde_uint16x8_t a, simde_uint16x8_t b) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
return vaddl_high_u16(a, b);
#elif defined(SIMDE_RISCV_V_NATIVE)
simde_uint32x4_private r_;
simde_uint16x8_private a_ = simde_uint16x8_to_private(a);
simde_uint16x8_private b_ = simde_uint16x8_to_private(b);

a_.sv128 = __riscv_vslidedown_vx_u16m1(a_.sv128 , 4 , 8);
b_.sv128 = __riscv_vslidedown_vx_u16m1(b_.sv128 , 4 , 8);
r_.sv128 = __riscv_vwaddu_vv_u32m1(__riscv_vlmul_trunc_v_u16m1_u16mf2 (a_.sv128) , __riscv_vlmul_trunc_v_u16m1_u16mf2 (b_.sv128) , 4);
return simde_uint32x4_from_private(r_);
#else
return simde_vaddq_u32(simde_vmovl_high_u16(a), simde_vmovl_high_u16(b));
#endif
Expand All @@ -112,6 +158,15 @@ simde_uint64x2_t
simde_vaddl_high_u32(simde_uint32x4_t a, simde_uint32x4_t b) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
return vaddl_high_u32(a, b);
#elif defined(SIMDE_RISCV_V_NATIVE)
simde_uint64x2_private r_;
simde_uint32x4_private a_ = simde_uint32x4_to_private(a);
simde_uint32x4_private b_ = simde_uint32x4_to_private(b);

a_.sv128 = __riscv_vslidedown_vx_u32m1(a_.sv128 , 2, 4);
b_.sv128 = __riscv_vslidedown_vx_u32m1(b_.sv128 , 2, 4);
r_.sv128 = __riscv_vwaddu_vv_u64m1(__riscv_vlmul_trunc_v_u32m1_u32mf2 (a_.sv128) , __riscv_vlmul_trunc_v_u32m1_u32mf2 (b_.sv128) , 2);
return simde_uint64x2_from_private(r_);
#else
return simde_vaddq_u64(simde_vmovl_high_u32(a), simde_vmovl_high_u32(b));
#endif
Expand Down
Loading
Loading