Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Neon : RVV Intrinsics implementations - part2. #1189

Merged
merged 4 commits into from
Jun 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 53 additions & 0 deletions simde/arm/neon/abal.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
*
* Copyright:
* 2023 Yi-Yen Chung <[email protected]> (Copyright owned by Andes Technology)
* 2023 Chi-Wei Chu <[email protected]> (Copyright owned by NTHU pllab)
*/

#if !defined(SIMDE_ARM_NEON_ABAL_H)
Expand All @@ -39,6 +40,14 @@ simde_int16x8_t
simde_vabal_s8(simde_int16x8_t a, simde_int8x8_t b, simde_int8x8_t c) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vabal_s8(a, b, c);
#elif defined(SIMDE_RISCV_V_NATIVE)
simde_int16x8_private r_, a_ = simde_int16x8_to_private(a);
simde_int8x8_private b_ = simde_int8x8_to_private(b);
simde_int8x8_private c_ = simde_int8x8_to_private(c);
vint16m1_t rst = __riscv_vwsub_vv_i16m1(__riscv_vlmul_trunc_v_i8m1_i8mf2(b_.sv64) , \
__riscv_vlmul_trunc_v_i8m1_i8mf2(c_.sv64) , 8);
r_.sv128 = __riscv_vadd_vv_i16m1(__riscv_vmax_vv_i16m1(rst , __riscv_vneg_v_i16m1(rst , 8) , 8), a_.sv128, 8);
return simde_int16x8_from_private(r_);
#else
return simde_vaddq_s16(simde_vabdl_s8(b, c), a);
#endif
Expand All @@ -53,6 +62,13 @@ simde_int32x4_t
simde_vabal_s16(simde_int32x4_t a, simde_int16x4_t b, simde_int16x4_t c) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vabal_s16(a, b, c);
#elif defined(SIMDE_RISCV_V_NATIVE)
simde_int32x4_private r_, a_ = simde_int32x4_to_private(a);
simde_int16x4_private b_ = simde_int16x4_to_private(b);
simde_int16x4_private c_ = simde_int16x4_to_private(c);
vint32m1_t rst = __riscv_vwsub_vv_i32m1(__riscv_vlmul_trunc_v_i16m1_i16mf2(b_.sv64) , __riscv_vlmul_trunc_v_i16m1_i16mf2(c_.sv64) , 4);
r_.sv128 = __riscv_vadd_vv_i32m1(__riscv_vmax_vv_i32m1(rst , __riscv_vneg_v_i32m1(rst , 4) , 4), a_.sv128, 4);
return simde_int32x4_from_private(r_);
#else
return simde_vaddq_s32(simde_vabdl_s16(b, c), a);
#endif
Expand All @@ -67,6 +83,13 @@ simde_int64x2_t
simde_vabal_s32(simde_int64x2_t a, simde_int32x2_t b, simde_int32x2_t c) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vabal_s32(a, b, c);
#elif defined(SIMDE_RISCV_V_NATIVE)
simde_int64x2_private r_, a_ = simde_int64x2_to_private(a);
simde_int32x2_private b_ = simde_int32x2_to_private(b);
simde_int32x2_private c_ = simde_int32x2_to_private(c);
vint64m1_t rst = __riscv_vwsub_vv_i64m1(__riscv_vlmul_trunc_v_i32m1_i32mf2(b_.sv64) , __riscv_vlmul_trunc_v_i32m1_i32mf2(c_.sv64) , 2);
r_.sv128 = __riscv_vadd_vv_i64m1(__riscv_vmax_vv_i64m1(rst , __riscv_vneg_v_i64m1(rst , 2) , 2), a_.sv128, 2);
return simde_int64x2_from_private(r_);
#else
return simde_vaddq_s64(simde_vabdl_s32(b, c), a);
#endif
Expand All @@ -81,6 +104,16 @@ simde_uint16x8_t
simde_vabal_u8(simde_uint16x8_t a, simde_uint8x8_t b, simde_uint8x8_t c) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vabal_u8(a, b, c);
#elif defined(SIMDE_RISCV_V_NATIVE)
simde_uint16x8_private r_, a_ = simde_uint16x8_to_private(a);
simde_uint8x8_private b_ = simde_uint8x8_to_private(b);
simde_uint8x8_private c_ = simde_uint8x8_to_private(c);
vint16m1_t a_tmp = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwcvtu_x_x_v_u16m1(__riscv_vlmul_trunc_v_u8m1_u8mf2(b_.sv64), 8));
vint16m1_t b_tmp = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwcvtu_x_x_v_u16m1(__riscv_vlmul_trunc_v_u8m1_u8mf2(c_.sv64), 8));
vint16m1_t rst = __riscv_vsub_vv_i16m1(a_tmp, b_tmp, 8);
r_.sv128 = __riscv_vadd_vv_u16m1(__riscv_vreinterpret_v_i16m1_u16m1(__riscv_vmax_vv_i16m1(rst , __riscv_vneg_v_i16m1(rst , 8) , 8)), \
a_.sv128, 8);
return simde_uint16x8_from_private(r_);
#else
return simde_vaddq_u16(simde_vabdl_u8(b, c), a);
#endif
Expand All @@ -95,6 +128,16 @@ simde_uint32x4_t
simde_vabal_u16(simde_uint32x4_t a, simde_uint16x4_t b, simde_uint16x4_t c) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vabal_u16(a, b, c);
#elif defined(SIMDE_RISCV_V_NATIVE)
simde_uint32x4_private r_, a_ = simde_uint32x4_to_private(a);
simde_uint16x4_private b_ = simde_uint16x4_to_private(b);
simde_uint16x4_private c_ = simde_uint16x4_to_private(c);
vint32m1_t a_tmp = __riscv_vreinterpret_v_u32m1_i32m1(__riscv_vwcvtu_x_x_v_u32m1(__riscv_vlmul_trunc_v_u16m1_u16mf2(b_.sv64), 4));
vint32m1_t b_tmp = __riscv_vreinterpret_v_u32m1_i32m1(__riscv_vwcvtu_x_x_v_u32m1(__riscv_vlmul_trunc_v_u16m1_u16mf2(c_.sv64), 4));
vint32m1_t rst = __riscv_vsub_vv_i32m1(a_tmp, b_tmp, 4);
r_.sv128 = __riscv_vadd_vv_u32m1(__riscv_vreinterpret_v_i32m1_u32m1(__riscv_vmax_vv_i32m1(rst , __riscv_vneg_v_i32m1(rst , 4) , 4)), \
a_.sv128, 4);
return simde_uint32x4_from_private(r_);
#else
return simde_vaddq_u32(simde_vabdl_u16(b, c), a);
#endif
Expand All @@ -109,6 +152,16 @@ simde_uint64x2_t
simde_vabal_u32(simde_uint64x2_t a, simde_uint32x2_t b, simde_uint32x2_t c) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vabal_u32(a, b, c);
#elif defined(SIMDE_RISCV_V_NATIVE)
simde_uint64x2_private r_, a_ = simde_uint64x2_to_private(a);
simde_uint32x2_private b_ = simde_uint32x2_to_private(b);
simde_uint32x2_private c_ = simde_uint32x2_to_private(c);
vint64m1_t a_tmp = __riscv_vreinterpret_v_u64m1_i64m1(__riscv_vwcvtu_x_x_v_u64m1(__riscv_vlmul_trunc_v_u32m1_u32mf2(b_.sv64), 2));
vint64m1_t b_tmp = __riscv_vreinterpret_v_u64m1_i64m1(__riscv_vwcvtu_x_x_v_u64m1(__riscv_vlmul_trunc_v_u32m1_u32mf2(c_.sv64), 2));
vint64m1_t rst = __riscv_vsub_vv_i64m1(a_tmp, b_tmp, 4);
r_.sv128 = __riscv_vadd_vv_u64m1(__riscv_vreinterpret_v_i64m1_u64m1(__riscv_vmax_vv_i64m1(rst , __riscv_vneg_v_i64m1(rst , 2) , 2)), \
a_.sv128, 2);
return simde_uint64x2_from_private(r_);
#else
return simde_vaddq_u64(simde_vabdl_u32(b, c), a);
#endif
Expand Down
58 changes: 58 additions & 0 deletions simde/arm/neon/abdl_high.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
*
* Copyright:
* 2023 Yi-Yen Chung <[email protected]> (Copyright owned by Andes Technology)
* 2023 Chi-Wei Chu <[email protected]> (Copyright owned by NTHU pllab)
*/

#if !defined(SIMDE_ARM_NEON_ABDL_HIGH_H)
Expand All @@ -38,6 +39,14 @@ simde_int16x8_t
simde_vabdl_high_s8(simde_int8x16_t a, simde_int8x16_t b) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
return vabdl_high_s8(a, b);
#elif defined(SIMDE_RISCV_V_NATIVE)
simde_int16x8_private r_;
simde_int8x16_private a_ = simde_int8x16_to_private(a);
simde_int8x16_private b_ = simde_int8x16_to_private(b);
vint16m1_t rst = __riscv_vwsub_vv_i16m1(__riscv_vlmul_trunc_v_i8m1_i8mf2(__riscv_vslidedown_vx_i8m1(a_.sv128 , 8 , 16)),
__riscv_vlmul_trunc_v_i8m1_i8mf2(__riscv_vslidedown_vx_i8m1(b_.sv128 , 8 , 16)) , 8);
r_.sv128 = __riscv_vmax_vv_i16m1(rst , __riscv_vneg_v_i16m1(rst , 8) , 8);
return simde_int16x8_from_private(r_);
#else
return simde_vabdl_s8(simde_vget_high_s8(a), simde_vget_high_s8(b));
#endif
Expand All @@ -52,6 +61,14 @@ simde_int32x4_t
simde_vabdl_high_s16(simde_int16x8_t a, simde_int16x8_t b) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
return vabdl_high_s16(a, b);
#elif defined(SIMDE_RISCV_V_NATIVE)
simde_int32x4_private r_;
simde_int16x8_private a_ = simde_int16x8_to_private(a);
simde_int16x8_private b_ = simde_int16x8_to_private(b);
vint32m1_t rst = __riscv_vwsub_vv_i32m1(__riscv_vlmul_trunc_v_i16m1_i16mf2(__riscv_vslidedown_vx_i16m1(a_.sv128 , 4 , 8)) , \
__riscv_vlmul_trunc_v_i16m1_i16mf2(__riscv_vslidedown_vx_i16m1(b_.sv128 , 4 , 8)) , 4);
r_.sv128 = __riscv_vmax_vv_i32m1(rst , __riscv_vneg_v_i32m1(rst , 4) , 4);
return simde_int32x4_from_private(r_);
#else
return simde_vabdl_s16(simde_vget_high_s16(a), simde_vget_high_s16(b));
#endif
Expand All @@ -66,6 +83,14 @@ simde_int64x2_t
simde_vabdl_high_s32(simde_int32x4_t a, simde_int32x4_t b) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
return vabdl_high_s32(a, b);
#elif defined(SIMDE_RISCV_V_NATIVE)
simde_int64x2_private r_;
simde_int32x4_private a_ = simde_int32x4_to_private(a);
simde_int32x4_private b_ = simde_int32x4_to_private(b);
vint64m1_t rst = __riscv_vwsub_vv_i64m1(__riscv_vlmul_trunc_v_i32m1_i32mf2(__riscv_vslidedown_vx_i32m1(a_.sv128 , 2 , 4)) , \
__riscv_vlmul_trunc_v_i32m1_i32mf2(__riscv_vslidedown_vx_i32m1(b_.sv128 , 2 , 4)) , 2);
r_.sv128 = __riscv_vmax_vv_i64m1(rst , __riscv_vneg_v_i64m1(rst , 2) , 2);
return simde_int64x2_from_private(r_);
#else
return simde_vabdl_s32(simde_vget_high_s32(a), simde_vget_high_s32(b));
#endif
Expand All @@ -80,6 +105,17 @@ simde_uint16x8_t
simde_vabdl_high_u8(simde_uint8x16_t a, simde_uint8x16_t b) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
return vabdl_high_u8(a, b);
#elif defined(SIMDE_RISCV_V_NATIVE)
simde_uint16x8_private r_;
simde_uint8x16_private a_ = simde_uint8x16_to_private(a);
simde_uint8x16_private b_ = simde_uint8x16_to_private(b);
vint16m1_t a_tmp = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwcvtu_x_x_v_u16m1( \
__riscv_vlmul_trunc_v_u8m1_u8mf2(__riscv_vslidedown_vx_u8m1(a_.sv128 , 8 , 16)), 8));
vint16m1_t b_tmp = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwcvtu_x_x_v_u16m1( \
__riscv_vlmul_trunc_v_u8m1_u8mf2(__riscv_vslidedown_vx_u8m1(b_.sv128 , 8 , 16)), 8));
vint16m1_t rst = __riscv_vsub_vv_i16m1(a_tmp, b_tmp, 8);
r_.sv128 = __riscv_vreinterpret_v_i16m1_u16m1(__riscv_vmax_vv_i16m1(rst , __riscv_vneg_v_i16m1(rst , 8) , 8));
return simde_uint16x8_from_private(r_);
#else
return simde_vabdl_u8(simde_vget_high_u8(a), simde_vget_high_u8(b));
#endif
Expand All @@ -94,6 +130,17 @@ simde_uint32x4_t
simde_vabdl_high_u16(simde_uint16x8_t a, simde_uint16x8_t b) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
return vabdl_high_u16(a, b);
#elif defined(SIMDE_RISCV_V_NATIVE)
simde_uint32x4_private r_;
simde_uint16x8_private a_ = simde_uint16x8_to_private(a);
simde_uint16x8_private b_ = simde_uint16x8_to_private(b);
vint32m1_t a_tmp = __riscv_vreinterpret_v_u32m1_i32m1(__riscv_vwcvtu_x_x_v_u32m1( \
__riscv_vlmul_trunc_v_u16m1_u16mf2(__riscv_vslidedown_vx_u16m1(a_.sv128 , 4 , 8)), 4));
vint32m1_t b_tmp = __riscv_vreinterpret_v_u32m1_i32m1(__riscv_vwcvtu_x_x_v_u32m1( \
__riscv_vlmul_trunc_v_u16m1_u16mf2(__riscv_vslidedown_vx_u16m1(b_.sv128 , 4 , 8)), 4));
vint32m1_t rst = __riscv_vsub_vv_i32m1(a_tmp, b_tmp, 4);
r_.sv128 = __riscv_vreinterpret_v_i32m1_u32m1(__riscv_vmax_vv_i32m1(rst , __riscv_vneg_v_i32m1(rst , 4) , 4));
return simde_uint32x4_from_private(r_);
#else
return simde_vabdl_u16(simde_vget_high_u16(a), simde_vget_high_u16(b));
#endif
Expand All @@ -108,6 +155,17 @@ simde_uint64x2_t
simde_vabdl_high_u32(simde_uint32x4_t a, simde_uint32x4_t b) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
return vabdl_high_u32(a, b);
#elif defined(SIMDE_RISCV_V_NATIVE)
simde_uint64x2_private r_;
simde_uint32x4_private a_ = simde_uint32x4_to_private(a);
simde_uint32x4_private b_ = simde_uint32x4_to_private(b);
vint64m1_t a_tmp = __riscv_vreinterpret_v_u64m1_i64m1(__riscv_vwcvtu_x_x_v_u64m1( \
__riscv_vlmul_trunc_v_u32m1_u32mf2(__riscv_vslidedown_vx_u32m1(a_.sv128 , 2 , 4)), 2));
vint64m1_t b_tmp = __riscv_vreinterpret_v_u64m1_i64m1(__riscv_vwcvtu_x_x_v_u64m1( \
__riscv_vlmul_trunc_v_u32m1_u32mf2(__riscv_vslidedown_vx_u32m1(b_.sv128 , 2 , 4)), 2));
vint64m1_t rst = __riscv_vsub_vv_i64m1(a_tmp, b_tmp, 4);
r_.sv128 = __riscv_vreinterpret_v_i64m1_u64m1(__riscv_vmax_vv_i64m1(rst , __riscv_vneg_v_i64m1(rst , 2) , 2));
return simde_uint64x2_from_private(r_);
#else
return simde_vabdl_u32(simde_vget_high_u32(a), simde_vget_high_u32(b));
#endif
Expand Down
43 changes: 31 additions & 12 deletions simde/arm/neon/addw.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
* Copyright:
* 2020 Evan Nemerson <[email protected]>
* 2020 Sean Maher <[email protected]> (Copyright owned by Google, LLC)
* 2023 Chi-Wei Chu <[email protected]> (Copyright owned by NTHU pllab)
*/

#if !defined(SIMDE_ARM_NEON_ADDW_H)
Expand All @@ -41,14 +42,17 @@ simde_int16x8_t
simde_vaddw_s8(simde_int16x8_t a, simde_int8x8_t b) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vaddw_s8(a, b);
#elif SIMDE_NATURAL_VECTOR_SIZE_GE(128)
#elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && !defined(SIMDE_RISCV_V_NATIVE)
return simde_vaddq_s16(a, simde_vmovl_s8(b));
#else
simde_int16x8_private r_;
simde_int16x8_private a_ = simde_int16x8_to_private(a);
simde_int8x8_private b_ = simde_int8x8_to_private(b);

#if (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_)
#if defined(SIMDE_RISCV_V_NATIVE)
vint8mf2_t vb = __riscv_vlmul_trunc_v_i8m1_i8mf2 (b_.sv64);
r_.sv128 = __riscv_vwadd_wv_i16m1(a_.sv128, vb, 8);
#elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_)
SIMDE_CONVERT_VECTOR_(r_.values, b_.values);
r_.values += a_.values;
#else
Expand All @@ -71,14 +75,17 @@ simde_int32x4_t
simde_vaddw_s16(simde_int32x4_t a, simde_int16x4_t b) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vaddw_s16(a, b);
#elif SIMDE_NATURAL_VECTOR_SIZE_GE(128)
#elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && !defined(SIMDE_RISCV_V_NATIVE)
return simde_vaddq_s32(a, simde_vmovl_s16(b));
#else
simde_int32x4_private r_;
simde_int32x4_private a_ = simde_int32x4_to_private(a);
simde_int16x4_private b_ = simde_int16x4_to_private(b);

#if (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_)
#if defined(SIMDE_RISCV_V_NATIVE)
vint16mf2_t vb = __riscv_vlmul_trunc_v_i16m1_i16mf2 (b_.sv64);
r_.sv128 = __riscv_vwadd_wv_i32m1(a_.sv128, vb, 4);
#elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_)
SIMDE_CONVERT_VECTOR_(r_.values, b_.values);
r_.values += a_.values;
#else
Expand All @@ -101,14 +108,17 @@ simde_int64x2_t
simde_vaddw_s32(simde_int64x2_t a, simde_int32x2_t b) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vaddw_s32(a, b);
#elif SIMDE_NATURAL_VECTOR_SIZE_GE(128)
#elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && !defined(SIMDE_RISCV_V_NATIVE)
return simde_vaddq_s64(a, simde_vmovl_s32(b));
#else
simde_int64x2_private r_;
simde_int64x2_private a_ = simde_int64x2_to_private(a);
simde_int32x2_private b_ = simde_int32x2_to_private(b);

#if (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_)
#if defined(SIMDE_RISCV_V_NATIVE)
vint32mf2_t vb = __riscv_vlmul_trunc_v_i32m1_i32mf2 (b_.sv64);
r_.sv128 = __riscv_vwadd_wv_i64m1(a_.sv128, vb, 2);
#elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_)
SIMDE_CONVERT_VECTOR_(r_.values, b_.values);
r_.values += a_.values;
#else
Expand All @@ -131,14 +141,17 @@ simde_uint16x8_t
simde_vaddw_u8(simde_uint16x8_t a, simde_uint8x8_t b) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vaddw_u8(a, b);
#elif SIMDE_NATURAL_VECTOR_SIZE_GE(128)
#elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && !defined(SIMDE_RISCV_V_NATIVE)
return simde_vaddq_u16(a, simde_vmovl_u8(b));
#else
simde_uint16x8_private r_;
simde_uint16x8_private a_ = simde_uint16x8_to_private(a);
simde_uint8x8_private b_ = simde_uint8x8_to_private(b);

#if (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_)
#if defined(SIMDE_RISCV_V_NATIVE)
vuint8mf2_t vb = __riscv_vlmul_trunc_v_u8m1_u8mf2 (b_.sv64);
r_.sv128 = __riscv_vwaddu_wv_u16m1(a_.sv128, vb, 8);
#elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_)
SIMDE_CONVERT_VECTOR_(r_.values, b_.values);
r_.values += a_.values;
#else
Expand All @@ -161,14 +174,17 @@ simde_uint32x4_t
simde_vaddw_u16(simde_uint32x4_t a, simde_uint16x4_t b) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vaddw_u16(a, b);
#elif SIMDE_NATURAL_VECTOR_SIZE_GE(128)
#elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && !defined(SIMDE_RISCV_V_NATIVE)
return simde_vaddq_u32(a, simde_vmovl_u16(b));
#else
simde_uint32x4_private r_;
simde_uint32x4_private a_ = simde_uint32x4_to_private(a);
simde_uint16x4_private b_ = simde_uint16x4_to_private(b);

#if (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_)
#if defined(SIMDE_RISCV_V_NATIVE)
vuint16mf2_t vb = __riscv_vlmul_trunc_v_u16m1_u16mf2 (b_.sv64);
r_.sv128 = __riscv_vwaddu_wv_u32m1(a_.sv128, vb, 4);
#elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_)
SIMDE_CONVERT_VECTOR_(r_.values, b_.values);
r_.values += a_.values;
#else
Expand All @@ -191,14 +207,17 @@ simde_uint64x2_t
simde_vaddw_u32(simde_uint64x2_t a, simde_uint32x2_t b) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vaddw_u32(a, b);
#elif SIMDE_NATURAL_VECTOR_SIZE_GE(128)
#elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && !defined(SIMDE_RISCV_V_NATIVE)
return simde_vaddq_u64(a, simde_vmovl_u32(b));
#else
simde_uint64x2_private r_;
simde_uint64x2_private a_ = simde_uint64x2_to_private(a);
simde_uint32x2_private b_ = simde_uint32x2_to_private(b);

#if (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_)
#if defined(SIMDE_RISCV_V_NATIVE)
vuint32mf2_t vb = __riscv_vlmul_trunc_v_u32m1_u32mf2 (b_.sv64);
r_.sv128 = __riscv_vwaddu_wv_u64m1(a_.sv128, vb, 2);
#elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_)
SIMDE_CONVERT_VECTOR_(r_.values, b_.values);
r_.values += a_.values;
#else
Expand Down
Loading
Loading