Skip to content

Commit

Permalink
arm/neon riscv64: additional RVV implementations - part 2. (#1189)
Browse files Browse the repository at this point in the history
Contains RVV implementations for the following Neon instructions: 

`abal`, `abdl_high`, `addw`, `addw_high`, `bcax`, `bic`, `cadd_rot270`, `cadd_rot90`, `cmla_lane`, `cmla_rot180_lane` , `cmla_rot270_lane`, `cmla_rot90_lane`, `combine`, `cvt`, `dot`, `dot_lane`, `dup_n`, `eor`, `ext`, `maxnmv`, `minnmv` , `movl` , `movn` , `qdmull` , `qshlu_n`,  `rnda`,  `rsubhn` , `shl`, `shl_n`, `shll_n`, `shr_n`, `shrn_n`, `sqadd`, `sqrt`
  • Loading branch information
wewe5215 authored Jun 21, 2024
1 parent 6346405 commit c903416
Show file tree
Hide file tree
Showing 34 changed files with 3,092 additions and 1,231 deletions.
53 changes: 53 additions & 0 deletions simde/arm/neon/abal.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
*
* Copyright:
* 2023 Yi-Yen Chung <[email protected]> (Copyright owned by Andes Technology)
* 2023 Chi-Wei Chu <[email protected]> (Copyright owned by NTHU pllab)
*/

#if !defined(SIMDE_ARM_NEON_ABAL_H)
Expand All @@ -39,6 +40,14 @@ simde_int16x8_t
simde_vabal_s8(simde_int16x8_t a, simde_int8x8_t b, simde_int8x8_t c) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vabal_s8(a, b, c);
#elif defined(SIMDE_RISCV_V_NATIVE)
simde_int16x8_private r_, a_ = simde_int16x8_to_private(a);
simde_int8x8_private b_ = simde_int8x8_to_private(b);
simde_int8x8_private c_ = simde_int8x8_to_private(c);
vint16m1_t rst = __riscv_vwsub_vv_i16m1(__riscv_vlmul_trunc_v_i8m1_i8mf2(b_.sv64) , \
__riscv_vlmul_trunc_v_i8m1_i8mf2(c_.sv64) , 8);
r_.sv128 = __riscv_vadd_vv_i16m1(__riscv_vmax_vv_i16m1(rst , __riscv_vneg_v_i16m1(rst , 8) , 8), a_.sv128, 8);
return simde_int16x8_from_private(r_);
#else
return simde_vaddq_s16(simde_vabdl_s8(b, c), a);
#endif
Expand All @@ -53,6 +62,13 @@ simde_int32x4_t
simde_vabal_s16(simde_int32x4_t a, simde_int16x4_t b, simde_int16x4_t c) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vabal_s16(a, b, c);
#elif defined(SIMDE_RISCV_V_NATIVE)
simde_int32x4_private r_, a_ = simde_int32x4_to_private(a);
simde_int16x4_private b_ = simde_int16x4_to_private(b);
simde_int16x4_private c_ = simde_int16x4_to_private(c);
vint32m1_t rst = __riscv_vwsub_vv_i32m1(__riscv_vlmul_trunc_v_i16m1_i16mf2(b_.sv64) , __riscv_vlmul_trunc_v_i16m1_i16mf2(c_.sv64) , 4);
r_.sv128 = __riscv_vadd_vv_i32m1(__riscv_vmax_vv_i32m1(rst , __riscv_vneg_v_i32m1(rst , 4) , 4), a_.sv128, 4);
return simde_int32x4_from_private(r_);
#else
return simde_vaddq_s32(simde_vabdl_s16(b, c), a);
#endif
Expand All @@ -67,6 +83,13 @@ simde_int64x2_t
simde_vabal_s32(simde_int64x2_t a, simde_int32x2_t b, simde_int32x2_t c) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vabal_s32(a, b, c);
#elif defined(SIMDE_RISCV_V_NATIVE)
simde_int64x2_private r_, a_ = simde_int64x2_to_private(a);
simde_int32x2_private b_ = simde_int32x2_to_private(b);
simde_int32x2_private c_ = simde_int32x2_to_private(c);
vint64m1_t rst = __riscv_vwsub_vv_i64m1(__riscv_vlmul_trunc_v_i32m1_i32mf2(b_.sv64) , __riscv_vlmul_trunc_v_i32m1_i32mf2(c_.sv64) , 2);
r_.sv128 = __riscv_vadd_vv_i64m1(__riscv_vmax_vv_i64m1(rst , __riscv_vneg_v_i64m1(rst , 2) , 2), a_.sv128, 2);
return simde_int64x2_from_private(r_);
#else
return simde_vaddq_s64(simde_vabdl_s32(b, c), a);
#endif
Expand All @@ -81,6 +104,16 @@ simde_uint16x8_t
simde_vabal_u8(simde_uint16x8_t a, simde_uint8x8_t b, simde_uint8x8_t c) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vabal_u8(a, b, c);
#elif defined(SIMDE_RISCV_V_NATIVE)
simde_uint16x8_private r_, a_ = simde_uint16x8_to_private(a);
simde_uint8x8_private b_ = simde_uint8x8_to_private(b);
simde_uint8x8_private c_ = simde_uint8x8_to_private(c);
vint16m1_t a_tmp = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwcvtu_x_x_v_u16m1(__riscv_vlmul_trunc_v_u8m1_u8mf2(b_.sv64), 8));
vint16m1_t b_tmp = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwcvtu_x_x_v_u16m1(__riscv_vlmul_trunc_v_u8m1_u8mf2(c_.sv64), 8));
vint16m1_t rst = __riscv_vsub_vv_i16m1(a_tmp, b_tmp, 8);
r_.sv128 = __riscv_vadd_vv_u16m1(__riscv_vreinterpret_v_i16m1_u16m1(__riscv_vmax_vv_i16m1(rst , __riscv_vneg_v_i16m1(rst , 8) , 8)), \
a_.sv128, 8);
return simde_uint16x8_from_private(r_);
#else
return simde_vaddq_u16(simde_vabdl_u8(b, c), a);
#endif
Expand All @@ -95,6 +128,16 @@ simde_uint32x4_t
simde_vabal_u16(simde_uint32x4_t a, simde_uint16x4_t b, simde_uint16x4_t c) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vabal_u16(a, b, c);
#elif defined(SIMDE_RISCV_V_NATIVE)
simde_uint32x4_private r_, a_ = simde_uint32x4_to_private(a);
simde_uint16x4_private b_ = simde_uint16x4_to_private(b);
simde_uint16x4_private c_ = simde_uint16x4_to_private(c);
vint32m1_t a_tmp = __riscv_vreinterpret_v_u32m1_i32m1(__riscv_vwcvtu_x_x_v_u32m1(__riscv_vlmul_trunc_v_u16m1_u16mf2(b_.sv64), 4));
vint32m1_t b_tmp = __riscv_vreinterpret_v_u32m1_i32m1(__riscv_vwcvtu_x_x_v_u32m1(__riscv_vlmul_trunc_v_u16m1_u16mf2(c_.sv64), 4));
vint32m1_t rst = __riscv_vsub_vv_i32m1(a_tmp, b_tmp, 4);
r_.sv128 = __riscv_vadd_vv_u32m1(__riscv_vreinterpret_v_i32m1_u32m1(__riscv_vmax_vv_i32m1(rst , __riscv_vneg_v_i32m1(rst , 4) , 4)), \
a_.sv128, 4);
return simde_uint32x4_from_private(r_);
#else
return simde_vaddq_u32(simde_vabdl_u16(b, c), a);
#endif
Expand All @@ -109,6 +152,16 @@ simde_uint64x2_t
simde_vabal_u32(simde_uint64x2_t a, simde_uint32x2_t b, simde_uint32x2_t c) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vabal_u32(a, b, c);
#elif defined(SIMDE_RISCV_V_NATIVE)
simde_uint64x2_private r_, a_ = simde_uint64x2_to_private(a);
simde_uint32x2_private b_ = simde_uint32x2_to_private(b);
simde_uint32x2_private c_ = simde_uint32x2_to_private(c);
vint64m1_t a_tmp = __riscv_vreinterpret_v_u64m1_i64m1(__riscv_vwcvtu_x_x_v_u64m1(__riscv_vlmul_trunc_v_u32m1_u32mf2(b_.sv64), 2));
vint64m1_t b_tmp = __riscv_vreinterpret_v_u64m1_i64m1(__riscv_vwcvtu_x_x_v_u64m1(__riscv_vlmul_trunc_v_u32m1_u32mf2(c_.sv64), 2));
vint64m1_t rst = __riscv_vsub_vv_i64m1(a_tmp, b_tmp, 4);
r_.sv128 = __riscv_vadd_vv_u64m1(__riscv_vreinterpret_v_i64m1_u64m1(__riscv_vmax_vv_i64m1(rst , __riscv_vneg_v_i64m1(rst , 2) , 2)), \
a_.sv128, 2);
return simde_uint64x2_from_private(r_);
#else
return simde_vaddq_u64(simde_vabdl_u32(b, c), a);
#endif
Expand Down
58 changes: 58 additions & 0 deletions simde/arm/neon/abdl_high.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
*
* Copyright:
* 2023 Yi-Yen Chung <[email protected]> (Copyright owned by Andes Technology)
* 2023 Chi-Wei Chu <[email protected]> (Copyright owned by NTHU pllab)
*/

#if !defined(SIMDE_ARM_NEON_ABDL_HIGH_H)
Expand All @@ -38,6 +39,14 @@ simde_int16x8_t
simde_vabdl_high_s8(simde_int8x16_t a, simde_int8x16_t b) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
return vabdl_high_s8(a, b);
#elif defined(SIMDE_RISCV_V_NATIVE)
simde_int16x8_private r_;
simde_int8x16_private a_ = simde_int8x16_to_private(a);
simde_int8x16_private b_ = simde_int8x16_to_private(b);
vint16m1_t rst = __riscv_vwsub_vv_i16m1(__riscv_vlmul_trunc_v_i8m1_i8mf2(__riscv_vslidedown_vx_i8m1(a_.sv128 , 8 , 16)),
__riscv_vlmul_trunc_v_i8m1_i8mf2(__riscv_vslidedown_vx_i8m1(b_.sv128 , 8 , 16)) , 8);
r_.sv128 = __riscv_vmax_vv_i16m1(rst , __riscv_vneg_v_i16m1(rst , 8) , 8);
return simde_int16x8_from_private(r_);
#else
return simde_vabdl_s8(simde_vget_high_s8(a), simde_vget_high_s8(b));
#endif
Expand All @@ -52,6 +61,14 @@ simde_int32x4_t
simde_vabdl_high_s16(simde_int16x8_t a, simde_int16x8_t b) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
return vabdl_high_s16(a, b);
#elif defined(SIMDE_RISCV_V_NATIVE)
simde_int32x4_private r_;
simde_int16x8_private a_ = simde_int16x8_to_private(a);
simde_int16x8_private b_ = simde_int16x8_to_private(b);
vint32m1_t rst = __riscv_vwsub_vv_i32m1(__riscv_vlmul_trunc_v_i16m1_i16mf2(__riscv_vslidedown_vx_i16m1(a_.sv128 , 4 , 8)) , \
__riscv_vlmul_trunc_v_i16m1_i16mf2(__riscv_vslidedown_vx_i16m1(b_.sv128 , 4 , 8)) , 4);
r_.sv128 = __riscv_vmax_vv_i32m1(rst , __riscv_vneg_v_i32m1(rst , 4) , 4);
return simde_int32x4_from_private(r_);
#else
return simde_vabdl_s16(simde_vget_high_s16(a), simde_vget_high_s16(b));
#endif
Expand All @@ -66,6 +83,14 @@ simde_int64x2_t
simde_vabdl_high_s32(simde_int32x4_t a, simde_int32x4_t b) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
return vabdl_high_s32(a, b);
#elif defined(SIMDE_RISCV_V_NATIVE)
simde_int64x2_private r_;
simde_int32x4_private a_ = simde_int32x4_to_private(a);
simde_int32x4_private b_ = simde_int32x4_to_private(b);
vint64m1_t rst = __riscv_vwsub_vv_i64m1(__riscv_vlmul_trunc_v_i32m1_i32mf2(__riscv_vslidedown_vx_i32m1(a_.sv128 , 2 , 4)) , \
__riscv_vlmul_trunc_v_i32m1_i32mf2(__riscv_vslidedown_vx_i32m1(b_.sv128 , 2 , 4)) , 2);
r_.sv128 = __riscv_vmax_vv_i64m1(rst , __riscv_vneg_v_i64m1(rst , 2) , 2);
return simde_int64x2_from_private(r_);
#else
return simde_vabdl_s32(simde_vget_high_s32(a), simde_vget_high_s32(b));
#endif
Expand All @@ -80,6 +105,17 @@ simde_uint16x8_t
simde_vabdl_high_u8(simde_uint8x16_t a, simde_uint8x16_t b) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
return vabdl_high_u8(a, b);
#elif defined(SIMDE_RISCV_V_NATIVE)
simde_uint16x8_private r_;
simde_uint8x16_private a_ = simde_uint8x16_to_private(a);
simde_uint8x16_private b_ = simde_uint8x16_to_private(b);
vint16m1_t a_tmp = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwcvtu_x_x_v_u16m1( \
__riscv_vlmul_trunc_v_u8m1_u8mf2(__riscv_vslidedown_vx_u8m1(a_.sv128 , 8 , 16)), 8));
vint16m1_t b_tmp = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwcvtu_x_x_v_u16m1( \
__riscv_vlmul_trunc_v_u8m1_u8mf2(__riscv_vslidedown_vx_u8m1(b_.sv128 , 8 , 16)), 8));
vint16m1_t rst = __riscv_vsub_vv_i16m1(a_tmp, b_tmp, 8);
r_.sv128 = __riscv_vreinterpret_v_i16m1_u16m1(__riscv_vmax_vv_i16m1(rst , __riscv_vneg_v_i16m1(rst , 8) , 8));
return simde_uint16x8_from_private(r_);
#else
return simde_vabdl_u8(simde_vget_high_u8(a), simde_vget_high_u8(b));
#endif
Expand All @@ -94,6 +130,17 @@ simde_uint32x4_t
simde_vabdl_high_u16(simde_uint16x8_t a, simde_uint16x8_t b) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
return vabdl_high_u16(a, b);
#elif defined(SIMDE_RISCV_V_NATIVE)
simde_uint32x4_private r_;
simde_uint16x8_private a_ = simde_uint16x8_to_private(a);
simde_uint16x8_private b_ = simde_uint16x8_to_private(b);
vint32m1_t a_tmp = __riscv_vreinterpret_v_u32m1_i32m1(__riscv_vwcvtu_x_x_v_u32m1( \
__riscv_vlmul_trunc_v_u16m1_u16mf2(__riscv_vslidedown_vx_u16m1(a_.sv128 , 4 , 8)), 4));
vint32m1_t b_tmp = __riscv_vreinterpret_v_u32m1_i32m1(__riscv_vwcvtu_x_x_v_u32m1( \
__riscv_vlmul_trunc_v_u16m1_u16mf2(__riscv_vslidedown_vx_u16m1(b_.sv128 , 4 , 8)), 4));
vint32m1_t rst = __riscv_vsub_vv_i32m1(a_tmp, b_tmp, 4);
r_.sv128 = __riscv_vreinterpret_v_i32m1_u32m1(__riscv_vmax_vv_i32m1(rst , __riscv_vneg_v_i32m1(rst , 4) , 4));
return simde_uint32x4_from_private(r_);
#else
return simde_vabdl_u16(simde_vget_high_u16(a), simde_vget_high_u16(b));
#endif
Expand All @@ -108,6 +155,17 @@ simde_uint64x2_t
simde_vabdl_high_u32(simde_uint32x4_t a, simde_uint32x4_t b) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
return vabdl_high_u32(a, b);
#elif defined(SIMDE_RISCV_V_NATIVE)
simde_uint64x2_private r_;
simde_uint32x4_private a_ = simde_uint32x4_to_private(a);
simde_uint32x4_private b_ = simde_uint32x4_to_private(b);
vint64m1_t a_tmp = __riscv_vreinterpret_v_u64m1_i64m1(__riscv_vwcvtu_x_x_v_u64m1( \
__riscv_vlmul_trunc_v_u32m1_u32mf2(__riscv_vslidedown_vx_u32m1(a_.sv128 , 2 , 4)), 2));
vint64m1_t b_tmp = __riscv_vreinterpret_v_u64m1_i64m1(__riscv_vwcvtu_x_x_v_u64m1( \
__riscv_vlmul_trunc_v_u32m1_u32mf2(__riscv_vslidedown_vx_u32m1(b_.sv128 , 2 , 4)), 2));
vint64m1_t rst = __riscv_vsub_vv_i64m1(a_tmp, b_tmp, 4);
r_.sv128 = __riscv_vreinterpret_v_i64m1_u64m1(__riscv_vmax_vv_i64m1(rst , __riscv_vneg_v_i64m1(rst , 2) , 2));
return simde_uint64x2_from_private(r_);
#else
return simde_vabdl_u32(simde_vget_high_u32(a), simde_vget_high_u32(b));
#endif
Expand Down
43 changes: 31 additions & 12 deletions simde/arm/neon/addw.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
* Copyright:
* 2020 Evan Nemerson <[email protected]>
* 2020 Sean Maher <[email protected]> (Copyright owned by Google, LLC)
* 2023 Chi-Wei Chu <[email protected]> (Copyright owned by NTHU pllab)
*/

#if !defined(SIMDE_ARM_NEON_ADDW_H)
Expand All @@ -41,14 +42,17 @@ simde_int16x8_t
simde_vaddw_s8(simde_int16x8_t a, simde_int8x8_t b) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vaddw_s8(a, b);
#elif SIMDE_NATURAL_VECTOR_SIZE_GE(128)
#elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && !defined(SIMDE_RISCV_V_NATIVE)
return simde_vaddq_s16(a, simde_vmovl_s8(b));
#else
simde_int16x8_private r_;
simde_int16x8_private a_ = simde_int16x8_to_private(a);
simde_int8x8_private b_ = simde_int8x8_to_private(b);

#if (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_)
#if defined(SIMDE_RISCV_V_NATIVE)
vint8mf2_t vb = __riscv_vlmul_trunc_v_i8m1_i8mf2 (b_.sv64);
r_.sv128 = __riscv_vwadd_wv_i16m1(a_.sv128, vb, 8);
#elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_)
SIMDE_CONVERT_VECTOR_(r_.values, b_.values);
r_.values += a_.values;
#else
Expand All @@ -71,14 +75,17 @@ simde_int32x4_t
simde_vaddw_s16(simde_int32x4_t a, simde_int16x4_t b) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vaddw_s16(a, b);
#elif SIMDE_NATURAL_VECTOR_SIZE_GE(128)
#elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && !defined(SIMDE_RISCV_V_NATIVE)
return simde_vaddq_s32(a, simde_vmovl_s16(b));
#else
simde_int32x4_private r_;
simde_int32x4_private a_ = simde_int32x4_to_private(a);
simde_int16x4_private b_ = simde_int16x4_to_private(b);

#if (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_)
#if defined(SIMDE_RISCV_V_NATIVE)
vint16mf2_t vb = __riscv_vlmul_trunc_v_i16m1_i16mf2 (b_.sv64);
r_.sv128 = __riscv_vwadd_wv_i32m1(a_.sv128, vb, 4);
#elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_)
SIMDE_CONVERT_VECTOR_(r_.values, b_.values);
r_.values += a_.values;
#else
Expand All @@ -101,14 +108,17 @@ simde_int64x2_t
simde_vaddw_s32(simde_int64x2_t a, simde_int32x2_t b) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vaddw_s32(a, b);
#elif SIMDE_NATURAL_VECTOR_SIZE_GE(128)
#elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && !defined(SIMDE_RISCV_V_NATIVE)
return simde_vaddq_s64(a, simde_vmovl_s32(b));
#else
simde_int64x2_private r_;
simde_int64x2_private a_ = simde_int64x2_to_private(a);
simde_int32x2_private b_ = simde_int32x2_to_private(b);

#if (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_)
#if defined(SIMDE_RISCV_V_NATIVE)
vint32mf2_t vb = __riscv_vlmul_trunc_v_i32m1_i32mf2 (b_.sv64);
r_.sv128 = __riscv_vwadd_wv_i64m1(a_.sv128, vb, 2);
#elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_)
SIMDE_CONVERT_VECTOR_(r_.values, b_.values);
r_.values += a_.values;
#else
Expand All @@ -131,14 +141,17 @@ simde_uint16x8_t
simde_vaddw_u8(simde_uint16x8_t a, simde_uint8x8_t b) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vaddw_u8(a, b);
#elif SIMDE_NATURAL_VECTOR_SIZE_GE(128)
#elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && !defined(SIMDE_RISCV_V_NATIVE)
return simde_vaddq_u16(a, simde_vmovl_u8(b));
#else
simde_uint16x8_private r_;
simde_uint16x8_private a_ = simde_uint16x8_to_private(a);
simde_uint8x8_private b_ = simde_uint8x8_to_private(b);

#if (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_)
#if defined(SIMDE_RISCV_V_NATIVE)
vuint8mf2_t vb = __riscv_vlmul_trunc_v_u8m1_u8mf2 (b_.sv64);
r_.sv128 = __riscv_vwaddu_wv_u16m1(a_.sv128, vb, 8);
#elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_)
SIMDE_CONVERT_VECTOR_(r_.values, b_.values);
r_.values += a_.values;
#else
Expand All @@ -161,14 +174,17 @@ simde_uint32x4_t
simde_vaddw_u16(simde_uint32x4_t a, simde_uint16x4_t b) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vaddw_u16(a, b);
#elif SIMDE_NATURAL_VECTOR_SIZE_GE(128)
#elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && !defined(SIMDE_RISCV_V_NATIVE)
return simde_vaddq_u32(a, simde_vmovl_u16(b));
#else
simde_uint32x4_private r_;
simde_uint32x4_private a_ = simde_uint32x4_to_private(a);
simde_uint16x4_private b_ = simde_uint16x4_to_private(b);

#if (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_)
#if defined(SIMDE_RISCV_V_NATIVE)
vuint16mf2_t vb = __riscv_vlmul_trunc_v_u16m1_u16mf2 (b_.sv64);
r_.sv128 = __riscv_vwaddu_wv_u32m1(a_.sv128, vb, 4);
#elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_)
SIMDE_CONVERT_VECTOR_(r_.values, b_.values);
r_.values += a_.values;
#else
Expand All @@ -191,14 +207,17 @@ simde_uint64x2_t
simde_vaddw_u32(simde_uint64x2_t a, simde_uint32x2_t b) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vaddw_u32(a, b);
#elif SIMDE_NATURAL_VECTOR_SIZE_GE(128)
#elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && !defined(SIMDE_RISCV_V_NATIVE)
return simde_vaddq_u64(a, simde_vmovl_u32(b));
#else
simde_uint64x2_private r_;
simde_uint64x2_private a_ = simde_uint64x2_to_private(a);
simde_uint32x2_private b_ = simde_uint32x2_to_private(b);

#if (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_)
#if defined(SIMDE_RISCV_V_NATIVE)
vuint32mf2_t vb = __riscv_vlmul_trunc_v_u32m1_u32mf2 (b_.sv64);
r_.sv128 = __riscv_vwaddu_wv_u64m1(a_.sv128, vb, 2);
#elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_)
SIMDE_CONVERT_VECTOR_(r_.values, b_.values);
r_.values += a_.values;
#else
Expand Down
Loading

0 comments on commit c903416

Please sign in to comment.