From c903416679cb90af1e7685b3c9c04d7880069360 Mon Sep 17 00:00:00 2001 From: Chi-Wei Chu Date: Sat, 22 Jun 2024 03:36:56 +0800 Subject: [PATCH] arm/neon riscv64: additional RVV implementations - part 2. (#1189) Contains RVV implementations for the following Neon instructions: `abal`, `abdl_high`, `addw`, `addw_high`, `bcax`, `bic`, `cadd_rot270`, `cadd_rot90`, `cmla_lane`, `cmla_rot180_lane` , `cmla_rot270_lane`, `cmla_rot90_lane`, `combine`, `cvt`, `dot`, `dot_lane`, `dup_n`, `eor`, `ext`, `maxnmv`, `minnmv` , `movl` , `movn` , `qdmull` , `qshlu_n`, `rnda`, `rsubhn` , `shl`, `shl_n`, `shll_n`, `shr_n`, `shrn_n`, `sqadd`, `sqrt` --- simde/arm/neon/abal.h | 53 ++++ simde/arm/neon/abdl_high.h | 58 +++++ simde/arm/neon/addw.h | 43 +++- simde/arm/neon/addw_high.h | 100 +++++--- simde/arm/neon/bcax.h | 73 ++++++ simde/arm/neon/bic.h | 113 ++++++--- simde/arm/neon/cadd_rot270.h | 40 ++- simde/arm/neon/cadd_rot90.h | 40 ++- simde/arm/neon/cmla_lane.h | 332 ++++++++++++++++--------- simde/arm/neon/cmla_rot180_lane.h | 262 +++++++++++++------- simde/arm/neon/cmla_rot270_lane.h | 262 +++++++++++++------- simde/arm/neon/cmla_rot90_lane.h | 262 +++++++++++++------- simde/arm/neon/combine.h | 59 +++-- simde/arm/neon/cvt.h | 176 ++++++++++--- simde/arm/neon/dot.h | 141 ++++++++--- simde/arm/neon/dot_lane.h | 296 +++++++++++++++------- simde/arm/neon/dup_n.h | 124 +++++++--- simde/arm/neon/eor.h | 146 ++++++++--- simde/arm/neon/ext.h | 331 ++++++++++++++++--------- simde/arm/neon/maxnmv.h | 97 +++++--- simde/arm/neon/minnmv.h | 121 +++++---- simde/arm/neon/movl.h | 31 ++- simde/arm/neon/movn.h | 25 +- simde/arm/neon/qdmull.h | 32 ++- simde/arm/neon/qshlu_n.h | 43 +++- simde/arm/neon/rnda.h | 103 ++++++-- simde/arm/neon/rsubhn.h | 83 ++++--- simde/arm/neon/shl.h | 397 ++++++++++++++++++++++-------- simde/arm/neon/shl_n.h | 50 +++- simde/arm/neon/shll_n.h | 97 +++++--- simde/arm/neon/shr_n.h | 61 ++++- simde/arm/neon/shrn_n.h | 44 ++-- simde/arm/neon/sqadd.h | 151 ++++++++---- simde/arm/neon/sqrt.h | 77 +++--- 34 files changed, 3092 insertions(+), 1231 deletions(-) diff --git a/simde/arm/neon/abal.h b/simde/arm/neon/abal.h index 7e5093d37..e3af088f7 100644 --- a/simde/arm/neon/abal.h +++ b/simde/arm/neon/abal.h @@ -22,6 +22,7 @@ * * Copyright: * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_ABAL_H) @@ -39,6 +40,14 @@ simde_int16x8_t simde_vabal_s8(simde_int16x8_t a, simde_int8x8_t b, simde_int8x8_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vabal_s8(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int16x8_private r_, a_ = simde_int16x8_to_private(a); + simde_int8x8_private b_ = simde_int8x8_to_private(b); + simde_int8x8_private c_ = simde_int8x8_to_private(c); + vint16m1_t rst = __riscv_vwsub_vv_i16m1(__riscv_vlmul_trunc_v_i8m1_i8mf2(b_.sv64) , \ + __riscv_vlmul_trunc_v_i8m1_i8mf2(c_.sv64) , 8); + r_.sv128 = __riscv_vadd_vv_i16m1(__riscv_vmax_vv_i16m1(rst , __riscv_vneg_v_i16m1(rst , 8) , 8), a_.sv128, 8); + return simde_int16x8_from_private(r_); #else return simde_vaddq_s16(simde_vabdl_s8(b, c), a); #endif @@ -53,6 +62,13 @@ simde_int32x4_t simde_vabal_s16(simde_int32x4_t a, simde_int16x4_t b, simde_int16x4_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vabal_s16(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int32x4_private r_, a_ = simde_int32x4_to_private(a); + simde_int16x4_private b_ = simde_int16x4_to_private(b); + simde_int16x4_private c_ = simde_int16x4_to_private(c); + vint32m1_t rst = __riscv_vwsub_vv_i32m1(__riscv_vlmul_trunc_v_i16m1_i16mf2(b_.sv64) , __riscv_vlmul_trunc_v_i16m1_i16mf2(c_.sv64) , 4); + r_.sv128 = __riscv_vadd_vv_i32m1(__riscv_vmax_vv_i32m1(rst , __riscv_vneg_v_i32m1(rst , 4) , 4), a_.sv128, 4); + return simde_int32x4_from_private(r_); #else return simde_vaddq_s32(simde_vabdl_s16(b, c), a); #endif @@ -67,6 +83,13 @@ simde_int64x2_t simde_vabal_s32(simde_int64x2_t a, simde_int32x2_t b, simde_int32x2_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vabal_s32(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int64x2_private r_, a_ = simde_int64x2_to_private(a); + simde_int32x2_private b_ = simde_int32x2_to_private(b); + simde_int32x2_private c_ = simde_int32x2_to_private(c); + vint64m1_t rst = __riscv_vwsub_vv_i64m1(__riscv_vlmul_trunc_v_i32m1_i32mf2(b_.sv64) , __riscv_vlmul_trunc_v_i32m1_i32mf2(c_.sv64) , 2); + r_.sv128 = __riscv_vadd_vv_i64m1(__riscv_vmax_vv_i64m1(rst , __riscv_vneg_v_i64m1(rst , 2) , 2), a_.sv128, 2); + return simde_int64x2_from_private(r_); #else return simde_vaddq_s64(simde_vabdl_s32(b, c), a); #endif @@ -81,6 +104,16 @@ simde_uint16x8_t simde_vabal_u8(simde_uint16x8_t a, simde_uint8x8_t b, simde_uint8x8_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vabal_u8(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint16x8_private r_, a_ = simde_uint16x8_to_private(a); + simde_uint8x8_private b_ = simde_uint8x8_to_private(b); + simde_uint8x8_private c_ = simde_uint8x8_to_private(c); + vint16m1_t a_tmp = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwcvtu_x_x_v_u16m1(__riscv_vlmul_trunc_v_u8m1_u8mf2(b_.sv64), 8)); + vint16m1_t b_tmp = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwcvtu_x_x_v_u16m1(__riscv_vlmul_trunc_v_u8m1_u8mf2(c_.sv64), 8)); + vint16m1_t rst = __riscv_vsub_vv_i16m1(a_tmp, b_tmp, 8); + r_.sv128 = __riscv_vadd_vv_u16m1(__riscv_vreinterpret_v_i16m1_u16m1(__riscv_vmax_vv_i16m1(rst , __riscv_vneg_v_i16m1(rst , 8) , 8)), \ + a_.sv128, 8); + return simde_uint16x8_from_private(r_); #else return simde_vaddq_u16(simde_vabdl_u8(b, c), a); #endif @@ -95,6 +128,16 @@ simde_uint32x4_t simde_vabal_u16(simde_uint32x4_t a, simde_uint16x4_t b, simde_uint16x4_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vabal_u16(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint32x4_private r_, a_ = simde_uint32x4_to_private(a); + simde_uint16x4_private b_ = simde_uint16x4_to_private(b); + simde_uint16x4_private c_ = simde_uint16x4_to_private(c); + vint32m1_t a_tmp = __riscv_vreinterpret_v_u32m1_i32m1(__riscv_vwcvtu_x_x_v_u32m1(__riscv_vlmul_trunc_v_u16m1_u16mf2(b_.sv64), 4)); + vint32m1_t b_tmp = __riscv_vreinterpret_v_u32m1_i32m1(__riscv_vwcvtu_x_x_v_u32m1(__riscv_vlmul_trunc_v_u16m1_u16mf2(c_.sv64), 4)); + vint32m1_t rst = __riscv_vsub_vv_i32m1(a_tmp, b_tmp, 4); + r_.sv128 = __riscv_vadd_vv_u32m1(__riscv_vreinterpret_v_i32m1_u32m1(__riscv_vmax_vv_i32m1(rst , __riscv_vneg_v_i32m1(rst , 4) , 4)), \ + a_.sv128, 4); + return simde_uint32x4_from_private(r_); #else return simde_vaddq_u32(simde_vabdl_u16(b, c), a); #endif @@ -109,6 +152,16 @@ simde_uint64x2_t simde_vabal_u32(simde_uint64x2_t a, simde_uint32x2_t b, simde_uint32x2_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vabal_u32(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint64x2_private r_, a_ = simde_uint64x2_to_private(a); + simde_uint32x2_private b_ = simde_uint32x2_to_private(b); + simde_uint32x2_private c_ = simde_uint32x2_to_private(c); + vint64m1_t a_tmp = __riscv_vreinterpret_v_u64m1_i64m1(__riscv_vwcvtu_x_x_v_u64m1(__riscv_vlmul_trunc_v_u32m1_u32mf2(b_.sv64), 2)); + vint64m1_t b_tmp = __riscv_vreinterpret_v_u64m1_i64m1(__riscv_vwcvtu_x_x_v_u64m1(__riscv_vlmul_trunc_v_u32m1_u32mf2(c_.sv64), 2)); + vint64m1_t rst = __riscv_vsub_vv_i64m1(a_tmp, b_tmp, 4); + r_.sv128 = __riscv_vadd_vv_u64m1(__riscv_vreinterpret_v_i64m1_u64m1(__riscv_vmax_vv_i64m1(rst , __riscv_vneg_v_i64m1(rst , 2) , 2)), \ + a_.sv128, 2); + return simde_uint64x2_from_private(r_); #else return simde_vaddq_u64(simde_vabdl_u32(b, c), a); #endif diff --git a/simde/arm/neon/abdl_high.h b/simde/arm/neon/abdl_high.h index 826b1ba33..4672a5b28 100644 --- a/simde/arm/neon/abdl_high.h +++ b/simde/arm/neon/abdl_high.h @@ -22,6 +22,7 @@ * * Copyright: * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_ABDL_HIGH_H) @@ -38,6 +39,14 @@ simde_int16x8_t simde_vabdl_high_s8(simde_int8x16_t a, simde_int8x16_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vabdl_high_s8(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int16x8_private r_; + simde_int8x16_private a_ = simde_int8x16_to_private(a); + simde_int8x16_private b_ = simde_int8x16_to_private(b); + vint16m1_t rst = __riscv_vwsub_vv_i16m1(__riscv_vlmul_trunc_v_i8m1_i8mf2(__riscv_vslidedown_vx_i8m1(a_.sv128 , 8 , 16)), + __riscv_vlmul_trunc_v_i8m1_i8mf2(__riscv_vslidedown_vx_i8m1(b_.sv128 , 8 , 16)) , 8); + r_.sv128 = __riscv_vmax_vv_i16m1(rst , __riscv_vneg_v_i16m1(rst , 8) , 8); + return simde_int16x8_from_private(r_); #else return simde_vabdl_s8(simde_vget_high_s8(a), simde_vget_high_s8(b)); #endif @@ -52,6 +61,14 @@ simde_int32x4_t simde_vabdl_high_s16(simde_int16x8_t a, simde_int16x8_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vabdl_high_s16(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int32x4_private r_; + simde_int16x8_private a_ = simde_int16x8_to_private(a); + simde_int16x8_private b_ = simde_int16x8_to_private(b); + vint32m1_t rst = __riscv_vwsub_vv_i32m1(__riscv_vlmul_trunc_v_i16m1_i16mf2(__riscv_vslidedown_vx_i16m1(a_.sv128 , 4 , 8)) , \ + __riscv_vlmul_trunc_v_i16m1_i16mf2(__riscv_vslidedown_vx_i16m1(b_.sv128 , 4 , 8)) , 4); + r_.sv128 = __riscv_vmax_vv_i32m1(rst , __riscv_vneg_v_i32m1(rst , 4) , 4); + return simde_int32x4_from_private(r_); #else return simde_vabdl_s16(simde_vget_high_s16(a), simde_vget_high_s16(b)); #endif @@ -66,6 +83,14 @@ simde_int64x2_t simde_vabdl_high_s32(simde_int32x4_t a, simde_int32x4_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vabdl_high_s32(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int64x2_private r_; + simde_int32x4_private a_ = simde_int32x4_to_private(a); + simde_int32x4_private b_ = simde_int32x4_to_private(b); + vint64m1_t rst = __riscv_vwsub_vv_i64m1(__riscv_vlmul_trunc_v_i32m1_i32mf2(__riscv_vslidedown_vx_i32m1(a_.sv128 , 2 , 4)) , \ + __riscv_vlmul_trunc_v_i32m1_i32mf2(__riscv_vslidedown_vx_i32m1(b_.sv128 , 2 , 4)) , 2); + r_.sv128 = __riscv_vmax_vv_i64m1(rst , __riscv_vneg_v_i64m1(rst , 2) , 2); + return simde_int64x2_from_private(r_); #else return simde_vabdl_s32(simde_vget_high_s32(a), simde_vget_high_s32(b)); #endif @@ -80,6 +105,17 @@ simde_uint16x8_t simde_vabdl_high_u8(simde_uint8x16_t a, simde_uint8x16_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vabdl_high_u8(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint16x8_private r_; + simde_uint8x16_private a_ = simde_uint8x16_to_private(a); + simde_uint8x16_private b_ = simde_uint8x16_to_private(b); + vint16m1_t a_tmp = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwcvtu_x_x_v_u16m1( \ + __riscv_vlmul_trunc_v_u8m1_u8mf2(__riscv_vslidedown_vx_u8m1(a_.sv128 , 8 , 16)), 8)); + vint16m1_t b_tmp = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwcvtu_x_x_v_u16m1( \ + __riscv_vlmul_trunc_v_u8m1_u8mf2(__riscv_vslidedown_vx_u8m1(b_.sv128 , 8 , 16)), 8)); + vint16m1_t rst = __riscv_vsub_vv_i16m1(a_tmp, b_tmp, 8); + r_.sv128 = __riscv_vreinterpret_v_i16m1_u16m1(__riscv_vmax_vv_i16m1(rst , __riscv_vneg_v_i16m1(rst , 8) , 8)); + return simde_uint16x8_from_private(r_); #else return simde_vabdl_u8(simde_vget_high_u8(a), simde_vget_high_u8(b)); #endif @@ -94,6 +130,17 @@ simde_uint32x4_t simde_vabdl_high_u16(simde_uint16x8_t a, simde_uint16x8_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vabdl_high_u16(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint32x4_private r_; + simde_uint16x8_private a_ = simde_uint16x8_to_private(a); + simde_uint16x8_private b_ = simde_uint16x8_to_private(b); + vint32m1_t a_tmp = __riscv_vreinterpret_v_u32m1_i32m1(__riscv_vwcvtu_x_x_v_u32m1( \ + __riscv_vlmul_trunc_v_u16m1_u16mf2(__riscv_vslidedown_vx_u16m1(a_.sv128 , 4 , 8)), 4)); + vint32m1_t b_tmp = __riscv_vreinterpret_v_u32m1_i32m1(__riscv_vwcvtu_x_x_v_u32m1( \ + __riscv_vlmul_trunc_v_u16m1_u16mf2(__riscv_vslidedown_vx_u16m1(b_.sv128 , 4 , 8)), 4)); + vint32m1_t rst = __riscv_vsub_vv_i32m1(a_tmp, b_tmp, 4); + r_.sv128 = __riscv_vreinterpret_v_i32m1_u32m1(__riscv_vmax_vv_i32m1(rst , __riscv_vneg_v_i32m1(rst , 4) , 4)); + return simde_uint32x4_from_private(r_); #else return simde_vabdl_u16(simde_vget_high_u16(a), simde_vget_high_u16(b)); #endif @@ -108,6 +155,17 @@ simde_uint64x2_t simde_vabdl_high_u32(simde_uint32x4_t a, simde_uint32x4_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vabdl_high_u32(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint64x2_private r_; + simde_uint32x4_private a_ = simde_uint32x4_to_private(a); + simde_uint32x4_private b_ = simde_uint32x4_to_private(b); + vint64m1_t a_tmp = __riscv_vreinterpret_v_u64m1_i64m1(__riscv_vwcvtu_x_x_v_u64m1( \ + __riscv_vlmul_trunc_v_u32m1_u32mf2(__riscv_vslidedown_vx_u32m1(a_.sv128 , 2 , 4)), 2)); + vint64m1_t b_tmp = __riscv_vreinterpret_v_u64m1_i64m1(__riscv_vwcvtu_x_x_v_u64m1( \ + __riscv_vlmul_trunc_v_u32m1_u32mf2(__riscv_vslidedown_vx_u32m1(b_.sv128 , 2 , 4)), 2)); + vint64m1_t rst = __riscv_vsub_vv_i64m1(a_tmp, b_tmp, 4); + r_.sv128 = __riscv_vreinterpret_v_i64m1_u64m1(__riscv_vmax_vv_i64m1(rst , __riscv_vneg_v_i64m1(rst , 2) , 2)); + return simde_uint64x2_from_private(r_); #else return simde_vabdl_u32(simde_vget_high_u32(a), simde_vget_high_u32(b)); #endif diff --git a/simde/arm/neon/addw.h b/simde/arm/neon/addw.h index ec736215f..f38b4d777 100644 --- a/simde/arm/neon/addw.h +++ b/simde/arm/neon/addw.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Sean Maher (Copyright owned by Google, LLC) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_ADDW_H) @@ -41,14 +42,17 @@ simde_int16x8_t simde_vaddw_s8(simde_int16x8_t a, simde_int8x8_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vaddw_s8(a, b); - #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) + #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && !defined(SIMDE_RISCV_V_NATIVE) return simde_vaddq_s16(a, simde_vmovl_s8(b)); #else simde_int16x8_private r_; simde_int16x8_private a_ = simde_int16x8_to_private(a); simde_int8x8_private b_ = simde_int8x8_to_private(b); - #if (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_) + #if defined(SIMDE_RISCV_V_NATIVE) + vint8mf2_t vb = __riscv_vlmul_trunc_v_i8m1_i8mf2 (b_.sv64); + r_.sv128 = __riscv_vwadd_wv_i16m1(a_.sv128, vb, 8); + #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_) SIMDE_CONVERT_VECTOR_(r_.values, b_.values); r_.values += a_.values; #else @@ -71,14 +75,17 @@ simde_int32x4_t simde_vaddw_s16(simde_int32x4_t a, simde_int16x4_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vaddw_s16(a, b); - #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) + #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && !defined(SIMDE_RISCV_V_NATIVE) return simde_vaddq_s32(a, simde_vmovl_s16(b)); #else simde_int32x4_private r_; simde_int32x4_private a_ = simde_int32x4_to_private(a); simde_int16x4_private b_ = simde_int16x4_to_private(b); - #if (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_) + #if defined(SIMDE_RISCV_V_NATIVE) + vint16mf2_t vb = __riscv_vlmul_trunc_v_i16m1_i16mf2 (b_.sv64); + r_.sv128 = __riscv_vwadd_wv_i32m1(a_.sv128, vb, 4); + #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_) SIMDE_CONVERT_VECTOR_(r_.values, b_.values); r_.values += a_.values; #else @@ -101,14 +108,17 @@ simde_int64x2_t simde_vaddw_s32(simde_int64x2_t a, simde_int32x2_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vaddw_s32(a, b); - #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) + #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && !defined(SIMDE_RISCV_V_NATIVE) return simde_vaddq_s64(a, simde_vmovl_s32(b)); #else simde_int64x2_private r_; simde_int64x2_private a_ = simde_int64x2_to_private(a); simde_int32x2_private b_ = simde_int32x2_to_private(b); - #if (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_) + #if defined(SIMDE_RISCV_V_NATIVE) + vint32mf2_t vb = __riscv_vlmul_trunc_v_i32m1_i32mf2 (b_.sv64); + r_.sv128 = __riscv_vwadd_wv_i64m1(a_.sv128, vb, 2); + #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_) SIMDE_CONVERT_VECTOR_(r_.values, b_.values); r_.values += a_.values; #else @@ -131,14 +141,17 @@ simde_uint16x8_t simde_vaddw_u8(simde_uint16x8_t a, simde_uint8x8_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vaddw_u8(a, b); - #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) + #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && !defined(SIMDE_RISCV_V_NATIVE) return simde_vaddq_u16(a, simde_vmovl_u8(b)); #else simde_uint16x8_private r_; simde_uint16x8_private a_ = simde_uint16x8_to_private(a); simde_uint8x8_private b_ = simde_uint8x8_to_private(b); - #if (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_) + #if defined(SIMDE_RISCV_V_NATIVE) + vuint8mf2_t vb = __riscv_vlmul_trunc_v_u8m1_u8mf2 (b_.sv64); + r_.sv128 = __riscv_vwaddu_wv_u16m1(a_.sv128, vb, 8); + #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_) SIMDE_CONVERT_VECTOR_(r_.values, b_.values); r_.values += a_.values; #else @@ -161,14 +174,17 @@ simde_uint32x4_t simde_vaddw_u16(simde_uint32x4_t a, simde_uint16x4_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vaddw_u16(a, b); - #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) + #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && !defined(SIMDE_RISCV_V_NATIVE) return simde_vaddq_u32(a, simde_vmovl_u16(b)); #else simde_uint32x4_private r_; simde_uint32x4_private a_ = simde_uint32x4_to_private(a); simde_uint16x4_private b_ = simde_uint16x4_to_private(b); - #if (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_) + #if defined(SIMDE_RISCV_V_NATIVE) + vuint16mf2_t vb = __riscv_vlmul_trunc_v_u16m1_u16mf2 (b_.sv64); + r_.sv128 = __riscv_vwaddu_wv_u32m1(a_.sv128, vb, 4); + #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_) SIMDE_CONVERT_VECTOR_(r_.values, b_.values); r_.values += a_.values; #else @@ -191,14 +207,17 @@ simde_uint64x2_t simde_vaddw_u32(simde_uint64x2_t a, simde_uint32x2_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vaddw_u32(a, b); - #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) + #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && !defined(SIMDE_RISCV_V_NATIVE) return simde_vaddq_u64(a, simde_vmovl_u32(b)); #else simde_uint64x2_private r_; simde_uint64x2_private a_ = simde_uint64x2_to_private(a); simde_uint32x2_private b_ = simde_uint32x2_to_private(b); - #if (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_) + #if defined(SIMDE_RISCV_V_NATIVE) + vuint32mf2_t vb = __riscv_vlmul_trunc_v_u32m1_u32mf2 (b_.sv64); + r_.sv128 = __riscv_vwaddu_wv_u64m1(a_.sv128, vb, 2); + #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_) SIMDE_CONVERT_VECTOR_(r_.values, b_.values); r_.values += a_.values; #else diff --git a/simde/arm/neon/addw_high.h b/simde/arm/neon/addw_high.h index 1f2df9052..be293ac60 100644 --- a/simde/arm/neon/addw_high.h +++ b/simde/arm/neon/addw_high.h @@ -22,6 +22,7 @@ * * Copyright: * 2020 Evan Nemerson + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_ADDW_HIGH_H) @@ -30,6 +31,7 @@ #include "types.h" #include "movl_high.h" #include "add.h" +#include "addw.h" HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS @@ -40,17 +42,22 @@ simde_int16x8_t simde_vaddw_high_s8(simde_int16x8_t a, simde_int8x16_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vaddw_high_s8(a, b); - #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) + #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && !defined(SIMDE_RISCV_V_NATIVE) return simde_vaddq_s16(a, simde_vmovl_high_s8(b)); #else simde_int16x8_private r_; simde_int16x8_private a_ = simde_int16x8_to_private(a); simde_int8x16_private b_ = simde_int8x16_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] + b_.values[i + ((sizeof(b_.values) / sizeof(b_.values[0])) / 2)]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vint8mf2_t b_high = __riscv_vlmul_trunc_v_i8m1_i8mf2(__riscv_vslidedown_vx_i8m1(b_.sv128 , 8 , 16)); + r_.sv128 = __riscv_vwadd_wv_i16m1(a_.sv128, b_high, 8); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] + b_.values[i + ((sizeof(b_.values) / sizeof(b_.values[0])) / 2)]; + } + #endif return simde_int16x8_from_private(r_); #endif @@ -65,17 +72,22 @@ simde_int32x4_t simde_vaddw_high_s16(simde_int32x4_t a, simde_int16x8_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vaddw_high_s16(a, b); - #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) + #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && !defined(SIMDE_RISCV_V_NATIVE) return simde_vaddq_s32(a, simde_vmovl_high_s16(b)); #else simde_int32x4_private r_; simde_int32x4_private a_ = simde_int32x4_to_private(a); simde_int16x8_private b_ = simde_int16x8_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] + b_.values[i + ((sizeof(b_.values) / sizeof(b_.values[0])) / 2)]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vint16mf2_t b_high = __riscv_vlmul_trunc_v_i16m1_i16mf2(__riscv_vslidedown_vx_i16m1(b_.sv128 , 4 , 8)); + r_.sv128 = __riscv_vwadd_wv_i32m1(a_.sv128, b_high, 4); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] + b_.values[i + ((sizeof(b_.values) / sizeof(b_.values[0])) / 2)]; + } + #endif return simde_int32x4_from_private(r_); #endif @@ -90,18 +102,21 @@ simde_int64x2_t simde_vaddw_high_s32(simde_int64x2_t a, simde_int32x4_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vaddw_high_s32(a, b); - #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) + #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && !defined(SIMDE_RISCV_V_NATIVE) return simde_vaddq_s64(a, simde_vmovl_high_s32(b)); #else simde_int64x2_private r_; simde_int64x2_private a_ = simde_int64x2_to_private(a); simde_int32x4_private b_ = simde_int32x4_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] + b_.values[i + ((sizeof(b_.values) / sizeof(b_.values[0])) / 2)]; - } - + #if defined(SIMDE_RISCV_V_NATIVE) + vint32mf2_t b_high = __riscv_vlmul_trunc_v_i32m1_i32mf2(__riscv_vslidedown_vx_i32m1(b_.sv128 , 2 , 4)); + r_.sv128 = __riscv_vwadd_wv_i64m1(a_.sv128, b_high, 2); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] + b_.values[i + ((sizeof(b_.values) / sizeof(b_.values[0])) / 2)]; + } + #endif return simde_int64x2_from_private(r_); #endif } @@ -115,18 +130,21 @@ simde_uint16x8_t simde_vaddw_high_u8(simde_uint16x8_t a, simde_uint8x16_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vaddw_high_u8(a, b); - #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) + #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && !defined(SIMDE_RISCV_V_NATIVE) return simde_vaddq_u16(a, simde_vmovl_high_u8(b)); #else simde_uint16x8_private r_; simde_uint16x8_private a_ = simde_uint16x8_to_private(a); simde_uint8x16_private b_ = simde_uint8x16_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] + b_.values[i + ((sizeof(b_.values) / sizeof(b_.values[0])) / 2)]; - } - + #if defined(SIMDE_RISCV_V_NATIVE) + vuint8mf2_t b_high = __riscv_vlmul_trunc_v_u8m1_u8mf2(__riscv_vslidedown_vx_u8m1(b_.sv128 , 8 , 16)); + r_.sv128 = __riscv_vwaddu_wv_u16m1(a_.sv128, b_high, 8); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] + b_.values[i + ((sizeof(b_.values) / sizeof(b_.values[0])) / 2)]; + } + #endif return simde_uint16x8_from_private(r_); #endif } @@ -140,18 +158,21 @@ simde_uint32x4_t simde_vaddw_high_u16(simde_uint32x4_t a, simde_uint16x8_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vaddw_high_u16(a, b); - #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) + #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && !defined(SIMDE_RISCV_V_NATIVE) return simde_vaddq_u32(a, simde_vmovl_high_u16(b)); #else simde_uint32x4_private r_; simde_uint32x4_private a_ = simde_uint32x4_to_private(a); simde_uint16x8_private b_ = simde_uint16x8_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] + b_.values[i + ((sizeof(b_.values) / sizeof(b_.values[0])) / 2)]; - } - + #if defined(SIMDE_RISCV_V_NATIVE) + vuint16mf2_t b_high = __riscv_vlmul_trunc_v_u16m1_u16mf2(__riscv_vslidedown_vx_u16m1(b_.sv128 , 4 , 8)); + r_.sv128 = __riscv_vwaddu_wv_u32m1(a_.sv128, b_high, 4); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] + b_.values[i + ((sizeof(b_.values) / sizeof(b_.values[0])) / 2)]; + } + #endif return simde_uint32x4_from_private(r_); #endif } @@ -165,18 +186,21 @@ simde_uint64x2_t simde_vaddw_high_u32(simde_uint64x2_t a, simde_uint32x4_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vaddw_high_u32(a, b); - #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) + #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && !defined(SIMDE_RISCV_V_NATIVE) return simde_vaddq_u64(a, simde_vmovl_high_u32(b)); #else simde_uint64x2_private r_; simde_uint64x2_private a_ = simde_uint64x2_to_private(a); simde_uint32x4_private b_ = simde_uint32x4_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] + b_.values[i + ((sizeof(b_.values) / sizeof(b_.values[0])) / 2)]; - } - + #if defined(SIMDE_RISCV_V_NATIVE) + vuint32mf2_t b_high = __riscv_vlmul_trunc_v_u32m1_u32mf2(__riscv_vslidedown_vx_u32m1(b_.sv128 , 2 , 4)); + r_.sv128 = __riscv_vwaddu_wv_u64m1(a_.sv128, b_high, 2); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] + b_.values[i + ((sizeof(b_.values) / sizeof(b_.values[0])) / 2)]; + } + #endif return simde_uint64x2_from_private(r_); #endif } diff --git a/simde/arm/neon/bcax.h b/simde/arm/neon/bcax.h index b9e84ccba..746d8d613 100644 --- a/simde/arm/neon/bcax.h +++ b/simde/arm/neon/bcax.h @@ -22,6 +22,7 @@ * * Copyright: * 2021 Atharva Nimbalkar + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_BCAX_H) @@ -41,6 +42,15 @@ simde_uint8x16_t simde_vbcaxq_u8(simde_uint8x16_t a, simde_uint8x16_t b, simde_uint8x16_t c) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_SHA3) return vbcaxq_u8(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint8x16_private + r_, + a_ = simde_uint8x16_to_private(a), + b_ = simde_uint8x16_to_private(b), + c_ = simde_uint8x16_to_private(c); + r_.sv128 = __riscv_vxor_vv_u8m1(a_.sv128, __riscv_vand_vv_u8m1(b_.sv128 , \ + __riscv_vnot_v_u8m1(c_.sv128 , 16), 16), 16); + return simde_uint8x16_from_private(r_); #else return simde_veorq_u8(a, simde_vbicq_u8(b, c)); #endif @@ -55,6 +65,15 @@ simde_uint16x8_t simde_vbcaxq_u16(simde_uint16x8_t a, simde_uint16x8_t b, simde_uint16x8_t c) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_SHA3) return vbcaxq_u16(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint16x8_private + r_, + a_ = simde_uint16x8_to_private(a), + b_ = simde_uint16x8_to_private(b), + c_ = simde_uint16x8_to_private(c); + r_.sv128 = __riscv_vxor_vv_u16m1(a_.sv128, __riscv_vand_vv_u16m1(b_.sv128 , \ + __riscv_vnot_v_u16m1(c_.sv128 , 8), 8), 8); + return simde_uint16x8_from_private(r_); #else return simde_veorq_u16(a, simde_vbicq_u16(b, c)); #endif @@ -69,6 +88,15 @@ simde_uint32x4_t simde_vbcaxq_u32(simde_uint32x4_t a, simde_uint32x4_t b, simde_uint32x4_t c) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_SHA3) return vbcaxq_u32(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint32x4_private + r_, + a_ = simde_uint32x4_to_private(a), + b_ = simde_uint32x4_to_private(b), + c_ = simde_uint32x4_to_private(c); + r_.sv128 = __riscv_vxor_vv_u32m1(a_.sv128, __riscv_vand_vv_u32m1(b_.sv128 , \ + __riscv_vnot_v_u32m1(c_.sv128 , 4), 4), 4); + return simde_uint32x4_from_private(r_); #else return simde_veorq_u32(a, simde_vbicq_u32(b, c)); #endif @@ -83,6 +111,15 @@ simde_uint64x2_t simde_vbcaxq_u64(simde_uint64x2_t a, simde_uint64x2_t b, simde_uint64x2_t c) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_SHA3) return vbcaxq_u64(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint64x2_private + r_, + a_ = simde_uint64x2_to_private(a), + b_ = simde_uint64x2_to_private(b), + c_ = simde_uint64x2_to_private(c); + r_.sv128 = __riscv_vxor_vv_u64m1(a_.sv128, __riscv_vand_vv_u64m1(b_.sv128 , \ + __riscv_vnot_v_u64m1(c_.sv128 , 2), 2), 2); + return simde_uint64x2_from_private(r_); #else return simde_veorq_u64(a, simde_vbicq_u64(b, c)); #endif @@ -97,6 +134,15 @@ simde_int8x16_t simde_vbcaxq_s8(simde_int8x16_t a, simde_int8x16_t b, simde_int8x16_t c) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_SHA3) return vbcaxq_s8(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int8x16_private + r_, + a_ = simde_int8x16_to_private(a), + b_ = simde_int8x16_to_private(b), + c_ = simde_int8x16_to_private(c); + r_.sv128 = __riscv_vxor_vv_i8m1(a_.sv128, __riscv_vand_vv_i8m1(b_.sv128 , \ + __riscv_vnot_v_i8m1(c_.sv128 , 16), 16), 16); + return simde_int8x16_from_private(r_); #else return simde_veorq_s8(a, simde_vbicq_s8(b, c)); #endif @@ -111,6 +157,15 @@ simde_int16x8_t simde_vbcaxq_s16(simde_int16x8_t a, simde_int16x8_t b, simde_int16x8_t c) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_SHA3) return vbcaxq_s16(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int16x8_private + r_, + a_ = simde_int16x8_to_private(a), + b_ = simde_int16x8_to_private(b), + c_ = simde_int16x8_to_private(c); + r_.sv128 = __riscv_vxor_vv_i16m1(a_.sv128, __riscv_vand_vv_i16m1(b_.sv128 , \ + __riscv_vnot_v_i16m1(c_.sv128 , 8), 8), 8); + return simde_int16x8_from_private(r_); #else return simde_veorq_s16(a,simde_vbicq_s16(b, c)); #endif @@ -125,6 +180,15 @@ simde_int32x4_t simde_vbcaxq_s32(simde_int32x4_t a, simde_int32x4_t b, simde_int32x4_t c) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_SHA3) return vbcaxq_s32(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int32x4_private + r_, + a_ = simde_int32x4_to_private(a), + b_ = simde_int32x4_to_private(b), + c_ = simde_int32x4_to_private(c); + r_.sv128 = __riscv_vxor_vv_i32m1(a_.sv128, __riscv_vand_vv_i32m1(b_.sv128 , \ + __riscv_vnot_v_i32m1(c_.sv128 , 4), 4), 4); + return simde_int32x4_from_private(r_); #else return simde_veorq_s32(a, simde_vbicq_s32(b, c)); #endif @@ -139,6 +203,15 @@ simde_int64x2_t simde_vbcaxq_s64(simde_int64x2_t a, simde_int64x2_t b, simde_int64x2_t c) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_SHA3) return vbcaxq_s64(a, b, c); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int64x2_private + r_, + a_ = simde_int64x2_to_private(a), + b_ = simde_int64x2_to_private(b), + c_ = simde_int64x2_to_private(c); + r_.sv128 = __riscv_vxor_vv_i64m1(a_.sv128, __riscv_vand_vv_i64m1(b_.sv128 , \ + __riscv_vnot_v_i64m1(c_.sv128 , 2), 2), 2); + return simde_int64x2_from_private(r_); #else return simde_veorq_s64(a, simde_vbicq_s64(b, c)); #endif diff --git a/simde/arm/neon/bic.h b/simde/arm/neon/bic.h index 49cc7f396..88a68ae5f 100644 --- a/simde/arm/neon/bic.h +++ b/simde/arm/neon/bic.h @@ -22,6 +22,7 @@ * * Copyright: * 2020 Evan Nemerson + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_BIC_H) @@ -48,9 +49,13 @@ simde_vbic_s8(simde_int8x8_t a, simde_int8x8_t b) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_andnot_si64(b_.m64, a_.m64); #else - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] & ~b_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vand_vv_i8m1(a_.sv64 , __riscv_vnot_v_i8m1(b_.sv64 , 8) , 8); + #else + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] & ~b_.values[i]; + } + #endif #endif return simde_int8x8_from_private(r_); @@ -75,9 +80,13 @@ simde_vbic_s16(simde_int16x4_t a, simde_int16x4_t b) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_andnot_si64(b_.m64, a_.m64); #else - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] & ~b_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vand_vv_i16m1(a_.sv64 , __riscv_vnot_v_i16m1(b_.sv64 , 4) , 4); + #else + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] & ~b_.values[i]; + } + #endif #endif return simde_int16x4_from_private(r_); @@ -102,9 +111,13 @@ simde_vbic_s32(simde_int32x2_t a, simde_int32x2_t b) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_andnot_si64(b_.m64, a_.m64); #else - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] & ~b_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vand_vv_i32m1(a_.sv64 , __riscv_vnot_v_i32m1(b_.sv64 , 2) , 2); + #else + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] & ~b_.values[i]; + } + #endif #endif return simde_int32x2_from_private(r_); @@ -129,9 +142,13 @@ simde_vbic_s64(simde_int64x1_t a, simde_int64x1_t b) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_andnot_si64(b_.m64, a_.m64); #else - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] & ~b_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vand_vv_i64m1(a_.sv64 , __riscv_vnot_v_i64m1(b_.sv64 , 1) , 1); + #else + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] & ~b_.values[i]; + } + #endif #endif return simde_int64x1_from_private(r_); @@ -156,9 +173,13 @@ simde_vbic_u8(simde_uint8x8_t a, simde_uint8x8_t b) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_andnot_si64(b_.m64, a_.m64); #else - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] & ~b_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vand_vv_u8m1(a_.sv64 , __riscv_vnot_v_u8m1(b_.sv64 , 8) , 8); + #else + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] & ~b_.values[i]; + } + #endif #endif return simde_uint8x8_from_private(r_); @@ -183,9 +204,13 @@ simde_vbic_u16(simde_uint16x4_t a, simde_uint16x4_t b) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_andnot_si64(b_.m64, a_.m64); #else - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] & ~b_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vand_vv_u16m1(a_.sv64 , __riscv_vnot_v_u16m1(b_.sv64 , 4) , 4); + #else + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] & ~b_.values[i]; + } + #endif #endif return simde_uint16x4_from_private(r_); @@ -210,9 +235,13 @@ simde_vbic_u32(simde_uint32x2_t a, simde_uint32x2_t b) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_andnot_si64(b_.m64, a_.m64); #else - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] & ~b_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vand_vv_u32m1(a_.sv64 , __riscv_vnot_v_u32m1(b_.sv64 , 2) , 2); + #else + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] & ~b_.values[i]; + } + #endif #endif return simde_uint32x2_from_private(r_); @@ -237,9 +266,13 @@ simde_vbic_u64(simde_uint64x1_t a, simde_uint64x1_t b) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_andnot_si64(b_.m64, a_.m64); #else - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] & ~b_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vand_vv_u64m1(a_.sv64 , __riscv_vnot_v_u64m1(b_.sv64 , 1) , 1); + #else + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] & ~b_.values[i]; + } + #endif #endif return simde_uint64x1_from_private(r_); @@ -263,7 +296,9 @@ simde_vbicq_s8(simde_int8x16_t a, simde_int8x16_t b) { b_ = simde_int8x16_to_private(b), r_; - #if defined(SIMDE_X86_SSE2_NATIVE) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vand_vv_i8m1(a_.sv128 , __riscv_vnot_v_i8m1(b_.sv128 , 16) , 16); + #elif defined(SIMDE_X86_SSE2_NATIVE) r_.m128i = _mm_andnot_si128(b_.m128i, a_.m128i); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_v128_andnot(a_.v128, b_.v128); @@ -294,7 +329,9 @@ simde_vbicq_s16(simde_int16x8_t a, simde_int16x8_t b) { b_ = simde_int16x8_to_private(b), r_; - #if defined(SIMDE_X86_SSE2_NATIVE) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vand_vv_i16m1(a_.sv128 , __riscv_vnot_v_i16m1(b_.sv128 , 8) , 8); + #elif defined(SIMDE_X86_SSE2_NATIVE) r_.m128i = _mm_andnot_si128(b_.m128i, a_.m128i); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_v128_andnot(a_.v128, b_.v128); @@ -325,7 +362,9 @@ simde_vbicq_s32(simde_int32x4_t a, simde_int32x4_t b) { b_ = simde_int32x4_to_private(b), r_; - #if defined(SIMDE_X86_SSE2_NATIVE) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vand_vv_i32m1(a_.sv128 , __riscv_vnot_v_i32m1(b_.sv128 , 4) , 4); + #elif defined(SIMDE_X86_SSE2_NATIVE) r_.m128i = _mm_andnot_si128(b_.m128i, a_.m128i); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_v128_andnot(a_.v128, b_.v128); @@ -356,7 +395,9 @@ simde_vbicq_s64(simde_int64x2_t a, simde_int64x2_t b) { b_ = simde_int64x2_to_private(b), r_; - #if defined(SIMDE_X86_SSE2_NATIVE) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vand_vv_i64m1(a_.sv128 , __riscv_vnot_v_i64m1(b_.sv128 , 2) , 2); + #elif defined(SIMDE_X86_SSE2_NATIVE) r_.m128i = _mm_andnot_si128(b_.m128i, a_.m128i); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_v128_andnot(a_.v128, b_.v128); @@ -387,7 +428,9 @@ simde_vbicq_u8(simde_uint8x16_t a, simde_uint8x16_t b) { b_ = simde_uint8x16_to_private(b), r_; - #if defined(SIMDE_X86_SSE2_NATIVE) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vand_vv_u8m1(a_.sv128 , __riscv_vnot_v_u8m1(b_.sv128 , 16) , 16); + #elif defined(SIMDE_X86_SSE2_NATIVE) r_.m128i = _mm_andnot_si128(b_.m128i, a_.m128i); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_v128_andnot(a_.v128, b_.v128); @@ -418,7 +461,9 @@ simde_vbicq_u16(simde_uint16x8_t a, simde_uint16x8_t b) { b_ = simde_uint16x8_to_private(b), r_; - #if defined(SIMDE_X86_SSE2_NATIVE) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vand_vv_u16m1(a_.sv128 , __riscv_vnot_v_u16m1(b_.sv128 , 8) , 8); + #elif defined(SIMDE_X86_SSE2_NATIVE) r_.m128i = _mm_andnot_si128(b_.m128i, a_.m128i); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_v128_andnot(a_.v128, b_.v128); @@ -449,7 +494,9 @@ simde_vbicq_u32(simde_uint32x4_t a, simde_uint32x4_t b) { b_ = simde_uint32x4_to_private(b), r_; - #if defined(SIMDE_X86_SSE2_NATIVE) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vand_vv_u32m1(a_.sv128 , __riscv_vnot_v_u32m1(b_.sv128 , 4) , 4); + #elif defined(SIMDE_X86_SSE2_NATIVE) r_.m128i = _mm_andnot_si128(b_.m128i, a_.m128i); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_v128_andnot(a_.v128, b_.v128); @@ -480,7 +527,9 @@ simde_vbicq_u64(simde_uint64x2_t a, simde_uint64x2_t b) { b_ = simde_uint64x2_to_private(b), r_; - #if defined(SIMDE_X86_SSE2_NATIVE) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vand_vv_u64m1(a_.sv128 , __riscv_vnot_v_u64m1(b_.sv128 , 2) , 2); + #elif defined(SIMDE_X86_SSE2_NATIVE) r_.m128i = _mm_andnot_si128(b_.m128i, a_.m128i); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_v128_andnot(a_.v128, b_.v128); diff --git a/simde/arm/neon/cadd_rot270.h b/simde/arm/neon/cadd_rot270.h index 17995f48a..1c557c1df 100644 --- a/simde/arm/neon/cadd_rot270.h +++ b/simde/arm/neon/cadd_rot270.h @@ -21,7 +21,7 @@ * SOFTWARE. * * Copyright: - * 2023 Chi-Wei Chu + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_CADD_ROT270_H) @@ -47,7 +47,12 @@ simde_float16x4_t simde_vcadd_rot270_f16(simde_float16x4_t a, simde_float16x4_t return vcadd_rot270_f16(a, b); #else simde_float16x4_private r_, a_ = simde_float16x4_to_private(a), b_ = simde_float16x4_to_private(b); - #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) && \ + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + uint16_t idx1[4] = {5, 0, 7, 2}; + vfloat16m1_t op1 = __riscv_vrgather_vv_f16m1(__riscv_vslideup_vx_f16m1( \ + __riscv_vfneg_v_f16m1(b_.sv64, 4), b_.sv64, 4, 8), __riscv_vle16_v_u16m1(idx1, 4), 4); + r_.sv64 = __riscv_vfadd_vv_f16m1(op1, a_.sv64, 4); + #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) && \ ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) b_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, -b_.values, b_.values, 5, 0, 7, 2); r_.values = b_.values + a_.values; @@ -77,7 +82,13 @@ simde_float16x8_t simde_vcaddq_rot270_f16(simde_float16x8_t a, simde_float16x8_t return vcaddq_rot270_f16(a, b); #else simde_float16x8_private r_, a_ = simde_float16x8_to_private(a), b_ = simde_float16x8_to_private(b); - #if defined(SIMDE_SHUFFLE_VECTOR_) && \ + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + uint16_t idx1[8] = {9, 0, 11, 2, 13, 4, 15, 6}; + vfloat16m2_t b_tmp = __riscv_vlmul_ext_v_f16m1_f16m2 (b_.sv128); + vfloat16m1_t op1 = __riscv_vlmul_trunc_v_f16m2_f16m1(__riscv_vrgather_vv_f16m2(__riscv_vslideup_vx_f16m2( \ + __riscv_vfneg_v_f16m2(b_tmp, 8), b_tmp, 8, 16), __riscv_vle16_v_u16m2(idx1, 8), 8)); + r_.sv128 = __riscv_vfadd_vv_f16m1(op1, a_.sv128, 8); + #elif defined(SIMDE_SHUFFLE_VECTOR_) && \ ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) b_.values = SIMDE_SHUFFLE_VECTOR_(16, 8, -b_.values, b_.values, 9, 0, 11, 2, 13, 4, 15, 6); r_.values = b_.values + a_.values; @@ -107,7 +118,12 @@ simde_float32x2_t simde_vcadd_rot270_f32(simde_float32x2_t a, simde_float32x2_t return vcadd_rot270_f32(a, b); #else simde_float32x2_private r_, a_ = simde_float32x2_to_private(a), b_ = simde_float32x2_to_private(b); - #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) + #if defined(SIMDE_RISCV_V_NATIVE) + uint32_t idx1[2] = {3, 0}; + vfloat32m1_t op1 = __riscv_vrgather_vv_f32m1(__riscv_vslideup_vx_f32m1( \ + __riscv_vfneg_v_f32m1(b_.sv64, 2), b_.sv64, 2, 4), __riscv_vle32_v_u32m1(idx1, 2), 2); + r_.sv64 = __riscv_vfadd_vv_f32m1(op1, a_.sv64, 2); + #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) b_.values = SIMDE_SHUFFLE_VECTOR_(32, 8, -b_.values, b_.values, 3, 0); r_.values = b_.values + a_.values; #else @@ -135,7 +151,13 @@ simde_float32x4_t simde_vcaddq_rot270_f32(simde_float32x4_t a, simde_float32x4_t return vcaddq_rot270_f32(a, b); #else simde_float32x4_private r_, a_ = simde_float32x4_to_private(a), b_ = simde_float32x4_to_private(b); - #if defined(SIMDE_SHUFFLE_VECTOR_) + #if defined(SIMDE_RISCV_V_NATIVE) + uint32_t idx1[4] = {5, 0, 7, 2}; + vfloat32m2_t b_tmp = __riscv_vlmul_ext_v_f32m1_f32m2 (b_.sv128); + vfloat32m1_t op1 = __riscv_vlmul_trunc_v_f32m2_f32m1(__riscv_vrgather_vv_f32m2(__riscv_vslideup_vx_f32m2( \ + __riscv_vfneg_v_f32m2(b_tmp, 4), b_tmp, 4, 8), __riscv_vle32_v_u32m2(idx1, 4), 4)); + r_.sv128 = __riscv_vfadd_vv_f32m1(op1, a_.sv128, 4); + #elif defined(SIMDE_SHUFFLE_VECTOR_) b_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, -b_.values, b_.values, 5, 0, 7, 2); r_.values = b_.values + a_.values; #else @@ -163,7 +185,13 @@ simde_float64x2_t simde_vcaddq_rot270_f64(simde_float64x2_t a, simde_float64x2_t return vcaddq_rot270_f64(a, b); #else simde_float64x2_private r_, a_ = simde_float64x2_to_private(a), b_ = simde_float64x2_to_private(b); - #if defined(SIMDE_SHUFFLE_VECTOR_) + #if defined(SIMDE_RISCV_V_NATIVE) + uint64_t idx1[2] = {3, 0}; + vfloat64m2_t b_tmp = __riscv_vlmul_ext_v_f64m1_f64m2 (b_.sv128); + vfloat64m1_t op1 = __riscv_vlmul_trunc_v_f64m2_f64m1(__riscv_vrgather_vv_f64m2(__riscv_vslideup_vx_f64m2( \ + __riscv_vfneg_v_f64m2(b_tmp, 2), b_tmp, 2, 4), __riscv_vle64_v_u64m2(idx1, 2), 2)); + r_.sv128 = __riscv_vfadd_vv_f64m1(op1, a_.sv128, 2); + #elif defined(SIMDE_SHUFFLE_VECTOR_) b_.values = SIMDE_SHUFFLE_VECTOR_(64, 16, -b_.values, b_.values, 3, 0); r_.values = b_.values + a_.values; #else diff --git a/simde/arm/neon/cadd_rot90.h b/simde/arm/neon/cadd_rot90.h index 0c448a521..70a2c4f9d 100644 --- a/simde/arm/neon/cadd_rot90.h +++ b/simde/arm/neon/cadd_rot90.h @@ -21,7 +21,7 @@ * SOFTWARE. * * Copyright: - * 2023 Chi-Wei Chu + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_CADD_ROT90_H) @@ -47,7 +47,12 @@ simde_float16x4_t simde_vcadd_rot90_f16(simde_float16x4_t a, simde_float16x4_t b return vcadd_rot90_f16(a, b); #else simde_float16x4_private r_, a_ = simde_float16x4_to_private(a), b_ = simde_float16x4_to_private(b); - #if defined(SIMDE_SHUFFLE_VECTOR_) && \ + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + uint16_t idx1[4] = {1, 4, 3, 6}; + vfloat16m1_t op1 = __riscv_vrgather_vv_f16m1(__riscv_vslideup_vx_f16m1( \ + __riscv_vfneg_v_f16m1(b_.sv64, 4), b_.sv64, 4, 8), __riscv_vle16_v_u16m1(idx1, 4), 4); + r_.sv64 = __riscv_vfadd_vv_f16m1(op1, a_.sv64, 4); + #elif defined(SIMDE_SHUFFLE_VECTOR_) && \ ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) b_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, -b_.values, b_.values, 1, 4, 3, 6); r_.values = b_.values + a_.values; @@ -77,7 +82,13 @@ simde_float16x8_t simde_vcaddq_rot90_f16(simde_float16x8_t a, simde_float16x8_t return vcaddq_rot90_f16(a, b); #else simde_float16x8_private r_, a_ = simde_float16x8_to_private(a), b_ = simde_float16x8_to_private(b); - #if defined(SIMDE_SHUFFLE_VECTOR_) && \ + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + uint16_t idx1[8] = {1, 8, 3, 10, 5, 12, 7, 14}; + vfloat16m2_t b_tmp = __riscv_vlmul_ext_v_f16m1_f16m2 (b_.sv128); + vfloat16m1_t op1 = __riscv_vlmul_trunc_v_f16m2_f16m1(__riscv_vrgather_vv_f16m2(__riscv_vslideup_vx_f16m2( \ + __riscv_vfneg_v_f16m2(b_tmp, 8), b_tmp, 8, 16), __riscv_vle16_v_u16m2(idx1, 8), 8)); + r_.sv128 = __riscv_vfadd_vv_f16m1(op1, a_.sv128, 8); + #elif defined(SIMDE_SHUFFLE_VECTOR_) && \ ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) b_.values = SIMDE_SHUFFLE_VECTOR_(16, 8, -b_.values, b_.values, 1, 8, 3, 10, 5, 12, 7, 14); r_.values = b_.values + a_.values; @@ -107,7 +118,12 @@ simde_float32x2_t simde_vcadd_rot90_f32(simde_float32x2_t a, simde_float32x2_t b return vcadd_rot90_f32(a, b); #else simde_float32x2_private r_, a_ = simde_float32x2_to_private(a), b_ = simde_float32x2_to_private(b); - #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) + #if defined(SIMDE_RISCV_V_NATIVE) + uint32_t idx1[2] = {1, 2}; + vfloat32m1_t op1 = __riscv_vrgather_vv_f32m1(__riscv_vslideup_vx_f32m1( \ + __riscv_vfneg_v_f32m1(b_.sv64, 2), b_.sv64, 2, 4), __riscv_vle32_v_u32m1(idx1, 2), 2); + r_.sv64 = __riscv_vfadd_vv_f32m1(op1, a_.sv64, 2); + #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) b_.values = SIMDE_SHUFFLE_VECTOR_(32, 8, -b_.values, b_.values, 1, 2); r_.values = b_.values + a_.values; #else @@ -135,7 +151,13 @@ simde_float32x4_t simde_vcaddq_rot90_f32(simde_float32x4_t a, simde_float32x4_t return vcaddq_rot90_f32(a, b); #else simde_float32x4_private r_, a_ = simde_float32x4_to_private(a), b_ = simde_float32x4_to_private(b); - #if defined(SIMDE_SHUFFLE_VECTOR_) + #if defined(SIMDE_RISCV_V_NATIVE) + uint32_t idx1[4] = {1, 4, 3, 6}; + vfloat32m2_t b_tmp = __riscv_vlmul_ext_v_f32m1_f32m2 (b_.sv128); + vfloat32m1_t op1 = __riscv_vlmul_trunc_v_f32m2_f32m1(__riscv_vrgather_vv_f32m2(__riscv_vslideup_vx_f32m2( \ + __riscv_vfneg_v_f32m2(b_tmp, 4), b_tmp, 4, 8), __riscv_vle32_v_u32m2(idx1, 4), 4)); + r_.sv128 = __riscv_vfadd_vv_f32m1(op1, a_.sv128, 4); + #elif defined(SIMDE_SHUFFLE_VECTOR_) b_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, -b_.values, b_.values, 1, 4, 3, 6); r_.values = b_.values + a_.values; #else @@ -163,7 +185,13 @@ simde_float64x2_t simde_vcaddq_rot90_f64(simde_float64x2_t a, simde_float64x2_t return vcaddq_rot90_f64(a, b); #else simde_float64x2_private r_, a_ = simde_float64x2_to_private(a), b_ = simde_float64x2_to_private(b); - #if defined(SIMDE_SHUFFLE_VECTOR_) + #if defined(SIMDE_RISCV_V_NATIVE) + uint64_t idx1[2] = {1, 2}; + vfloat64m2_t b_tmp = __riscv_vlmul_ext_v_f64m1_f64m2 (b_.sv128); + vfloat64m1_t op1 = __riscv_vlmul_trunc_v_f64m2_f64m1(__riscv_vrgather_vv_f64m2(__riscv_vslideup_vx_f64m2( \ + __riscv_vfneg_v_f64m2(b_tmp, 2), b_tmp, 2, 4), __riscv_vle64_v_u64m2(idx1, 2), 2)); + r_.sv128 = __riscv_vfadd_vv_f64m1(op1, a_.sv128, 2); + #elif defined(SIMDE_SHUFFLE_VECTOR_) b_.values = SIMDE_SHUFFLE_VECTOR_(64, 16, -b_.values, b_.values, 1, 2); r_.values = b_.values + a_.values; #else diff --git a/simde/arm/neon/cmla_lane.h b/simde/arm/neon/cmla_lane.h index 4355bf7a5..9256ceccc 100644 --- a/simde/arm/neon/cmla_lane.h +++ b/simde/arm/neon/cmla_lane.h @@ -21,7 +21,7 @@ * SOFTWARE. * * Copyright: - * 2023 Chi-Wei Chu + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_CMLA_LANE_H) @@ -43,23 +43,34 @@ SIMDE_FUNCTION_ATTRIBUTES simde_float16x4_t simde_vcmla_lane_f16(simde_float16x4_t r, simde_float16x4_t a, simde_float16x4_t b, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { - simde_float32x4_private r_ = simde_float32x4_to_private(simde_vcvt_f32_f16(r)), + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + simde_float16x4_private r_ = simde_float16x4_to_private(r), + a_ = simde_float16x4_to_private(a), + b_ = simde_float16x4_to_private(b); + uint16_t idx1[4] = {0, 0, 2, 2}; + vfloat16m1_t op1 = __riscv_vrgather_vv_f16m1(__riscv_vslideup_vx_f16m1( \ + a_.sv64, a_.sv64, 4, 8), __riscv_vle16_v_u16m1(idx1, 4), 4); + r_.sv64 = __riscv_vfmacc_vf_f16m1(r_.sv64, b_.values[lane], op1, 4); + return simde_float16x4_from_private(r_); + #else + simde_float32x4_private r_ = simde_float32x4_to_private(simde_vcvt_f32_f16(r)), a_ = simde_float32x4_to_private(simde_vcvt_f32_f16(a)), b_ = simde_float32x4_to_private( simde_vcvt_f32_f16(simde_vdup_n_f16(simde_float16x4_to_private(b).values[lane]))); - #if defined(SIMDE_SHUFFLE_VECTOR_) && \ - ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) - a_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_.values, a_.values, 0, 0, 2, 2); - r_.values += b_.values * a_.values; - #else - SIMDE_VECTORIZE - for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++) - { - r_.values[2 * i] += b_.values[lane] * a_.values[2 * i]; - r_.values[2 * i + 1] += b_.values[lane] * a_.values[2 * i]; - } + #if defined(SIMDE_SHUFFLE_VECTOR_) && \ + ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) + a_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_.values, a_.values, 0, 0, 2, 2); + r_.values += b_.values * a_.values; + #else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++) + { + r_.values[2 * i] += b_.values[lane] * a_.values[2 * i]; + r_.values[2 * i + 1] += b_.values[lane] * a_.values[2 * i]; + } + #endif + return simde_vcvt_f16_f32(simde_float32x4_from_private(r_)); #endif - return simde_vcvt_f16_f32(simde_float32x4_from_private(r_)); } #if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) #undef vcmla_lane_f16 @@ -75,20 +86,31 @@ SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vcmla_lane_f32(simde_float32x2_t r, simde_float32x2_t a, simde_float32x2_t b, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 0) { - simde_float32x2_private r_ = simde_float32x2_to_private(r), a_ = simde_float32x2_to_private(a), - b_ = simde_float32x2_to_private(simde_vdup_n_f32(simde_float32x2_to_private(b).values[lane])); - #if defined(SIMDE_SHUFFLE_VECTOR_) - a_.values = SIMDE_SHUFFLE_VECTOR_(32, 8, a_.values, a_.values, 0, 0); - r_.values += b_.values * a_.values; + #if defined(SIMDE_RISCV_V_NATIVE) + simde_float32x2_private r_ = simde_float32x2_to_private(r), + a_ = simde_float32x2_to_private(a), + b_ = simde_float32x2_to_private(b); + uint32_t idx1[2] = {0, 0}; + vfloat32m1_t op1 = __riscv_vrgather_vv_f32m1(__riscv_vslideup_vx_f32m1( \ + a_.sv64, a_.sv64, 2, 4), __riscv_vle32_v_u32m1(idx1, 2), 2); + r_.sv64 = __riscv_vfmacc_vf_f32m1(r_.sv64, b_.values[lane], op1, 2); + return simde_float32x2_from_private(r_); #else - SIMDE_VECTORIZE - for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++) - { - r_.values[2 * i] += b_.values[lane] * a_.values[2 * i]; - r_.values[2 * i + 1] += b_.values[lane] * a_.values[2 * i]; - } + simde_float32x2_private r_ = simde_float32x2_to_private(r), a_ = simde_float32x2_to_private(a), + b_ = simde_float32x2_to_private(simde_vdup_n_f32(simde_float32x2_to_private(b).values[lane])); + #if defined(SIMDE_SHUFFLE_VECTOR_) + a_.values = SIMDE_SHUFFLE_VECTOR_(32, 8, a_.values, a_.values, 0, 0); + r_.values += b_.values * a_.values; + #else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++) + { + r_.values[2 * i] += b_.values[lane] * a_.values[2 * i]; + r_.values[2 * i + 1] += b_.values[lane] * a_.values[2 * i]; + } + #endif + return simde_float32x2_from_private(r_); #endif - return simde_float32x2_from_private(r_); } #if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) #undef vcmla_lane_f32 @@ -104,23 +126,34 @@ SIMDE_FUNCTION_ATTRIBUTES simde_float16x4_t simde_vcmla_laneq_f16(simde_float16x4_t r, simde_float16x4_t a, simde_float16x8_t b, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { - simde_float32x4_private r_ = simde_float32x4_to_private(simde_vcvt_f32_f16(r)), + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + simde_float16x4_private r_ = simde_float16x4_to_private(r), + a_ = simde_float16x4_to_private(a); + simde_float16x8_private b_ = simde_float16x8_to_private(b); + uint16_t idx1[4] = {0, 0, 2, 2}; + vfloat16m1_t op1 = __riscv_vrgather_vv_f16m1(__riscv_vslideup_vx_f16m1( \ + a_.sv64, a_.sv64, 4, 8), __riscv_vle16_v_u16m1(idx1, 4), 4); + r_.sv64 = __riscv_vfmacc_vf_f16m1(r_.sv64, b_.values[lane], op1, 4); + return simde_float16x4_from_private(r_); + #else + simde_float32x4_private r_ = simde_float32x4_to_private(simde_vcvt_f32_f16(r)), a_ = simde_float32x4_to_private(simde_vcvt_f32_f16(a)), b_ = simde_float32x4_to_private( simde_vcvt_f32_f16(simde_vdup_n_f16(simde_float16x8_to_private(b).values[lane]))); - #if defined(SIMDE_SHUFFLE_VECTOR_) && \ - ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) - a_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_.values, a_.values, 0, 0, 2, 2); - r_.values += b_.values * a_.values; - #else - SIMDE_VECTORIZE - for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++) - { - r_.values[2 * i] += b_.values[lane] * a_.values[2 * i]; - r_.values[2 * i + 1] += b_.values[lane] * a_.values[2 * i]; - } + #if defined(SIMDE_SHUFFLE_VECTOR_) && \ + ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) + a_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_.values, a_.values, 0, 0, 2, 2); + r_.values += b_.values * a_.values; + #else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++) + { + r_.values[2 * i] += b_.values[lane] * a_.values[2 * i]; + r_.values[2 * i + 1] += b_.values[lane] * a_.values[2 * i]; + } + #endif + return simde_vcvt_f16_f32(simde_float32x4_from_private(r_)); #endif - return simde_vcvt_f16_f32(simde_float32x4_from_private(r_)); } #if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) #undef vcmla_laneq_f16 @@ -136,20 +169,31 @@ SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vcmla_laneq_f32(simde_float32x2_t r, simde_float32x2_t a, simde_float32x4_t b, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { - simde_float32x2_private r_ = simde_float32x2_to_private(r), a_ = simde_float32x2_to_private(a), - b_ = simde_float32x2_to_private(simde_vdup_n_f32(simde_float32x4_to_private(b).values[lane])); - #if defined(SIMDE_SHUFFLE_VECTOR_) - a_.values = SIMDE_SHUFFLE_VECTOR_(32, 8, a_.values, a_.values, 0, 0); - r_.values += b_.values * a_.values; + #if defined(SIMDE_RISCV_V_NATIVE) + simde_float32x2_private r_ = simde_float32x2_to_private(r), + a_ = simde_float32x2_to_private(a); + simde_float32x4_private b_ = simde_float32x4_to_private(b); + uint32_t idx1[2] = {0, 0}; + vfloat32m1_t op1 = __riscv_vrgather_vv_f32m1(__riscv_vslideup_vx_f32m1( \ + a_.sv64, a_.sv64, 2, 4), __riscv_vle32_v_u32m1(idx1, 2), 2); + r_.sv64 = __riscv_vfmacc_vf_f32m1(r_.sv64, b_.values[lane], op1, 2); + return simde_float32x2_from_private(r_); #else - SIMDE_VECTORIZE - for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++) - { - r_.values[2 * i] += b_.values[lane] * a_.values[2 * i]; - r_.values[2 * i + 1] += b_.values[lane] * a_.values[2 * i]; - } + simde_float32x2_private r_ = simde_float32x2_to_private(r), a_ = simde_float32x2_to_private(a), + b_ = simde_float32x2_to_private(simde_vdup_n_f32(simde_float32x4_to_private(b).values[lane])); + #if defined(SIMDE_SHUFFLE_VECTOR_) + a_.values = SIMDE_SHUFFLE_VECTOR_(32, 8, a_.values, a_.values, 0, 0); + r_.values += b_.values * a_.values; + #else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++) + { + r_.values[2 * i] += b_.values[lane] * a_.values[2 * i]; + r_.values[2 * i + 1] += b_.values[lane] * a_.values[2 * i]; + } + #endif + return simde_float32x2_from_private(r_); #endif - return simde_float32x2_from_private(r_); } #if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) #undef vcmla_laneq_f32 @@ -165,30 +209,42 @@ SIMDE_FUNCTION_ATTRIBUTES simde_float16x8_t simde_vcmlaq_lane_f16(simde_float16x8_t r, simde_float16x8_t a, simde_float16x4_t b, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { - simde_float32x4_private r_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(r))), - a_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(a))), - r_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(r))), - a_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(a))), - b_ = simde_float32x4_to_private( - simde_vcvt_f32_f16(simde_vdup_n_f16(simde_float16x4_to_private(b).values[lane]))); - #if defined(SIMDE_SHUFFLE_VECTOR_) && \ - ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) - a_low.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_low.values, a_low.values, 0, 0, 2, 2); - a_high.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_high.values, a_high.values, 0, 0, 2, 2); - r_low.values += b_.values * a_low.values; - r_high.values += b_.values * a_high.values; + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + simde_float16x8_private r_ = simde_float16x8_to_private(r), + a_ = simde_float16x8_to_private(a); + simde_float16x4_private b_ = simde_float16x4_to_private(b); + uint16_t idx1[8] = {0, 0, 2, 2, 4, 4, 6, 6}; + vfloat16m2_t a_tmp = __riscv_vlmul_ext_v_f16m1_f16m2 (a_.sv128); + vfloat16m1_t op1 = __riscv_vlmul_trunc_v_f16m2_f16m1(__riscv_vrgather_vv_f16m2( \ + __riscv_vslideup_vx_f16m2(a_tmp, a_tmp, 8, 16), __riscv_vle16_v_u16m2(idx1, 8), 8)); + r_.sv128 = __riscv_vfmacc_vf_f16m1(r_.sv128, b_.values[lane], op1, 8); + return simde_float16x8_from_private(r_); #else - SIMDE_VECTORIZE - for (size_t i = 0; i < (sizeof(r_low.values) / (2 * sizeof(r_low.values[0]))); i++) - { - r_low.values[2 * i] += b_.values[lane] * a_low.values[2 * i]; - r_low.values[2 * i + 1] += b_.values[lane] * a_low.values[2 * i]; - r_high.values[2 * i] += b_.values[lane] * a_high.values[2 * i]; - r_high.values[2 * i + 1] += b_.values[lane] * a_high.values[2 * i]; - } + simde_float32x4_private r_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(r))), + a_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(a))), + r_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(r))), + a_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(a))), + b_ = simde_float32x4_to_private( + simde_vcvt_f32_f16(simde_vdup_n_f16(simde_float16x4_to_private(b).values[lane]))); + #if defined(SIMDE_SHUFFLE_VECTOR_) && \ + ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) + a_low.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_low.values, a_low.values, 0, 0, 2, 2); + a_high.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_high.values, a_high.values, 0, 0, 2, 2); + r_low.values += b_.values * a_low.values; + r_high.values += b_.values * a_high.values; + #else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_low.values) / (2 * sizeof(r_low.values[0]))); i++) + { + r_low.values[2 * i] += b_.values[lane] * a_low.values[2 * i]; + r_low.values[2 * i + 1] += b_.values[lane] * a_low.values[2 * i]; + r_high.values[2 * i] += b_.values[lane] * a_high.values[2 * i]; + r_high.values[2 * i + 1] += b_.values[lane] * a_high.values[2 * i]; + } + #endif + return simde_vcombine_f16(simde_vcvt_f16_f32(simde_float32x4_from_private(r_low)), + simde_vcvt_f16_f32(simde_float32x4_from_private(r_high))); #endif - return simde_vcombine_f16(simde_vcvt_f16_f32(simde_float32x4_from_private(r_low)), - simde_vcvt_f16_f32(simde_float32x4_from_private(r_high))); } #if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) #undef vcmlaq_lane_f16 @@ -204,20 +260,32 @@ SIMDE_FUNCTION_ATTRIBUTES simde_float32x4_t simde_vcmlaq_lane_f32(simde_float32x4_t r, simde_float32x4_t a, simde_float32x2_t b, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 0) { - simde_float32x4_private r_ = simde_float32x4_to_private(r), a_ = simde_float32x4_to_private(a), - b_ = simde_float32x4_to_private(simde_vdupq_n_f32(simde_float32x2_to_private(b).values[lane])); - #if defined(SIMDE_SHUFFLE_VECTOR_) - a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 0, 0, 2, 2); - r_.values += b_.values * a_.values; + #if defined(SIMDE_RISCV_V_NATIVE) + simde_float32x4_private r_ = simde_float32x4_to_private(r), + a_ = simde_float32x4_to_private(a); + simde_float32x2_private b_ = simde_float32x2_to_private(b); + uint32_t idx1[4] = {0, 0, 2, 2}; + vfloat32m2_t a_tmp = __riscv_vlmul_ext_v_f32m1_f32m2 (a_.sv128); + vfloat32m1_t op1 = __riscv_vlmul_trunc_v_f32m2_f32m1(__riscv_vrgather_vv_f32m2( \ + __riscv_vslideup_vx_f32m2(a_tmp, a_tmp, 4, 8), __riscv_vle32_v_u32m2(idx1, 4), 4)); + r_.sv128 = __riscv_vfmacc_vf_f32m1(r_.sv128, b_.values[lane], op1, 4); + return simde_float32x4_from_private(r_); #else - SIMDE_VECTORIZE - for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++) - { - r_.values[2 * i] += b_.values[lane] * a_.values[2 * i]; - r_.values[2 * i + 1] += b_.values[lane] * a_.values[2 * i]; - } + simde_float32x4_private r_ = simde_float32x4_to_private(r), a_ = simde_float32x4_to_private(a), + b_ = simde_float32x4_to_private(simde_vdupq_n_f32(simde_float32x2_to_private(b).values[lane])); + #if defined(SIMDE_SHUFFLE_VECTOR_) + a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 0, 0, 2, 2); + r_.values += b_.values * a_.values; + #else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++) + { + r_.values[2 * i] += b_.values[lane] * a_.values[2 * i]; + r_.values[2 * i + 1] += b_.values[lane] * a_.values[2 * i]; + } + #endif + return simde_float32x4_from_private(r_); #endif - return simde_float32x4_from_private(r_); } #if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) #undef vcmlaq_lane_f32 @@ -233,30 +301,42 @@ SIMDE_FUNCTION_ATTRIBUTES simde_float16x8_t simde_vcmlaq_laneq_f16(simde_float16x8_t r, simde_float16x8_t a, simde_float16x8_t b, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { - simde_float32x4_private r_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(r))), - a_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(a))), - r_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(r))), - a_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(a))), - b_ = simde_float32x4_to_private( - simde_vcvt_f32_f16(simde_vdup_n_f16(simde_float16x8_to_private(b).values[lane]))); - #if defined(SIMDE_SHUFFLE_VECTOR_) && \ - ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) - a_low.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_low.values, a_low.values, 0, 0, 2, 2); - r_low.values += b_.values * a_low.values; - a_high.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_high.values, a_high.values, 0, 0, 2, 2); - r_high.values += b_.values * a_high.values; + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + simde_float16x8_private r_ = simde_float16x8_to_private(r), + a_ = simde_float16x8_to_private(a); + simde_float16x8_private b_ = simde_float16x8_to_private(b); + uint16_t idx1[8] = {0, 0, 2, 2, 4, 4, 6, 6}; + vfloat16m2_t a_tmp = __riscv_vlmul_ext_v_f16m1_f16m2 (a_.sv128); + vfloat16m1_t op1 = __riscv_vlmul_trunc_v_f16m2_f16m1(__riscv_vrgather_vv_f16m2( \ + __riscv_vslideup_vx_f16m2(a_tmp, a_tmp, 8, 16), __riscv_vle16_v_u16m2(idx1, 8), 8)); + r_.sv128 = __riscv_vfmacc_vf_f16m1(r_.sv128, b_.values[lane], op1, 8); + return simde_float16x8_from_private(r_); #else - SIMDE_VECTORIZE - for (size_t i = 0; i < (sizeof(r_low.values) / (2 * sizeof(r_low.values[0]))); i++) - { - r_low.values[2 * i] += b_.values[lane] * a_low.values[2 * i]; - r_low.values[2 * i + 1] += b_.values[lane] * a_low.values[2 * i]; - r_high.values[2 * i] += b_.values[lane] * a_high.values[2 * i]; - r_high.values[2 * i + 1] += b_.values[lane] * a_high.values[2 * i]; - } + simde_float32x4_private r_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(r))), + a_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(a))), + r_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(r))), + a_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(a))), + b_ = simde_float32x4_to_private( + simde_vcvt_f32_f16(simde_vdup_n_f16(simde_float16x8_to_private(b).values[lane]))); + #if defined(SIMDE_SHUFFLE_VECTOR_) && \ + ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) + a_low.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_low.values, a_low.values, 0, 0, 2, 2); + r_low.values += b_.values * a_low.values; + a_high.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_high.values, a_high.values, 0, 0, 2, 2); + r_high.values += b_.values * a_high.values; + #else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_low.values) / (2 * sizeof(r_low.values[0]))); i++) + { + r_low.values[2 * i] += b_.values[lane] * a_low.values[2 * i]; + r_low.values[2 * i + 1] += b_.values[lane] * a_low.values[2 * i]; + r_high.values[2 * i] += b_.values[lane] * a_high.values[2 * i]; + r_high.values[2 * i + 1] += b_.values[lane] * a_high.values[2 * i]; + } + #endif + return simde_vcombine_f16(simde_vcvt_f16_f32(simde_float32x4_from_private(r_low)), + simde_vcvt_f16_f32(simde_float32x4_from_private(r_high))); #endif - return simde_vcombine_f16(simde_vcvt_f16_f32(simde_float32x4_from_private(r_low)), - simde_vcvt_f16_f32(simde_float32x4_from_private(r_high))); } #if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) #undef vcmlaq_laneq_f16 @@ -272,21 +352,33 @@ SIMDE_FUNCTION_ATTRIBUTES simde_float32x4_t simde_vcmlaq_laneq_f32(simde_float32x4_t r, simde_float32x4_t a, simde_float32x4_t b, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { - simde_float32x4_private r_ = simde_float32x4_to_private(r), a_ = simde_float32x4_to_private(a), - b_ = simde_float32x4_to_private(simde_vdupq_n_f32(simde_float32x4_to_private(b).values[lane])); - - #if defined(SIMDE_SHUFFLE_VECTOR_) - a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 0, 0, 2, 2); - r_.values += b_.values * a_.values; + #if defined(SIMDE_RISCV_V_NATIVE) + simde_float32x4_private r_ = simde_float32x4_to_private(r), + a_ = simde_float32x4_to_private(a); + simde_float32x4_private b_ = simde_float32x4_to_private(b); + uint32_t idx1[4] = {0, 0, 2, 2}; + vfloat32m2_t a_tmp = __riscv_vlmul_ext_v_f32m1_f32m2 (a_.sv128); + vfloat32m1_t op1 = __riscv_vlmul_trunc_v_f32m2_f32m1(__riscv_vrgather_vv_f32m2( \ + __riscv_vslideup_vx_f32m2(a_tmp, a_tmp, 4, 8), __riscv_vle32_v_u32m2(idx1, 4), 4)); + r_.sv128 = __riscv_vfmacc_vf_f32m1(r_.sv128, b_.values[lane], op1, 4); + return simde_float32x4_from_private(r_); #else - SIMDE_VECTORIZE - for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++) - { - r_.values[2 * i] += b_.values[lane] * a_.values[2 * i]; - r_.values[2 * i + 1] += b_.values[lane] * a_.values[2 * i]; - } + simde_float32x4_private r_ = simde_float32x4_to_private(r), a_ = simde_float32x4_to_private(a), + b_ = simde_float32x4_to_private(simde_vdupq_n_f32(simde_float32x4_to_private(b).values[lane])); + + #if defined(SIMDE_SHUFFLE_VECTOR_) + a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 0, 0, 2, 2); + r_.values += b_.values * a_.values; + #else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++) + { + r_.values[2 * i] += b_.values[lane] * a_.values[2 * i]; + r_.values[2 * i + 1] += b_.values[lane] * a_.values[2 * i]; + } + #endif + return simde_float32x4_from_private(r_); #endif - return simde_float32x4_from_private(r_); } #if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) #undef vcmlaq_laneq_f32 diff --git a/simde/arm/neon/cmla_rot180_lane.h b/simde/arm/neon/cmla_rot180_lane.h index d72225917..0c25d0ee4 100644 --- a/simde/arm/neon/cmla_rot180_lane.h +++ b/simde/arm/neon/cmla_rot180_lane.h @@ -21,7 +21,7 @@ * SOFTWARE. * * Copyright: - * 2023 Chi-Wei Chu + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_CMLA_ROT180_LANE_H) @@ -43,24 +43,38 @@ SIMDE_FUNCTION_ATTRIBUTES simde_float16x4_t simde_vcmla_rot180_lane_f16(simde_float16x4_t r, simde_float16x4_t a, simde_float16x4_t b, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { - simde_float32x4_private r_ = simde_float32x4_to_private(simde_vcvt_f32_f16(r)), - a_ = simde_float32x4_to_private(simde_vcvt_f32_f16(a)), - b_ = simde_float32x4_to_private( - simde_vcvt_f32_f16(simde_vdup_n_f16(simde_float16x4_to_private(b).values[lane]))); - #if defined(SIMDE_SHUFFLE_VECTOR_) && \ - ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) - a_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_.values, a_.values, 0, 0, 2, 2); - b_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, -b_.values, b_.values, 0, 1, 2, 3); - r_.values += b_.values * a_.values; + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + simde_float16x4_private r_ = simde_float16x4_to_private(r), + a_ = simde_float16x4_to_private(a), + b_ = simde_vdup_n_f16(simde_float16x4_to_private(b).values[lane]); + uint16_t idx1[4] = {0, 0, 2, 2}; + uint16_t idx2[4] = {0, 1, 2, 3}; + vfloat16m1_t op1 = __riscv_vrgather_vv_f16m1(__riscv_vslideup_vx_f16m1( \ + a_.sv64, a_.sv64, 4, 8), __riscv_vle16_v_u16m1(idx1, 4), 4); + vfloat16m1_t op2 = __riscv_vrgather_vv_f16m1(__riscv_vslideup_vx_f16m1( \ + __riscv_vfneg_v_f16m1(b_.sv64, 4), b_.sv64, 4, 8), __riscv_vle16_v_u16m1(idx2, 4), 4); + r_.sv64 = __riscv_vfmacc_vv_f16m1(r_.sv64, op1, op2, 4); + return simde_float16x4_from_private(r_); #else - SIMDE_VECTORIZE - for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++) - { - r_.values[2 * i] += -(b_.values[2 * i]) * a_.values[2 * i]; - r_.values[2 * i + 1] += -(b_.values[2 * i + 1]) * a_.values[2 * i]; - } + simde_float32x4_private r_ = simde_float32x4_to_private(simde_vcvt_f32_f16(r)), + a_ = simde_float32x4_to_private(simde_vcvt_f32_f16(a)), + b_ = simde_float32x4_to_private( + simde_vcvt_f32_f16(simde_vdup_n_f16(simde_float16x4_to_private(b).values[lane]))); + #if defined(SIMDE_SHUFFLE_VECTOR_) && \ + ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) + a_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_.values, a_.values, 0, 0, 2, 2); + b_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, -b_.values, b_.values, 0, 1, 2, 3); + r_.values += b_.values * a_.values; + #else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++) + { + r_.values[2 * i] += -(b_.values[2 * i]) * a_.values[2 * i]; + r_.values[2 * i + 1] += -(b_.values[2 * i + 1]) * a_.values[2 * i]; + } + #endif + return simde_vcvt_f16_f32(simde_float32x4_from_private(r_)); #endif - return simde_vcvt_f16_f32(simde_float32x4_from_private(r_)); } #if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) #undef vcmla_rot180_lane_f16 @@ -78,7 +92,15 @@ simde_float32x2_t simde_vcmla_rot180_lane_f32(simde_float32x2_t r, simde_float32 { simde_float32x2_private r_ = simde_float32x2_to_private(r), a_ = simde_float32x2_to_private(a), b_ = simde_float32x2_to_private(simde_vdup_n_f32(simde_float32x2_to_private(b).values[lane])); - #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) + #if defined(SIMDE_RISCV_V_NATIVE) + uint32_t idx1[2] = {0, 0}; + uint32_t idx2[2] = {0, 1}; + vfloat32m1_t op1 = __riscv_vrgather_vv_f32m1(__riscv_vslideup_vx_f32m1( \ + a_.sv64, a_.sv64, 2, 4), __riscv_vle32_v_u32m1(idx1, 2), 2); + vfloat32m1_t op2 = __riscv_vrgather_vv_f32m1(__riscv_vslideup_vx_f32m1( \ + __riscv_vfneg_v_f32m1(b_.sv64, 2), b_.sv64, 2, 4), __riscv_vle32_v_u32m1(idx2, 2), 2); + r_.sv64 = __riscv_vfmacc_vv_f32m1(r_.sv64, op1, op2, 2); + #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) a_.values = SIMDE_SHUFFLE_VECTOR_(32, 8, a_.values, a_.values, 0, 0); b_.values = SIMDE_SHUFFLE_VECTOR_(32, 8, -b_.values, -b_.values, 0, 1); r_.values += b_.values * a_.values; @@ -106,31 +128,47 @@ SIMDE_FUNCTION_ATTRIBUTES simde_float16x8_t simde_vcmlaq_rot180_lane_f16(simde_float16x8_t r, simde_float16x8_t a, simde_float16x4_t b, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { - simde_float32x4_private r_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(r))), - a_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(a))), - r_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(r))), - a_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(a))), - b_ = simde_float32x4_to_private( - simde_vcvt_f32_f16(simde_vdup_n_f16(simde_float16x4_to_private(b).values[lane]))); - #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) && \ - ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) - a_low.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_low.values, a_low.values, 0, 0, 2, 2); - a_high.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_high.values, a_high.values, 0, 0, 2, 2); - b_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, -b_.values, b_.values, 0, 1, 2, 3); - r_low.values += b_.values * a_low.values; - r_high.values += b_.values * a_high.values; + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + simde_float16x8_private r_ = simde_float16x8_to_private(r), + a_ = simde_float16x8_to_private(a), + b_ = simde_vdupq_n_f16(simde_float16x4_to_private(b).values[lane]); + uint16_t idx1[8] = {0, 0, 2, 2, 4, 4, 6, 6}; + uint16_t idx2[8] = {0, 1, 2, 3, 4, 5, 6, 7}; + vfloat16m2_t a_tmp = __riscv_vlmul_ext_v_f16m1_f16m2 (a_.sv128); + vfloat16m2_t b_tmp = __riscv_vlmul_ext_v_f16m1_f16m2 (b_.sv128); + vfloat16m1_t op1 = __riscv_vlmul_trunc_v_f16m2_f16m1(__riscv_vrgather_vv_f16m2(__riscv_vslideup_vx_f16m2( \ + a_tmp, a_tmp, 8, 16), __riscv_vle16_v_u16m2(idx1, 8), 8)); + vfloat16m1_t op2 = __riscv_vlmul_trunc_v_f16m2_f16m1(__riscv_vrgather_vv_f16m2(__riscv_vslideup_vx_f16m2( \ + __riscv_vfneg_v_f16m2(b_tmp, 8), b_tmp, 8, 16), __riscv_vle16_v_u16m2(idx2, 8), 8)); + r_.sv128 = __riscv_vfmacc_vv_f16m1(r_.sv128, op1, op2, 8); + return simde_float16x8_from_private(r_); #else - SIMDE_VECTORIZE - for (size_t i = 0; i < (sizeof(r_low.values) / (2 * sizeof(r_low.values[0]))); i++) - { - r_low.values[2 * i] += -(b_.values[2 * i]) * a_low.values[2 * i]; - r_low.values[2 * i + 1] += -(b_.values[2 * i + 1]) * a_low.values[2 * i]; - r_high.values[2 * i] += -(b_.values[2 * i]) * a_high.values[2 * i]; - r_high.values[2 * i + 1] += -(b_.values[2 * i + 1]) * a_high.values[2 * i]; - } + simde_float32x4_private r_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(r))), + a_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(a))), + r_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(r))), + a_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(a))), + b_ = simde_float32x4_to_private( + simde_vcvt_f32_f16(simde_vdup_n_f16(simde_float16x4_to_private(b).values[lane]))); + #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) && \ + ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) + a_low.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_low.values, a_low.values, 0, 0, 2, 2); + a_high.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_high.values, a_high.values, 0, 0, 2, 2); + b_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, -b_.values, b_.values, 0, 1, 2, 3); + r_low.values += b_.values * a_low.values; + r_high.values += b_.values * a_high.values; + #else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_low.values) / (2 * sizeof(r_low.values[0]))); i++) + { + r_low.values[2 * i] += -(b_.values[2 * i]) * a_low.values[2 * i]; + r_low.values[2 * i + 1] += -(b_.values[2 * i + 1]) * a_low.values[2 * i]; + r_high.values[2 * i] += -(b_.values[2 * i]) * a_high.values[2 * i]; + r_high.values[2 * i + 1] += -(b_.values[2 * i + 1]) * a_high.values[2 * i]; + } + #endif + return simde_vcombine_f16(simde_vcvt_f16_f32(simde_float32x4_from_private(r_low)), + simde_vcvt_f16_f32(simde_float32x4_from_private(r_high))); #endif - return simde_vcombine_f16(simde_vcvt_f16_f32(simde_float32x4_from_private(r_low)), - simde_vcvt_f16_f32(simde_float32x4_from_private(r_high))); } #if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) #undef vcmlaq_rot180_lane_f16 @@ -148,7 +186,17 @@ simde_float32x4_t simde_vcmlaq_rot180_lane_f32(simde_float32x4_t r, simde_float3 { simde_float32x4_private r_ = simde_float32x4_to_private(r), a_ = simde_float32x4_to_private(a), b_ = simde_float32x4_to_private(simde_vdupq_n_f32(simde_float32x2_to_private(b).values[lane])); - #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) + #if defined(SIMDE_RISCV_V_NATIVE) + uint32_t idx1[4] = {0, 0, 2, 2}; + uint32_t idx2[4] = {0, 1, 2, 3}; + vfloat32m2_t a_tmp = __riscv_vlmul_ext_v_f32m1_f32m2 (a_.sv128); + vfloat32m2_t b_tmp = __riscv_vlmul_ext_v_f32m1_f32m2 (b_.sv128); + vfloat32m1_t op1 = __riscv_vlmul_trunc_v_f32m2_f32m1(__riscv_vrgather_vv_f32m2(__riscv_vslideup_vx_f32m2( \ + a_tmp, a_tmp, 4, 8), __riscv_vle32_v_u32m2(idx1, 4), 4)); + vfloat32m1_t op2 = __riscv_vlmul_trunc_v_f32m2_f32m1(__riscv_vrgather_vv_f32m2(__riscv_vslideup_vx_f32m2( \ + __riscv_vfneg_v_f32m2(b_tmp, 4), b_tmp, 4, 8), __riscv_vle32_v_u32m2(idx2, 4), 4)); + r_.sv128 = __riscv_vfmacc_vv_f32m1(r_.sv128, op1, op2, 4); + #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 0, 0, 2, 2); b_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, -b_.values, b_.values, 0, 1, 2, 3); r_.values += b_.values * a_.values; @@ -176,24 +224,38 @@ SIMDE_FUNCTION_ATTRIBUTES simde_float16x4_t simde_vcmla_rot180_laneq_f16(simde_float16x4_t r, simde_float16x4_t a, simde_float16x8_t b, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { - simde_float32x4_private r_ = simde_float32x4_to_private(simde_vcvt_f32_f16(r)), - a_ = simde_float32x4_to_private(simde_vcvt_f32_f16(a)), - b_ = simde_float32x4_to_private( - simde_vcvt_f32_f16(simde_vdup_n_f16(simde_float16x8_to_private(b).values[lane]))); - #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) && \ - ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) - a_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_.values, a_.values, 0, 0, 2, 2); - b_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, -b_.values, b_.values, 0, 1, 2, 3); - r_.values += b_.values * a_.values; + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + simde_float16x4_private r_ = simde_float16x4_to_private(r), + a_ = simde_float16x4_to_private(a), + b_ = simde_vdup_n_f16(simde_float16x8_to_private(b).values[lane]); + uint16_t idx1[4] = {0, 0, 2, 2}; + uint16_t idx2[4] = {0, 1, 2, 3}; + vfloat16m1_t op1 = __riscv_vrgather_vv_f16m1(__riscv_vslideup_vx_f16m1( \ + a_.sv64, a_.sv64, 4, 8), __riscv_vle16_v_u16m1(idx1, 4), 4); + vfloat16m1_t op2 = __riscv_vrgather_vv_f16m1(__riscv_vslideup_vx_f16m1( \ + __riscv_vfneg_v_f16m1(b_.sv64, 4), b_.sv64, 4, 8), __riscv_vle16_v_u16m1(idx2, 4), 4); + r_.sv64 = __riscv_vfmacc_vv_f16m1(r_.sv64, op1, op2, 4); + return simde_float16x4_from_private(r_); #else - SIMDE_VECTORIZE - for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++) - { - r_.values[2 * i] += -(b_.values[2 * i]) * a_.values[2 * i]; - r_.values[2 * i + 1] += -(b_.values[2 * i + 1]) * a_.values[2 * i]; - } + simde_float32x4_private r_ = simde_float32x4_to_private(simde_vcvt_f32_f16(r)), + a_ = simde_float32x4_to_private(simde_vcvt_f32_f16(a)), + b_ = simde_float32x4_to_private( + simde_vcvt_f32_f16(simde_vdup_n_f16(simde_float16x8_to_private(b).values[lane]))); + #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) && \ + ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) + a_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_.values, a_.values, 0, 0, 2, 2); + b_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, -b_.values, b_.values, 0, 1, 2, 3); + r_.values += b_.values * a_.values; + #else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++) + { + r_.values[2 * i] += -(b_.values[2 * i]) * a_.values[2 * i]; + r_.values[2 * i + 1] += -(b_.values[2 * i + 1]) * a_.values[2 * i]; + } + #endif + return simde_vcvt_f16_f32(simde_float32x4_from_private(r_)); #endif - return simde_vcvt_f16_f32(simde_float32x4_from_private(r_)); } #if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) #undef vcmla_rot180_laneq_f16 @@ -211,7 +273,15 @@ simde_float32x2_t simde_vcmla_rot180_laneq_f32(simde_float32x2_t r, simde_float3 { simde_float32x2_private r_ = simde_float32x2_to_private(r), a_ = simde_float32x2_to_private(a), b_ = simde_float32x2_to_private(simde_vdup_n_f32(simde_float32x4_to_private(b).values[lane])); - #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) + #if defined(SIMDE_RISCV_V_NATIVE) + uint32_t idx1[2] = {0, 0}; + uint32_t idx2[2] = {0, 1}; + vfloat32m1_t op1 = __riscv_vrgather_vv_f32m1(__riscv_vslideup_vx_f32m1( \ + a_.sv64, a_.sv64, 2, 4), __riscv_vle32_v_u32m1(idx1, 2), 2); + vfloat32m1_t op2 = __riscv_vrgather_vv_f32m1(__riscv_vslideup_vx_f32m1( \ + __riscv_vfneg_v_f32m1(b_.sv64, 2), b_.sv64, 2, 4), __riscv_vle32_v_u32m1(idx2, 2), 2); + r_.sv64 = __riscv_vfmacc_vv_f32m1(r_.sv64, op1, op2, 2); + #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) a_.values = SIMDE_SHUFFLE_VECTOR_(32, 8, a_.values, a_.values, 0, 0); b_.values = SIMDE_SHUFFLE_VECTOR_(32, 8, -b_.values, -b_.values, 0, 1); r_.values += b_.values * a_.values; @@ -239,31 +309,47 @@ SIMDE_FUNCTION_ATTRIBUTES simde_float16x8_t simde_vcmlaq_rot180_laneq_f16(simde_float16x8_t r, simde_float16x8_t a, simde_float16x8_t b, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { - simde_float32x4_private r_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(r))), - a_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(a))), - r_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(r))), - a_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(a))), - b_ = simde_float32x4_to_private( - simde_vcvt_f32_f16(simde_vdup_n_f16(simde_float16x8_to_private(b).values[lane]))); - #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) && \ - ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) - a_low.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_low.values, a_low.values, 0, 0, 2, 2); - a_high.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_high.values, a_high.values, 0, 0, 2, 2); - b_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, -b_.values, b_.values, 0, 1, 2, 3); - r_low.values += b_.values * a_low.values; - r_high.values += b_.values * a_high.values; + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + simde_float16x8_private r_ = simde_float16x8_to_private(r), + a_ = simde_float16x8_to_private(a), + b_ = simde_vdupq_n_f16(simde_float16x8_to_private(b).values[lane]); + uint16_t idx1[8] = {0, 0, 2, 2, 4, 4, 6, 6}; + uint16_t idx2[8] = {0, 1, 2, 3, 4, 5, 6, 7}; + vfloat16m2_t a_tmp = __riscv_vlmul_ext_v_f16m1_f16m2 (a_.sv128); + vfloat16m2_t b_tmp = __riscv_vlmul_ext_v_f16m1_f16m2 (b_.sv128); + vfloat16m1_t op1 = __riscv_vlmul_trunc_v_f16m2_f16m1(__riscv_vrgather_vv_f16m2(__riscv_vslideup_vx_f16m2( \ + a_tmp, a_tmp, 8, 16), __riscv_vle16_v_u16m2(idx1, 8), 8)); + vfloat16m1_t op2 = __riscv_vlmul_trunc_v_f16m2_f16m1(__riscv_vrgather_vv_f16m2(__riscv_vslideup_vx_f16m2( \ + __riscv_vfneg_v_f16m2(b_tmp, 8), b_tmp, 8, 16), __riscv_vle16_v_u16m2(idx2, 8), 8)); + r_.sv128 = __riscv_vfmacc_vv_f16m1(r_.sv128, op1, op2, 8); + return simde_float16x8_from_private(r_); #else - SIMDE_VECTORIZE - for (size_t i = 0; i < (sizeof(r_low.values) / (2 * sizeof(r_low.values[0]))); i++) - { - r_low.values[2 * i] += -(b_.values[2 * i]) * a_low.values[2 * i]; - r_low.values[2 * i + 1] += -(b_.values[2 * i + 1]) * a_low.values[2 * i]; - r_high.values[2 * i] += -(b_.values[2 * i]) * a_high.values[2 * i]; - r_high.values[2 * i + 1] += -(b_.values[2 * i + 1]) * a_high.values[2 * i]; - } + simde_float32x4_private r_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(r))), + a_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(a))), + r_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(r))), + a_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(a))), + b_ = simde_float32x4_to_private( + simde_vcvt_f32_f16(simde_vdup_n_f16(simde_float16x8_to_private(b).values[lane]))); + #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) && \ + ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) + a_low.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_low.values, a_low.values, 0, 0, 2, 2); + a_high.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_high.values, a_high.values, 0, 0, 2, 2); + b_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, -b_.values, b_.values, 0, 1, 2, 3); + r_low.values += b_.values * a_low.values; + r_high.values += b_.values * a_high.values; + #else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_low.values) / (2 * sizeof(r_low.values[0]))); i++) + { + r_low.values[2 * i] += -(b_.values[2 * i]) * a_low.values[2 * i]; + r_low.values[2 * i + 1] += -(b_.values[2 * i + 1]) * a_low.values[2 * i]; + r_high.values[2 * i] += -(b_.values[2 * i]) * a_high.values[2 * i]; + r_high.values[2 * i + 1] += -(b_.values[2 * i + 1]) * a_high.values[2 * i]; + } + #endif + return simde_vcombine_f16(simde_vcvt_f16_f32(simde_float32x4_from_private(r_low)), + simde_vcvt_f16_f32(simde_float32x4_from_private(r_high))); #endif - return simde_vcombine_f16(simde_vcvt_f16_f32(simde_float32x4_from_private(r_low)), - simde_vcvt_f16_f32(simde_float32x4_from_private(r_high))); } #if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) #undef vcmlaq_rot180_laneq_f16 @@ -281,7 +367,17 @@ simde_float32x4_t simde_vcmlaq_rot180_laneq_f32(simde_float32x4_t r, simde_float { simde_float32x4_private r_ = simde_float32x4_to_private(r), a_ = simde_float32x4_to_private(a), b_ = simde_float32x4_to_private(simde_vdupq_n_f32(simde_float32x4_to_private(b).values[lane])); - #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) + #if defined(SIMDE_RISCV_V_NATIVE) + uint32_t idx1[4] = {0, 0, 2, 2}; + uint32_t idx2[4] = {0, 1, 2, 3}; + vfloat32m2_t a_tmp = __riscv_vlmul_ext_v_f32m1_f32m2 (a_.sv128); + vfloat32m2_t b_tmp = __riscv_vlmul_ext_v_f32m1_f32m2 (b_.sv128); + vfloat32m1_t op1 = __riscv_vlmul_trunc_v_f32m2_f32m1(__riscv_vrgather_vv_f32m2(__riscv_vslideup_vx_f32m2( \ + a_tmp, a_tmp, 4, 8), __riscv_vle32_v_u32m2(idx1, 4), 4)); + vfloat32m1_t op2 = __riscv_vlmul_trunc_v_f32m2_f32m1(__riscv_vrgather_vv_f32m2(__riscv_vslideup_vx_f32m2( \ + __riscv_vfneg_v_f32m2(b_tmp, 4), b_tmp, 4, 8), __riscv_vle32_v_u32m2(idx2, 4), 4)); + r_.sv128 = __riscv_vfmacc_vv_f32m1(r_.sv128, op1, op2, 4); + #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 0, 0, 2, 2); b_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, -b_.values, b_.values, 0, 1, 2, 3); r_.values += b_.values * a_.values; diff --git a/simde/arm/neon/cmla_rot270_lane.h b/simde/arm/neon/cmla_rot270_lane.h index d8d64dd38..38cd93460 100644 --- a/simde/arm/neon/cmla_rot270_lane.h +++ b/simde/arm/neon/cmla_rot270_lane.h @@ -21,7 +21,7 @@ * SOFTWARE. * * Copyright: - * 2023 Chi-Wei Chu + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_CMLA_ROT270_LANE_H) @@ -43,24 +43,38 @@ SIMDE_FUNCTION_ATTRIBUTES simde_float16x4_t simde_vcmla_rot270_lane_f16(simde_float16x4_t r, simde_float16x4_t a, simde_float16x4_t b, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { - simde_float32x4_private r_ = simde_float32x4_to_private(simde_vcvt_f32_f16(r)), - a_ = simde_float32x4_to_private(simde_vcvt_f32_f16(a)), - b_ = simde_float32x4_to_private( - simde_vcvt_f32_f16(simde_vdup_n_f16(simde_float16x4_to_private(b).values[lane]))); - #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) && \ - ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) - a_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_.values, a_.values, 1, 1, 3, 3); - b_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, -b_.values, b_.values, 5, 0, 7, 2); - r_.values += b_.values * a_.values; + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + simde_float16x4_private r_ = simde_float16x4_to_private(r), + a_ = simde_float16x4_to_private(a), + b_ = simde_vdup_n_f16(simde_float16x4_to_private(b).values[lane]); + uint16_t idx1[4] = {1, 1, 3, 3}; + uint16_t idx2[4] = {5, 0, 7, 2}; + vfloat16m1_t op1 = __riscv_vrgather_vv_f16m1(__riscv_vslideup_vx_f16m1( \ + a_.sv64, a_.sv64, 4, 8), __riscv_vle16_v_u16m1(idx1, 4), 4); + vfloat16m1_t op2 = __riscv_vrgather_vv_f16m1(__riscv_vslideup_vx_f16m1( \ + __riscv_vfneg_v_f16m1(b_.sv64, 4), b_.sv64, 4, 8), __riscv_vle16_v_u16m1(idx2, 4), 4); + r_.sv64 = __riscv_vfmacc_vv_f16m1(r_.sv64, op1, op2, 4); + return simde_float16x4_from_private(r_); #else - SIMDE_VECTORIZE - for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++) - { - r_.values[2 * i] += b_.values[2 * i + 1] * a_.values[2 * i + 1]; - r_.values[2 * i + 1] += -(b_.values[2 * i]) * a_.values[2 * i + 1]; - } + simde_float32x4_private r_ = simde_float32x4_to_private(simde_vcvt_f32_f16(r)), + a_ = simde_float32x4_to_private(simde_vcvt_f32_f16(a)), + b_ = simde_float32x4_to_private( + simde_vcvt_f32_f16(simde_vdup_n_f16(simde_float16x4_to_private(b).values[lane]))); + #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) && \ + ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) + a_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_.values, a_.values, 1, 1, 3, 3); + b_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, -b_.values, b_.values, 5, 0, 7, 2); + r_.values += b_.values * a_.values; + #else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++) + { + r_.values[2 * i] += b_.values[2 * i + 1] * a_.values[2 * i + 1]; + r_.values[2 * i + 1] += -(b_.values[2 * i]) * a_.values[2 * i + 1]; + } + #endif + return simde_vcvt_f16_f32(simde_float32x4_from_private(r_)); #endif - return simde_vcvt_f16_f32(simde_float32x4_from_private(r_)); } #if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) #undef vcmla_rot270_lane_f16 @@ -78,7 +92,15 @@ simde_float32x2_t simde_vcmla_rot270_lane_f32(simde_float32x2_t r, simde_float32 { simde_float32x2_private r_ = simde_float32x2_to_private(r), a_ = simde_float32x2_to_private(a), b_ = simde_float32x2_to_private(simde_vdup_n_f32(simde_float32x2_to_private(b).values[lane])); - #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) + #if defined(SIMDE_RISCV_V_NATIVE) + uint32_t idx1[2] = {1, 1}; + uint32_t idx2[2] = {3, 0}; + vfloat32m1_t op1 = __riscv_vrgather_vv_f32m1(__riscv_vslideup_vx_f32m1( \ + a_.sv64, a_.sv64, 2, 4), __riscv_vle32_v_u32m1(idx1, 2), 2); + vfloat32m1_t op2 = __riscv_vrgather_vv_f32m1(__riscv_vslideup_vx_f32m1( \ + __riscv_vfneg_v_f32m1(b_.sv64, 2), b_.sv64, 2, 4), __riscv_vle32_v_u32m1(idx2, 2), 2); + r_.sv64 = __riscv_vfmacc_vv_f32m1(r_.sv64, op1, op2, 2); + #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) a_.values = SIMDE_SHUFFLE_VECTOR_(32, 8, a_.values, a_.values, 1, 1); b_.values = SIMDE_SHUFFLE_VECTOR_(32, 8, -b_.values, b_.values, 3, 0); r_.values += b_.values * a_.values; @@ -106,31 +128,47 @@ SIMDE_FUNCTION_ATTRIBUTES simde_float16x8_t simde_vcmlaq_rot270_lane_f16(simde_float16x8_t r, simde_float16x8_t a, simde_float16x4_t b, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { - simde_float32x4_private r_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(r))), - a_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(a))), - r_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(r))), - a_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(a))), - b_ = simde_float32x4_to_private( - simde_vcvt_f32_f16(simde_vdup_n_f16(simde_float16x4_to_private(b).values[lane]))); - #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) && \ - ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) - a_low.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_low.values, a_low.values, 1, 1, 3, 3); - a_high.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_high.values, a_high.values, 1, 1, 3, 3); - b_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, -b_.values, b_.values, 5, 0, 7, 2); - r_low.values += b_.values * a_low.values; - r_high.values += b_.values * a_high.values; + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + simde_float16x8_private r_ = simde_float16x8_to_private(r), + a_ = simde_float16x8_to_private(a), + b_ = simde_vdupq_n_f16(simde_float16x4_to_private(b).values[lane]); + uint16_t idx1[8] = {1, 1, 3, 3, 5, 5, 7, 7}; + uint16_t idx2[8] = {9, 0, 11, 2, 13, 4, 15, 6}; + vfloat16m2_t a_tmp = __riscv_vlmul_ext_v_f16m1_f16m2 (a_.sv128); + vfloat16m2_t b_tmp = __riscv_vlmul_ext_v_f16m1_f16m2 (b_.sv128); + vfloat16m1_t op1 = __riscv_vlmul_trunc_v_f16m2_f16m1(__riscv_vrgather_vv_f16m2(__riscv_vslideup_vx_f16m2( \ + a_tmp, a_tmp, 8, 16), __riscv_vle16_v_u16m2(idx1, 8), 8)); + vfloat16m1_t op2 = __riscv_vlmul_trunc_v_f16m2_f16m1(__riscv_vrgather_vv_f16m2(__riscv_vslideup_vx_f16m2( \ + __riscv_vfneg_v_f16m2(b_tmp, 8), b_tmp, 8, 16), __riscv_vle16_v_u16m2(idx2, 8), 8)); + r_.sv128 = __riscv_vfmacc_vv_f16m1(r_.sv128, op1, op2, 8); + return simde_float16x8_from_private(r_); #else - SIMDE_VECTORIZE - for (size_t i = 0; i < (sizeof(r_low.values) / (2 * sizeof(r_low.values[0]))); i++) - { - r_low.values[2 * i] += b_.values[2 * i + 1] * a_low.values[2 * i + 1]; - r_low.values[2 * i + 1] += -(b_.values[2 * i]) * a_low.values[2 * i + 1]; - r_high.values[2 * i] += b_.values[2 * i + 1] * a_high.values[2 * i + 1]; - r_high.values[2 * i + 1] += -(b_.values[2 * i]) * a_high.values[2 * i + 1]; - } + simde_float32x4_private r_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(r))), + a_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(a))), + r_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(r))), + a_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(a))), + b_ = simde_float32x4_to_private( + simde_vcvt_f32_f16(simde_vdup_n_f16(simde_float16x4_to_private(b).values[lane]))); + #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) && \ + ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) + a_low.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_low.values, a_low.values, 1, 1, 3, 3); + a_high.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_high.values, a_high.values, 1, 1, 3, 3); + b_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, -b_.values, b_.values, 5, 0, 7, 2); + r_low.values += b_.values * a_low.values; + r_high.values += b_.values * a_high.values; + #else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_low.values) / (2 * sizeof(r_low.values[0]))); i++) + { + r_low.values[2 * i] += b_.values[2 * i + 1] * a_low.values[2 * i + 1]; + r_low.values[2 * i + 1] += -(b_.values[2 * i]) * a_low.values[2 * i + 1]; + r_high.values[2 * i] += b_.values[2 * i + 1] * a_high.values[2 * i + 1]; + r_high.values[2 * i + 1] += -(b_.values[2 * i]) * a_high.values[2 * i + 1]; + } + #endif + return simde_vcombine_f16(simde_vcvt_f16_f32(simde_float32x4_from_private(r_low)), + simde_vcvt_f16_f32(simde_float32x4_from_private(r_high))); #endif - return simde_vcombine_f16(simde_vcvt_f16_f32(simde_float32x4_from_private(r_low)), - simde_vcvt_f16_f32(simde_float32x4_from_private(r_high))); } #if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) #undef vcmlaq_rot270_lane_f16 @@ -148,7 +186,17 @@ simde_float32x4_t simde_vcmlaq_rot270_lane_f32(simde_float32x4_t r, simde_float3 { simde_float32x4_private r_ = simde_float32x4_to_private(r), a_ = simde_float32x4_to_private(a), b_ = simde_float32x4_to_private(simde_vdupq_n_f32(simde_float32x2_to_private(b).values[lane])); - #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) + #if defined(SIMDE_RISCV_V_NATIVE) + uint32_t idx1[4] = {1, 1, 3, 3}; + uint32_t idx2[4] = {5, 0, 7, 2}; + vfloat32m2_t a_tmp = __riscv_vlmul_ext_v_f32m1_f32m2 (a_.sv128); + vfloat32m2_t b_tmp = __riscv_vlmul_ext_v_f32m1_f32m2 (b_.sv128); + vfloat32m1_t op1 = __riscv_vlmul_trunc_v_f32m2_f32m1(__riscv_vrgather_vv_f32m2(__riscv_vslideup_vx_f32m2( \ + a_tmp, a_tmp, 4, 8), __riscv_vle32_v_u32m2(idx1, 4), 4)); + vfloat32m1_t op2 = __riscv_vlmul_trunc_v_f32m2_f32m1(__riscv_vrgather_vv_f32m2(__riscv_vslideup_vx_f32m2( \ + __riscv_vfneg_v_f32m2(b_tmp, 4), b_tmp, 4, 8), __riscv_vle32_v_u32m2(idx2, 4), 4)); + r_.sv128 = __riscv_vfmacc_vv_f32m1(r_.sv128, op1, op2, 4); + #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 1, 1, 3, 3); b_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, -b_.values, b_.values, 5, 0, 7, 2); r_.values += b_.values * a_.values; @@ -176,24 +224,38 @@ SIMDE_FUNCTION_ATTRIBUTES simde_float16x4_t simde_vcmla_rot270_laneq_f16(simde_float16x4_t r, simde_float16x4_t a, simde_float16x8_t b, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { - simde_float32x4_private r_ = simde_float32x4_to_private(simde_vcvt_f32_f16(r)), - a_ = simde_float32x4_to_private(simde_vcvt_f32_f16(a)), - b_ = simde_float32x4_to_private( - simde_vcvt_f32_f16(simde_vdup_n_f16(simde_float16x8_to_private(b).values[lane]))); - #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) && \ - ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) - a_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_.values, a_.values, 1, 1, 3, 3); - b_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, -b_.values, b_.values, 5, 0, 7, 2); - r_.values += b_.values * a_.values; + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + simde_float16x4_private r_ = simde_float16x4_to_private(r), + a_ = simde_float16x4_to_private(a), + b_ = simde_vdup_n_f16(simde_float16x8_to_private(b).values[lane]); + uint16_t idx1[4] = {1, 1, 3, 3}; + uint16_t idx2[4] = {5, 0, 7, 2}; + vfloat16m1_t op1 = __riscv_vrgather_vv_f16m1(__riscv_vslideup_vx_f16m1( \ + a_.sv64, a_.sv64, 4, 8), __riscv_vle16_v_u16m1(idx1, 4), 4); + vfloat16m1_t op2 = __riscv_vrgather_vv_f16m1(__riscv_vslideup_vx_f16m1( \ + __riscv_vfneg_v_f16m1(b_.sv64, 4), b_.sv64, 4, 8), __riscv_vle16_v_u16m1(idx2, 4), 4); + r_.sv64 = __riscv_vfmacc_vv_f16m1(r_.sv64, op1, op2, 4); + return simde_float16x4_from_private(r_); #else - SIMDE_VECTORIZE - for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++) - { - r_.values[2 * i] += b_.values[2 * i + 1] * a_.values[2 * i + 1]; - r_.values[2 * i + 1] += -(b_.values[2 * i]) * a_.values[2 * i + 1]; - } + simde_float32x4_private r_ = simde_float32x4_to_private(simde_vcvt_f32_f16(r)), + a_ = simde_float32x4_to_private(simde_vcvt_f32_f16(a)), + b_ = simde_float32x4_to_private( + simde_vcvt_f32_f16(simde_vdup_n_f16(simde_float16x8_to_private(b).values[lane]))); + #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) && \ + ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) + a_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_.values, a_.values, 1, 1, 3, 3); + b_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, -b_.values, b_.values, 5, 0, 7, 2); + r_.values += b_.values * a_.values; + #else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++) + { + r_.values[2 * i] += b_.values[2 * i + 1] * a_.values[2 * i + 1]; + r_.values[2 * i + 1] += -(b_.values[2 * i]) * a_.values[2 * i + 1]; + } + #endif + return simde_vcvt_f16_f32(simde_float32x4_from_private(r_)); #endif - return simde_vcvt_f16_f32(simde_float32x4_from_private(r_)); } #if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) #undef vcmla_rot270_laneq_f16 @@ -211,7 +273,15 @@ simde_float32x2_t simde_vcmla_rot270_laneq_f32(simde_float32x2_t r, simde_float3 { simde_float32x2_private r_ = simde_float32x2_to_private(r), a_ = simde_float32x2_to_private(a), b_ = simde_float32x2_to_private(simde_vdup_n_f32(simde_float32x4_to_private(b).values[lane])); - #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) + #if defined(SIMDE_RISCV_V_NATIVE) + uint32_t idx1[2] = {1, 1}; + uint32_t idx2[2] = {3, 0}; + vfloat32m1_t op1 = __riscv_vrgather_vv_f32m1(__riscv_vslideup_vx_f32m1( \ + a_.sv64, a_.sv64, 2, 4), __riscv_vle32_v_u32m1(idx1, 2), 2); + vfloat32m1_t op2 = __riscv_vrgather_vv_f32m1(__riscv_vslideup_vx_f32m1( \ + __riscv_vfneg_v_f32m1(b_.sv64, 2), b_.sv64, 2, 4), __riscv_vle32_v_u32m1(idx2, 2), 2); + r_.sv64 = __riscv_vfmacc_vv_f32m1(r_.sv64, op1, op2, 2); + #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) a_.values = SIMDE_SHUFFLE_VECTOR_(32, 8, a_.values, a_.values, 1, 1); b_.values = SIMDE_SHUFFLE_VECTOR_(32, 8, -b_.values, b_.values, 3, 0); r_.values += b_.values * a_.values; @@ -239,31 +309,47 @@ SIMDE_FUNCTION_ATTRIBUTES simde_float16x8_t simde_vcmlaq_rot270_laneq_f16(simde_float16x8_t r, simde_float16x8_t a, simde_float16x8_t b, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { - simde_float32x4_private r_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(r))), - a_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(a))), - r_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(r))), - a_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(a))), - b_ = simde_float32x4_to_private( - simde_vcvt_f32_f16(simde_vdup_n_f16(simde_float16x8_to_private(b).values[lane]))); - #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) && \ - ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) - a_high.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_high.values, a_high.values, 1, 1, 3, 3); - a_low.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_low.values, a_low.values, 1, 1, 3, 3); - b_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, -b_.values, b_.values, 5, 0, 7, 2); - r_high.values += b_.values * a_high.values; - r_low.values += b_.values * a_low.values; + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + simde_float16x8_private r_ = simde_float16x8_to_private(r), + a_ = simde_float16x8_to_private(a), + b_ = simde_vdupq_n_f16(simde_float16x8_to_private(b).values[lane]); + uint16_t idx1[8] = {1, 1, 3, 3, 5, 5, 7, 7}; + uint16_t idx2[8] = {9, 0, 11, 2, 13, 4, 15, 6}; + vfloat16m2_t a_tmp = __riscv_vlmul_ext_v_f16m1_f16m2 (a_.sv128); + vfloat16m2_t b_tmp = __riscv_vlmul_ext_v_f16m1_f16m2 (b_.sv128); + vfloat16m1_t op1 = __riscv_vlmul_trunc_v_f16m2_f16m1(__riscv_vrgather_vv_f16m2(__riscv_vslideup_vx_f16m2( \ + a_tmp, a_tmp, 8, 16), __riscv_vle16_v_u16m2(idx1, 8), 8)); + vfloat16m1_t op2 = __riscv_vlmul_trunc_v_f16m2_f16m1(__riscv_vrgather_vv_f16m2(__riscv_vslideup_vx_f16m2( \ + __riscv_vfneg_v_f16m2(b_tmp, 8), b_tmp, 8, 16), __riscv_vle16_v_u16m2(idx2, 8), 8)); + r_.sv128 = __riscv_vfmacc_vv_f16m1(r_.sv128, op1, op2, 8); + return simde_float16x8_from_private(r_); #else - SIMDE_VECTORIZE - for (size_t i = 0; i < (sizeof(r_low.values) / (2 * sizeof(r_low.values[0]))); i++) - { - r_low.values[2 * i] += b_.values[2 * i + 1] * a_low.values[2 * i + 1]; - r_low.values[2 * i + 1] += -(b_.values[2 * i]) * a_low.values[2 * i + 1]; - r_high.values[2 * i] += b_.values[2 * i + 1] * a_high.values[2 * i + 1]; - r_high.values[2 * i + 1] += -(b_.values[2 * i]) * a_high.values[2 * i + 1]; - } + simde_float32x4_private r_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(r))), + a_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(a))), + r_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(r))), + a_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(a))), + b_ = simde_float32x4_to_private( + simde_vcvt_f32_f16(simde_vdup_n_f16(simde_float16x8_to_private(b).values[lane]))); + #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) && \ + ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) + a_high.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_high.values, a_high.values, 1, 1, 3, 3); + a_low.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_low.values, a_low.values, 1, 1, 3, 3); + b_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, -b_.values, b_.values, 5, 0, 7, 2); + r_high.values += b_.values * a_high.values; + r_low.values += b_.values * a_low.values; + #else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_low.values) / (2 * sizeof(r_low.values[0]))); i++) + { + r_low.values[2 * i] += b_.values[2 * i + 1] * a_low.values[2 * i + 1]; + r_low.values[2 * i + 1] += -(b_.values[2 * i]) * a_low.values[2 * i + 1]; + r_high.values[2 * i] += b_.values[2 * i + 1] * a_high.values[2 * i + 1]; + r_high.values[2 * i + 1] += -(b_.values[2 * i]) * a_high.values[2 * i + 1]; + } + #endif + return simde_vcombine_f16(simde_vcvt_f16_f32(simde_float32x4_from_private(r_low)), + simde_vcvt_f16_f32(simde_float32x4_from_private(r_high))); #endif - return simde_vcombine_f16(simde_vcvt_f16_f32(simde_float32x4_from_private(r_low)), - simde_vcvt_f16_f32(simde_float32x4_from_private(r_high))); } #if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) #undef vcmlaq_rot270_laneq_f16 @@ -281,7 +367,17 @@ simde_float32x4_t simde_vcmlaq_rot270_laneq_f32(simde_float32x4_t r, simde_float { simde_float32x4_private r_ = simde_float32x4_to_private(r), a_ = simde_float32x4_to_private(a), b_ = simde_float32x4_to_private(simde_vdupq_n_f32(simde_float32x4_to_private(b).values[lane])); - #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) + #if defined(SIMDE_RISCV_V_NATIVE) + uint32_t idx1[4] = {1, 1, 3, 3}; + uint32_t idx2[4] = {5, 0, 7, 2}; + vfloat32m2_t a_tmp = __riscv_vlmul_ext_v_f32m1_f32m2 (a_.sv128); + vfloat32m2_t b_tmp = __riscv_vlmul_ext_v_f32m1_f32m2 (b_.sv128); + vfloat32m1_t op1 = __riscv_vlmul_trunc_v_f32m2_f32m1(__riscv_vrgather_vv_f32m2(__riscv_vslideup_vx_f32m2( \ + a_tmp, a_tmp, 4, 8), __riscv_vle32_v_u32m2(idx1, 4), 4)); + vfloat32m1_t op2 = __riscv_vlmul_trunc_v_f32m2_f32m1(__riscv_vrgather_vv_f32m2(__riscv_vslideup_vx_f32m2( \ + __riscv_vfneg_v_f32m2(b_tmp, 4), b_tmp, 4, 8), __riscv_vle32_v_u32m2(idx2, 4), 4)); + r_.sv128 = __riscv_vfmacc_vv_f32m1(r_.sv128, op1, op2, 4); + #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 1, 1, 3, 3); b_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, -b_.values, b_.values, 5, 0, 7, 2); r_.values += b_.values * a_.values; diff --git a/simde/arm/neon/cmla_rot90_lane.h b/simde/arm/neon/cmla_rot90_lane.h index 45df8c0ed..338930281 100644 --- a/simde/arm/neon/cmla_rot90_lane.h +++ b/simde/arm/neon/cmla_rot90_lane.h @@ -21,7 +21,7 @@ * SOFTWARE. * * Copyright: - * 2023 Chi-Wei Chu + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_CMLA_ROT90_LANE_H) @@ -43,24 +43,38 @@ SIMDE_FUNCTION_ATTRIBUTES simde_float16x4_t simde_vcmla_rot90_lane_f16(simde_float16x4_t r, simde_float16x4_t a, simde_float16x4_t b, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { - simde_float32x4_private r_ = simde_float32x4_to_private(simde_vcvt_f32_f16(r)), - a_ = simde_float32x4_to_private(simde_vcvt_f32_f16(a)), - b_ = simde_float32x4_to_private( - simde_vcvt_f32_f16(simde_vdup_n_f16(simde_float16x4_to_private(b).values[lane]))); - #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) && \ - ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) - a_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_.values, a_.values, 1, 1, 3, 3); - b_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, -b_.values, b_.values, 1, 4, 3, 6); - r_.values += b_.values * a_.values; + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + simde_float16x4_private r_ = simde_float16x4_to_private(r), + a_ = simde_float16x4_to_private(a), + b_ = simde_vdup_n_f16(simde_float16x4_to_private(b).values[lane]); + uint16_t idx1[4] = {1, 1, 3, 3}; + uint16_t idx2[4] = {1, 4, 3, 6}; + vfloat16m1_t op1 = __riscv_vrgather_vv_f16m1(__riscv_vslideup_vx_f16m1( \ + a_.sv64, a_.sv64, 4, 8), __riscv_vle16_v_u16m1(idx1, 4), 4); + vfloat16m1_t op2 = __riscv_vrgather_vv_f16m1(__riscv_vslideup_vx_f16m1( \ + __riscv_vfneg_v_f16m1(b_.sv64, 4), b_.sv64, 4, 8), __riscv_vle16_v_u16m1(idx2, 4), 4); + r_.sv64 = __riscv_vfmacc_vv_f16m1(r_.sv64, op1, op2, 4); + return simde_float16x4_from_private(r_); #else - SIMDE_VECTORIZE - for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++) - { - r_.values[2 * i] += -(b_.values[2 * i + 1]) * a_.values[2 * i + 1]; - r_.values[2 * i + 1] += b_.values[2 * i] * a_.values[2 * i + 1]; - } + simde_float32x4_private r_ = simde_float32x4_to_private(simde_vcvt_f32_f16(r)), + a_ = simde_float32x4_to_private(simde_vcvt_f32_f16(a)), + b_ = simde_float32x4_to_private( + simde_vcvt_f32_f16(simde_vdup_n_f16(simde_float16x4_to_private(b).values[lane]))); + #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) && \ + ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) + a_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_.values, a_.values, 1, 1, 3, 3); + b_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, -b_.values, b_.values, 1, 4, 3, 6); + r_.values += b_.values * a_.values; + #else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++) + { + r_.values[2 * i] += -(b_.values[2 * i + 1]) * a_.values[2 * i + 1]; + r_.values[2 * i + 1] += b_.values[2 * i] * a_.values[2 * i + 1]; + } + #endif + return simde_vcvt_f16_f32(simde_float32x4_from_private(r_)); #endif - return simde_vcvt_f16_f32(simde_float32x4_from_private(r_)); } #if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) #undef vcmla_rot90_lane_f16 @@ -78,7 +92,15 @@ simde_float32x2_t simde_vcmla_rot90_lane_f32(simde_float32x2_t r, simde_float32x { simde_float32x2_private r_ = simde_float32x2_to_private(r), a_ = simde_float32x2_to_private(a), b_ = simde_float32x2_to_private(simde_vdup_n_f32(simde_float32x2_to_private(b).values[lane])); - #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) + #if defined(SIMDE_RISCV_V_NATIVE) + uint32_t idx1[2] = {1, 1}; + uint32_t idx2[2] = {1, 2}; + vfloat32m1_t op1 = __riscv_vrgather_vv_f32m1(__riscv_vslideup_vx_f32m1( \ + a_.sv64, a_.sv64, 2, 4), __riscv_vle32_v_u32m1(idx1, 2), 2); + vfloat32m1_t op2 = __riscv_vrgather_vv_f32m1(__riscv_vslideup_vx_f32m1( \ + __riscv_vfneg_v_f32m1(b_.sv64, 2), b_.sv64, 2, 4), __riscv_vle32_v_u32m1(idx2, 2), 2); + r_.sv64 = __riscv_vfmacc_vv_f32m1(r_.sv64, op1, op2, 2); + #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) a_.values = SIMDE_SHUFFLE_VECTOR_(32, 8, a_.values, a_.values, 1, 1); b_.values = SIMDE_SHUFFLE_VECTOR_(32, 8, -b_.values, b_.values, 1, 2); r_.values += b_.values * a_.values; @@ -106,24 +128,38 @@ SIMDE_FUNCTION_ATTRIBUTES simde_float16x4_t simde_vcmla_rot90_laneq_f16(simde_float16x4_t r, simde_float16x4_t a, simde_float16x8_t b, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { - simde_float32x4_private r_ = simde_float32x4_to_private(simde_vcvt_f32_f16(r)), - a_ = simde_float32x4_to_private(simde_vcvt_f32_f16(a)), - b_ = simde_float32x4_to_private( - simde_vcvt_f32_f16(simde_vdup_n_f16(simde_float16x8_to_private(b).values[lane]))); - #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) && \ - ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) - a_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_.values, a_.values, 1, 1, 3, 3); - b_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, -b_.values, b_.values, 1, 4, 3, 6); - r_.values += b_.values * a_.values; + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + simde_float16x4_private r_ = simde_float16x4_to_private(r), + a_ = simde_float16x4_to_private(a), + b_ = simde_vdup_n_f16(simde_float16x8_to_private(b).values[lane]); + uint16_t idx1[4] = {1, 1, 3, 3}; + uint16_t idx2[4] = {1, 4, 3, 6}; + vfloat16m1_t op1 = __riscv_vrgather_vv_f16m1(__riscv_vslideup_vx_f16m1( \ + a_.sv64, a_.sv64, 4, 8), __riscv_vle16_v_u16m1(idx1, 4), 4); + vfloat16m1_t op2 = __riscv_vrgather_vv_f16m1(__riscv_vslideup_vx_f16m1( \ + __riscv_vfneg_v_f16m1(b_.sv64, 4), b_.sv64, 4, 8), __riscv_vle16_v_u16m1(idx2, 4), 4); + r_.sv64 = __riscv_vfmacc_vv_f16m1(r_.sv64, op1, op2, 4); + return simde_float16x4_from_private(r_); #else - SIMDE_VECTORIZE - for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++) - { - r_.values[2 * i] += -(b_.values[2 * i + 1]) * a_.values[2 * i + 1]; - r_.values[2 * i + 1] += b_.values[2 * i] * a_.values[2 * i + 1]; - } + simde_float32x4_private r_ = simde_float32x4_to_private(simde_vcvt_f32_f16(r)), + a_ = simde_float32x4_to_private(simde_vcvt_f32_f16(a)), + b_ = simde_float32x4_to_private( + simde_vcvt_f32_f16(simde_vdup_n_f16(simde_float16x8_to_private(b).values[lane]))); + #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) && \ + ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) + a_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_.values, a_.values, 1, 1, 3, 3); + b_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, -b_.values, b_.values, 1, 4, 3, 6); + r_.values += b_.values * a_.values; + #else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++) + { + r_.values[2 * i] += -(b_.values[2 * i + 1]) * a_.values[2 * i + 1]; + r_.values[2 * i + 1] += b_.values[2 * i] * a_.values[2 * i + 1]; + } + #endif + return simde_vcvt_f16_f32(simde_float32x4_from_private(r_)); #endif - return simde_vcvt_f16_f32(simde_float32x4_from_private(r_)); } #if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) #undef vcmla_rot90_laneq_f16 @@ -142,7 +178,15 @@ simde_float32x2_t simde_vcmla_rot90_laneq_f32(simde_float32x2_t r, simde_float32 simde_float32x2_private r_ = simde_float32x2_to_private(r), a_ = simde_float32x2_to_private(a), b_ = simde_float32x2_to_private(simde_vdup_n_f32(simde_float32x4_to_private(b).values[lane])); - #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) + #if defined(SIMDE_RISCV_V_NATIVE) + uint32_t idx1[2] = {1, 1}; + uint32_t idx2[2] = {1, 2}; + vfloat32m1_t op1 = __riscv_vrgather_vv_f32m1(__riscv_vslideup_vx_f32m1( \ + a_.sv64, a_.sv64, 2, 4), __riscv_vle32_v_u32m1(idx1, 2), 2); + vfloat32m1_t op2 = __riscv_vrgather_vv_f32m1(__riscv_vslideup_vx_f32m1( \ + __riscv_vfneg_v_f32m1(b_.sv64, 2), b_.sv64, 2, 4), __riscv_vle32_v_u32m1(idx2, 2), 2); + r_.sv64 = __riscv_vfmacc_vv_f32m1(r_.sv64, op1, op2, 2); + #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) a_.values = SIMDE_SHUFFLE_VECTOR_(32, 8, a_.values, a_.values, 1, 1); b_.values = SIMDE_SHUFFLE_VECTOR_(32, 8, -b_.values, b_.values, 1, 2); r_.values += b_.values * a_.values; @@ -170,31 +214,47 @@ SIMDE_FUNCTION_ATTRIBUTES simde_float16x8_t simde_vcmlaq_rot90_lane_f16(simde_float16x8_t r, simde_float16x8_t a, simde_float16x4_t b, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { - simde_float32x4_private r_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(r))), - a_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(a))), - r_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(r))), - a_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(a))), - b_ = simde_float32x4_to_private( - simde_vcvt_f32_f16(simde_vdup_n_f16(simde_float16x4_to_private(b).values[lane]))); - #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) && \ - ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) - a_low.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_low.values, a_low.values, 1, 1, 3, 3); - a_high.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_high.values, a_high.values, 1, 1, 3, 3); - b_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, -b_.values, b_.values, 1, 4, 3, 6); - r_low.values += b_.values * a_low.values; - r_high.values += b_.values * a_high.values; + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + simde_float16x8_private r_ = simde_float16x8_to_private(r), + a_ = simde_float16x8_to_private(a), + b_ = simde_vdupq_n_f16(simde_float16x4_to_private(b).values[lane]); + uint16_t idx1[8] = {1, 1, 3, 3, 5, 5, 7, 7}; + uint16_t idx2[8] = {1, 8, 3, 10, 5, 12, 7, 14}; + vfloat16m2_t a_tmp = __riscv_vlmul_ext_v_f16m1_f16m2 (a_.sv128); + vfloat16m2_t b_tmp = __riscv_vlmul_ext_v_f16m1_f16m2 (b_.sv128); + vfloat16m1_t op1 = __riscv_vlmul_trunc_v_f16m2_f16m1(__riscv_vrgather_vv_f16m2(__riscv_vslideup_vx_f16m2( \ + a_tmp, a_tmp, 8, 16), __riscv_vle16_v_u16m2(idx1, 8), 8)); + vfloat16m1_t op2 = __riscv_vlmul_trunc_v_f16m2_f16m1(__riscv_vrgather_vv_f16m2(__riscv_vslideup_vx_f16m2( \ + __riscv_vfneg_v_f16m2(b_tmp, 8), b_tmp, 8, 16), __riscv_vle16_v_u16m2(idx2, 8), 8)); + r_.sv128 = __riscv_vfmacc_vv_f16m1(r_.sv128, op1, op2, 8); + return simde_float16x8_from_private(r_); #else - SIMDE_VECTORIZE - for (size_t i = 0; i < (sizeof(r_low.values) / (2 * sizeof(r_low.values[0]))); i++) - { - r_low.values[2 * i] += -(b_.values[2 * i + 1]) * a_low.values[2 * i + 1]; - r_low.values[2 * i + 1] += b_.values[2 * i] * a_low.values[2 * i + 1]; - r_high.values[2 * i] += -(b_.values[2 * i + 1]) * a_high.values[2 * i + 1]; - r_high.values[2 * i + 1] += b_.values[2 * i] * a_high.values[2 * i + 1]; - } + simde_float32x4_private r_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(r))), + a_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(a))), + r_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(r))), + a_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(a))), + b_ = simde_float32x4_to_private( + simde_vcvt_f32_f16(simde_vdup_n_f16(simde_float16x4_to_private(b).values[lane]))); + #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) && \ + ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) + a_low.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_low.values, a_low.values, 1, 1, 3, 3); + a_high.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_high.values, a_high.values, 1, 1, 3, 3); + b_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, -b_.values, b_.values, 1, 4, 3, 6); + r_low.values += b_.values * a_low.values; + r_high.values += b_.values * a_high.values; + #else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_low.values) / (2 * sizeof(r_low.values[0]))); i++) + { + r_low.values[2 * i] += -(b_.values[2 * i + 1]) * a_low.values[2 * i + 1]; + r_low.values[2 * i + 1] += b_.values[2 * i] * a_low.values[2 * i + 1]; + r_high.values[2 * i] += -(b_.values[2 * i + 1]) * a_high.values[2 * i + 1]; + r_high.values[2 * i + 1] += b_.values[2 * i] * a_high.values[2 * i + 1]; + } + #endif + return simde_vcombine_f16(simde_vcvt_f16_f32(simde_float32x4_from_private(r_low)), + simde_vcvt_f16_f32(simde_float32x4_from_private(r_high))); #endif - return simde_vcombine_f16(simde_vcvt_f16_f32(simde_float32x4_from_private(r_low)), - simde_vcvt_f16_f32(simde_float32x4_from_private(r_high))); } #if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) #undef vcmlaq_rot90_lane_f16 @@ -212,7 +272,17 @@ simde_float32x4_t simde_vcmlaq_rot90_lane_f32(simde_float32x4_t r, simde_float32 { simde_float32x4_private r_ = simde_float32x4_to_private(r), a_ = simde_float32x4_to_private(a), b_ = simde_float32x4_to_private(simde_vdupq_n_f32(simde_float32x2_to_private(b).values[lane])); - #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) + #if defined(SIMDE_RISCV_V_NATIVE) + uint32_t idx1[4] = {1, 1, 3, 3}; + uint32_t idx2[4] = {1, 4, 3, 6}; + vfloat32m2_t a_tmp = __riscv_vlmul_ext_v_f32m1_f32m2 (a_.sv128); + vfloat32m2_t b_tmp = __riscv_vlmul_ext_v_f32m1_f32m2 (b_.sv128); + vfloat32m1_t op1 = __riscv_vlmul_trunc_v_f32m2_f32m1(__riscv_vrgather_vv_f32m2(__riscv_vslideup_vx_f32m2( \ + a_tmp, a_tmp, 4, 8), __riscv_vle32_v_u32m2(idx1, 4), 4)); + vfloat32m1_t op2 = __riscv_vlmul_trunc_v_f32m2_f32m1(__riscv_vrgather_vv_f32m2(__riscv_vslideup_vx_f32m2( \ + __riscv_vfneg_v_f32m2(b_tmp, 4), b_tmp, 4, 8), __riscv_vle32_v_u32m2(idx2, 4), 4)); + r_.sv128 = __riscv_vfmacc_vv_f32m1(r_.sv128, op1, op2, 4); + #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 1, 1, 3, 3); b_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, -b_.values, b_.values, 1, 4, 3, 6); r_.values += b_.values * a_.values; @@ -240,31 +310,47 @@ SIMDE_FUNCTION_ATTRIBUTES simde_float16x8_t simde_vcmlaq_rot90_laneq_f16(simde_float16x8_t r, simde_float16x8_t a, simde_float16x8_t b, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { - simde_float32x4_private r_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(r))), - a_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(a))), - r_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(r))), - a_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(a))), - b_ = simde_float32x4_to_private( - simde_vcvt_f32_f16(simde_vdup_n_f16(simde_float16x8_to_private(b).values[lane]))); - #if defined(SIMDE_SHUFFLE_VECTOR_) && \ - ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) - a_low.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_low.values, a_low.values, 1, 1, 3, 3); - a_high.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_high.values, a_high.values, 1, 1, 3, 3); - b_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, -b_.values, b_.values, 1, 4, 3, 6); - r_low.values += b_.values * a_low.values; - r_high.values += b_.values * a_high.values; + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH && (SIMDE_NATURAL_VECTOR_SIZE > 128) + simde_float16x8_private r_ = simde_float16x8_to_private(r), + a_ = simde_float16x8_to_private(a), + b_ = simde_vdupq_n_f16(simde_float16x8_to_private(b).values[lane]); + uint16_t idx1[8] = {1, 1, 3, 3, 5, 5, 7, 7}; + uint16_t idx2[8] = {1, 8, 3, 10, 5, 12, 7, 14}; + vfloat16m2_t a_tmp = __riscv_vlmul_ext_v_f16m1_f16m2 (a_.sv128); + vfloat16m2_t b_tmp = __riscv_vlmul_ext_v_f16m1_f16m2 (b_.sv128); + vfloat16m1_t op1 = __riscv_vlmul_trunc_v_f16m2_f16m1(__riscv_vrgather_vv_f16m2(__riscv_vslideup_vx_f16m2( \ + a_tmp, a_tmp, 8, 16), __riscv_vle16_v_u16m2(idx1, 8), 8)); + vfloat16m1_t op2 = __riscv_vlmul_trunc_v_f16m2_f16m1(__riscv_vrgather_vv_f16m2(__riscv_vslideup_vx_f16m2( \ + __riscv_vfneg_v_f16m2(b_tmp, 8), b_tmp, 8, 16), __riscv_vle16_v_u16m2(idx2, 8), 8)); + r_.sv128 = __riscv_vfmacc_vv_f16m1(r_.sv128, op1, op2, 8); + return simde_float16x8_from_private(r_); #else - SIMDE_VECTORIZE - for (size_t i = 0; i < (sizeof(r_low.values) / (2 * sizeof(r_low.values[0]))); i++) - { - r_low.values[2 * i] += -(b_.values[2 * i + 1]) * a_low.values[2 * i + 1]; - r_low.values[2 * i + 1] += b_.values[2 * i] * a_low.values[2 * i + 1]; - r_high.values[2 * i] += -(b_.values[2 * i + 1]) * a_high.values[2 * i + 1]; - r_high.values[2 * i + 1] += b_.values[2 * i] * a_high.values[2 * i + 1]; - } + simde_float32x4_private r_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(r))), + a_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(a))), + r_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(r))), + a_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(a))), + b_ = simde_float32x4_to_private( + simde_vcvt_f32_f16(simde_vdup_n_f16(simde_float16x8_to_private(b).values[lane]))); + #if defined(SIMDE_SHUFFLE_VECTOR_) && \ + ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) + a_low.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_low.values, a_low.values, 1, 1, 3, 3); + a_high.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_high.values, a_high.values, 1, 1, 3, 3); + b_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, -b_.values, b_.values, 1, 4, 3, 6); + r_low.values += b_.values * a_low.values; + r_high.values += b_.values * a_high.values; + #else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_low.values) / (2 * sizeof(r_low.values[0]))); i++) + { + r_low.values[2 * i] += -(b_.values[2 * i + 1]) * a_low.values[2 * i + 1]; + r_low.values[2 * i + 1] += b_.values[2 * i] * a_low.values[2 * i + 1]; + r_high.values[2 * i] += -(b_.values[2 * i + 1]) * a_high.values[2 * i + 1]; + r_high.values[2 * i + 1] += b_.values[2 * i] * a_high.values[2 * i + 1]; + } + #endif + return simde_vcombine_f16(simde_vcvt_f16_f32(simde_float32x4_from_private(r_low)), + simde_vcvt_f16_f32(simde_float32x4_from_private(r_high))); #endif - return simde_vcombine_f16(simde_vcvt_f16_f32(simde_float32x4_from_private(r_low)), - simde_vcvt_f16_f32(simde_float32x4_from_private(r_high))); } #if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) #undef vcmlaq_rot90_laneq_f16 @@ -282,7 +368,17 @@ simde_float32x4_t simde_vcmlaq_rot90_laneq_f32(simde_float32x4_t r, simde_float3 { simde_float32x4_private r_ = simde_float32x4_to_private(r), a_ = simde_float32x4_to_private(a), b_ = simde_float32x4_to_private(simde_vdupq_n_f32(simde_float32x4_to_private(b).values[lane])); - #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) + #if defined(SIMDE_RISCV_V_NATIVE) + uint32_t idx1[4] = {1, 1, 3, 3}; + uint32_t idx2[4] = {1, 4, 3, 6}; + vfloat32m2_t a_tmp = __riscv_vlmul_ext_v_f32m1_f32m2 (a_.sv128); + vfloat32m2_t b_tmp = __riscv_vlmul_ext_v_f32m1_f32m2 (b_.sv128); + vfloat32m1_t op1 = __riscv_vlmul_trunc_v_f32m2_f32m1(__riscv_vrgather_vv_f32m2(__riscv_vslideup_vx_f32m2( \ + a_tmp, a_tmp, 4, 8), __riscv_vle32_v_u32m2(idx1, 4), 4)); + vfloat32m1_t op2 = __riscv_vlmul_trunc_v_f32m2_f32m1(__riscv_vrgather_vv_f32m2(__riscv_vslideup_vx_f32m2( \ + __riscv_vfneg_v_f32m2(b_tmp, 4), b_tmp, 4, 8), __riscv_vle32_v_u32m2(idx2, 4), 4)); + r_.sv128 = __riscv_vfmacc_vv_f32m1(r_.sv128, op1, op2, 4); + #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 1, 1, 3, 3); b_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, -b_.values, b_.values, 1, 4, 3, 6); r_.values += b_.values * a_.values; diff --git a/simde/arm/neon/combine.h b/simde/arm/neon/combine.h index 1a9218784..3db44edfa 100644 --- a/simde/arm/neon/combine.h +++ b/simde/arm/neon/combine.h @@ -24,6 +24,7 @@ * 2020 Evan Nemerson * 2020 Sean Maher (Copyright owned by Google, LLC) * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_COMBINE_H) @@ -45,14 +46,16 @@ simde_vcombine_f16(simde_float16x4_t low, simde_float16x4_t high) { simde_float16x4_private low_ = simde_float16x4_to_private(low), high_ = simde_float16x4_to_private(high); - - size_t halfway = (sizeof(r_.values) / sizeof(r_.values[0])) / 2; - SIMDE_VECTORIZE - for (size_t i = 0 ; i < halfway ; i++) { - r_.values[i] = low_.values[i]; - r_.values[i + halfway] = high_.values[i]; - } - + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + r_.sv128 = __riscv_vslideup_vx_f16m1(low_.sv64, high_.sv64, 4, 8); + #else + size_t halfway = (sizeof(r_.values) / sizeof(r_.values[0])) / 2; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < halfway ; i++) { + r_.values[i] = low_.values[i]; + r_.values[i + halfway] = high_.values[i]; + } + #endif return simde_float16x8_from_private(r_); #endif } @@ -75,7 +78,9 @@ simde_vcombine_f32(simde_float32x2_t low, simde_float32x2_t high) { /* Note: __builtin_shufflevector can have a the output contain * twice the number of elements, __builtin_shuffle cannot. * Using SIMDE_SHUFFLE_VECTOR_ here would not work. */ - #if defined(SIMDE_VECTOR_SUBSCRIPT) && HEDLEY_HAS_BUILTIN(__builtin_shufflevector) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vslideup_vx_f32m1(low_.sv64, high_.sv64, 2, 4); + #elif defined(SIMDE_VECTOR_SUBSCRIPT) && HEDLEY_HAS_BUILTIN(__builtin_shufflevector) r_.values = __builtin_shufflevector(low_.values, high_.values, 0, 1, 2, 3); #else size_t halfway = (sizeof(r_.values) / sizeof(r_.values[0])) / 2; @@ -105,7 +110,9 @@ simde_vcombine_f64(simde_float64x1_t low, simde_float64x1_t high) { low_ = simde_float64x1_to_private(low), high_ = simde_float64x1_to_private(high); - #if defined(SIMDE_VECTOR_SUBSCRIPT) && HEDLEY_HAS_BUILTIN(__builtin_shufflevector) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vslideup_vx_f64m1(low_.sv64, high_.sv64, 1, 2); + #elif defined(SIMDE_VECTOR_SUBSCRIPT) && HEDLEY_HAS_BUILTIN(__builtin_shufflevector) r_.values = __builtin_shufflevector(low_.values, high_.values, 0, 1); #else size_t halfway = (sizeof(r_.values) / sizeof(r_.values[0])) / 2; @@ -135,7 +142,9 @@ simde_vcombine_s8(simde_int8x8_t low, simde_int8x8_t high) { low_ = simde_int8x8_to_private(low), high_ = simde_int8x8_to_private(high); - #if defined(SIMDE_VECTOR_SUBSCRIPT) && HEDLEY_HAS_BUILTIN(__builtin_shufflevector) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vslideup_vx_i8m1(low_.sv64, high_.sv64, 8, 16); + #elif defined(SIMDE_VECTOR_SUBSCRIPT) && HEDLEY_HAS_BUILTIN(__builtin_shufflevector) r_.values = __builtin_shufflevector(low_.values, high_.values, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); #else size_t halfway = (sizeof(r_.values) / sizeof(r_.values[0])) / 2; @@ -165,7 +174,9 @@ simde_vcombine_s16(simde_int16x4_t low, simde_int16x4_t high) { low_ = simde_int16x4_to_private(low), high_ = simde_int16x4_to_private(high); - #if defined(SIMDE_VECTOR_SUBSCRIPT) && HEDLEY_HAS_BUILTIN(__builtin_shufflevector) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vslideup_vx_i16m1(low_.sv64, high_.sv64, 4, 8); + #elif defined(SIMDE_VECTOR_SUBSCRIPT) && HEDLEY_HAS_BUILTIN(__builtin_shufflevector) r_.values = __builtin_shufflevector(low_.values, high_.values, 0, 1, 2, 3, 4, 5, 6, 7); #else size_t halfway = (sizeof(r_.values) / sizeof(r_.values[0])) / 2; @@ -195,7 +206,9 @@ simde_vcombine_s32(simde_int32x2_t low, simde_int32x2_t high) { low_ = simde_int32x2_to_private(low), high_ = simde_int32x2_to_private(high); - #if defined(SIMDE_VECTOR_SUBSCRIPT) && HEDLEY_HAS_BUILTIN(__builtin_shufflevector) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vslideup_vx_i32m1(low_.sv64, high_.sv64, 2, 4); + #elif defined(SIMDE_VECTOR_SUBSCRIPT) && HEDLEY_HAS_BUILTIN(__builtin_shufflevector) r_.values = __builtin_shufflevector(low_.values, high_.values, 0, 1, 2, 3); #else size_t halfway = (sizeof(r_.values) / sizeof(r_.values[0])) / 2; @@ -225,7 +238,9 @@ simde_vcombine_s64(simde_int64x1_t low, simde_int64x1_t high) { low_ = simde_int64x1_to_private(low), high_ = simde_int64x1_to_private(high); - #if defined(SIMDE_VECTOR_SUBSCRIPT) && HEDLEY_HAS_BUILTIN(__builtin_shufflevector) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vslideup_vx_i64m1(low_.sv64, high_.sv64, 1, 2); + #elif defined(SIMDE_VECTOR_SUBSCRIPT) && HEDLEY_HAS_BUILTIN(__builtin_shufflevector) r_.values = __builtin_shufflevector(low_.values, high_.values, 0, 1); #else size_t halfway = (sizeof(r_.values) / sizeof(r_.values[0])) / 2; @@ -255,7 +270,9 @@ simde_vcombine_u8(simde_uint8x8_t low, simde_uint8x8_t high) { low_ = simde_uint8x8_to_private(low), high_ = simde_uint8x8_to_private(high); - #if defined(SIMDE_VECTOR_SUBSCRIPT) && HEDLEY_HAS_BUILTIN(__builtin_shufflevector) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vslideup_vx_u8m1(low_.sv64, high_.sv64, 8, 16); + #elif defined(SIMDE_VECTOR_SUBSCRIPT) && HEDLEY_HAS_BUILTIN(__builtin_shufflevector) r_.values = __builtin_shufflevector(low_.values, high_.values, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); #else size_t halfway = (sizeof(r_.values) / sizeof(r_.values[0])) / 2; @@ -285,7 +302,9 @@ simde_vcombine_u16(simde_uint16x4_t low, simde_uint16x4_t high) { low_ = simde_uint16x4_to_private(low), high_ = simde_uint16x4_to_private(high); - #if defined(SIMDE_VECTOR_SUBSCRIPT) && HEDLEY_HAS_BUILTIN(__builtin_shufflevector) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vslideup_vx_u16m1(low_.sv64, high_.sv64, 4, 8); + #elif defined(SIMDE_VECTOR_SUBSCRIPT) && HEDLEY_HAS_BUILTIN(__builtin_shufflevector) r_.values = __builtin_shufflevector(low_.values, high_.values, 0, 1, 2, 3, 4, 5, 6, 7); #else size_t halfway = (sizeof(r_.values) / sizeof(r_.values[0])) / 2; @@ -315,7 +334,9 @@ simde_vcombine_u32(simde_uint32x2_t low, simde_uint32x2_t high) { low_ = simde_uint32x2_to_private(low), high_ = simde_uint32x2_to_private(high); - #if defined(SIMDE_VECTOR_SUBSCRIPT) && HEDLEY_HAS_BUILTIN(__builtin_shufflevector) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vslideup_vx_u32m1(low_.sv64, high_.sv64, 2, 4); + #elif defined(SIMDE_VECTOR_SUBSCRIPT) && HEDLEY_HAS_BUILTIN(__builtin_shufflevector) r_.values = __builtin_shufflevector(low_.values, high_.values, 0, 1, 2, 3); #else size_t halfway = (sizeof(r_.values) / sizeof(r_.values[0])) / 2; @@ -345,7 +366,9 @@ simde_vcombine_u64(simde_uint64x1_t low, simde_uint64x1_t high) { low_ = simde_uint64x1_to_private(low), high_ = simde_uint64x1_to_private(high); - #if defined(SIMDE_VECTOR_SUBSCRIPT) && HEDLEY_HAS_BUILTIN(__builtin_shufflevector) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vslideup_vx_u64m1(low_.sv64, high_.sv64, 1, 2); + #elif defined(SIMDE_VECTOR_SUBSCRIPT) && HEDLEY_HAS_BUILTIN(__builtin_shufflevector) r_.values = __builtin_shufflevector(low_.values, high_.values, 0, 1); #else size_t halfway = (sizeof(r_.values) / sizeof(r_.values[0])) / 2; diff --git a/simde/arm/neon/cvt.h b/simde/arm/neon/cvt.h index ab5122527..f60e5c092 100644 --- a/simde/arm/neon/cvt.h +++ b/simde/arm/neon/cvt.h @@ -44,7 +44,9 @@ simde_vcvt_f16_f32(simde_float32x4_t a) { simde_float32x4_private a_ = simde_float32x4_to_private(a); simde_float16x4_private r_; - #if defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_FLOAT16_VECTOR) + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + r_.sv64 = __riscv_vfncvt_f_f_w_f16m1(__riscv_vlmul_ext_v_f32m1_f32m2(a_.sv128), 4); + #elif defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_FLOAT16_VECTOR) SIMDE_CONVERT_VECTOR_(r_.values, a_.values); #else SIMDE_VECTORIZE @@ -70,7 +72,9 @@ simde_vcvt_f32_f16(simde_float16x4_t a) { simde_float16x4_private a_ = simde_float16x4_to_private(a); simde_float32x4_private r_; - #if defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_FLOAT16_VECTOR) + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + r_.sv128 = __riscv_vlmul_trunc_v_f32m2_f32m1(__riscv_vfwcvt_f_f_v_f32m2(a_.sv64, 4)); + #elif defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_FLOAT16_VECTOR) SIMDE_CONVERT_VECTOR_(r_.values, a_.values); #else SIMDE_VECTORIZE @@ -96,8 +100,10 @@ simde_vcvt_f32_f64(simde_float64x2_t a) { simde_float64x2_private a_ = simde_float64x2_to_private(a); simde_float32x2_private r_; - #if defined(SIMDE_CONVERT_VECTOR_) + #if defined(SIMDE_CONVERT_VECTOR_) && !defined(SIMDE_RISCV_V_NATIVE) SIMDE_CONVERT_VECTOR_(r_.values, a_.values); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vfncvt_f_f_w_f32m1(__riscv_vlmul_ext_v_f64m1_f64m2(a_.sv128), 2); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -122,8 +128,10 @@ simde_vcvt_f64_f32(simde_float32x2_t a) { simde_float32x2_private a_ = simde_float32x2_to_private(a); simde_float64x2_private r_; - #if defined(SIMDE_CONVERT_VECTOR_) + #if defined(SIMDE_CONVERT_VECTOR_) && !defined(SIMDE_RISCV_V_NATIVE) SIMDE_CONVERT_VECTOR_(r_.values, a_.values); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vlmul_trunc_v_f64m2_f64m1(__riscv_vfwcvt_f_f_v_f64m2(a_.sv64, 2)); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -527,7 +535,17 @@ simde_vcvt_s32_f32(simde_float32x2_t a) { simde_float32x2_private a_ = simde_float32x2_to_private(a); simde_int32x2_private r_; - #if defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_FAST_CONVERSION_RANGE) + #if defined(SIMDE_RISCV_V_NATIVE) + #if defined(SIMDE_FAST_CONVERSION_RANGE) + r_.sv64 = __riscv_vfcvt_rtz_x_f_v_i32m1(a_.sv64, 2); + #else + r_.sv64 = __riscv_vmerge_vxm_i32m1( + __riscv_vfcvt_rtz_x_f_v_i32m1(a_.sv64, 2), + 0, + __riscv_vmseq_vx_u32m1_b32(__riscv_vfclass_v_u32m1(a_.sv64 , 2) , 512 , 2), + 2); + #endif + #elif defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_FAST_CONVERSION_RANGE) SIMDE_CONVERT_VECTOR_(r_.values, a_.values); #else SIMDE_VECTORIZE @@ -553,7 +571,9 @@ simde_vcvt_u16_f16(simde_float16x4_t a) { simde_float16x4_private a_ = simde_float16x4_to_private(a); simde_uint16x4_private r_; - #if defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_FAST_CONVERSION_RANGE) && defined(SIMDE_FLOAT16_VECTOR) + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + r_.sv64 = __riscv_vfcvt_rtz_xu_f_v_u16m1(a_.sv64, 4); + #elif defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_FAST_CONVERSION_RANGE) && defined(SIMDE_FLOAT16_VECTOR) SIMDE_CONVERT_VECTOR_(r_.values, a_.values); #else SIMDE_VECTORIZE @@ -579,7 +599,17 @@ simde_vcvt_u32_f32(simde_float32x2_t a) { simde_float32x2_private a_ = simde_float32x2_to_private(a); simde_uint32x2_private r_; - #if defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_FAST_CONVERSION_RANGE) + #if defined(SIMDE_RISCV_V_NATIVE) + #if defined(SIMDE_FAST_CONVERSION_RANGE) + r_.sv64 = __riscv_vfcvt_rtz_xu_f_v_u32m1(a_.sv64, 2); + #else + r_.sv64 = __riscv_vmerge_vxm_u32m1( + __riscv_vfcvt_rtz_xu_f_v_u32m1(a_.sv64, 2), + 0, + __riscv_vmseq_vx_u32m1_b32(__riscv_vfclass_v_u32m1(a_.sv64 , 2) , 512 , 2), + 2); + #endif + #elif defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_FAST_CONVERSION_RANGE) SIMDE_CONVERT_VECTOR_(r_.values, a_.values); #else SIMDE_VECTORIZE @@ -605,7 +635,17 @@ simde_vcvt_s64_f64(simde_float64x1_t a) { simde_float64x1_private a_ = simde_float64x1_to_private(a); simde_int64x1_private r_; - #if defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_FAST_CONVERSION_RANGE) + #if defined(SIMDE_RISCV_V_NATIVE) + #if defined(SIMDE_FAST_CONVERSION_RANGE) + r_.sv64 = __riscv_vfcvt_rtz_x_f_v_i64m1(a_.sv64, 1); + #else + r_.sv64 = __riscv_vmerge_vxm_i64m1( + __riscv_vfcvt_rtz_x_f_v_i64m1(a_.sv64, 1), + 0, + __riscv_vmseq_vx_u64m1_b64(__riscv_vfclass_v_u64m1(a_.sv64 , 1) , 512 , 1), + 1); + #endif + #elif defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_FAST_CONVERSION_RANGE) SIMDE_CONVERT_VECTOR_(r_.values, a_.values); #else SIMDE_VECTORIZE @@ -631,7 +671,17 @@ simde_vcvt_u64_f64(simde_float64x1_t a) { simde_float64x1_private a_ = simde_float64x1_to_private(a); simde_uint64x1_private r_; - #if defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_FAST_CONVERSION_RANGE) + #if defined(SIMDE_RISCV_V_NATIVE) + #if defined(SIMDE_FAST_CONVERSION_RANGE) + r_.sv64 = __riscv_vfcvt_rtz_xu_f_v_u64m1(a_.sv64, 1); + #else + r_.sv64 = __riscv_vmerge_vxm_u64m1( + __riscv_vfcvt_rtz_xu_f_v_u64m1(a_.sv64, 1), + 0, + __riscv_vmseq_vx_u64m1_b64(__riscv_vfclass_v_u64m1(a_.sv64 , 1) , 512 , 1), + 1); + #endif + #elif defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_FAST_CONVERSION_RANGE) SIMDE_CONVERT_VECTOR_(r_.values, a_.values); r_.values &= HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), (a_.values >= SIMDE_FLOAT64_C(0.0))); #else @@ -696,6 +746,16 @@ simde_vcvtq_s32_f32(simde_float32x4_t a) { #if !defined(SIMDE_FAST_NANS) r_.m128i = _mm_and_si128(r_.m128i, _mm_castps_si128(_mm_cmpord_ps(a_.m128, a_.m128))); #endif + #elif defined(SIMDE_RISCV_V_NATIVE) + #if defined(SIMDE_FAST_CONVERSION_RANGE) + r_.sv128 = __riscv_vfcvt_rtz_x_f_v_i32m1(a_.sv128, 4); + #else + r_.sv128 = __riscv_vmerge_vxm_i32m1( + __riscv_vfcvt_rtz_x_f_v_i32m1(a_.sv128, 4), + 0, + __riscv_vmseq_vx_u32m1_b32(__riscv_vfclass_v_u32m1(a_.sv128 , 4) , 512 , 4), + 4); + #endif #elif defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_FAST_CONVERSION_RANGE) && !defined(SIMDE_FAST_NANS) SIMDE_CONVERT_VECTOR_(r_.values, a_.values); #elif defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_IEEE754_STORAGE) @@ -736,7 +796,9 @@ simde_vcvtq_u16_f16(simde_float16x8_t a) { simde_float16x8_private a_ = simde_float16x8_to_private(a); simde_uint16x8_private r_; - #if defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_FAST_CONVERSION_RANGE) && defined(SIMDE_FLOAT16_VECTOR) + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + r_.sv128 = __riscv_vfcvt_rtz_xu_f_v_u16m1(a_.sv128, 8); + #elif defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_FAST_CONVERSION_RANGE) && defined(SIMDE_FLOAT16_VECTOR) SIMDE_CONVERT_VECTOR_(r_.values, a_.values); #else SIMDE_VECTORIZE @@ -793,6 +855,16 @@ simde_vcvtq_u32_f32(simde_float32x4_t a) { #if !defined(SIMDE_FAST_NANS) r_.m128i = _mm_and_si128(r_.m128i, _mm_castps_si128(_mm_cmpord_ps(a_.m128, a_.m128))); #endif + #elif defined(SIMDE_RISCV_V_NATIVE) + #if defined(SIMDE_FAST_CONVERSION_RANGE) + r_.sv128 = __riscv_vfcvt_rtz_xu_f_v_u32m1(a_.sv128, 4); + #else + r_.sv128 = __riscv_vmerge_vxm_u32m1( + __riscv_vfcvt_rtz_xu_f_v_u32m1(a_.sv128, 4), + 0, + __riscv_vmseq_vx_u32m1_b32(__riscv_vfclass_v_u32m1(a_.sv128 , 4) , 512 , 4), + 4); + #endif #elif defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_FAST_CONVERSION_RANGE) SIMDE_CONVERT_VECTOR_(r_.values, a_.values); #elif defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_IEEE754_STORAGE) @@ -873,6 +945,16 @@ simde_vcvtq_s64_f64(simde_float64x2_t a) { #if !defined(SIMDE_FAST_NANS) r_.m128i = _mm_and_si128(r_.m128i, _mm_castpd_si128(_mm_cmpord_pd(a_.m128d, a_.m128d))); #endif + #elif defined(SIMDE_RISCV_V_NATIVE) + #if defined(SIMDE_FAST_CONVERSION_RANGE) + r_.sv128 = __riscv_vfcvt_rtz_x_f_v_i64m1(a_.sv128, 2); + #else + r_.sv128 = __riscv_vmerge_vxm_i64m1( + __riscv_vfcvt_rtz_x_f_v_i64m1(a_.sv128, 2), + 0, + __riscv_vmseq_vx_u64m1_b64(__riscv_vfclass_v_u64m1(a_.sv128 , 2) , 512 , 2), + 2); + #endif #elif defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_FAST_CONVERSION_RANGE) SIMDE_CONVERT_VECTOR_(r_.values, a_.values); #elif defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_IEEE754_STORAGE) @@ -952,6 +1034,16 @@ simde_vcvtq_u64_f64(simde_float64x2_t a) { #if !defined(SIMDE_FAST_NANS) r_.m128i = _mm_and_si128(r_.m128i, _mm_castpd_si128(_mm_cmpord_pd(a_.m128d, a_.m128d))); #endif + #elif defined(SIMDE_RISCV_V_NATIVE) + #if defined(SIMDE_FAST_CONVERSION_RANGE) + r_.sv128 = __riscv_vfcvt_rtz_xu_f_v_u64m1(a_.sv128, 2); + #else + r_.sv128 = __riscv_vmerge_vxm_u64m1( + __riscv_vfcvt_rtz_xu_f_v_u64m1(a_.sv128, 2), + 0, + __riscv_vmseq_vx_u64m1_b64(__riscv_vfclass_v_u64m1(a_.sv128 , 2) , 512 , 2), + 2); + #endif #elif defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_IEEE754_STORAGE) SIMDE_CONVERT_VECTOR_(r_.values, a_.values); @@ -986,7 +1078,9 @@ simde_vcvt_f16_s16(simde_int16x4_t a) { simde_int16x4_private a_ = simde_int16x4_to_private(a); simde_float16x4_private r_; - #if defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_FLOAT16_VECTOR) + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + r_.sv64 = __riscv_vfcvt_f_x_v_f16m1(a_.sv64, 4); + #elif defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_FLOAT16_VECTOR) SIMDE_CONVERT_VECTOR_(r_.values, a_.values); #else SIMDE_VECTORIZE @@ -1016,8 +1110,10 @@ simde_vcvt_f32_s32(simde_int32x2_t a) { simde_int32x2_private a_ = simde_int32x2_to_private(a); simde_float32x2_private r_; - #if defined(SIMDE_CONVERT_VECTOR_) + #if defined(SIMDE_CONVERT_VECTOR_) && !defined(SIMDE_RISCV_V_NATIVE) SIMDE_CONVERT_VECTOR_(r_.values, a_.values); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vfcvt_f_x_v_f32m1(a_.sv64, 2); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -1042,14 +1138,18 @@ simde_vcvt_f16_u16(simde_uint16x4_t a) { simde_uint16x4_private a_ = simde_uint16x4_to_private(a); simde_float16x4_private r_; - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - #if SIMDE_FLOAT16_API != SIMDE_FLOAT16_API_PORTABLE && SIMDE_FLOAT16_API != SIMDE_FLOAT16_API_FP16_NO_ABI - r_.values[i] = HEDLEY_STATIC_CAST(simde_float16_t, a_.values[i]); - #else - r_.values[i] = simde_float16_from_float32(HEDLEY_STATIC_CAST(simde_float32_t, a_.values[i])); - #endif - } + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + r_.sv64 = __riscv_vfcvt_f_xu_v_f16m1(a_.sv64, 4); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + #if SIMDE_FLOAT16_API != SIMDE_FLOAT16_API_PORTABLE && SIMDE_FLOAT16_API != SIMDE_FLOAT16_API_FP16_NO_ABI + r_.values[i] = HEDLEY_STATIC_CAST(simde_float16_t, a_.values[i]); + #else + r_.values[i] = simde_float16_from_float32(HEDLEY_STATIC_CAST(simde_float32_t, a_.values[i])); + #endif + } + #endif return simde_float16x4_from_private(r_); #endif @@ -1068,8 +1168,10 @@ simde_vcvt_f32_u32(simde_uint32x2_t a) { simde_uint32x2_private a_ = simde_uint32x2_to_private(a); simde_float32x2_private r_; - #if defined(SIMDE_CONVERT_VECTOR_) + #if defined(SIMDE_CONVERT_VECTOR_) && !defined(SIMDE_RISCV_V_NATIVE) SIMDE_CONVERT_VECTOR_(r_.values, a_.values); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vfcvt_f_xu_v_f32m1(a_.sv64, 2); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -1094,8 +1196,10 @@ simde_vcvt_f64_s64(simde_int64x1_t a) { simde_int64x1_private a_ = simde_int64x1_to_private(a); simde_float64x1_private r_; - #if defined(SIMDE_CONVERT_VECTOR_) + #if defined(SIMDE_CONVERT_VECTOR_) && !defined(SIMDE_RISCV_V_NATIVE) SIMDE_CONVERT_VECTOR_(r_.values, a_.values); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vfcvt_f_x_v_f64m1(a_.sv64, 1); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -1120,8 +1224,10 @@ simde_vcvt_f64_u64(simde_uint64x1_t a) { simde_uint64x1_private a_ = simde_uint64x1_to_private(a); simde_float64x1_private r_; - #if defined(SIMDE_CONVERT_VECTOR_) + #if defined(SIMDE_CONVERT_VECTOR_) && !defined(SIMDE_RISCV_V_NATIVE) SIMDE_CONVERT_VECTOR_(r_.values, a_.values); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vfcvt_f_xu_v_f64m1(a_.sv64, 1); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -1146,7 +1252,9 @@ simde_vcvtq_f16_s16(simde_int16x8_t a) { simde_int16x8_private a_ = simde_int16x8_to_private(a); simde_float16x8_private r_; - #if defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_FLOAT16_VECTOR) + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + r_.sv128 = __riscv_vfcvt_f_x_v_f16m1(a_.sv128, 8); + #elif defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_FLOAT16_VECTOR) SIMDE_CONVERT_VECTOR_(r_.values, a_.values); #else SIMDE_VECTORIZE @@ -1176,8 +1284,10 @@ simde_vcvtq_f32_s32(simde_int32x4_t a) { simde_int32x4_private a_ = simde_int32x4_to_private(a); simde_float32x4_private r_; - #if defined(SIMDE_CONVERT_VECTOR_) + #if defined(SIMDE_CONVERT_VECTOR_) && !defined(SIMDE_RISCV_V_NATIVE) SIMDE_CONVERT_VECTOR_(r_.values, a_.values); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vfcvt_f_x_v_f32m1(a_.sv128, 4); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -1202,7 +1312,9 @@ simde_vcvtq_f16_u16(simde_uint16x8_t a) { simde_uint16x8_private a_ = simde_uint16x8_to_private(a); simde_float16x8_private r_; - #if defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_FLOAT16_VECTOR) + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + r_.sv128 = __riscv_vfcvt_f_xu_v_f16m1(a_.sv128, 8); + #elif defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_FLOAT16_VECTOR) SIMDE_CONVERT_VECTOR_(r_.values, a_.values); #else SIMDE_VECTORIZE @@ -1232,8 +1344,10 @@ simde_vcvtq_f32_u32(simde_uint32x4_t a) { simde_uint32x4_private a_ = simde_uint32x4_to_private(a); simde_float32x4_private r_; - #if defined(SIMDE_CONVERT_VECTOR_) + #if defined(SIMDE_CONVERT_VECTOR_) && !defined(SIMDE_RISCV_V_NATIVE) SIMDE_CONVERT_VECTOR_(r_.values, a_.values); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vfcvt_f_xu_v_f32m1(a_.sv128, 4); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -1262,8 +1376,10 @@ simde_vcvtq_f64_s64(simde_int64x2_t a) { #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512DQ_NATIVE) r_.m128d = _mm_cvtepi64_pd(a_.m128i); - #elif defined(SIMDE_CONVERT_VECTOR_) + #elif defined(SIMDE_CONVERT_VECTOR_) && !defined(SIMDE_RISCV_V_NATIVE) SIMDE_CONVERT_VECTOR_(r_.values, a_.values); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vfcvt_f_x_v_f64m1(a_.sv128, 2); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -1288,8 +1404,10 @@ simde_vcvtq_f64_u64(simde_uint64x2_t a) { simde_uint64x2_private a_ = simde_uint64x2_to_private(a); simde_float64x2_private r_; - #if defined(SIMDE_CONVERT_VECTOR_) + #if defined(SIMDE_CONVERT_VECTOR_) && !defined(SIMDE_RISCV_V_NATIVE) SIMDE_CONVERT_VECTOR_(r_.values, a_.values); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vfcvt_f_xu_v_f64m1(a_.sv128, 2); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { diff --git a/simde/arm/neon/dot.h b/simde/arm/neon/dot.h index a05d32d47..f195710be 100644 --- a/simde/arm/neon/dot.h +++ b/simde/arm/neon/dot.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Sean Maher (Copyright owned by Google, LLC) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_DOT_H) @@ -55,16 +56,32 @@ simde_vdot_s32(simde_int32x2_t r, simde_int8x8_t a, simde_int8x8_t b) { simde_int8x8_private a_ = simde_int8x8_to_private(a), b_ = simde_int8x8_to_private(b); - for (int i = 0 ; i < 2 ; i++) { - int32_t acc = 0; - SIMDE_VECTORIZE_REDUCTION(+:acc) - for (int j = 0 ; j < 4 ; j++) { - const int idx = j + (i << 2); - acc += HEDLEY_STATIC_CAST(int32_t, a_.values[idx]) * HEDLEY_STATIC_CAST(int32_t, b_.values[idx]); - } - r_.values[i] = acc; - } - return simde_vadd_s32(r, simde_int32x2_from_private(r_)); + #if defined(SIMDE_RISCV_V_NATIVE) + simde_int32x2_private r_tmp = simde_int32x2_to_private(r); + vint16m2_t vd_low = __riscv_vwmul_vv_i16m2 (a_.sv64, b_.sv64, 8); + vint16m2_t vd_high = __riscv_vslidedown_vx_i16m2(vd_low, 4, 8); + vint32m1_t vd = __riscv_vmv_v_x_i32m1(0, 4); + vint32m1_t vd_low_wide = __riscv_vwcvt_x_x_v_i32m1 (__riscv_vlmul_trunc_v_i16m2_i16mf2(vd_low), 4); + vint32m1_t rst0 = __riscv_vredsum_vs_i32m1_i32m1(vd_low_wide, vd, 4); + vint32m1_t vd_high_wide = __riscv_vwcvt_x_x_v_i32m1 (__riscv_vlmul_trunc_v_i16m2_i16mf2(vd_high), 4); + vint32m1_t rst1 = __riscv_vredsum_vs_i32m1_i32m1(vd_high_wide, vd, 4); + r_.sv64 = __riscv_vslideup_vx_i32m1( + __riscv_vadd_vx_i32m1(rst0, r_tmp.values[0], 2), + __riscv_vadd_vx_i32m1(rst1, r_tmp.values[1], 2), + 1, 2); + return simde_int32x2_from_private(r_); + #else + for (int i = 0 ; i < 2 ; i++) { + int32_t acc = 0; + SIMDE_VECTORIZE_REDUCTION(+:acc) + for (int j = 0 ; j < 4 ; j++) { + const int idx = j + (i << 2); + acc += HEDLEY_STATIC_CAST(int32_t, a_.values[idx]) * HEDLEY_STATIC_CAST(int32_t, b_.values[idx]); + } + r_.values[i] = acc; + } + #endif + return simde_vadd_s32(r, simde_int32x2_from_private(r_)); #endif } #if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) @@ -85,15 +102,31 @@ simde_vdot_u32(simde_uint32x2_t r, simde_uint8x8_t a, simde_uint8x8_t b) { a_ = simde_uint8x8_to_private(a), b_ = simde_uint8x8_to_private(b); - for (int i = 0 ; i < 2 ; i++) { - uint32_t acc = 0; - SIMDE_VECTORIZE_REDUCTION(+:acc) - for (int j = 0 ; j < 4 ; j++) { - const int idx = j + (i << 2); - acc += HEDLEY_STATIC_CAST(uint32_t, a_.values[idx]) * HEDLEY_STATIC_CAST(uint32_t, b_.values[idx]); - } - r_.values[i] = acc; - } + #if defined(SIMDE_RISCV_V_NATIVE) + simde_uint32x2_private r_tmp = simde_uint32x2_to_private(r); + vuint16m2_t vd_low = __riscv_vwmulu_vv_u16m2 (a_.sv64, b_.sv64, 8); + vuint16m2_t vd_high = __riscv_vslidedown_vx_u16m2(vd_low, 4, 8); + vuint32m1_t vd = __riscv_vmv_v_x_u32m1(0, 4); + vuint32m1_t vd_low_wide = __riscv_vwcvtu_x_x_v_u32m1 (__riscv_vlmul_trunc_v_u16m2_u16mf2(vd_low), 4); + vuint32m1_t rst0 = __riscv_vredsum_vs_u32m1_u32m1(vd_low_wide, vd, 4); + vuint32m1_t vd_high_wide = __riscv_vwcvtu_x_x_v_u32m1 (__riscv_vlmul_trunc_v_u16m2_u16mf2(vd_high), 4); + vuint32m1_t rst1 = __riscv_vredsum_vs_u32m1_u32m1(vd_high_wide, vd, 4); + r_.sv64 = __riscv_vslideup_vx_u32m1( + __riscv_vadd_vx_u32m1(rst0, r_tmp.values[0], 2), + __riscv_vadd_vx_u32m1(rst1, r_tmp.values[1], 2), + 1, 2); + return simde_uint32x2_from_private(r_); + #else + for (int i = 0 ; i < 2 ; i++) { + uint32_t acc = 0; + SIMDE_VECTORIZE_REDUCTION(+:acc) + for (int j = 0 ; j < 4 ; j++) { + const int idx = j + (i << 2); + acc += HEDLEY_STATIC_CAST(uint32_t, a_.values[idx]) * HEDLEY_STATIC_CAST(uint32_t, b_.values[idx]); + } + r_.values[i] = acc; + } + #endif return simde_vadd_u32(r, simde_uint32x2_from_private(r_)); #endif } @@ -116,15 +149,33 @@ simde_vdotq_s32(simde_int32x4_t r, simde_int8x16_t a, simde_int8x16_t b) { simde_int8x16_private a_ = simde_int8x16_to_private(a), b_ = simde_int8x16_to_private(b); - for (int i = 0 ; i < 4 ; i++) { - int32_t acc = 0; - SIMDE_VECTORIZE_REDUCTION(+:acc) - for (int j = 0 ; j < 4 ; j++) { - const int idx = j + (i << 2); - acc += HEDLEY_STATIC_CAST(int32_t, a_.values[idx]) * HEDLEY_STATIC_CAST(int32_t, b_.values[idx]); - } - r_.values[i] = acc; - } + #if defined(SIMDE_RISCV_V_NATIVE) + simde_int32x4_private r_tmp = simde_int32x4_to_private(r); + vint16m2_t vd_low = __riscv_vwmul_vv_i16m2 (a_.sv128, b_.sv128, 16); + vint32m1_t vd = __riscv_vmv_v_x_i32m1(0, 4); + vint32m1_t rst0 = __riscv_vredsum_vs_i32m1_i32m1(__riscv_vwcvt_x_x_v_i32m1(__riscv_vlmul_trunc_v_i16m2_i16mf2( \ + vd_low), 4), vd, 4); + vint32m1_t rst1 = __riscv_vredsum_vs_i32m1_i32m1(__riscv_vwcvt_x_x_v_i32m1(__riscv_vlmul_trunc_v_i16m2_i16mf2( \ + __riscv_vslidedown_vx_i16m2(vd_low, 4, 4)), 4), vd, 4); + vint32m1_t rst2 = __riscv_vredsum_vs_i32m1_i32m1(__riscv_vwcvt_x_x_v_i32m1(__riscv_vlmul_trunc_v_i16m2_i16mf2( \ + __riscv_vslidedown_vx_i16m2(vd_low, 8, 4)), 4), vd, 4); + vint32m1_t rst3 = __riscv_vredsum_vs_i32m1_i32m1(__riscv_vwcvt_x_x_v_i32m1(__riscv_vlmul_trunc_v_i16m2_i16mf2( \ + __riscv_vslidedown_vx_i16m2(vd_low, 12, 4)), 4), vd, 4); + vint32m1_t r0 = __riscv_vslideup_vx_i32m1(__riscv_vadd_vx_i32m1(rst0, r_tmp.values[0], 2), __riscv_vadd_vx_i32m1(rst1, r_tmp.values[1], 2), 1, 2); + vint32m1_t r1 = __riscv_vslideup_vx_i32m1(r0, __riscv_vadd_vx_i32m1(rst2, r_tmp.values[2], 2), 2, 3); + r_.sv128 = __riscv_vslideup_vx_i32m1(r1, __riscv_vadd_vx_i32m1(rst3, r_tmp.values[3], 2), 3, 4); + return simde_int32x4_from_private(r_); + #else + for (int i = 0 ; i < 4 ; i++) { + int32_t acc = 0; + SIMDE_VECTORIZE_REDUCTION(+:acc) + for (int j = 0 ; j < 4 ; j++) { + const int idx = j + (i << 2); + acc += HEDLEY_STATIC_CAST(int32_t, a_.values[idx]) * HEDLEY_STATIC_CAST(int32_t, b_.values[idx]); + } + r_.values[i] = acc; + } + #endif return simde_vaddq_s32(r, simde_int32x4_from_private(r_)); #endif } @@ -147,15 +198,33 @@ simde_vdotq_u32(simde_uint32x4_t r, simde_uint8x16_t a, simde_uint8x16_t b) { simde_uint8x16_private a_ = simde_uint8x16_to_private(a), b_ = simde_uint8x16_to_private(b); - for (int i = 0 ; i < 4 ; i++) { - uint32_t acc = 0; - SIMDE_VECTORIZE_REDUCTION(+:acc) - for (int j = 0 ; j < 4 ; j++) { - const int idx = j + (i << 2); - acc += HEDLEY_STATIC_CAST(uint32_t, a_.values[idx]) * HEDLEY_STATIC_CAST(uint32_t, b_.values[idx]); + #if defined(SIMDE_RISCV_V_NATIVE) + simde_uint32x4_private r_tmp = simde_uint32x4_to_private(r); + vuint16m2_t vd_low = __riscv_vwmulu_vv_u16m2 (a_.sv128, b_.sv128, 16); + vuint32m1_t vd = __riscv_vmv_v_x_u32m1(0, 4); + vuint32m1_t rst0 = __riscv_vredsum_vs_u32m1_u32m1(__riscv_vwcvtu_x_x_v_u32m1(__riscv_vlmul_trunc_v_u16m2_u16mf2( \ + vd_low), 4), vd, 4); + vuint32m1_t rst1 = __riscv_vredsum_vs_u32m1_u32m1(__riscv_vwcvtu_x_x_v_u32m1(__riscv_vlmul_trunc_v_u16m2_u16mf2( \ + __riscv_vslidedown_vx_u16m2(vd_low, 4, 4)), 4), vd, 4); + vuint32m1_t rst2 = __riscv_vredsum_vs_u32m1_u32m1(__riscv_vwcvtu_x_x_v_u32m1(__riscv_vlmul_trunc_v_u16m2_u16mf2( \ + __riscv_vslidedown_vx_u16m2(vd_low, 8, 4)), 4), vd, 4); + vuint32m1_t rst3 = __riscv_vredsum_vs_u32m1_u32m1(__riscv_vwcvtu_x_x_v_u32m1(__riscv_vlmul_trunc_v_u16m2_u16mf2( \ + __riscv_vslidedown_vx_u16m2(vd_low, 12, 4)), 4), vd, 4); + vuint32m1_t r0 = __riscv_vslideup_vx_u32m1(__riscv_vadd_vx_u32m1(rst0, r_tmp.values[0], 2), __riscv_vadd_vx_u32m1(rst1, r_tmp.values[1], 2), 1, 2); + vuint32m1_t r1 = __riscv_vslideup_vx_u32m1(r0, __riscv_vadd_vx_u32m1(rst2, r_tmp.values[2], 2), 2, 3); + r_.sv128 = __riscv_vslideup_vx_u32m1(r1, __riscv_vadd_vx_u32m1(rst3, r_tmp.values[3], 2), 3, 4); + return simde_uint32x4_from_private(r_); + #else + for (int i = 0 ; i < 4 ; i++) { + uint32_t acc = 0; + SIMDE_VECTORIZE_REDUCTION(+:acc) + for (int j = 0 ; j < 4 ; j++) { + const int idx = j + (i << 2); + acc += HEDLEY_STATIC_CAST(uint32_t, a_.values[idx]) * HEDLEY_STATIC_CAST(uint32_t, b_.values[idx]); + } + r_.values[i] = acc; } - r_.values[i] = acc; - } + #endif return simde_vaddq_u32(r, simde_uint32x4_from_private(r_)); #endif } diff --git a/simde/arm/neon/dot_lane.h b/simde/arm/neon/dot_lane.h index a7d570b4a..71378e9ef 100644 --- a/simde/arm/neon/dot_lane.h +++ b/simde/arm/neon/dot_lane.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Sean Maher (Copyright owned by Google, LLC) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_DOT_LANE_H) @@ -69,18 +70,31 @@ simde_vdot_lane_s32(simde_int32x2_t r, simde_int8x8_t a, simde_int8x8_t b, const simde_int8x8_private a_ = simde_int8x8_to_private(a), b_ = simde_int8x8_to_private(b); - - for (int i = 0 ; i < 2 ; i++) { - int32_t acc = 0; - SIMDE_VECTORIZE_REDUCTION(+:acc) - for (int j = 0 ; j < 4 ; j++) { - const int idx_b = j + (lane << 2); - const int idx_a = j + (i << 2); - acc += HEDLEY_STATIC_CAST(int32_t, a_.values[idx_a]) * HEDLEY_STATIC_CAST(int32_t, b_.values[idx_b]); + #if defined(SIMDE_RISCV_V_NATIVE) + simde_int32x2_private r_tmp = simde_int32x2_to_private(r); + vint8mf4_t vb_low = __riscv_vlmul_trunc_v_i8m1_i8mf4(__riscv_vslidedown_vx_i8m1(b_.sv64, lane*4, 4)); + vint16mf2_t vd_high = __riscv_vwmul_vv_i16mf2(__riscv_vlmul_trunc_v_i8m1_i8mf4 \ + (__riscv_vslidedown_vx_i8m1(a_.sv64, 4, 4)), vb_low, 4); + vint32m1_t vd = __riscv_vmv_v_x_i32m1(0, 4); + vint32m1_t rst0 = __riscv_vredsum_vs_i32m1_i32m1(__riscv_vwcvt_x_x_v_i32m1(__riscv_vwmul_vv_i16mf2( \ + __riscv_vlmul_trunc_v_i8m1_i8mf4(__riscv_vslidedown_vx_i8m1(a_.sv64, 0, 4)), vb_low, 4), 4), vd, 4); + vint32m1_t rst1 = __riscv_vredsum_vs_i32m1_i32m1(__riscv_vwcvt_x_x_v_i32m1 (vd_high, 4), vd, 4); + r_.sv64 = __riscv_vslideup_vx_i32m1( + __riscv_vadd_vx_i32m1(rst0, r_tmp.values[0], 2), + __riscv_vadd_vx_i32m1(rst1, r_tmp.values[1], 2), + 1, 2); + #else + for (int i = 0 ; i < 2 ; i++) { + int32_t acc = 0; + SIMDE_VECTORIZE_REDUCTION(+:acc) + for (int j = 0 ; j < 4 ; j++) { + const int idx_b = j + (lane << 2); + const int idx_a = j + (i << 2); + acc += HEDLEY_STATIC_CAST(int32_t, a_.values[idx_a]) * HEDLEY_STATIC_CAST(int32_t, b_.values[idx_b]); + } + r_.values[i] += acc; } - r_.values[i] += acc; - } - + #endif result = simde_int32x2_from_private(r_); #endif @@ -120,18 +134,31 @@ simde_vdot_lane_u32(simde_uint32x2_t r, simde_uint8x8_t a, simde_uint8x8_t b, co simde_uint8x8_private a_ = simde_uint8x8_to_private(a), b_ = simde_uint8x8_to_private(b); - - for (int i = 0 ; i < 2 ; i++) { - uint32_t acc = 0; - SIMDE_VECTORIZE_REDUCTION(+:acc) - for (int j = 0 ; j < 4 ; j++) { - const int idx_b = j + (lane << 2); - const int idx_a = j + (i << 2); - acc += HEDLEY_STATIC_CAST(uint32_t, a_.values[idx_a]) * HEDLEY_STATIC_CAST(uint32_t, b_.values[idx_b]); + #if defined(SIMDE_RISCV_V_NATIVE) + simde_uint32x2_private r_tmp = simde_uint32x2_to_private(r); + vuint8mf4_t vb_low = __riscv_vlmul_trunc_v_u8m1_u8mf4 ( + __riscv_vslidedown_vx_u8m1(b_.sv64, lane*4, 4)); + vuint32m1_t vd = __riscv_vmv_v_x_u32m1(0, 4); + vuint32m1_t rst0 = __riscv_vredsum_vs_u32m1_u32m1(__riscv_vwcvtu_x_x_v_u32m1 (__riscv_vwmulu_vv_u16mf2 \ + (__riscv_vlmul_trunc_v_u8m1_u8mf4(__riscv_vslidedown_vx_u8m1(a_.sv64, 0, 4)), vb_low, 4), 4), vd, 4); + vuint32m1_t rst1 = __riscv_vredsum_vs_u32m1_u32m1(__riscv_vwcvtu_x_x_v_u32m1 (__riscv_vwmulu_vv_u16mf2 \ + (__riscv_vlmul_trunc_v_u8m1_u8mf4(__riscv_vslidedown_vx_u8m1(a_.sv64, 4, 4)), vb_low, 4), 4), vd, 4); + r_.sv64 = __riscv_vslideup_vx_u32m1( + __riscv_vadd_vx_u32m1(rst0, r_tmp.values[0], 2), + __riscv_vadd_vx_u32m1(rst1, r_tmp.values[1], 2), + 1, 2); + #else + for (int i = 0 ; i < 2 ; i++) { + uint32_t acc = 0; + SIMDE_VECTORIZE_REDUCTION(+:acc) + for (int j = 0 ; j < 4 ; j++) { + const int idx_b = j + (lane << 2); + const int idx_a = j + (i << 2); + acc += HEDLEY_STATIC_CAST(uint32_t, a_.values[idx_a]) * HEDLEY_STATIC_CAST(uint32_t, b_.values[idx_b]); + } + r_.values[i] += acc; } - r_.values[i] += acc; - } - + #endif result = simde_uint32x2_from_private(r_); #endif @@ -169,18 +196,31 @@ simde_vdot_laneq_s32(simde_int32x2_t r, simde_int8x8_t a, simde_int8x16_t b, con simde_int32x2_private r_ = simde_int32x2_to_private(r); simde_int8x8_private a_ = simde_int8x8_to_private(a); simde_int8x16_private b_ = simde_int8x16_to_private(b); - - for (int i = 0 ; i < 2 ; i++) { - int32_t acc = 0; - SIMDE_VECTORIZE_REDUCTION(+:acc) - for (int j = 0 ; j < 4 ; j++) { - const int idx_b = j + (lane << 2); - const int idx_a = j + (i << 2); - acc += HEDLEY_STATIC_CAST(int32_t, a_.values[idx_a]) * HEDLEY_STATIC_CAST(int32_t, b_.values[idx_b]); + #if defined(SIMDE_RISCV_V_NATIVE) + simde_int32x2_private r_tmp = simde_int32x2_to_private(r); + vint8mf4_t vb_low = __riscv_vlmul_trunc_v_i8m1_i8mf4( + __riscv_vslidedown_vx_i8m1(b_.sv128, lane*4, 4)); + vint32m1_t vd = __riscv_vmv_v_x_i32m1(0, 4); + vint32m1_t rst0 = __riscv_vredsum_vs_i32m1_i32m1(__riscv_vwcvt_x_x_v_i32m1 (__riscv_vwmul_vv_i16mf2 ( \ + __riscv_vlmul_trunc_v_i8m1_i8mf4(__riscv_vslidedown_vx_i8m1(a_.sv64, 0, 4)), vb_low, 4), 4), vd, 4); + vint32m1_t rst1 = __riscv_vredsum_vs_i32m1_i32m1(__riscv_vwcvt_x_x_v_i32m1 (__riscv_vwmul_vv_i16mf2 ( \ + __riscv_vlmul_trunc_v_i8m1_i8mf4(__riscv_vslidedown_vx_i8m1(a_.sv64, 4, 4)), vb_low, 4), 4), vd, 4); + r_.sv64 = __riscv_vslideup_vx_i32m1( + __riscv_vadd_vx_i32m1(rst0, r_tmp.values[0], 2), + __riscv_vadd_vx_i32m1(rst1, r_tmp.values[1], 2), + 1, 2); + #else + for (int i = 0 ; i < 2 ; i++) { + int32_t acc = 0; + SIMDE_VECTORIZE_REDUCTION(+:acc) + for (int j = 0 ; j < 4 ; j++) { + const int idx_b = j + (lane << 2); + const int idx_a = j + (i << 2); + acc += HEDLEY_STATIC_CAST(int32_t, a_.values[idx_a]) * HEDLEY_STATIC_CAST(int32_t, b_.values[idx_b]); + } + r_.values[i] += acc; } - r_.values[i] += acc; - } - + #endif result = simde_int32x2_from_private(r_); #endif @@ -218,18 +258,31 @@ simde_vdot_laneq_u32(simde_uint32x2_t r, simde_uint8x8_t a, simde_uint8x16_t b, simde_uint32x2_private r_ = simde_uint32x2_to_private(r); simde_uint8x8_private a_ = simde_uint8x8_to_private(a); simde_uint8x16_private b_ = simde_uint8x16_to_private(b); - - for (int i = 0 ; i < 2 ; i++) { - uint32_t acc = 0; - SIMDE_VECTORIZE_REDUCTION(+:acc) - for (int j = 0 ; j < 4 ; j++) { - const int idx_b = j + (lane << 2); - const int idx_a = j + (i << 2); - acc += HEDLEY_STATIC_CAST(uint32_t, a_.values[idx_a]) * HEDLEY_STATIC_CAST(uint32_t, b_.values[idx_b]); + #if defined(SIMDE_RISCV_V_NATIVE) + simde_uint32x2_private r_tmp = simde_uint32x2_to_private(r); + vuint8mf4_t vb_low = __riscv_vlmul_trunc_v_u8m1_u8mf4 ( + __riscv_vslidedown_vx_u8m1(b_.sv128, lane*4, 4)); + vuint32m1_t vd = __riscv_vmv_v_x_u32m1(0, 4); + vuint32m1_t rst0 = __riscv_vredsum_vs_u32m1_u32m1(__riscv_vwcvtu_x_x_v_u32m1 (__riscv_vwmulu_vv_u16mf2 \ + (__riscv_vlmul_trunc_v_u8m1_u8mf4 (__riscv_vslidedown_vx_u8m1(a_.sv64, 0, 4)), vb_low, 4), 4), vd, 4); + vuint32m1_t rst1 = __riscv_vredsum_vs_u32m1_u32m1(__riscv_vwcvtu_x_x_v_u32m1 (__riscv_vwmulu_vv_u16mf2 \ + (__riscv_vlmul_trunc_v_u8m1_u8mf4 (__riscv_vslidedown_vx_u8m1(a_.sv64, 4, 4)), vb_low, 4), 4), vd, 4); + r_.sv64 = __riscv_vslideup_vx_u32m1( + __riscv_vadd_vx_u32m1(rst0, r_tmp.values[0], 2), + __riscv_vadd_vx_u32m1(rst1, r_tmp.values[1], 2), + 1, 2); + #else + for (int i = 0 ; i < 2 ; i++) { + uint32_t acc = 0; + SIMDE_VECTORIZE_REDUCTION(+:acc) + for (int j = 0 ; j < 4 ; j++) { + const int idx_b = j + (lane << 2); + const int idx_a = j + (i << 2); + acc += HEDLEY_STATIC_CAST(uint32_t, a_.values[idx_a]) * HEDLEY_STATIC_CAST(uint32_t, b_.values[idx_b]); + } + r_.values[i] += acc; } - r_.values[i] += acc; - } - + #endif result = simde_uint32x2_from_private(r_); #endif return result; @@ -280,18 +333,34 @@ simde_vdotq_laneq_u32(simde_uint32x4_t r, simde_uint8x16_t a, simde_uint8x16_t b simde_uint8x16_private a_ = simde_uint8x16_to_private(a), b_ = simde_uint8x16_to_private(b); - - for(int i = 0 ; i < 4 ; i++) { - uint32_t acc = 0; - SIMDE_VECTORIZE_REDUCTION(+:acc) - for(int j = 0 ; j < 4 ; j++) { - const int idx_b = j + (lane << 2); - const int idx_a = j + (i << 2); - acc += HEDLEY_STATIC_CAST(uint32_t, a_.values[idx_a]) * HEDLEY_STATIC_CAST(uint32_t, b_.values[idx_b]); + #if defined(SIMDE_RISCV_V_NATIVE) + simde_uint32x4_private r_tmp = simde_uint32x4_to_private(r); + vuint8mf4_t vb_low = __riscv_vlmul_trunc_v_u8m1_u8mf4( + __riscv_vslidedown_vx_u8m1(b_.sv128, lane*4, 4)); + vuint32m1_t vd = __riscv_vmv_v_x_u32m1(0, 4); + vuint32m1_t rst0 = __riscv_vredsum_vs_u32m1_u32m1(__riscv_vwcvtu_x_x_v_u32m1(__riscv_vwmulu_vv_u16mf2 \ + (__riscv_vlmul_trunc_v_u8m1_u8mf4(__riscv_vslidedown_vx_u8m1(a_.sv128, 0, 4)), vb_low, 4), 4), vd, 4); + vuint32m1_t rst1 = __riscv_vredsum_vs_u32m1_u32m1(__riscv_vwcvtu_x_x_v_u32m1 (__riscv_vwmulu_vv_u16mf2 \ + (__riscv_vlmul_trunc_v_u8m1_u8mf4(__riscv_vslidedown_vx_u8m1(a_.sv128, 4, 4)), vb_low, 4), 4), vd, 4); + vuint32m1_t rst2 = __riscv_vredsum_vs_u32m1_u32m1(__riscv_vwcvtu_x_x_v_u32m1 (__riscv_vwmulu_vv_u16mf2 \ + (__riscv_vlmul_trunc_v_u8m1_u8mf4(__riscv_vslidedown_vx_u8m1(a_.sv128, 8, 4)), vb_low, 4), 4), vd, 4); + vuint32m1_t rst3 = __riscv_vredsum_vs_u32m1_u32m1(__riscv_vwcvtu_x_x_v_u32m1 (__riscv_vwmulu_vv_u16mf2 \ + (__riscv_vlmul_trunc_v_u8m1_u8mf4(__riscv_vslidedown_vx_u8m1(a_.sv128, 12, 4)), vb_low, 4), 4), vd, 4); + vuint32m1_t r0 = __riscv_vslideup_vx_u32m1(__riscv_vadd_vx_u32m1(rst0, r_tmp.values[0], 2), __riscv_vadd_vx_u32m1(rst1, r_tmp.values[1], 2), 1, 2); + vuint32m1_t r1 = __riscv_vslideup_vx_u32m1(r0, __riscv_vadd_vx_u32m1(rst2, r_tmp.values[2], 2), 2, 3); + r_.sv128 = __riscv_vslideup_vx_u32m1(r1, __riscv_vadd_vx_u32m1(rst3, r_tmp.values[3], 2), 3, 4); + #else + for(int i = 0 ; i < 4 ; i++) { + uint32_t acc = 0; + SIMDE_VECTORIZE_REDUCTION(+:acc) + for(int j = 0 ; j < 4 ; j++) { + const int idx_b = j + (lane << 2); + const int idx_a = j + (i << 2); + acc += HEDLEY_STATIC_CAST(uint32_t, a_.values[idx_a]) * HEDLEY_STATIC_CAST(uint32_t, b_.values[idx_b]); + } + r_.values[i] += acc; } - r_.values[i] += acc; - } - + #endif result = simde_uint32x4_from_private(r_); #endif return result; @@ -342,18 +411,38 @@ simde_vdotq_laneq_s32(simde_int32x4_t r, simde_int8x16_t a, simde_int8x16_t b, c simde_int8x16_private a_ = simde_int8x16_to_private(a), b_ = simde_int8x16_to_private(b); - - for(int i = 0 ; i < 4 ; i++) { - int32_t acc = 0; - SIMDE_VECTORIZE_REDUCTION(+:acc) - for(int j = 0 ; j < 4 ; j++) { - const int idx_b = j + (lane << 2); - const int idx_a = j + (i << 2); - acc += HEDLEY_STATIC_CAST(int32_t, a_.values[idx_a]) * HEDLEY_STATIC_CAST(int32_t, b_.values[idx_b]); + #if defined(SIMDE_RISCV_V_NATIVE) + simde_int32x4_private r_tmp = simde_int32x4_to_private(r); + vint8mf4_t vb_low = __riscv_vlmul_trunc_v_i8m1_i8mf4( + __riscv_vslidedown_vx_i8m1(b_.sv128, lane*4, 4)); + vint32m1_t vd = __riscv_vmv_v_x_i32m1(0, 4); + vint32m1_t rst0 = __riscv_vredsum_vs_i32m1_i32m1(__riscv_vwcvt_x_x_v_i32m1 \ + (__riscv_vwmul_vv_i16mf2 (__riscv_vlmul_trunc_v_i8m1_i8mf4(__riscv_vslidedown_vx_i8m1(a_.sv128, 0, 4)), \ + vb_low, 4), 4), vd, 4); + vint32m1_t rst1 = __riscv_vredsum_vs_i32m1_i32m1(__riscv_vwcvt_x_x_v_i32m1 \ + (__riscv_vwmul_vv_i16mf2 (__riscv_vlmul_trunc_v_i8m1_i8mf4(__riscv_vslidedown_vx_i8m1(a_.sv128, 4, 4)), \ + vb_low, 4), 4), vd, 4); + vint32m1_t rst2 = __riscv_vredsum_vs_i32m1_i32m1(__riscv_vwcvt_x_x_v_i32m1 \ + (__riscv_vwmul_vv_i16mf2 (__riscv_vlmul_trunc_v_i8m1_i8mf4(__riscv_vslidedown_vx_i8m1(a_.sv128, 8, 4)), \ + vb_low, 4), 4), vd, 4); + vint32m1_t rst3 = __riscv_vredsum_vs_i32m1_i32m1(__riscv_vwcvt_x_x_v_i32m1 \ + (__riscv_vwmul_vv_i16mf2 (__riscv_vlmul_trunc_v_i8m1_i8mf4(__riscv_vslidedown_vx_i8m1(a_.sv128, 12, 4)), \ + vb_low, 4), 4), vd, 4); + vint32m1_t r0 = __riscv_vslideup_vx_i32m1(__riscv_vadd_vx_i32m1(rst0, r_tmp.values[0], 2), __riscv_vadd_vx_i32m1(rst1, r_tmp.values[1], 2), 1, 2); + vint32m1_t r1 = __riscv_vslideup_vx_i32m1(r0, __riscv_vadd_vx_i32m1(rst2, r_tmp.values[2], 2), 2, 3); + r_.sv128 = __riscv_vslideup_vx_i32m1(r1, __riscv_vadd_vx_i32m1(rst3, r_tmp.values[3], 2), 3, 4); + #else + for(int i = 0 ; i < 4 ; i++) { + int32_t acc = 0; + SIMDE_VECTORIZE_REDUCTION(+:acc) + for(int j = 0 ; j < 4 ; j++) { + const int idx_b = j + (lane << 2); + const int idx_a = j + (i << 2); + acc += HEDLEY_STATIC_CAST(int32_t, a_.values[idx_a]) * HEDLEY_STATIC_CAST(int32_t, b_.values[idx_b]); + } + r_.values[i] += acc; } - r_.values[i] += acc; - } - + #endif result = simde_int32x4_from_private(r_); #endif return result; @@ -403,18 +492,33 @@ simde_vdotq_lane_u32(simde_uint32x4_t r, simde_uint8x16_t a, simde_uint8x8_t b, simde_uint32x4_private r_ = simde_uint32x4_to_private(r); simde_uint8x16_private a_ = simde_uint8x16_to_private(a); simde_uint8x8_private b_ = simde_uint8x8_to_private(b); - - for(int i = 0 ; i < 4 ; i++) { - uint32_t acc = 0; - SIMDE_VECTORIZE_REDUCTION(+:acc) - for(int j = 0 ; j < 4 ; j++) { - const int idx_b = j + (lane << 2); - const int idx_a = j + (i << 2); - acc += HEDLEY_STATIC_CAST(uint32_t, a_.values[idx_a]) * HEDLEY_STATIC_CAST(uint32_t, b_.values[idx_b]); + #if defined(SIMDE_RISCV_V_NATIVE) + simde_uint32x4_private r_tmp = simde_uint32x4_to_private(r); + vuint8mf4_t vb_low = __riscv_vlmul_trunc_v_u8m1_u8mf4(__riscv_vslidedown_vx_u8m1(b_.sv64, lane*4, 4)); + vuint32m1_t vd = __riscv_vmv_v_x_u32m1(0, 4); + vuint32m1_t rst0 = __riscv_vredsum_vs_u32m1_u32m1(__riscv_vwcvtu_x_x_v_u32m1(__riscv_vwmulu_vv_u16mf2 \ + (__riscv_vlmul_trunc_v_u8m1_u8mf4(__riscv_vslidedown_vx_u8m1(a_.sv128, 0, 4)), vb_low, 4), 4), vd, 4); + vuint32m1_t rst1 = __riscv_vredsum_vs_u32m1_u32m1(__riscv_vwcvtu_x_x_v_u32m1(__riscv_vwmulu_vv_u16mf2 \ + (__riscv_vlmul_trunc_v_u8m1_u8mf4(__riscv_vslidedown_vx_u8m1(a_.sv128, 4, 4)), vb_low, 4), 4), vd, 4); + vuint32m1_t rst2 = __riscv_vredsum_vs_u32m1_u32m1(__riscv_vwcvtu_x_x_v_u32m1(__riscv_vwmulu_vv_u16mf2 \ + (__riscv_vlmul_trunc_v_u8m1_u8mf4(__riscv_vslidedown_vx_u8m1(a_.sv128, 8, 4)), vb_low, 4), 4), vd, 4); + vuint32m1_t rst3 = __riscv_vredsum_vs_u32m1_u32m1(__riscv_vwcvtu_x_x_v_u32m1(__riscv_vwmulu_vv_u16mf2 \ + (__riscv_vlmul_trunc_v_u8m1_u8mf4(__riscv_vslidedown_vx_u8m1(a_.sv128, 12, 4)), vb_low, 4), 4), vd, 4); + vuint32m1_t r0 = __riscv_vslideup_vx_u32m1(__riscv_vadd_vx_u32m1(rst0, r_tmp.values[0], 2), __riscv_vadd_vx_u32m1(rst1, r_tmp.values[1], 2), 1, 2); + vuint32m1_t r1 = __riscv_vslideup_vx_u32m1(r0, __riscv_vadd_vx_u32m1(rst2, r_tmp.values[2], 2), 2, 3); + r_.sv128 = __riscv_vslideup_vx_u32m1(r1, __riscv_vadd_vx_u32m1(rst3, r_tmp.values[3], 2), 3, 4); + #else + for(int i = 0 ; i < 4 ; i++) { + uint32_t acc = 0; + SIMDE_VECTORIZE_REDUCTION(+:acc) + for(int j = 0 ; j < 4 ; j++) { + const int idx_b = j + (lane << 2); + const int idx_a = j + (i << 2); + acc += HEDLEY_STATIC_CAST(uint32_t, a_.values[idx_a]) * HEDLEY_STATIC_CAST(uint32_t, b_.values[idx_b]); + } + r_.values[i] += acc; } - r_.values[i] += acc; - } - + #endif result = simde_uint32x4_from_private(r_); #endif return result; @@ -464,18 +568,34 @@ simde_vdotq_lane_s32(simde_int32x4_t r, simde_int8x16_t a, simde_int8x8_t b, con simde_int32x4_private r_ = simde_int32x4_to_private(r); simde_int8x16_private a_ = simde_int8x16_to_private(a); simde_int8x8_private b_ = simde_int8x8_to_private(b); - - for(int i = 0 ; i < 4 ; i++) { - int32_t acc = 0; - SIMDE_VECTORIZE_REDUCTION(+:acc) - for(int j = 0 ; j < 4 ; j++) { - const int idx_b = j + (lane << 2); - const int idx_a = j + (i << 2); - acc += HEDLEY_STATIC_CAST(int32_t, a_.values[idx_a]) * HEDLEY_STATIC_CAST(int32_t, b_.values[idx_b]); + #if defined(SIMDE_RISCV_V_NATIVE) + simde_int32x4_private r_tmp = simde_int32x4_to_private(r); + vint8mf4_t vb_low = __riscv_vlmul_trunc_v_i8m1_i8mf4( + __riscv_vslidedown_vx_i8m1(b_.sv64, lane*4, 4)); + vint32m1_t vd = __riscv_vmv_v_x_i32m1(0, 4); + vint32m1_t rst0 = __riscv_vredsum_vs_i32m1_i32m1(__riscv_vwcvt_x_x_v_i32m1(__riscv_vwmul_vv_i16mf2 \ + (__riscv_vlmul_trunc_v_i8m1_i8mf4(__riscv_vslidedown_vx_i8m1(a_.sv128, 0, 4)), vb_low, 4), 4), vd, 4); + vint32m1_t rst1 = __riscv_vredsum_vs_i32m1_i32m1(__riscv_vwcvt_x_x_v_i32m1(__riscv_vwmul_vv_i16mf2 \ + (__riscv_vlmul_trunc_v_i8m1_i8mf4(__riscv_vslidedown_vx_i8m1(a_.sv128, 4, 4)), vb_low, 4), 4), vd, 4); + vint32m1_t rst2 = __riscv_vredsum_vs_i32m1_i32m1(__riscv_vwcvt_x_x_v_i32m1(__riscv_vwmul_vv_i16mf2 \ + (__riscv_vlmul_trunc_v_i8m1_i8mf4(__riscv_vslidedown_vx_i8m1(a_.sv128, 8, 4)), vb_low, 4), 4), vd, 4); + vint32m1_t rst3 = __riscv_vredsum_vs_i32m1_i32m1(__riscv_vwcvt_x_x_v_i32m1(__riscv_vwmul_vv_i16mf2 \ + (__riscv_vlmul_trunc_v_i8m1_i8mf4(__riscv_vslidedown_vx_i8m1(a_.sv128, 12, 4)), vb_low, 4), 4), vd, 4); + vint32m1_t r0 = __riscv_vslideup_vx_i32m1(__riscv_vadd_vx_i32m1(rst0, r_tmp.values[0], 2), __riscv_vadd_vx_i32m1(rst1, r_tmp.values[1], 2), 1, 2); + vint32m1_t r1 = __riscv_vslideup_vx_i32m1(r0, __riscv_vadd_vx_i32m1(rst2, r_tmp.values[2], 2), 2, 3); + r_.sv128 = __riscv_vslideup_vx_i32m1(r1, __riscv_vadd_vx_i32m1(rst3, r_tmp.values[3], 2), 3, 4); + #else + for(int i = 0 ; i < 4 ; i++) { + int32_t acc = 0; + SIMDE_VECTORIZE_REDUCTION(+:acc) + for(int j = 0 ; j < 4 ; j++) { + const int idx_b = j + (lane << 2); + const int idx_a = j + (i << 2); + acc += HEDLEY_STATIC_CAST(int32_t, a_.values[idx_a]) * HEDLEY_STATIC_CAST(int32_t, b_.values[idx_b]); + } + r_.values[i] += acc; } - r_.values[i] += acc; - } - + #endif result = simde_int32x4_from_private(r_); #endif return result; diff --git a/simde/arm/neon/dup_n.h b/simde/arm/neon/dup_n.h index 365293edf..61b06a3dc 100644 --- a/simde/arm/neon/dup_n.h +++ b/simde/arm/neon/dup_n.h @@ -24,6 +24,7 @@ * 2020 Sean Maher (Copyright owned by Google, LLC) * 2020 Evan Nemerson * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_DUP_N_H) @@ -42,12 +43,14 @@ simde_vdup_n_f16(simde_float16_t value) { return vdup_n_f16(value); #else simde_float16x4_private r_; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = value; - } - + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + r_.sv64 = __riscv_vfmv_v_f_f16m1 (value, 4); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = value; + } + #endif return simde_float16x4_from_private(r_); #endif } @@ -66,12 +69,14 @@ simde_vdup_n_f32(float value) { return vdup_n_f32(value); #else simde_float32x2_private r_; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = value; - } - + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vfmv_v_f_f32m1(value, 2); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = value; + } + #endif return simde_float32x2_from_private(r_); #endif } @@ -90,12 +95,14 @@ simde_vdup_n_f64(double value) { return vdup_n_f64(value); #else simde_float64x1_private r_; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = value; - } - + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vfmv_v_f_f64m1(value, 1); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = value; + } + #endif return simde_float64x1_from_private(r_); #endif } @@ -117,11 +124,13 @@ simde_vdup_n_s8(int8_t value) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_set1_pi8(value); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmv_v_x_i8m1(value, 8); #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = value; - } + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = value; + } #endif return simde_int8x8_from_private(r_); @@ -145,6 +154,8 @@ simde_vdup_n_s16(int16_t value) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_set1_pi16(value); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmv_v_x_i16m1(value, 4); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -173,6 +184,8 @@ simde_vdup_n_s32(int32_t value) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_set1_pi32(value); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmv_v_x_i32m1(value, 2); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -199,11 +212,14 @@ simde_vdup_n_s64(int64_t value) { #else simde_int64x1_private r_; - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = value; - } - + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmv_v_x_i64m1(value, 1); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = value; + } + #endif return simde_int64x1_from_private(r_); #endif } @@ -225,6 +241,8 @@ simde_vdup_n_u8(uint8_t value) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_set1_pi8(HEDLEY_STATIC_CAST(int8_t, value)); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmv_v_x_u8m1(value, 8); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -253,6 +271,8 @@ simde_vdup_n_u16(uint16_t value) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_set1_pi16(HEDLEY_STATIC_CAST(int16_t, value)); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmv_v_x_u16m1(value, 4); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -281,6 +301,8 @@ simde_vdup_n_u32(uint32_t value) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_set1_pi32(HEDLEY_STATIC_CAST(int32_t, value)); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmv_v_x_u32m1(value, 2); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -306,12 +328,14 @@ simde_vdup_n_u64(uint64_t value) { return vdup_n_u64(value); #else simde_uint64x1_private r_; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = value; - } - + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmv_v_x_u64m1(value, 1); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = value; + } + #endif return simde_uint64x1_from_private(r_); #endif } @@ -330,12 +354,14 @@ simde_vdupq_n_f16(simde_float16_t value) { return vdupq_n_f16(value); #else simde_float16x8_private r_; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = value; - } - + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + r_.sv128 = __riscv_vfmv_v_f_f16m1(value, 8); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = value; + } + #endif return simde_float16x8_from_private(r_); #endif } @@ -362,6 +388,8 @@ simde_vdupq_n_f32(float value) { r_.m128 = _mm_set1_ps(value); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_f32x4_splat(value); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vfmv_v_f_f32m1(value, 4); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -395,6 +423,8 @@ simde_vdupq_n_f64(double value) { r_.m128d = _mm_set1_pd(value); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_f64x2_splat(value); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vfmv_v_f_f64m1(value, 2); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -427,6 +457,8 @@ simde_vdupq_n_s8(int8_t value) { r_.m128i = _mm_set1_epi8(value); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i8x16_splat(value); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vmv_v_x_i8m1(value, 16); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -459,6 +491,8 @@ simde_vdupq_n_s16(int16_t value) { r_.m128i = _mm_set1_epi16(value); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i16x8_splat(value); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vmv_v_x_i16m1(value, 8); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -491,6 +525,8 @@ simde_vdupq_n_s32(int32_t value) { r_.m128i = _mm_set1_epi32(value); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i32x4_splat(value); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vmv_v_x_i32m1(value, 4); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -523,6 +559,8 @@ simde_vdupq_n_s64(int64_t value) { r_.m128i = _mm_set1_epi64x(value); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i64x2_splat(value); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vmv_v_x_i64m1(value, 2); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -555,6 +593,8 @@ simde_vdupq_n_u8(uint8_t value) { r_.m128i = _mm_set1_epi8(HEDLEY_STATIC_CAST(int8_t, value)); #elif defined (SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i8x16_splat(HEDLEY_STATIC_CAST(int8_t, value)); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vmv_v_x_u8m1(value, 16); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -587,6 +627,8 @@ simde_vdupq_n_u16(uint16_t value) { r_.m128i = _mm_set1_epi16(HEDLEY_STATIC_CAST(int16_t, value)); #elif defined (SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i16x8_splat(HEDLEY_STATIC_CAST(int16_t, value)); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vmv_v_x_u16m1(value, 8); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -619,6 +661,8 @@ simde_vdupq_n_u32(uint32_t value) { r_.m128i = _mm_set1_epi32(HEDLEY_STATIC_CAST(int32_t, value)); #elif defined (SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i32x4_splat(HEDLEY_STATIC_CAST(int32_t, value)); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vmv_v_x_u32m1(value, 4); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -651,6 +695,8 @@ simde_vdupq_n_u64(uint64_t value) { r_.m128i = _mm_set1_epi64x(HEDLEY_STATIC_CAST(int64_t, value)); #elif defined (SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i64x2_splat(HEDLEY_STATIC_CAST(int64_t, value)); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vmv_v_x_u64m1(value, 2); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { diff --git a/simde/arm/neon/eor.h b/simde/arm/neon/eor.h index 951476025..9bb53b479 100644 --- a/simde/arm/neon/eor.h +++ b/simde/arm/neon/eor.h @@ -24,6 +24,8 @@ * 2020 Evan Nemerson * 2020 Christopher Moore * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Yung-Cheng Su (Copyright owned by NTHU pllab) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_EOR_H) @@ -48,6 +50,8 @@ simde_veor_s8(simde_int8x8_t a, simde_int8x8_t b) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_xor_si64(a_.m64, b_.m64); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vxor_vv_i8m1(a_.sv64, b_.sv64, 8); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values ^ b_.values; #else @@ -78,6 +82,8 @@ simde_veor_s16(simde_int16x4_t a, simde_int16x4_t b) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_xor_si64(a_.m64, b_.m64); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vxor_vv_i16m1(a_.sv64, b_.sv64, 4); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values ^ b_.values; #else @@ -108,6 +114,8 @@ simde_veor_s32(simde_int32x2_t a, simde_int32x2_t b) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_xor_si64(a_.m64, b_.m64); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vxor_vv_i32m1(a_.sv64, b_.sv64, 2); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values ^ b_.values; #else @@ -138,6 +146,8 @@ simde_veor_s64(simde_int64x1_t a, simde_int64x1_t b) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_xor_si64(a_.m64, b_.m64); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vxor_vv_i64m1(a_.sv64, b_.sv64, 1); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values ^ b_.values; #else @@ -168,6 +178,8 @@ simde_veor_u8(simde_uint8x8_t a, simde_uint8x8_t b) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_xor_si64(a_.m64, b_.m64); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vxor_vv_u8m1(a_.sv64, b_.sv64, 8); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values ^ b_.values; #else @@ -198,6 +210,8 @@ simde_veor_u16(simde_uint16x4_t a, simde_uint16x4_t b) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_xor_si64(a_.m64, b_.m64); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vxor_vv_u16m1(a_.sv64, b_.sv64, 4); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values ^ b_.values; #else @@ -228,6 +242,8 @@ simde_veor_u32(simde_uint32x2_t a, simde_uint32x2_t b) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_xor_si64(a_.m64, b_.m64); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vxor_vv_u32m1(a_.sv64, b_.sv64, 2); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values ^ b_.values; #else @@ -258,6 +274,8 @@ simde_veor_u64(simde_uint64x1_t a, simde_uint64x1_t b) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_xor_si64(a_.m64, b_.m64); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vxor_vv_u64m1(a_.sv64, b_.sv64, 1); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values ^ b_.values; #else @@ -292,6 +310,8 @@ simde_veorq_s8(simde_int8x16_t a, simde_int8x16_t b) { r_.m128i = _mm_xor_si128(a_.m128i, b_.m128i); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_v128_xor(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vxor_vv_i8m1(a_.sv128, b_.sv128, 16); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values ^ b_.values; #else @@ -326,6 +346,8 @@ simde_veorq_s16(simde_int16x8_t a, simde_int16x8_t b) { r_.m128i = _mm_xor_si128(a_.m128i, b_.m128i); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_v128_xor(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vxor_vv_i16m1(a_.sv128, b_.sv128, 8); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values ^ b_.values; #else @@ -360,6 +382,8 @@ simde_veorq_s32(simde_int32x4_t a, simde_int32x4_t b) { r_.m128i = _mm_xor_si128(a_.m128i, b_.m128i); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_v128_xor(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vxor_vv_i32m1(a_.sv128, b_.sv128, 4); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values ^ b_.values; #else @@ -394,6 +418,8 @@ simde_veorq_s64(simde_int64x2_t a, simde_int64x2_t b) { r_.m128i = _mm_xor_si128(a_.m128i, b_.m128i); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_v128_xor(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vxor_vv_i64m1(a_.sv128, b_.sv128, 2); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values ^ b_.values; #else @@ -428,6 +454,8 @@ simde_veorq_u8(simde_uint8x16_t a, simde_uint8x16_t b) { r_.m128i = _mm_xor_si128(a_.m128i, b_.m128i); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_v128_xor(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vxor_vv_u8m1(a_.sv128, b_.sv128, 16); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values ^ b_.values; #else @@ -462,6 +490,8 @@ simde_veorq_u16(simde_uint16x8_t a, simde_uint16x8_t b) { r_.m128i = _mm_xor_si128(a_.m128i, b_.m128i); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_v128_xor(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vxor_vv_u16m1(a_.sv128, b_.sv128, 8); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values ^ b_.values; #else @@ -496,6 +526,8 @@ simde_veorq_u32(simde_uint32x4_t a, simde_uint32x4_t b) { r_.m128i = _mm_xor_si128(a_.m128i, b_.m128i); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_v128_xor(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vxor_vv_u32m1(a_.sv128, b_.sv128, 4); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values ^ b_.values; #else @@ -530,6 +562,8 @@ simde_veorq_u64(simde_uint64x2_t a, simde_uint64x2_t b) { r_.m128i = _mm_xor_si128(a_.m128i, b_.m128i); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_v128_xor(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vxor_vv_u64m1(a_.sv128, b_.sv128, 2); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values ^ b_.values; #else @@ -560,10 +594,16 @@ simde_veor3q_s8(simde_int8x16_t a, simde_int8x16_t b, simde_int8x16_t c) { b_ = simde_int8x16_to_private(b), c_ = simde_int8x16_to_private(c); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] ^ b_.values[i] ^ c_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vxor_vv_i8m1(__riscv_vxor_vv_i8m1(a_.sv128, b_.sv128, 16), c_.sv128, 16); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.values = a_.values ^ b_.values ^ c_.values; + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] ^ b_.values[i] ^ c_.values[i]; + } + #endif return simde_int8x16_from_private(r_); #endif @@ -585,10 +625,16 @@ simde_veor3q_s16(simde_int16x8_t a, simde_int16x8_t b, simde_int16x8_t c) { b_ = simde_int16x8_to_private(b), c_ = simde_int16x8_to_private(c); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] ^ b_.values[i] ^ c_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vxor_vv_i16m1(__riscv_vxor_vv_i16m1(a_.sv128, b_.sv128, 8), c_.sv128, 8); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.values = a_.values ^ b_.values ^ c_.values; + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] ^ b_.values[i] ^ c_.values[i]; + } + #endif return simde_int16x8_from_private(r_); #endif @@ -610,10 +656,16 @@ simde_veor3q_s32(simde_int32x4_t a, simde_int32x4_t b, simde_int32x4_t c) { b_ = simde_int32x4_to_private(b), c_ = simde_int32x4_to_private(c); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] ^ b_.values[i] ^ c_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vxor_vv_i32m1(__riscv_vxor_vv_i32m1(a_.sv128, b_.sv128, 4), c_.sv128, 4); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.values = a_.values ^ b_.values ^ c_.values; + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] ^ b_.values[i] ^ c_.values[i]; + } + #endif return simde_int32x4_from_private(r_); #endif @@ -635,10 +687,16 @@ simde_veor3q_s64(simde_int64x2_t a, simde_int64x2_t b, simde_int64x2_t c) { b_ = simde_int64x2_to_private(b), c_ = simde_int64x2_to_private(c); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] ^ b_.values[i] ^ c_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vxor_vv_i64m1(__riscv_vxor_vv_i64m1(a_.sv128, b_.sv128, 2), c_.sv128, 2); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.values = a_.values ^ b_.values ^ c_.values; + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] ^ b_.values[i] ^ c_.values[i]; + } + #endif return simde_int64x2_from_private(r_); #endif @@ -660,10 +718,16 @@ simde_veor3q_u8(simde_uint8x16_t a, simde_uint8x16_t b, simde_uint8x16_t c) { b_ = simde_uint8x16_to_private(b), c_ = simde_uint8x16_to_private(c); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] ^ b_.values[i] ^ c_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vxor_vv_u8m1(__riscv_vxor_vv_u8m1(a_.sv128, b_.sv128, 16), c_.sv128, 16); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.values = a_.values ^ b_.values ^ c_.values; + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] ^ b_.values[i] ^ c_.values[i]; + } + #endif return simde_uint8x16_from_private(r_); #endif @@ -685,10 +749,16 @@ simde_veor3q_u16(simde_uint16x8_t a, simde_uint16x8_t b, simde_uint16x8_t c) { b_ = simde_uint16x8_to_private(b), c_ = simde_uint16x8_to_private(c); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] ^ b_.values[i] ^ c_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vxor_vv_u16m1(__riscv_vxor_vv_u16m1(a_.sv128, b_.sv128, 8), c_.sv128, 8); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.values = a_.values ^ b_.values ^ c_.values; + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] ^ b_.values[i] ^ c_.values[i]; + } + #endif return simde_uint16x8_from_private(r_); #endif @@ -710,10 +780,16 @@ simde_veor3q_u32(simde_uint32x4_t a, simde_uint32x4_t b, simde_uint32x4_t c) { b_ = simde_uint32x4_to_private(b), c_ = simde_uint32x4_to_private(c); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] ^ b_.values[i] ^ c_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vxor_vv_u32m1(__riscv_vxor_vv_u32m1(a_.sv128, b_.sv128, 4), c_.sv128, 4); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.values = a_.values ^ b_.values ^ c_.values; + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] ^ b_.values[i] ^ c_.values[i]; + } + #endif return simde_uint32x4_from_private(r_); #endif @@ -735,10 +811,16 @@ simde_veor3q_u64(simde_uint64x2_t a, simde_uint64x2_t b, simde_uint64x2_t c) { b_ = simde_uint64x2_to_private(b), c_ = simde_uint64x2_to_private(c); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] ^ b_.values[i] ^ c_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vxor_vv_u64m1(__riscv_vxor_vv_u64m1(a_.sv128, b_.sv128, 2), c_.sv128, 2); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.values = a_.values ^ b_.values ^ c_.values; + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] ^ b_.values[i] ^ c_.values[i]; + } + #endif return simde_uint64x2_from_private(r_); #endif diff --git a/simde/arm/neon/ext.h b/simde/arm/neon/ext.h index 45c5aa0f0..4c874d92d 100644 --- a/simde/arm/neon/ext.h +++ b/simde/arm/neon/ext.h @@ -24,6 +24,7 @@ * 2020 Evan Nemerson * 2020 Sean Maher (Copyright owned by Google, LLC) * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_EXT_H) @@ -47,11 +48,16 @@ simde_vext_f16(simde_float16x4_t a, simde_float16x4_t b, const int n) a_ = simde_float16x4_to_private(a), b_ = simde_float16x4_to_private(b), r_ = a_; - const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - size_t src = i + n_; - r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 3]; - } + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + a_.sv64 = __riscv_vslidedown_vx_f16m1(a_.sv64, n, 4); + r_.sv64 = __riscv_vslideup_vx_f16m1(a_.sv64, b_.sv64, 4-n, 4); + #else + const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + size_t src = i + n_; + r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 3]; + } + #endif return simde_float16x4_from_private(r_); #endif } @@ -73,11 +79,16 @@ simde_vext_f32(simde_float32x2_t a, simde_float32x2_t b, const int n) a_ = simde_float32x2_to_private(a), b_ = simde_float32x2_to_private(b), r_ = a_; - const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - size_t src = i + n_; - r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 1]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_.sv64 = __riscv_vslidedown_vx_f32m1(a_.sv64, n, 2); + r_.sv64 = __riscv_vslideup_vx_f32m1(a_.sv64, b_.sv64, 2-n, 2); + #else + const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + size_t src = i + n_; + r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 1]; + } + #endif return simde_float32x2_from_private(r_); #endif } @@ -108,11 +119,16 @@ simde_vext_f64(simde_float64x1_t a, simde_float64x1_t b, const int n) a_ = simde_float64x1_to_private(a), b_ = simde_float64x1_to_private(b), r_ = a_; - const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - size_t src = i + n_; - r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 0]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_.sv64 = __riscv_vslidedown_vx_f64m1(a_.sv64, n, 1); + r_.sv64 = __riscv_vslideup_vx_f64m1(a_.sv64, b_.sv64, 1-n, 1); + #else + const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + size_t src = i + n_; + r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 0]; + } + #endif return simde_float64x1_from_private(r_); #endif } @@ -144,11 +160,16 @@ simde_vext_s8(simde_int8x8_t a, simde_int8x8_t b, const int n) a_ = simde_int8x8_to_private(a), b_ = simde_int8x8_to_private(b), r_ = a_; - const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - size_t src = i + n_; - r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 7]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_.sv64 = __riscv_vslidedown_vx_i8m1(a_.sv64, n, 8); + r_.sv64 = __riscv_vslideup_vx_i8m1(a_.sv64, b_.sv64, 8-n, 8); + #else + const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + size_t src = i + n_; + r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 7]; + } + #endif return simde_int8x8_from_private(r_); #endif } @@ -183,11 +204,16 @@ simde_vext_s16(simde_int16x4_t a, simde_int16x4_t b, const int n) a_ = simde_int16x4_to_private(a), b_ = simde_int16x4_to_private(b), r_ = a_; - const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - size_t src = i + n_; - r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 3]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_.sv64 = __riscv_vslidedown_vx_i16m1(a_.sv64, n, 4); + r_.sv64 = __riscv_vslideup_vx_i16m1(a_.sv64, b_.sv64, 4-n, 4); + #else + const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + size_t src = i + n_; + r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 3]; + } + #endif return simde_int16x4_from_private(r_); #endif } @@ -220,11 +246,16 @@ simde_vext_s32(simde_int32x2_t a, simde_int32x2_t b, const int n) a_ = simde_int32x2_to_private(a), b_ = simde_int32x2_to_private(b), r_ = a_; - const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - size_t src = i + n_; - r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 1]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_.sv64 = __riscv_vslidedown_vx_i32m1(a_.sv64, n, 2); + r_.sv64 = __riscv_vslideup_vx_i32m1(a_.sv64, b_.sv64, 2-n, 2); + #else + const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + size_t src = i + n_; + r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 1]; + } + #endif return simde_int32x2_from_private(r_); #endif } @@ -255,11 +286,16 @@ simde_vext_s64(simde_int64x1_t a, simde_int64x1_t b, const int n) a_ = simde_int64x1_to_private(a), b_ = simde_int64x1_to_private(b), r_ = a_; - const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - size_t src = i + n_; - r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 0]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_.sv64 = __riscv_vslidedown_vx_i64m1(a_.sv64, n, 1); + r_.sv64 = __riscv_vslideup_vx_i64m1(a_.sv64, b_.sv64, 1-n, 1); + #else + const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + size_t src = i + n_; + r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 0]; + } + #endif return simde_int64x1_from_private(r_); #endif } @@ -291,11 +327,16 @@ simde_vext_u8(simde_uint8x8_t a, simde_uint8x8_t b, const int n) a_ = simde_uint8x8_to_private(a), b_ = simde_uint8x8_to_private(b), r_ = a_; - const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - size_t src = i + n_; - r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 7]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_.sv64 = __riscv_vslidedown_vx_u8m1(a_.sv64, n, 8); + r_.sv64 = __riscv_vslideup_vx_u8m1(a_.sv64, b_.sv64, 8-n, 8); + #else + const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + size_t src = i + n_; + r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 7]; + } + #endif return simde_uint8x8_from_private(r_); #endif } @@ -330,11 +371,16 @@ simde_vext_u16(simde_uint16x4_t a, simde_uint16x4_t b, const int n) a_ = simde_uint16x4_to_private(a), b_ = simde_uint16x4_to_private(b), r_ = a_; - const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - size_t src = i + n_; - r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 3]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_.sv64 = __riscv_vslidedown_vx_u16m1(a_.sv64, n, 4); + r_.sv64 = __riscv_vslideup_vx_u16m1(a_.sv64, b_.sv64, 4-n, 4); + #else + const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + size_t src = i + n_; + r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 3]; + } + #endif return simde_uint16x4_from_private(r_); #endif } @@ -367,11 +413,16 @@ simde_vext_u32(simde_uint32x2_t a, simde_uint32x2_t b, const int n) a_ = simde_uint32x2_to_private(a), b_ = simde_uint32x2_to_private(b), r_ = a_; - const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - size_t src = i + n_; - r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 1]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_.sv64 = __riscv_vslidedown_vx_u32m1(a_.sv64, n, 2); + r_.sv64 = __riscv_vslideup_vx_u32m1(a_.sv64, b_.sv64, 2-n, 2); + #else + const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + size_t src = i + n_; + r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 1]; + } + #endif return simde_uint32x2_from_private(r_); #endif } @@ -402,11 +453,16 @@ simde_vext_u64(simde_uint64x1_t a, simde_uint64x1_t b, const int n) a_ = simde_uint64x1_to_private(a), b_ = simde_uint64x1_to_private(b), r_ = a_; - const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - size_t src = i + n_; - r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 0]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_.sv64 = __riscv_vslidedown_vx_u64m1(a_.sv64, n, 1); + r_.sv64 = __riscv_vslideup_vx_u64m1(a_.sv64, b_.sv64, 1-n, 1); + #else + const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + size_t src = i + n_; + r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 0]; + } + #endif return simde_uint64x1_from_private(r_); #endif } @@ -438,11 +494,16 @@ simde_vextq_f16(simde_float16x8_t a, simde_float16x8_t b, const int n) a_ = simde_float16x8_to_private(a), b_ = simde_float16x8_to_private(b), r_ = a_; - const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - size_t src = i + n_; - r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 7]; - } + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + a_.sv128 = __riscv_vslidedown_vx_f16m1(a_.sv128, n, 8); + r_.sv128 = __riscv_vslideup_vx_f16m1(a_.sv128, b_.sv128, 8-n, 8); + #else + const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + size_t src = i + n_; + r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 7]; + } + #endif return simde_float16x8_from_private(r_); #endif } @@ -464,11 +525,16 @@ simde_vextq_f32(simde_float32x4_t a, simde_float32x4_t b, const int n) a_ = simde_float32x4_to_private(a), b_ = simde_float32x4_to_private(b), r_ = a_; - const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - size_t src = i + n_; - r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 3]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_.sv128 = __riscv_vslidedown_vx_f32m1(a_.sv128, n, 4); + r_.sv128 = __riscv_vslideup_vx_f32m1(a_.sv128, b_.sv128, 4-n, 4); + #else + const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + size_t src = i + n_; + r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 3]; + } + #endif return simde_float32x4_from_private(r_); #endif } @@ -509,11 +575,16 @@ simde_vextq_f64(simde_float64x2_t a, simde_float64x2_t b, const int n) a_ = simde_float64x2_to_private(a), b_ = simde_float64x2_to_private(b), r_ = a_; - const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - size_t src = i + n_; - r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 1]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_.sv128 = __riscv_vslidedown_vx_f64m1(a_.sv128, n, 2); + r_.sv128 = __riscv_vslideup_vx_f64m1(a_.sv128, b_.sv128, 2-n, 2); + #else + const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + size_t src = i + n_; + r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 1]; + } + #endif return simde_float64x2_from_private(r_); #endif } @@ -552,11 +623,16 @@ simde_vextq_s8(simde_int8x16_t a, simde_int8x16_t b, const int n) a_ = simde_int8x16_to_private(a), b_ = simde_int8x16_to_private(b), r_ = a_; - const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - size_t src = i + n_; - r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 15]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_.sv128 = __riscv_vslidedown_vx_i8m1(a_.sv128, n, 16); + r_.sv128 = __riscv_vslideup_vx_i8m1(a_.sv128, b_.sv128, 16-n, 16); + #else + const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + size_t src = i + n_; + r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 15]; + } + #endif return simde_int8x16_from_private(r_); #endif } @@ -609,11 +685,16 @@ simde_vextq_s16(simde_int16x8_t a, simde_int16x8_t b, const int n) a_ = simde_int16x8_to_private(a), b_ = simde_int16x8_to_private(b), r_ = a_; - const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - size_t src = i + n_; - r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 7]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_.sv128 = __riscv_vslidedown_vx_i16m1(a_.sv128, n, 8); + r_.sv128 = __riscv_vslideup_vx_i16m1(a_.sv128, b_.sv128, 8-n, 8); + #else + const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + size_t src = i + n_; + r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 7]; + } + #endif return simde_int16x8_from_private(r_); #endif } @@ -658,11 +739,16 @@ simde_vextq_s32(simde_int32x4_t a, simde_int32x4_t b, const int n) a_ = simde_int32x4_to_private(a), b_ = simde_int32x4_to_private(b), r_ = a_; - const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - size_t src = i + n_; - r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 3]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_.sv128 = __riscv_vslidedown_vx_i32m1(a_.sv128, n, 4); + r_.sv128 = __riscv_vslideup_vx_i32m1(a_.sv128, b_.sv128, 4-n, 4); + #else + const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + size_t src = i + n_; + r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 3]; + } + #endif return simde_int32x4_from_private(r_); #endif } @@ -703,11 +789,16 @@ simde_vextq_s64(simde_int64x2_t a, simde_int64x2_t b, const int n) a_ = simde_int64x2_to_private(a), b_ = simde_int64x2_to_private(b), r_ = a_; - const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - size_t src = i + n_; - r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 1]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_.sv128 = __riscv_vslidedown_vx_i64m1(a_.sv128, n, 2); + r_.sv128 = __riscv_vslideup_vx_i64m1(a_.sv128, b_.sv128, 2-n, 2); + #else + const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + size_t src = i + n_; + r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 1]; + } + #endif return simde_int64x2_from_private(r_); #endif } @@ -746,11 +837,16 @@ simde_vextq_u8(simde_uint8x16_t a, simde_uint8x16_t b, const int n) a_ = simde_uint8x16_to_private(a), b_ = simde_uint8x16_to_private(b), r_ = a_; - const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - size_t src = i + n_; - r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 15]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_.sv128 = __riscv_vslidedown_vx_u8m1(a_.sv128, n, 16); + r_.sv128 = __riscv_vslideup_vx_u8m1(a_.sv128, b_.sv128, 16-n, 16); + #else + const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + size_t src = i + n_; + r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 15]; + } + #endif return simde_uint8x16_from_private(r_); #endif } @@ -789,11 +885,16 @@ simde_vextq_u16(simde_uint16x8_t a, simde_uint16x8_t b, const int n) a_ = simde_uint16x8_to_private(a), b_ = simde_uint16x8_to_private(b), r_ = a_; - const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - size_t src = i + n_; - r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 7]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_.sv128 = __riscv_vslidedown_vx_u16m1(a_.sv128, n, 8); + r_.sv128 = __riscv_vslideup_vx_u16m1(a_.sv128, b_.sv128, 8-n, 8); + #else + const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + size_t src = i + n_; + r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 7]; + } + #endif return simde_uint16x8_from_private(r_); #endif } @@ -837,11 +938,16 @@ simde_vextq_u32(simde_uint32x4_t a, simde_uint32x4_t b, const int n) a_ = simde_uint32x4_to_private(a), b_ = simde_uint32x4_to_private(b), r_ = a_; - const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - size_t src = i + n_; - r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 3]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_.sv128 = __riscv_vslidedown_vx_u32m1(a_.sv128, n, 4); + r_.sv128 = __riscv_vslideup_vx_u32m1(a_.sv128, b_.sv128, 4-n, 4); + #else + const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + size_t src = i + n_; + r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 3]; + } + #endif return simde_uint32x4_from_private(r_); #endif } @@ -874,11 +980,16 @@ simde_vextq_u64(simde_uint64x2_t a, simde_uint64x2_t b, const int n) a_ = simde_uint64x2_to_private(a), b_ = simde_uint64x2_to_private(b), r_ = a_; - const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - size_t src = i + n_; - r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 1]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_.sv128 = __riscv_vslidedown_vx_u64m1(a_.sv128, n, 2); + r_.sv128 = __riscv_vslideup_vx_u64m1(a_.sv128, b_.sv128, 2-n, 2); + #else + const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + size_t src = i + n_; + r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 1]; + } + #endif return simde_uint64x2_from_private(r_); #endif } diff --git a/simde/arm/neon/maxnmv.h b/simde/arm/neon/maxnmv.h index 7f00628e1..eba518874 100644 --- a/simde/arm/neon/maxnmv.h +++ b/simde/arm/neon/maxnmv.h @@ -22,6 +22,7 @@ * * Copyright: * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_MAXNMV_H) @@ -45,10 +46,15 @@ simde_vmaxnmv_f32(simde_float32x2_t a) { simde_float32x2_private a_ = simde_float32x2_to_private(a); r = -SIMDE_MATH_INFINITYF; - SIMDE_VECTORIZE_REDUCTION(max:r) - for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { - r = a_.values[i] > r ? a_.values[i] : r; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r = __riscv_vfmv_f_s_f32m1_f32(__riscv_vfredmax_vs_f32m1_f32m1(a_.sv64, \ + __riscv_vfmv_v_f_f32m1(r, 2), 2)); + #else + SIMDE_VECTORIZE_REDUCTION(max:r) + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + r = a_.values[i] > r ? a_.values[i] : r; + } + #endif #endif return r; @@ -69,10 +75,18 @@ simde_vmaxnmvq_f32(simde_float32x4_t a) { simde_float32x4_private a_ = simde_float32x4_to_private(a); r = -SIMDE_MATH_INFINITYF; - SIMDE_VECTORIZE_REDUCTION(max:r) - for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { - r = a_.values[i] > r ? a_.values[i] : r; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r = __riscv_vfmv_f_s_f32m1_f32(__riscv_vfredmax_vs_f32m1_f32m1(a_.sv128, \ + __riscv_vfmv_v_f_f32m1(r, 4), 4)); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && HEDLEY_HAS_BUILTIN(__builtin_reduce_max) + simde_float32_t rst = __builtin_reduce_max(a_.values); + r = (rst > r) ? rst : r; + #else + SIMDE_VECTORIZE_REDUCTION(max:r) + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + r = a_.values[i] > r ? a_.values[i] : r; + } + #endif #endif return r; @@ -93,10 +107,15 @@ simde_vmaxnmvq_f64(simde_float64x2_t a) { simde_float64x2_private a_ = simde_float64x2_to_private(a); r = -SIMDE_MATH_INFINITY; - SIMDE_VECTORIZE_REDUCTION(max:r) - for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { - r = a_.values[i] > r ? a_.values[i] : r; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredmax_vs_f64m1_f64m1(a_.sv128, \ + __riscv_vfmv_v_f_f64m1(r, 2), 2)); + #else + SIMDE_VECTORIZE_REDUCTION(max:r) + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + r = a_.values[i] > r ? a_.values[i] : r; + } + #endif #endif return r; @@ -112,23 +131,28 @@ simde_vmaxnmv_f16(simde_float16x4_t a) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) return vmaxnmv_f16(a); #else - simde_float32_t r_ = simde_float16_to_float32(SIMDE_NINFINITYHF); simde_float16x4_private a_ = simde_float16x4_to_private(a); - #if defined(SIMDE_FAST_NANS) - SIMDE_VECTORIZE_REDUCTION(max:r_) + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVFH) + return __riscv_vfmv_f_s_f16m1_f16(__riscv_vfredmax_vs_f16m1_f16m1(a_.sv64, \ + __riscv_vfmv_v_f_f16m1(SIMDE_NINFINITYHF, 4), 4)); #else - SIMDE_VECTORIZE - #endif - for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { - simde_float32_t tmp_a = simde_float16_to_float32(a_.values[i]); + simde_float32_t r_ = simde_float16_to_float32(SIMDE_NINFINITYHF); #if defined(SIMDE_FAST_NANS) - r_ = tmp_a > r_ ? tmp_a : r_; + SIMDE_VECTORIZE_REDUCTION(max:r_) #else - r_ = (tmp_a > r_) ? tmp_a : ((tmp_a <= r_) ? r_ : ((tmp_a == tmp_a) ? r_ : tmp_a)); + SIMDE_VECTORIZE #endif - } - return simde_float16_from_float32(r_); + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + simde_float32_t tmp_a = simde_float16_to_float32(a_.values[i]); + #if defined(SIMDE_FAST_NANS) + r_ = tmp_a > r_ ? tmp_a : r_; + #else + r_ = (tmp_a > r_) ? tmp_a : ((tmp_a <= r_) ? r_ : ((tmp_a == tmp_a) ? r_ : tmp_a)); + #endif + } + return simde_float16_from_float32(r_); + #endif #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) @@ -142,23 +166,28 @@ simde_vmaxnmvq_f16(simde_float16x8_t a) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) return vmaxnmvq_f16(a); #else - simde_float32_t r_ = simde_float16_to_float32(SIMDE_NINFINITYHF); simde_float16x8_private a_ = simde_float16x8_to_private(a); - #if defined(SIMDE_FAST_NANS) - SIMDE_VECTORIZE_REDUCTION(max:r_) + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVFH) + return __riscv_vfmv_f_s_f16m1_f16(__riscv_vfredmax_vs_f16m1_f16m1(a_.sv128, \ + __riscv_vfmv_v_f_f16m1(SIMDE_NINFINITYHF, 8), 8)); #else - SIMDE_VECTORIZE - #endif - for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { - simde_float32_t tmp_a = simde_float16_to_float32(a_.values[i]); + simde_float32_t r_ = simde_float16_to_float32(SIMDE_NINFINITYHF); #if defined(SIMDE_FAST_NANS) - r_ = tmp_a > r_ ? tmp_a : r_; + SIMDE_VECTORIZE_REDUCTION(max:r_) #else - r_ = (tmp_a > r_) ? tmp_a : ((tmp_a <= r_) ? r_ : ((tmp_a == tmp_a) ? r_ : tmp_a)); + SIMDE_VECTORIZE #endif - } - return simde_float16_from_float32(r_); + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + simde_float32_t tmp_a = simde_float16_to_float32(a_.values[i]); + #if defined(SIMDE_FAST_NANS) + r_ = tmp_a > r_ ? tmp_a : r_; + #else + r_ = (tmp_a > r_) ? tmp_a : ((tmp_a <= r_) ? r_ : ((tmp_a == tmp_a) ? r_ : tmp_a)); + #endif + } + return simde_float16_from_float32(r_); + #endif #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) diff --git a/simde/arm/neon/minnmv.h b/simde/arm/neon/minnmv.h index 11e1b3438..ecd0cab1a 100644 --- a/simde/arm/neon/minnmv.h +++ b/simde/arm/neon/minnmv.h @@ -22,6 +22,7 @@ * * Copyright: * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_MINNMV_H) @@ -40,23 +41,29 @@ simde_vminnmv_f16(simde_float16x4_t a) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) return vminnmv_f16(a); #else - simde_float32_t r_ = simde_float16_to_float32(SIMDE_INFINITYHF); simde_float16x4_private a_ = simde_float16x4_to_private(a); - - #if defined(SIMDE_FAST_NANS) - SIMDE_VECTORIZE_REDUCTION(min:r_) + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVFH) + return __riscv_vfmv_f_s_f16m1_f16(__riscv_vfredmin_vs_f16m1_f16m1(a_.sv64, \ + __riscv_vfmv_v_f_f16m1(SIMDE_INFINITYHF, 4), 4)); #else - SIMDE_VECTORIZE - #endif - for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { - simde_float32_t tmp_a = simde_float16_to_float32(a_.values[i]); + simde_float32_t r_ = simde_float16_to_float32(SIMDE_INFINITYHF); + a_ = simde_float16x4_to_private(a); + #if defined(SIMDE_FAST_NANS) - r_ = tmp_a < r_ ? tmp_a : r_; + SIMDE_VECTORIZE_REDUCTION(min:r_) #else - r_ = (tmp_a < r_) ? tmp_a : ((tmp_a >= r_) ? r_ : ((tmp_a == tmp_a) ? r_ : tmp_a)); + SIMDE_VECTORIZE #endif - } - return simde_float16_from_float32(r_); + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + simde_float32_t tmp_a = simde_float16_to_float32(a_.values[i]); + #if defined(SIMDE_FAST_NANS) + r_ = tmp_a < r_ ? tmp_a : r_; + #else + r_ = (tmp_a < r_) ? tmp_a : ((tmp_a >= r_) ? r_ : ((tmp_a == tmp_a) ? r_ : tmp_a)); + #endif + } + return simde_float16_from_float32(r_); + #endif #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) @@ -75,18 +82,23 @@ simde_vminnmv_f32(simde_float32x2_t a) { simde_float32x2_private a_ = simde_float32x2_to_private(a); r = SIMDE_MATH_INFINITYF; - #if defined(SIMDE_FAST_NANS) - SIMDE_VECTORIZE_REDUCTION(min:r) + #if defined(SIMDE_RISCV_V_NATIVE) + r = __riscv_vfmv_f_s_f32m1_f32(__riscv_vfredmin_vs_f32m1_f32m1(a_.sv64, \ + __riscv_vfmv_v_f_f32m1(r, 2), 2)); #else - SIMDE_VECTORIZE - #endif - for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { #if defined(SIMDE_FAST_NANS) - r = a_.values[i] < r ? a_.values[i] : r; + SIMDE_VECTORIZE_REDUCTION(min:r) #else - r = (a_.values[i] < r) ? a_.values[i] : ((a_.values[i] >= r) ? r : ((a_.values[i] == a_.values[i]) ? r : a_.values[i])); + SIMDE_VECTORIZE #endif - } + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + #if defined(SIMDE_FAST_NANS) + r = a_.values[i] < r ? a_.values[i] : r; + #else + r = (a_.values[i] < r) ? a_.values[i] : ((a_.values[i] >= r) ? r : ((a_.values[i] == a_.values[i]) ? r : a_.values[i])); + #endif + } + #endif #endif return r; @@ -102,23 +114,28 @@ simde_vminnmvq_f16(simde_float16x8_t a) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) return vminnmvq_f16(a); #else - simde_float32_t r_ = simde_float16_to_float32(SIMDE_INFINITYHF); simde_float16x8_private a_ = simde_float16x8_to_private(a); - #if defined(SIMDE_FAST_NANS) - SIMDE_VECTORIZE_REDUCTION(min:r_) + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVFH) + return __riscv_vfmv_f_s_f16m1_f16(__riscv_vfredmin_vs_f16m1_f16m1(a_.sv128, \ + __riscv_vfmv_v_f_f16m1(SIMDE_INFINITYHF, 8), 8)); #else - SIMDE_VECTORIZE - #endif - for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { - simde_float32_t tmp_a = simde_float16_to_float32(a_.values[i]); + simde_float32_t r_ = simde_float16_to_float32(SIMDE_INFINITYHF); #if defined(SIMDE_FAST_NANS) - r_ = tmp_a < r_ ? tmp_a : r_; + SIMDE_VECTORIZE_REDUCTION(min:r_) #else - r_ = (tmp_a < r_) ? tmp_a : ((tmp_a >= r_) ? r_ : ((tmp_a == tmp_a) ? r_ : tmp_a)); + SIMDE_VECTORIZE #endif - } - return simde_float16_from_float32(r_); + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + simde_float32_t tmp_a = simde_float16_to_float32(a_.values[i]); + #if defined(SIMDE_FAST_NANS) + r_ = tmp_a < r_ ? tmp_a : r_; + #else + r_ = (tmp_a < r_) ? tmp_a : ((tmp_a >= r_) ? r_ : ((tmp_a == tmp_a) ? r_ : tmp_a)); + #endif + } + return simde_float16_from_float32(r_); + #endif #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) @@ -137,18 +154,23 @@ simde_vminnmvq_f32(simde_float32x4_t a) { simde_float32x4_private a_ = simde_float32x4_to_private(a); r = SIMDE_MATH_INFINITYF; - #if defined(SIMDE_FAST_NANS) - SIMDE_VECTORIZE_REDUCTION(min:r) + #if defined(SIMDE_RISCV_V_NATIVE) + r = __riscv_vfmv_f_s_f32m1_f32(__riscv_vfredmin_vs_f32m1_f32m1(a_.sv128, \ + __riscv_vfmv_v_f_f32m1(r, 4), 4)); #else - SIMDE_VECTORIZE - #endif - for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { #if defined(SIMDE_FAST_NANS) - r = a_.values[i] < r ? a_.values[i] : r; + SIMDE_VECTORIZE_REDUCTION(min:r) #else - r = (a_.values[i] < r) ? a_.values[i] : ((a_.values[i] >= r) ? r : ((a_.values[i] == a_.values[i]) ? r : a_.values[i])); + SIMDE_VECTORIZE #endif - } + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + #if defined(SIMDE_FAST_NANS) + r = a_.values[i] < r ? a_.values[i] : r; + #else + r = (a_.values[i] < r) ? a_.values[i] : ((a_.values[i] >= r) ? r : ((a_.values[i] == a_.values[i]) ? r : a_.values[i])); + #endif + } + #endif #endif return r; @@ -169,18 +191,23 @@ simde_vminnmvq_f64(simde_float64x2_t a) { simde_float64x2_private a_ = simde_float64x2_to_private(a); r = SIMDE_MATH_INFINITY; - #if defined(SIMDE_FAST_NANS) - SIMDE_VECTORIZE_REDUCTION(min:r) + #if defined(SIMDE_RISCV_V_NATIVE) + r = __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredmin_vs_f64m1_f64m1(a_.sv128, \ + __riscv_vfmv_v_f_f64m1(r, 2), 2)); #else - SIMDE_VECTORIZE - #endif - for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { #if defined(SIMDE_FAST_NANS) - r = a_.values[i] < r ? a_.values[i] : r; + SIMDE_VECTORIZE_REDUCTION(min:r) #else - r = (a_.values[i] < r) ? a_.values[i] : ((a_.values[i] >= r) ? r : ((a_.values[i] == a_.values[i]) ? r : a_.values[i])); + SIMDE_VECTORIZE #endif - } + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + #if defined(SIMDE_FAST_NANS) + r = a_.values[i] < r ? a_.values[i] : r; + #else + r = (a_.values[i] < r) ? a_.values[i] : ((a_.values[i] >= r) ? r : ((a_.values[i] == a_.values[i]) ? r : a_.values[i])); + #endif + } + #endif #endif return r; diff --git a/simde/arm/neon/movl.h b/simde/arm/neon/movl.h index 853e3249e..91b2db9b9 100644 --- a/simde/arm/neon/movl.h +++ b/simde/arm/neon/movl.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Sean Maher (Copyright owned by Google, LLC) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_MOVL_H) @@ -50,7 +51,10 @@ simde_vmovl_s8(simde_int8x8_t a) { simde_int16x8_private r_; simde_int8x8_private a_ = simde_int8x8_to_private(a); - #if defined(SIMDE_CONVERT_VECTOR_) && !defined(SIMDE_BUG_GCC_100761) + #if defined(SIMDE_RISCV_V_NATIVE) + vint8mf2_t va = __riscv_vlmul_trunc_v_i8m1_i8mf2 (a_.sv64); + r_.sv128 = __riscv_vwcvt_x_x_v_i16m1 (va, 8); + #elif defined(SIMDE_CONVERT_VECTOR_) && !defined(SIMDE_BUG_GCC_100761) SIMDE_CONVERT_VECTOR_(r_.values, a_.values); #else SIMDE_VECTORIZE @@ -83,7 +87,10 @@ simde_vmovl_s16(simde_int16x4_t a) { simde_int32x4_private r_; simde_int16x4_private a_ = simde_int16x4_to_private(a); - #if defined(SIMDE_CONVERT_VECTOR_) && !defined(SIMDE_BUG_GCC_100761) + #if defined(SIMDE_RISCV_V_NATIVE) + vint16mf2_t va = __riscv_vlmul_trunc_v_i16m1_i16mf2 (a_.sv64); + r_.sv128 = __riscv_vwcvt_x_x_v_i32m1 (va, 4); + #elif defined(SIMDE_CONVERT_VECTOR_) && !defined(SIMDE_BUG_GCC_100761) SIMDE_CONVERT_VECTOR_(r_.values, a_.values); #else SIMDE_VECTORIZE @@ -116,7 +123,10 @@ simde_vmovl_s32(simde_int32x2_t a) { simde_int64x2_private r_; simde_int32x2_private a_ = simde_int32x2_to_private(a); - #if defined(SIMDE_CONVERT_VECTOR_) + #if defined(SIMDE_RISCV_V_NATIVE) + vint32mf2_t va = __riscv_vlmul_trunc_v_i32m1_i32mf2(a_.sv64); + r_.sv128 = __riscv_vwcvt_x_x_v_i64m1 (va, 2); + #elif defined(SIMDE_CONVERT_VECTOR_) SIMDE_CONVERT_VECTOR_(r_.values, a_.values); #else SIMDE_VECTORIZE @@ -149,7 +159,10 @@ simde_vmovl_u8(simde_uint8x8_t a) { simde_uint16x8_private r_; simde_uint8x8_private a_ = simde_uint8x8_to_private(a); - #if defined(SIMDE_CONVERT_VECTOR_) && !defined(SIMDE_BUG_GCC_100761) + #if defined(SIMDE_RISCV_V_NATIVE) + vuint8mf2_t va = __riscv_vlmul_trunc_v_u8m1_u8mf2(a_.sv64); + r_.sv128 = __riscv_vwcvtu_x_x_v_u16m1 (va, 8); + #elif defined(SIMDE_CONVERT_VECTOR_) && !defined(SIMDE_BUG_GCC_100761) SIMDE_CONVERT_VECTOR_(r_.values, a_.values); #else SIMDE_VECTORIZE @@ -182,7 +195,10 @@ simde_vmovl_u16(simde_uint16x4_t a) { simde_uint32x4_private r_; simde_uint16x4_private a_ = simde_uint16x4_to_private(a); - #if defined(SIMDE_CONVERT_VECTOR_) && !defined(SIMDE_BUG_GCC_100761) + #if defined(SIMDE_RISCV_V_NATIVE) + vuint16mf2_t va = __riscv_vlmul_trunc_v_u16m1_u16mf2(a_.sv64); + r_.sv128 = __riscv_vwcvtu_x_x_v_u32m1 (va, 4); + #elif defined(SIMDE_CONVERT_VECTOR_) && !defined(SIMDE_BUG_GCC_100761) SIMDE_CONVERT_VECTOR_(r_.values, a_.values); #else SIMDE_VECTORIZE @@ -215,7 +231,10 @@ simde_vmovl_u32(simde_uint32x2_t a) { simde_uint64x2_private r_; simde_uint32x2_private a_ = simde_uint32x2_to_private(a); - #if defined(SIMDE_CONVERT_VECTOR_) + #if defined(SIMDE_RISCV_V_NATIVE) + vuint32mf2_t va = __riscv_vlmul_trunc_v_u32m1_u32mf2(a_.sv64); + r_.sv128 = __riscv_vwcvtu_x_x_v_u64m1 (va, 2); + #elif defined(SIMDE_CONVERT_VECTOR_) SIMDE_CONVERT_VECTOR_(r_.values, a_.values); #else SIMDE_VECTORIZE diff --git a/simde/arm/neon/movn.h b/simde/arm/neon/movn.h index aa3ca453d..cd54f25ef 100644 --- a/simde/arm/neon/movn.h +++ b/simde/arm/neon/movn.h @@ -22,6 +22,7 @@ * * Copyright: * 2020 Evan Nemerson + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_MOVN_H) @@ -42,7 +43,9 @@ simde_vmovn_s16(simde_int16x8_t a) { simde_int8x8_private r_; simde_int16x8_private a_ = simde_int16x8_to_private(a); - #if defined(SIMDE_CONVERT_VECTOR_) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vlmul_ext_v_i8mf2_i8m1(__riscv_vncvt_x_x_w_i8mf2(a_.sv128, 8)); + #elif defined(SIMDE_CONVERT_VECTOR_) SIMDE_CONVERT_VECTOR_(r_.values, a_.values); #else SIMDE_VECTORIZE @@ -68,7 +71,9 @@ simde_vmovn_s32(simde_int32x4_t a) { simde_int16x4_private r_; simde_int32x4_private a_ = simde_int32x4_to_private(a); - #if defined(SIMDE_CONVERT_VECTOR_) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vlmul_ext_v_i16mf2_i16m1(__riscv_vncvt_x_x_w_i16mf2(a_.sv128, 4)); + #elif defined(SIMDE_CONVERT_VECTOR_) SIMDE_CONVERT_VECTOR_(r_.values, a_.values); #else SIMDE_VECTORIZE @@ -94,7 +99,9 @@ simde_vmovn_s64(simde_int64x2_t a) { simde_int32x2_private r_; simde_int64x2_private a_ = simde_int64x2_to_private(a); - #if defined(SIMDE_CONVERT_VECTOR_) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vlmul_ext_v_i32mf2_i32m1(__riscv_vncvt_x_x_w_i32mf2(a_.sv128, 2)); + #elif defined(SIMDE_CONVERT_VECTOR_) SIMDE_CONVERT_VECTOR_(r_.values, a_.values); #else SIMDE_VECTORIZE @@ -120,7 +127,9 @@ simde_vmovn_u16(simde_uint16x8_t a) { simde_uint8x8_private r_; simde_uint16x8_private a_ = simde_uint16x8_to_private(a); - #if defined(SIMDE_CONVERT_VECTOR_) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vlmul_ext_v_u8mf2_u8m1(__riscv_vncvt_x_x_w_u8mf2(a_.sv128, 8)); + #elif defined(SIMDE_CONVERT_VECTOR_) SIMDE_CONVERT_VECTOR_(r_.values, a_.values); #else SIMDE_VECTORIZE @@ -146,7 +155,9 @@ simde_vmovn_u32(simde_uint32x4_t a) { simde_uint16x4_private r_; simde_uint32x4_private a_ = simde_uint32x4_to_private(a); - #if defined(SIMDE_CONVERT_VECTOR_) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vlmul_ext_v_u16mf2_u16m1(__riscv_vncvt_x_x_w_u16mf2(a_.sv128, 4)); + #elif defined(SIMDE_CONVERT_VECTOR_) SIMDE_CONVERT_VECTOR_(r_.values, a_.values); #else SIMDE_VECTORIZE @@ -172,7 +183,9 @@ simde_vmovn_u64(simde_uint64x2_t a) { simde_uint32x2_private r_; simde_uint64x2_private a_ = simde_uint64x2_to_private(a); - #if defined(SIMDE_CONVERT_VECTOR_) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vlmul_ext_v_u32mf2_u32m1(__riscv_vncvt_x_x_w_u32mf2(a_.sv128, 2)); + #elif defined(SIMDE_CONVERT_VECTOR_) SIMDE_CONVERT_VECTOR_(r_.values, a_.values); #else SIMDE_VECTORIZE diff --git a/simde/arm/neon/qdmull.h b/simde/arm/neon/qdmull.h index 871257f61..446bd3ffd 100644 --- a/simde/arm/neon/qdmull.h +++ b/simde/arm/neon/qdmull.h @@ -24,6 +24,7 @@ * 2020 Evan Nemerson * 2020 Sean Maher (Copyright owned by Google, LLC) * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) */ /* Implementation notes (seanptmaher): @@ -97,11 +98,17 @@ simde_vqdmull_s16(simde_int16x4_t a, simde_int16x4_t b) { simde_int16x4_private a_ = simde_int16x4_to_private(a), b_ = simde_int16x4_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = simde_vqdmullh_s16(a_.values[i], b_.values[i]); - } + #if defined(SIMDE_RISCV_V_NATIVE) + vint32m2_t mul = __riscv_vwmul_vv_i32m2(a_.sv64, b_.sv64, 4); + r_.sv128 = __riscv_vlmul_trunc_v_i32m2_i32m1(__riscv_vmerge_vxm_i32m2(__riscv_vmerge_vxm_i32m2( + __riscv_vsll_vx_i32m2(mul, 1, 4), INT32_MAX, __riscv_vmsgt_vx_i32m2_b16(mul, INT32_C(0x3FFFFFFF), 4), 4), + INT32_MIN, __riscv_vmslt_vx_i32m2_b16(mul, -INT32_C(0x40000000), 4), 4)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vqdmullh_s16(a_.values[i], b_.values[i]); + } + #endif return simde_int32x4_from_private(r_); #endif @@ -137,10 +144,17 @@ simde_vqdmull_s32(simde_int32x2_t a, simde_int32x2_t b) { a_ = simde_int32x2_to_private(a), b_ = simde_int32x2_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = simde_vqdmulls_s32(a_.values[i], b_.values[i]); - } + #if defined(SIMDE_RISCV_V_NATIVE) + vint64m2_t mul = __riscv_vwmul_vv_i64m2(a_.sv64, b_.sv64, 2); + r_.sv128 = __riscv_vlmul_trunc_v_i64m2_i64m1(__riscv_vmerge_vxm_i64m2(__riscv_vmerge_vxm_i64m2( + __riscv_vsll_vx_i64m2(mul, 1, 2), INT64_MAX, __riscv_vmsgt_vx_i64m2_b32(mul, INT64_C(0x3FFFFFFFFFFFFFFF), 2), 2), + INT64_MIN, __riscv_vmslt_vx_i64m2_b32(mul, -INT64_C(0x4000000000000000), 2), 2)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vqdmulls_s32(a_.values[i], b_.values[i]); + } + #endif return simde_int64x2_from_private(r_); #endif diff --git a/simde/arm/neon/qshlu_n.h b/simde/arm/neon/qshlu_n.h index db9610a0f..587fc439c 100644 --- a/simde/arm/neon/qshlu_n.h +++ b/simde/arm/neon/qshlu_n.h @@ -23,6 +23,7 @@ * Copyright: * 2021 Atharva Nimbalkar * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_QSHLU_N_H) @@ -122,8 +123,11 @@ simde_vqshlu_n_s8(simde_int8x8_t a, const int n) #else simde_int8x8_private a_ = simde_int8x8_to_private(a); simde_uint8x8_private r_; - - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_100762) + #if defined(SIMDE_RISCV_V_NATIVE) + vuint8m1_t shift = __riscv_vsll_vx_u8m1(__riscv_vreinterpret_v_i8m1_u8m1(a_.sv64), n, 8); + r_.sv64 = __riscv_vmerge_vxm_u8m1(shift, UINT8_MAX, __riscv_vmsne_vv_u8m1_b8(__riscv_vsrl_vx_u8m1(shift, n, 8), __riscv_vreinterpret_v_i8m1_u8m1(a_.sv64), 8), 8); + r_.sv64 = __riscv_vmerge_vxm_u8m1(r_.sv64, 0, __riscv_vmslt_vx_i8m1_b8(a_.sv64, 0, 8), 8); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_100762) __typeof__(r_.values) shifted = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values) << n; __typeof__(r_.values) overflow = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), (shifted >> n) != HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values)); @@ -168,8 +172,11 @@ simde_vqshlu_n_s16(simde_int16x4_t a, const int n) #else simde_int16x4_private a_ = simde_int16x4_to_private(a); simde_uint16x4_private r_; - - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_100762) + #if defined(SIMDE_RISCV_V_NATIVE) + vuint16m1_t shift = __riscv_vsll_vx_u16m1(__riscv_vreinterpret_v_i16m1_u16m1(a_.sv64), n, 4); + r_.sv64 = __riscv_vmerge_vxm_u16m1(shift, UINT16_MAX, __riscv_vmsne_vv_u16m1_b16(__riscv_vsrl_vx_u16m1(shift, n, 4), __riscv_vreinterpret_v_i16m1_u16m1(a_.sv64), 4), 4); + r_.sv64 = __riscv_vmerge_vxm_u16m1(r_.sv64, 0, __riscv_vmslt_vx_i16m1_b16(a_.sv64, 0, 4), 4); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_100762) __typeof__(r_.values) shifted = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values) << n; __typeof__(r_.values) overflow = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), (shifted >> n) != HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values)); @@ -217,7 +224,11 @@ simde_vqshlu_n_s32(simde_int32x2_t a, const int n) simde_int32x2_private a_ = simde_int32x2_to_private(a); simde_uint32x2_private r_; - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_100762) + #if defined(SIMDE_RISCV_V_NATIVE) + vuint32m1_t shift = __riscv_vsll_vx_u32m1(__riscv_vreinterpret_v_i32m1_u32m1(a_.sv64), n, 2); + r_.sv64 = __riscv_vmerge_vxm_u32m1(shift, UINT32_MAX, __riscv_vmsne_vv_u32m1_b32(__riscv_vsrl_vx_u32m1(shift, n, 2), __riscv_vreinterpret_v_i32m1_u32m1(a_.sv64), 2), 2); + r_.sv64 = __riscv_vmerge_vxm_u32m1(r_.sv64, 0, __riscv_vmslt_vx_i32m1_b32(a_.sv64, 0, 2), 2); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_100762) __typeof__(r_.values) shifted = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values) << n; __typeof__(r_.values) overflow = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), (shifted >> n) != HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values)); @@ -264,7 +275,11 @@ simde_vqshlu_n_s64(simde_int64x1_t a, const int n) simde_int64x1_private a_ = simde_int64x1_to_private(a); simde_uint64x1_private r_; - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + #if defined(SIMDE_RISCV_V_NATIVE) + vuint64m1_t shift = __riscv_vsll_vx_u64m1(__riscv_vreinterpret_v_i64m1_u64m1(a_.sv64), n, 1); + r_.sv64 = __riscv_vmerge_vxm_u64m1(shift, UINT64_MAX, __riscv_vmsne_vv_u64m1_b64(__riscv_vsrl_vx_u64m1(shift, n, 1), __riscv_vreinterpret_v_i64m1_u64m1(a_.sv64), 1), 1); + r_.sv64 = __riscv_vmerge_vxm_u64m1(r_.sv64, 0, __riscv_vmslt_vx_i64m1_b64(a_.sv64, 0, 1), 1); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) __typeof__(r_.values) shifted = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values) << n; __typeof__(r_.values) overflow = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), (shifted >> n) != HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values)); @@ -304,6 +319,10 @@ simde_vqshluq_n_s8(simde_int8x16_t a, const int n) const v128_t overflow = wasm_i8x16_ne(a_.v128, wasm_u8x16_shr(r_.v128, HEDLEY_STATIC_CAST(uint32_t, n))); r_.v128 = wasm_v128_or(r_.v128, overflow); r_.v128 = wasm_v128_andnot(r_.v128, wasm_i8x16_shr(a_.v128, 7)); + #elif defined(SIMDE_RISCV_V_NATIVE) + vuint8m1_t shift = __riscv_vsll_vx_u8m1(__riscv_vreinterpret_v_i8m1_u8m1(a_.sv128), n, 16); + r_.sv128 = __riscv_vmerge_vxm_u8m1(shift, UINT8_MAX, __riscv_vmsne_vv_u8m1_b8(__riscv_vsrl_vx_u8m1(shift, n, 16), __riscv_vreinterpret_v_i8m1_u8m1(a_.sv128), 16), 16); + r_.sv128 = __riscv_vmerge_vxm_u8m1(r_.sv128, 0, __riscv_vmslt_vx_i8m1_b8(a_.sv128, 0, 16), 16); #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) __typeof__(r_.values) shifted = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values) << n; @@ -343,6 +362,10 @@ simde_vqshluq_n_s16(simde_int16x8_t a, const int n) const v128_t overflow = wasm_i16x8_ne(a_.v128, wasm_u16x8_shr(r_.v128, HEDLEY_STATIC_CAST(uint32_t, n))); r_.v128 = wasm_v128_or(r_.v128, overflow); r_.v128 = wasm_v128_andnot(r_.v128, wasm_i16x8_shr(a_.v128, 15)); + #elif defined(SIMDE_RISCV_V_NATIVE) + vuint16m1_t shift = __riscv_vsll_vx_u16m1(__riscv_vreinterpret_v_i16m1_u16m1(a_.sv128), n, 8); + r_.sv128 = __riscv_vmerge_vxm_u16m1(shift, UINT16_MAX, __riscv_vmsne_vv_u16m1_b16(__riscv_vsrl_vx_u16m1(shift, n, 8), __riscv_vreinterpret_v_i16m1_u16m1(a_.sv128), 8), 8); + r_.sv128 = __riscv_vmerge_vxm_u16m1(r_.sv128, 0, __riscv_vmslt_vx_i16m1_b16(a_.sv128, 0, 8), 8); #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) __typeof__(r_.values) shifted = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values) << n; @@ -382,6 +405,10 @@ simde_vqshluq_n_s32(simde_int32x4_t a, const int n) const v128_t overflow = wasm_i32x4_ne(a_.v128, wasm_u32x4_shr(r_.v128, HEDLEY_STATIC_CAST(uint32_t, n))); r_.v128 = wasm_v128_or(r_.v128, overflow); r_.v128 = wasm_v128_andnot(r_.v128, wasm_i32x4_shr(a_.v128, 31)); + #elif defined(SIMDE_RISCV_V_NATIVE) + vuint32m1_t shift = __riscv_vsll_vx_u32m1(__riscv_vreinterpret_v_i32m1_u32m1(a_.sv128), n, 4); + r_.sv128 = __riscv_vmerge_vxm_u32m1(shift, UINT32_MAX, __riscv_vmsne_vv_u32m1_b32(__riscv_vsrl_vx_u32m1(shift, n, 4), __riscv_vreinterpret_v_i32m1_u32m1(a_.sv128), 4), 4); + r_.sv128 = __riscv_vmerge_vxm_u32m1(r_.sv128, 0, __riscv_vmslt_vx_i32m1_b32(a_.sv128, 0, 4), 4); #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) __typeof__(r_.values) shifted = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values) << n; @@ -421,6 +448,10 @@ simde_vqshluq_n_s64(simde_int64x2_t a, const int n) const v128_t overflow = wasm_i64x2_ne(a_.v128, wasm_u64x2_shr(r_.v128, HEDLEY_STATIC_CAST(uint32_t, n))); r_.v128 = wasm_v128_or(r_.v128, overflow); r_.v128 = wasm_v128_andnot(r_.v128, wasm_i64x2_shr(a_.v128, 63)); + #elif defined(SIMDE_RISCV_V_NATIVE) + vuint64m1_t shift = __riscv_vsll_vx_u64m1(__riscv_vreinterpret_v_i64m1_u64m1(a_.sv128), n, 2); + r_.sv128 = __riscv_vmerge_vxm_u64m1(shift, UINT64_MAX, __riscv_vmsne_vv_u64m1_b64(__riscv_vsrl_vx_u64m1(shift, n, 2), __riscv_vreinterpret_v_i64m1_u64m1(a_.sv128), 2), 2); + r_.sv128 = __riscv_vmerge_vxm_u64m1(r_.sv128, 0, __riscv_vmslt_vx_i64m1_b64(a_.sv128, 0, 2), 2); #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) __typeof__(r_.values) shifted = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values) << n; diff --git a/simde/arm/neon/rnda.h b/simde/arm/neon/rnda.h index 964e682ea..05a540366 100644 --- a/simde/arm/neon/rnda.h +++ b/simde/arm/neon/rnda.h @@ -22,6 +22,7 @@ * * Copyright: * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_RNDA_H) @@ -56,11 +57,14 @@ simde_vrnda_f16(simde_float16x4_t a) { simde_float16x4_private r_, a_ = simde_float16x4_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = simde_vrndah_f16(a_.values[i]); - } + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVFH) + r_.sv64 = __riscv_vfcvt_f_x_v_f16m1(__riscv_vfcvt_x_f_v_i16m1_rm(a_.sv64, 0, 4), 4); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vrndah_f16(a_.values[i]); + } + #endif return simde_float16x4_from_private(r_); #endif @@ -80,10 +84,21 @@ simde_vrnda_f32(simde_float32x2_t a) { r_, a_ = simde_float32x2_to_private(a); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = simde_math_roundf(a_.values[i]); - } + #if defined(SIMDE_RISCV_V_NATIVE) + #if defined(SIMDE_FAST_NANS) + r_.sv64 = __riscv_vfcvt_f_x_v_f32m1(__riscv_vfcvt_x_f_v_i32m1_rm(a_.sv64, 0, 2), 2); + #else + simde_float32 nan = SIMDE_MATH_NAN; + vbool32_t mask = __riscv_vmseq_vx_u32m1_b32(__riscv_vfclass_v_u32m1(a_.sv64 , 2) , 512 , 2); + r_.sv64 = __riscv_vfmerge_vfm_f32m1(__riscv_vfcvt_f_x_v_f32m1(__riscv_vfcvt_x_f_v_i32m1_rm(a_.sv64, 0, 2), 2), \ + nan, mask, 2); + #endif + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_math_roundf(a_.values[i]); + } + #endif return simde_float32x2_from_private(r_); #endif @@ -103,10 +118,21 @@ simde_vrnda_f64(simde_float64x1_t a) { r_, a_ = simde_float64x1_to_private(a); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = simde_math_round(a_.values[i]); - } + #if defined(SIMDE_RISCV_V_NATIVE) + #if defined(SIMDE_FAST_NANS) + r_.sv64 = __riscv_vfcvt_f_x_v_f64m1(__riscv_vfcvt_x_f_v_i64m1_rm(a_.sv64, 0, 1), 1); + #else + simde_float64 nan = SIMDE_MATH_NAN; + vbool64_t mask = __riscv_vmseq_vx_u64m1_b64(__riscv_vfclass_v_u64m1(a_.sv64 , 1) , 512 , 1); + r_.sv64 = __riscv_vfmerge_vfm_f64m1(__riscv_vfcvt_f_x_v_f64m1(__riscv_vfcvt_x_f_v_i64m1_rm(a_.sv64, 0, 1), 1), \ + nan, mask, 1); + #endif + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_math_round(a_.values[i]); + } + #endif return simde_float64x1_from_private(r_); #endif @@ -125,11 +151,14 @@ simde_vrndaq_f16(simde_float16x8_t a) { simde_float16x8_private r_, a_ = simde_float16x8_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = simde_vrndah_f16(a_.values[i]); - } + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVFH) + r_.sv128 = __riscv_vfcvt_f_x_v_f16m1(__riscv_vfcvt_x_f_v_i16m1_rm(a_.sv128, 0, 8), 8); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vrndah_f16(a_.values[i]); + } + #endif return simde_float16x8_from_private(r_); #endif @@ -149,10 +178,21 @@ simde_vrndaq_f32(simde_float32x4_t a) { r_, a_ = simde_float32x4_to_private(a); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = simde_math_roundf(a_.values[i]); - } + #if defined(SIMDE_RISCV_V_NATIVE) + #if defined(SIMDE_FAST_NANS) + r_.sv128 = __riscv_vfcvt_f_x_v_f32m1(__riscv_vfcvt_x_f_v_i32m1_rm(a_.sv128, 0, 4), 4); + #else + simde_float32 nan = SIMDE_MATH_NAN; + vbool32_t mask = __riscv_vmseq_vx_u32m1_b32(__riscv_vfclass_v_u32m1(a_.sv128 , 4) , 512 , 4); + r_.sv128 = __riscv_vfmerge_vfm_f32m1(__riscv_vfcvt_f_x_v_f32m1(__riscv_vfcvt_x_f_v_i32m1_rm(a_.sv128, 0, 4), 4), \ + nan, mask, 4); + #endif + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_math_roundf(a_.values[i]); + } + #endif return simde_float32x4_from_private(r_); #endif @@ -172,10 +212,21 @@ simde_vrndaq_f64(simde_float64x2_t a) { r_, a_ = simde_float64x2_to_private(a); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = simde_math_round(a_.values[i]); - } + #if defined(SIMDE_RISCV_V_NATIVE) + #if defined(SIMDE_FAST_NANS) + r_.sv128 = __riscv_vfcvt_f_x_v_f64m1(__riscv_vfcvt_x_f_v_i64m1_rm(a_.sv128, 0, 2), 2); + #else + simde_float64 nan = SIMDE_MATH_NAN; + vbool64_t mask = __riscv_vmseq_vx_u64m1_b64(__riscv_vfclass_v_u64m1(a_.sv128 , 2) , 512 , 2); + r_.sv128 = __riscv_vfmerge_vfm_f64m1(__riscv_vfcvt_f_x_v_f64m1(__riscv_vfcvt_x_f_v_i64m1_rm(a_.sv128, 0, 2), 2), \ + nan, mask, 2); + #endif + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_math_round(a_.values[i]); + } + #endif return simde_float64x2_from_private(r_); #endif diff --git a/simde/arm/neon/rsubhn.h b/simde/arm/neon/rsubhn.h index 2d6a15da4..5d195f83c 100644 --- a/simde/arm/neon/rsubhn.h +++ b/simde/arm/neon/rsubhn.h @@ -22,6 +22,7 @@ * * Copyright: * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_RSUBHN_H) @@ -48,10 +49,14 @@ simde_vrsubhn_s16(simde_int16x8_t a, simde_int16x8_t b) { a_ = simde_int16x8_to_private(a), b_ = simde_int16x8_to_private(b); int16_t round_cast = 1 << 7; - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = HEDLEY_STATIC_CAST(int16_t, a_.values[i] - b_.values[i] + round_cast); - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vadd_vx_i16m1(__riscv_vsub_vv_i16m1(a_.sv128, b_.sv128, 8), round_cast, 8); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = HEDLEY_STATIC_CAST(int16_t, a_.values[i] - b_.values[i] + round_cast); + } + #endif return simde_vmovn_s16(simde_vshrq_n_s16(simde_int16x8_from_private(r_), 8)); #endif } @@ -71,10 +76,14 @@ simde_vrsubhn_s32(simde_int32x4_t a, simde_int32x4_t b) { a_ = simde_int32x4_to_private(a), b_ = simde_int32x4_to_private(b); int round_cast = 1 << 15; - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] - b_.values[i] + round_cast; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vadd_vx_i32m1(__riscv_vsub_vv_i32m1(a_.sv128, b_.sv128, 4), round_cast, 4); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] - b_.values[i] + round_cast; + } + #endif return simde_vmovn_s32(simde_vshrq_n_s32(simde_int32x4_from_private(r_), 16)); #endif } @@ -93,12 +102,17 @@ simde_vrsubhn_s64(simde_int64x2_t a, simde_int64x2_t b) { r_, a_ = simde_int64x2_to_private(a), b_ = simde_int64x2_to_private(b); - int64_t round_cast = 1ll << 31; - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = ((a_.values[i] - b_.values[i] + round_cast) >> 32); - } - return simde_vmovn_s64(simde_int64x2_from_private(r_)); + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vadd_vx_i64m1(__riscv_vsub_vv_i64m1(a_.sv128, b_.sv128, 2), 0x80000000, 2); + return simde_vmovn_s64(simde_vshrq_n_s64(simde_int64x2_from_private(r_), 32)); + #else + int64_t round_cast = 1ll << 31; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = ((a_.values[i] - b_.values[i] + round_cast) >> 32); + } + return simde_vmovn_s64(simde_int64x2_from_private(r_)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -117,10 +131,14 @@ simde_vrsubhn_u16(simde_uint16x8_t a, simde_uint16x8_t b) { a_ = simde_uint16x8_to_private(a), b_ = simde_uint16x8_to_private(b); uint16_t round_cast = 1 << 7; - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = HEDLEY_STATIC_CAST(uint16_t, a_.values[i] - b_.values[i] + round_cast); - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vadd_vx_u16m1(__riscv_vsub_vv_u16m1(a_.sv128, b_.sv128, 8), round_cast, 8); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = HEDLEY_STATIC_CAST(uint16_t, a_.values[i] - b_.values[i] + round_cast); + } + #endif return simde_vmovn_u16(simde_vshrq_n_u16(simde_uint16x8_from_private(r_), 8)); #endif } @@ -140,10 +158,14 @@ simde_vrsubhn_u32(simde_uint32x4_t a, simde_uint32x4_t b) { a_ = simde_uint32x4_to_private(a), b_ = simde_uint32x4_to_private(b); uint32_t round_cast = 1 << 15; - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] - b_.values[i] + round_cast; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vadd_vx_u32m1(__riscv_vsub_vv_u32m1(a_.sv128, b_.sv128, 4), round_cast, 4); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] - b_.values[i] + round_cast; + } + #endif return simde_vmovn_u32(simde_vshrq_n_u32(simde_uint32x4_from_private(r_), 16)); #endif } @@ -162,12 +184,17 @@ simde_vrsubhn_u64(simde_uint64x2_t a, simde_uint64x2_t b) { r_, a_ = simde_uint64x2_to_private(a), b_ = simde_uint64x2_to_private(b); - uint64_t round_cast = 1ull << 31; - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = ((a_.values[i] - b_.values[i] + round_cast) >> 32); - } - return simde_vmovn_u64(simde_uint64x2_from_private(r_)); + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vadd_vx_u64m1(__riscv_vsub_vv_u64m1(a_.sv128, b_.sv128, 2), 0x80000000, 2); + return simde_vmovn_u64(simde_vshrq_n_u64(simde_uint64x2_from_private(r_), 32)); + #else + uint64_t round_cast = 1ull << 31; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = ((a_.values[i] - b_.values[i] + round_cast) >> 32); + } + return simde_vmovn_u64(simde_uint64x2_from_private(r_)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) diff --git a/simde/arm/neon/shl.h b/simde/arm/neon/shl.h index b4fd84256..5a250dc9a 100644 --- a/simde/arm/neon/shl.h +++ b/simde/arm/neon/shl.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Christopher Moore + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_SHL_H) @@ -143,13 +144,23 @@ simde_vshl_s8 (const simde_int8x8_t a, const simde_int8x8_t b) { r256 = _mm256_shuffle_epi8(r256, _mm256_set1_epi32(0x0C080400)); r_.m64 = _mm_set_pi32(simde_mm256_extract_epi32(r256, 4), simde_mm256_extract_epi32(r256, 0)); #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = HEDLEY_STATIC_CAST(int8_t, - (b_.values[i] >= 0) ? - (b_.values[i] >= 8) ? 0 : (a_.values[i] << b_.values[i]) : - (b_.values[i] <= -8) ? (a_.values[i] >> 7) : (a_.values[i] >> -b_.values[i])); - } + #if defined(SIMDE_RISCV_V_NATIVE) + vint8m1_t bit_shift_rst = __riscv_vmerge_vxm_i8m1( + __riscv_vsll_vv_i8m1 (a_.sv64, __riscv_vreinterpret_v_i8m1_u8m1(b_.sv64), 8), 0, __riscv_vmsge_vx_i8m1_b8(b_.sv64, 8, 8), 8); + vint8m1_t b_abs = __riscv_vmax_vv_i8m1 (b_.sv64, __riscv_vneg_v_i8m1 (b_.sv64, 8), 8); + vuint8m1_t u_b_abs = __riscv_vreinterpret_v_i8m1_u8m1 (b_abs); + vint8m1_t scal_shift_rst = __riscv_vmerge_vvm_i8m1(__riscv_vsra_vv_i8m1 (a_.sv64, u_b_abs, 8), \ + __riscv_vsra_vx_i8m1(a_.sv64, 7, 8), __riscv_vmsle_vx_i8m1_b8(b_.sv64, -8, 8), 8); + r_.sv64 = __riscv_vmerge_vvm_i8m1 (bit_shift_rst, scal_shift_rst, __riscv_vmslt_vx_i8m1_b8 (b_.sv64, 0, 8), 8); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = HEDLEY_STATIC_CAST(int8_t, + (b_.values[i] >= 0) ? + (b_.values[i] >= 8) ? 0 : (a_.values[i] << b_.values[i]) : + (b_.values[i] <= -8) ? (a_.values[i] >> 7) : (a_.values[i] >> -b_.values[i])); + } + #endif #endif return simde_int8x8_from_private(r_); @@ -180,14 +191,25 @@ simde_vshl_s16 (const simde_int16x4_t a, const simde_int16x4_t b) { _mm_cmpgt_epi32(_mm_setzero_si128(), b128)); r_.m64 = _mm_movepi64_pi64(_mm_shuffle_epi8(r128, _mm_set1_epi64x(0x0D0C090805040100))); #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - b_.values[i] = HEDLEY_STATIC_CAST(int8_t, b_.values[i]); - r_.values[i] = HEDLEY_STATIC_CAST(int16_t, - (b_.values[i] >= 0) ? - (b_.values[i] >= 16) ? 0 : (a_.values[i] << b_.values[i]) : - (b_.values[i] <= -16) ? (a_.values[i] >> 15) : (a_.values[i] >> -b_.values[i])); - } + #if defined(SIMDE_RISCV_V_NATIVE) + vint8mf2_t b_8mf2 = __riscv_vncvt_x_x_w_i8mf2 (b_.sv64, 4); + vint16m1_t bit_shift_rst = __riscv_vmerge_vxm_i16m1(__riscv_vsll_vv_i16m1 (a_.sv64, __riscv_vreinterpret_v_i16m1_u16m1(b_.sv64), 4), 0 \ + , __riscv_vmsge_vx_i8mf2_b16(b_8mf2, 16, 8), 4); + vint16m1_t b_abs = __riscv_vmax_vv_i16m1 (b_.sv64, __riscv_vneg_v_i16m1 (b_.sv64, 4), 4); + vuint16m1_t u_b_abs = __riscv_vreinterpret_v_i16m1_u16m1 (b_abs); + vint16m1_t scal_shift_rst = __riscv_vmerge_vvm_i16m1(__riscv_vsra_vv_i16m1 (a_.sv64, u_b_abs, 4) + , __riscv_vsra_vx_i16m1(a_.sv64, 15, 4), __riscv_vmsle_vx_i8mf2_b16(b_8mf2, -16, 8), 4); + r_.sv64 = __riscv_vmerge_vvm_i16m1 (bit_shift_rst, scal_shift_rst, __riscv_vmslt_vx_i8mf2_b16 (b_8mf2, 0, 8), 4); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + b_.values[i] = HEDLEY_STATIC_CAST(int8_t, b_.values[i]); + r_.values[i] = HEDLEY_STATIC_CAST(int16_t, + (b_.values[i] >= 0) ? + (b_.values[i] >= 16) ? 0 : (a_.values[i] << b_.values[i]) : + (b_.values[i] <= -16) ? (a_.values[i] >> 15) : (a_.values[i] >> -b_.values[i])); + } + #endif #endif return simde_int16x4_from_private(r_); @@ -218,14 +240,25 @@ simde_vshl_s32 (const simde_int32x2_t a, const simde_int32x2_t b) { _mm_cmpgt_epi32(_mm_setzero_si128(), b128)); r_.m64 = _mm_movepi64_pi64(r128); #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - b_.values[i] = HEDLEY_STATIC_CAST(int8_t, b_.values[i]); - r_.values[i] = - (b_.values[i] >= 0) ? - (b_.values[i] >= 32) ? 0 : (a_.values[i] << b_.values[i]) : - (b_.values[i] <= -32) ? (a_.values[i] >> 31) : (a_.values[i] >> -b_.values[i]); - } + #if defined(SIMDE_RISCV_V_NATIVE) + vint8mf4_t b_8mf4 = __riscv_vncvt_x_x_w_i8mf4 (__riscv_vncvt_x_x_w_i16mf2 (b_.sv64, 2), 4); + vint32m1_t bit_shift_rst = __riscv_vmerge_vxm_i32m1(__riscv_vsll_vv_i32m1 (a_.sv64, __riscv_vreinterpret_v_i32m1_u32m1(b_.sv64), 2), 0 + , __riscv_vmsge_vx_i8mf4_b32(b_8mf4, 32, 2), 2); + vint32m1_t b_abs = __riscv_vmax_vv_i32m1 (b_.sv64, __riscv_vneg_v_i32m1 (b_.sv64, 2), 2); + vuint32m1_t u_b_abs = __riscv_vreinterpret_v_i32m1_u32m1 (b_abs); + vint32m1_t scal_shift_rst = __riscv_vmerge_vvm_i32m1(__riscv_vsra_vv_i32m1 (a_.sv64, u_b_abs, 2) + , __riscv_vsra_vx_i32m1(a_.sv64, 31, 2), __riscv_vmsle_vx_i8mf4_b32(b_8mf4, -32, 2), 2); + r_.sv64 = __riscv_vmerge_vvm_i32m1 (bit_shift_rst, scal_shift_rst, __riscv_vmslt_vx_i8mf4_b32 (b_8mf4, 0, 8), 2); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + b_.values[i] = HEDLEY_STATIC_CAST(int8_t, b_.values[i]); + r_.values[i] = + (b_.values[i] >= 0) ? + (b_.values[i] >= 32) ? 0 : (a_.values[i] << b_.values[i]) : + (b_.values[i] <= -32) ? (a_.values[i] >> 31) : (a_.values[i] >> -b_.values[i]); + } + #endif #endif return simde_int32x2_from_private(r_); @@ -267,10 +300,21 @@ simde_vshl_s64 (const simde_int64x1_t a, const simde_int64x1_t b) { _mm_cmpgt_epi64(zero, _mm_slli_epi64(b128, 56))); r_.m64 = _mm_movepi64_pi64(r128); #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = simde_vshld_s64(a_.values[i], b_.values[i]); - } + #if defined(SIMDE_RISCV_V_NATIVE) + vint8mf8_t b_8mf8 = __riscv_vncvt_x_x_w_i8mf8 (__riscv_vncvt_x_x_w_i16mf4 (__riscv_vncvt_x_x_w_i32mf2 (b_.sv64, 1), 2), 4); + vint64m1_t bit_shift_rst = __riscv_vmerge_vxm_i64m1(__riscv_vsll_vv_i64m1 (a_.sv64, __riscv_vreinterpret_v_i64m1_u64m1(b_.sv64), 1), 0 + , __riscv_vmsge_vx_i8mf8_b64(b_8mf8, 64, 1), 1); + vint64m1_t b_abs = __riscv_vmax_vv_i64m1 (b_.sv64, __riscv_vneg_v_i64m1 (b_.sv64, 1), 1); + vuint64m1_t u_b_abs = __riscv_vreinterpret_v_i64m1_u64m1 (b_abs); + vint64m1_t scal_shift_rst = __riscv_vmerge_vvm_i64m1(__riscv_vsra_vv_i64m1 (a_.sv64, u_b_abs, 1) + , __riscv_vsra_vx_i64m1(a_.sv64, 63, 1), __riscv_vmsle_vx_i8mf8_b64(b_8mf8, -64, 1), 1); + r_.sv64 = __riscv_vmerge_vvm_i64m1 (bit_shift_rst, scal_shift_rst, __riscv_vmslt_vx_i8mf8_b64 (b_8mf8, 0, 8), 1); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vshld_s64(a_.values[i], b_.values[i]); + } + #endif #endif return simde_int64x1_from_private(r_); @@ -308,13 +352,22 @@ simde_vshl_u8 (const simde_uint8x8_t a, const simde_int8x8_t b) { r256 = _mm256_shuffle_epi8(r256, _mm256_set1_epi32(0x0C080400)); r_.m64 = _mm_set_pi32(simde_mm256_extract_epi32(r256, 4), simde_mm256_extract_epi32(r256, 0)); #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = HEDLEY_STATIC_CAST(uint8_t, - (simde_math_abs(b_.values[i]) >= 8) ? 0 : - (b_.values[i] >= 0) ? (a_.values[i] << b_.values[i]) : - (a_.values[i] >> -b_.values[i])); - } + #if defined(SIMDE_RISCV_V_NATIVE) + vuint8m1_t u_b = __riscv_vreinterpret_v_i8m1_u8m1 (b_.sv64); + vint8m1_t b_abs = __riscv_vmax_vv_i8m1 (b_.sv64, __riscv_vneg_v_i8m1 (b_.sv64, 8), 8); + vuint8m1_t u_b_abs = __riscv_vreinterpret_v_i8m1_u8m1 (b_abs); + r_.sv64 = __riscv_vmerge_vxm_u8m1(__riscv_vmerge_vvm_u8m1(__riscv_vsrl_vv_u8m1(a_.sv64, u_b_abs, 8) + , __riscv_vsll_vv_u8m1 (a_.sv64, u_b, 8), __riscv_vmsge_vx_i8m1_b8(b_.sv64, 0, 8), 8), 0 \ + ,__riscv_vmsgeu_vx_u8m1_b8(u_b_abs, 8, 8), 8); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = HEDLEY_STATIC_CAST(uint8_t, + (simde_math_abs(b_.values[i]) >= 8) ? 0 : + (b_.values[i] >= 0) ? (a_.values[i] << b_.values[i]) : + (a_.values[i] >> -b_.values[i])); + } + #endif #endif return simde_uint8x8_from_private(r_); @@ -345,14 +398,26 @@ simde_vshl_u16 (const simde_uint16x4_t a, const simde_int16x4_t b) { _mm_cmpgt_epi32(_mm_setzero_si128(), b128)); r_.m64 = _mm_movepi64_pi64(_mm_shuffle_epi8(r128, _mm_set1_epi64x(0x0D0C090805040100))); #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - b_.values[i] = HEDLEY_STATIC_CAST(int8_t, b_.values[i]); - r_.values[i] = HEDLEY_STATIC_CAST(uint16_t, - (simde_math_abs(b_.values[i]) >= 16) ? 0 : - (b_.values[i] >= 0) ? (a_.values[i] << b_.values[i]) : - (a_.values[i] >> -b_.values[i])); - } + #if defined(SIMDE_RISCV_V_NATIVE) + vint8mf2_t b_8mf2 = __riscv_vncvt_x_x_w_i8mf2 (b_.sv64, 4); + vint8mf2_t b_8mf2_abs = __riscv_vmax_vv_i8mf2 (b_8mf2, __riscv_vneg_v_i8mf2 (b_8mf2, 8), 8); + vuint8mf2_t u_b_8mf2_abs = __riscv_vreinterpret_v_i8mf2_u8mf2 (b_8mf2_abs); + vuint16m1_t u_b = __riscv_vreinterpret_v_i16m1_u16m1 (b_.sv64); + vint16m1_t b_abs = __riscv_vmax_vv_i16m1 (b_.sv64, __riscv_vneg_v_i16m1 (b_.sv64, 4), 4); + vuint16m1_t u_b_abs = __riscv_vreinterpret_v_i16m1_u16m1 (b_abs); + r_.sv64 = __riscv_vmerge_vxm_u16m1(__riscv_vmerge_vvm_u16m1(__riscv_vsrl_vv_u16m1(a_.sv64, u_b_abs, 4) + , __riscv_vsll_vv_u16m1 (a_.sv64, u_b, 4), __riscv_vmsge_vx_i16m1_b16(b_.sv64, 0, 4), 4) + , 0, __riscv_vmsgeu_vx_u8mf2_b16(u_b_8mf2_abs, 16, 8), 4); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + b_.values[i] = HEDLEY_STATIC_CAST(int8_t, b_.values[i]); + r_.values[i] = HEDLEY_STATIC_CAST(uint16_t, + (simde_math_abs(b_.values[i]) >= 16) ? 0 : + (b_.values[i] >= 0) ? (a_.values[i] << b_.values[i]) : + (a_.values[i] >> -b_.values[i])); + } + #endif #endif return simde_uint16x4_from_private(r_); @@ -383,14 +448,26 @@ simde_vshl_u32 (const simde_uint32x2_t a, const simde_int32x2_t b) { _mm_cmpgt_epi32(_mm_setzero_si128(), b128)); r_.m64 = _mm_movepi64_pi64(r128); #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - b_.values[i] = HEDLEY_STATIC_CAST(int8_t, b_.values[i]); - r_.values[i] = - (simde_math_abs(b_.values[i]) >= 32) ? 0 : - (b_.values[i] >= 0) ? (a_.values[i] << b_.values[i]) : - (a_.values[i] >> -b_.values[i]); - } + #if defined(SIMDE_RISCV_V_NATIVE) + vint8mf4_t b_8mf4 = __riscv_vncvt_x_x_w_i8mf4 (__riscv_vncvt_x_x_w_i16mf2 (b_.sv64, 2), 4); + vint8mf4_t b_8mf4_abs = __riscv_vmax_vv_i8mf4 (b_8mf4, __riscv_vneg_v_i8mf4 (b_8mf4, 8), 8); + vuint8mf4_t u_b_8mf4_abs = __riscv_vreinterpret_v_i8mf4_u8mf4 (b_8mf4_abs); + vuint32m1_t u_b = __riscv_vreinterpret_v_i32m1_u32m1 (b_.sv64); + vint32m1_t b_abs = __riscv_vmax_vv_i32m1 (b_.sv64, __riscv_vneg_v_i32m1 (b_.sv64, 2), 2); + vuint32m1_t u_b_abs = __riscv_vreinterpret_v_i32m1_u32m1 (b_abs); + r_.sv64 = __riscv_vmerge_vxm_u32m1(__riscv_vmerge_vvm_u32m1(__riscv_vsrl_vv_u32m1(a_.sv64, u_b_abs, 2) + , __riscv_vsll_vv_u32m1 (a_.sv64, u_b, 2), __riscv_vmsge_vx_i32m1_b32(b_.sv64, 0, 2), 2), 0 + , __riscv_vmsgeu_vx_u8mf4_b32(u_b_8mf4_abs, 32, 8), 2); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + b_.values[i] = HEDLEY_STATIC_CAST(int8_t, b_.values[i]); + r_.values[i] = + (simde_math_abs(b_.values[i]) >= 32) ? 0 : + (b_.values[i] >= 0) ? (a_.values[i] << b_.values[i]) : + (a_.values[i] >> -b_.values[i]); + } + #endif #endif return simde_uint32x2_from_private(r_); @@ -430,10 +507,24 @@ simde_vshl_u64 (const simde_uint64x1_t a, const simde_int64x1_t b) { _mm_cmpgt_epi64(_mm_setzero_si128(), _mm_slli_epi64(b128, 56))); r_.m64 = _mm_movepi64_pi64(r128); #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = simde_vshld_u64(a_.values[i], b_.values[i]); - } + #if defined(SIMDE_RISCV_V_NATIVE) + // change b_ to int8_t + vint8mf8_t b_8mf8 = __riscv_vncvt_x_x_w_i8mf8(__riscv_vncvt_x_x_w_i16mf4 \ + (__riscv_vncvt_x_x_w_i32mf2 (b_.sv64, 1), 2), 4); + vint8mf8_t b_8mf8_abs = __riscv_vmax_vv_i8mf8(b_8mf8, __riscv_vneg_v_i8mf8 (b_8mf8, 8), 8); + vuint8mf8_t u_b_8mf8_abs = __riscv_vreinterpret_v_i8mf8_u8mf8 (b_8mf8_abs); + vuint64m1_t u_b = __riscv_vreinterpret_v_i64m1_u64m1 (b_.sv64); + vint64m1_t b_abs = __riscv_vmax_vv_i64m1 (b_.sv64, __riscv_vneg_v_i64m1 (b_.sv64, 1), 1); + vuint64m1_t u_b_abs = __riscv_vreinterpret_v_i64m1_u64m1 (b_abs); + r_.sv64 = __riscv_vmerge_vxm_u64m1(__riscv_vmerge_vvm_u64m1(__riscv_vsrl_vv_u64m1(a_.sv64, u_b_abs, 1) + , __riscv_vsll_vv_u64m1 (a_.sv64, u_b, 1), __riscv_vmsge_vx_i64m1_b64(b_.sv64, 0, 1), 1), 0 + , __riscv_vmsgeu_vx_u8mf8_b64(u_b_8mf8_abs, 64, 8), 1); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vshld_u64(a_.values[i], b_.values[i]); + } + #endif #endif return simde_uint64x1_from_private(r_); @@ -477,13 +568,23 @@ simde_vshlq_s8 (const simde_int8x16_t a, const simde_int8x16_t b) { _mm256_cmpgt_epi16(_mm256_setzero_si256(), b256)); r_.m128i = _mm256_cvtepi16_epi8(r256); #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = HEDLEY_STATIC_CAST(int8_t, - (b_.values[i] >= 0) ? - (b_.values[i] >= 8) ? 0 : (a_.values[i] << b_.values[i]) : - (b_.values[i] <= -8) ? (a_.values[i] >> 7) : (a_.values[i] >> -b_.values[i])); - } + #if defined(SIMDE_RISCV_V_NATIVE) + vint8m1_t bit_shift_rst = __riscv_vmerge_vxm_i8m1(__riscv_vsll_vv_i8m1 (a_.sv128, __riscv_vreinterpret_v_i8m1_u8m1(b_.sv128), 16), \ + 0, __riscv_vmsge_vx_i8m1_b8(b_.sv128, 8, 16), 16); + vint8m1_t b_abs = __riscv_vmax_vv_i8m1 (b_.sv128, __riscv_vneg_v_i8m1 (b_.sv128, 16), 16); + vuint8m1_t u_b_abs = __riscv_vreinterpret_v_i8m1_u8m1 (b_abs); + vint8m1_t scal_shift_rst = __riscv_vmerge_vvm_i8m1(__riscv_vsra_vv_i8m1 (a_.sv128, u_b_abs, 16) + , __riscv_vsra_vx_i8m1(a_.sv128, 7, 16), __riscv_vmsle_vx_i8m1_b8(b_.sv128, -8, 16), 16); + r_.sv128 = __riscv_vmerge_vvm_i8m1 (bit_shift_rst, scal_shift_rst, __riscv_vmslt_vx_i8m1_b8 (b_.sv128, 0, 16), 16); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = HEDLEY_STATIC_CAST(int8_t, + (b_.values[i] >= 0) ? + (b_.values[i] >= 8) ? 0 : (a_.values[i] << b_.values[i]) : + (b_.values[i] <= -8) ? (a_.values[i] >> 7) : (a_.values[i] >> -b_.values[i])); + } + #endif #endif return simde_int8x16_from_private(r_); @@ -536,14 +637,26 @@ simde_vshlq_s16 (const simde_int16x8_t a, const simde_int16x8_t b) { r256 = _mm256_shuffle_epi8(r256, _mm256_set1_epi64x(0x0D0C090805040100)); r_.m128i = _mm_set_epi64x(simde_mm256_extract_epi64(r256, 2), simde_mm256_extract_epi64(r256, 0)); #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - b_.values[i] = HEDLEY_STATIC_CAST(int8_t, b_.values[i]); - r_.values[i] = HEDLEY_STATIC_CAST(int16_t, - (b_.values[i] >= 0) ? - (b_.values[i] >= 16) ? 0 : (a_.values[i] << b_.values[i]) : - (b_.values[i] <= -16) ? (a_.values[i] >> 15) : (a_.values[i] >> -b_.values[i])); - } + #if defined(SIMDE_RISCV_V_NATIVE) + vint8mf2_t b_8mf2 = __riscv_vncvt_x_x_w_i8mf2 (b_.sv128, 8); + vint8mf2_t b_8mf2_abs = __riscv_vmax_vv_i8mf2 (b_8mf2, __riscv_vneg_v_i8mf2 (b_8mf2, 16), 16); + vuint8mf2_t u_b_8mf2_abs = __riscv_vreinterpret_v_i8mf2_u8mf2(b_8mf2_abs); + vuint16m1_t u_b_abs = __riscv_vwcvtu_x_x_v_u16m1 (u_b_8mf2_abs, 16); + vint16m1_t bit_shift_rst = __riscv_vmerge_vxm_i16m1(__riscv_vsll_vv_i16m1 (a_.sv128, __riscv_vreinterpret_v_i16m1_u16m1(b_.sv128), 8), 0, \ + __riscv_vmsge_vx_i8mf2_b16(b_8mf2, 16, 16), 8); + vint16m1_t scal_shift_rst = __riscv_vmerge_vvm_i16m1(__riscv_vsra_vv_i16m1 (a_.sv128, u_b_abs, 8), + __riscv_vsra_vx_i16m1(a_.sv128, 15, 8), __riscv_vmsle_vx_i8mf2_b16(b_8mf2, -16, 16), 8); + r_.sv128 = __riscv_vmerge_vvm_i16m1 (bit_shift_rst, scal_shift_rst, __riscv_vmslt_vx_i8mf2_b16 (b_8mf2, 0, 16), 8); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + b_.values[i] = HEDLEY_STATIC_CAST(int8_t, b_.values[i]); + r_.values[i] = HEDLEY_STATIC_CAST(int16_t, + (b_.values[i] >= 0) ? + (b_.values[i] >= 16) ? 0 : (a_.values[i] << b_.values[i]) : + (b_.values[i] <= -16) ? (a_.values[i] >> 15) : (a_.values[i] >> -b_.values[i])); + } + #endif #endif return simde_int16x8_from_private(r_); @@ -588,14 +701,26 @@ simde_vshlq_s32 (const simde_int32x4_t a, const simde_int32x4_t b) { _mm_srav_epi32(a_.m128i, _mm_abs_epi32(B)), _mm_cmpgt_epi32(_mm_setzero_si128(), B)); #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - b_.values[i] = HEDLEY_STATIC_CAST(int8_t, b_.values[i]); - r_.values[i] = - (b_.values[i] >= 0) ? - (b_.values[i] >= 32) ? 0 : (a_.values[i] << b_.values[i]) : - (b_.values[i] <= -32) ? (a_.values[i] >> 31) : (a_.values[i] >> -b_.values[i]); - } + #if defined(SIMDE_RISCV_V_NATIVE) + vint8mf4_t b_8mf4 = __riscv_vncvt_x_x_w_i8mf4 (__riscv_vncvt_x_x_w_i16mf2 (b_.sv128, 4), 8); + vint8mf4_t b_8mf4_abs = __riscv_vmax_vv_i8mf4 (b_8mf4, __riscv_vneg_v_i8mf4 (b_8mf4, 16), 16); + vuint8mf4_t u_b_8mf4_abs = __riscv_vreinterpret_v_i8mf4_u8mf4 (b_8mf4_abs); + vuint32m1_t u_b_abs = __riscv_vwcvtu_x_x_v_u32m1 (__riscv_vwcvtu_x_x_v_u16mf2 (u_b_8mf4_abs, 16), 8); + vint32m1_t bit_shift_rst = __riscv_vmerge_vxm_i32m1(__riscv_vsll_vv_i32m1 (a_.sv128, __riscv_vreinterpret_v_i32m1_u32m1(b_.sv128), 4), 0, + __riscv_vmsge_vx_i8mf4_b32(b_8mf4, 32, 16), 4); + vint32m1_t scal_shift_rst = __riscv_vmerge_vvm_i32m1(__riscv_vsra_vv_i32m1 (a_.sv128, u_b_abs, 4), \ + __riscv_vsra_vx_i32m1(a_.sv128, 31, 4), __riscv_vmsle_vx_i8mf4_b32(b_8mf4, -32, 4), 4); + r_.sv128 = __riscv_vmerge_vvm_i32m1 (bit_shift_rst, scal_shift_rst, __riscv_vmslt_vx_i8mf4_b32 (b_8mf4, 0, 16), 4); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + b_.values[i] = HEDLEY_STATIC_CAST(int8_t, b_.values[i]); + r_.values[i] = + (b_.values[i] >= 0) ? + (b_.values[i] >= 32) ? 0 : (a_.values[i] << b_.values[i]) : + (b_.values[i] <= -32) ? (a_.values[i] >> 31) : (a_.values[i] >> -b_.values[i]); + } + #endif #endif return simde_int32x4_from_private(r_); @@ -649,10 +774,21 @@ simde_vshlq_s64 (const simde_int64x2_t a, const simde_int64x2_t b) { _mm_xor_si128(_mm_srlv_epi64(_mm_xor_si128(a_.m128i, maska), b_abs), maska), _mm_cmpgt_epi64(zero, _mm_slli_epi64(b_.m128i, 56))); #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = simde_vshld_s64(a_.values[i], b_.values[i]); - } + #if defined(SIMDE_RISCV_V_NATIVE) + vint8mf8_t b_8mf8 = __riscv_vncvt_x_x_w_i8mf8 (__riscv_vncvt_x_x_w_i16mf4 (__riscv_vncvt_x_x_w_i32mf2 (b_.sv128, 2), 4), 8); + vint8mf8_t b_8mf8_abs = __riscv_vmax_vv_i8mf8 (b_8mf8, __riscv_vneg_v_i8mf8 (b_8mf8, 16), 16); + vuint8mf8_t u_b_8mf8_abs = __riscv_vreinterpret_v_i8mf8_u8mf8 (b_8mf8_abs); + vuint64m1_t u_b_abs = __riscv_vwcvtu_x_x_v_u64m1(__riscv_vwcvtu_x_x_v_u32mf2 (__riscv_vwcvtu_x_x_v_u16mf4(u_b_8mf8_abs, 16), 8), 4); + vint64m1_t bit_shift_rst = __riscv_vmerge_vxm_i64m1(__riscv_vsll_vv_i64m1 (a_.sv128, __riscv_vreinterpret_v_i64m1_u64m1(b_.sv128), 2), 0, __riscv_vmsge_vx_i8mf8_b64(b_8mf8, 64, 2), 2); + vint64m1_t scal_shift_rst = __riscv_vmerge_vvm_i64m1(__riscv_vsra_vv_i64m1 (a_.sv128, u_b_abs, 2) + , __riscv_vsra_vx_i64m1(a_.sv128, 63, 2), __riscv_vmsle_vx_i8mf8_b64(b_8mf8, -64, 2), 2); + r_.sv128 = __riscv_vmerge_vvm_i64m1 (bit_shift_rst, scal_shift_rst, __riscv_vmslt_vx_i8mf8_b64 (b_8mf8, 0, 16), 2); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vshld_s64(a_.values[i], b_.values[i]); + } + #endif #endif return simde_int64x2_from_private(r_); @@ -689,13 +825,22 @@ simde_vshlq_u8 (const simde_uint8x16_t a, const simde_int8x16_t b) { _mm256_cmpgt_epi16(_mm256_setzero_si256(), b256)); r_.m128i = _mm256_cvtepi16_epi8(r256); #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = HEDLEY_STATIC_CAST(uint8_t, - (simde_math_abs(b_.values[i]) >= 8) ? 0 : - (b_.values[i] >= 0) ? (a_.values[i] << b_.values[i]) : - (a_.values[i] >> -b_.values[i])); - } + #if defined(SIMDE_RISCV_V_NATIVE) + vuint8m1_t u_b = __riscv_vreinterpret_v_i8m1_u8m1 (b_.sv128); + vint8m1_t b_abs = __riscv_vmax_vv_i8m1 (b_.sv128, __riscv_vneg_v_i8m1 (b_.sv128, 16), 16); + vuint8m1_t u_b_abs = __riscv_vreinterpret_v_i8m1_u8m1 (b_abs); + r_.sv128 = __riscv_vmerge_vxm_u8m1(__riscv_vmerge_vvm_u8m1(__riscv_vsrl_vv_u8m1(a_.sv128, u_b_abs, 16) + , __riscv_vsll_vv_u8m1 (a_.sv128, u_b, 16), __riscv_vmsge_vx_i8m1_b8(b_.sv128, 0, 16), 16), 0 + , __riscv_vmsgeu_vx_u8m1_b8(u_b_abs, 8, 16), 16); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = HEDLEY_STATIC_CAST(uint8_t, + (simde_math_abs(b_.values[i]) >= 8) ? 0 : + (b_.values[i] >= 0) ? (a_.values[i] << b_.values[i]) : + (a_.values[i] >> -b_.values[i])); + } + #endif #endif return simde_uint8x16_from_private(r_); @@ -746,14 +891,25 @@ simde_vshlq_u16 (const simde_uint16x8_t a, const simde_int16x8_t b) { r256 = _mm256_shuffle_epi8(r256, _mm256_set1_epi64x(0x0D0C090805040100)); r_.m128i = _mm_set_epi64x(simde_mm256_extract_epi64(r256, 2), simde_mm256_extract_epi64(r256, 0)); #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - b_.values[i] = HEDLEY_STATIC_CAST(int8_t, b_.values[i]); - r_.values[i] = HEDLEY_STATIC_CAST(uint16_t, - (simde_math_abs(b_.values[i]) >= 16) ? 0 : - (b_.values[i] >= 0) ? (a_.values[i] << b_.values[i]) : - (a_.values[i] >> -b_.values[i])); - } + #if defined(SIMDE_RISCV_V_NATIVE) + vint8mf2_t b_8mf2 = __riscv_vncvt_x_x_w_i8mf2 (b_.sv128, 8); + vint8mf2_t b_8mf2_abs = __riscv_vmax_vv_i8mf2 (b_8mf2, __riscv_vneg_v_i8mf2 (b_8mf2, 16), 16); + vuint8mf2_t u_b_8mf2_abs = __riscv_vreinterpret_v_i8mf2_u8mf2 (b_8mf2_abs); + vuint16m1_t u_b = __riscv_vreinterpret_v_i16m1_u16m1 (b_.sv128); + vuint16m1_t u_b_abs = __riscv_vwcvtu_x_x_v_u16m1 (u_b_8mf2_abs, 16); + r_.sv128 = __riscv_vmerge_vxm_u16m1(__riscv_vmerge_vvm_u16m1(__riscv_vsrl_vv_u16m1(a_.sv128, u_b_abs, 8), + __riscv_vsll_vv_u16m1 (a_.sv128, u_b, 8), __riscv_vmsge_vx_i8mf2_b16(b_8mf2, 0, 8), 8), + 0, __riscv_vmsgeu_vx_u8mf2_b16(u_b_8mf2_abs, 16, 16), 8); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + b_.values[i] = HEDLEY_STATIC_CAST(int8_t, b_.values[i]); + r_.values[i] = HEDLEY_STATIC_CAST(uint16_t, + (simde_math_abs(b_.values[i]) >= 16) ? 0 : + (b_.values[i] >= 0) ? (a_.values[i] << b_.values[i]) : + (a_.values[i] >> -b_.values[i])); + } + #endif #endif return simde_uint16x8_from_private(r_); @@ -790,13 +946,26 @@ simde_vshlq_u32 (const simde_uint32x4_t a, const simde_int32x4_t b) { _mm_srlv_epi32(a_.m128i, _mm_abs_epi32(B)), _mm_cmpgt_epi32(_mm_setzero_si128(), B)); #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - b_.values[i] = HEDLEY_STATIC_CAST(int8_t, b_.values[i]); - r_.values[i] = (simde_math_abs(b_.values[i]) >= 32) ? 0 : - (b_.values[i] >= 0) ? (a_.values[i] << b_.values[i]) : - (a_.values[i] >> -b_.values[i]); - } + #if defined(SIMDE_RISCV_V_NATIVE) + vint8mf4_t b_8mf4 = __riscv_vncvt_x_x_w_i8mf4 ( + __riscv_vncvt_x_x_w_i16mf2 (b_.sv128, 4), 8); + vint8mf4_t b_8mf4_abs = __riscv_vmax_vv_i8mf4 (b_8mf4, __riscv_vneg_v_i8mf4 (b_8mf4, 16), 16); + vuint8mf4_t u_b_8mf4_abs = __riscv_vreinterpret_v_i8mf4_u8mf4 (b_8mf4_abs); + vuint32m1_t u_b = __riscv_vreinterpret_v_i32m1_u32m1 (b_.sv128); + vint32m1_t b_abs = __riscv_vmax_vv_i32m1 (b_.sv128, __riscv_vneg_v_i32m1 (b_.sv128, 4), 4); + vuint32m1_t u_b_abs = __riscv_vreinterpret_v_i32m1_u32m1 (b_abs); + r_.sv128 = __riscv_vmerge_vxm_u32m1(__riscv_vmerge_vvm_u32m1(__riscv_vsrl_vv_u32m1(a_.sv128, u_b_abs, 4) + , __riscv_vsll_vv_u32m1 (a_.sv128, u_b, 4), __riscv_vmsge_vx_i8mf4_b32(b_8mf4, 0, 4), 4), 0 + , __riscv_vmsgeu_vx_u8mf4_b32(u_b_8mf4_abs, 32, 16), 4); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + b_.values[i] = HEDLEY_STATIC_CAST(int8_t, b_.values[i]); + r_.values[i] = (simde_math_abs(b_.values[i]) >= 32) ? 0 : + (b_.values[i] >= 0) ? (a_.values[i] << b_.values[i]) : + (a_.values[i] >> -b_.values[i]); + } + #endif #endif return simde_uint32x4_from_private(r_); @@ -845,10 +1014,24 @@ simde_vshlq_u64 (const simde_uint64x2_t a, const simde_int64x2_t b) { _mm_srlv_epi64(a_.m128i, b_abs), _mm_cmpgt_epi64(_mm_setzero_si128(), _mm_slli_epi64(b_.m128i, 56))); #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = simde_vshld_u64(a_.values[i], b_.values[i]); - } + #if defined(SIMDE_RISCV_V_NATIVE) + vint8mf8_t b_8mf8 = __riscv_vncvt_x_x_w_i8mf8 ( + __riscv_vncvt_x_x_w_i16mf4 ( + __riscv_vncvt_x_x_w_i32mf2 (b_.sv128, 2), 4), 8); + vint8mf8_t b_8mf8_abs = __riscv_vmax_vv_i8mf8 (b_8mf8, __riscv_vneg_v_i8mf8 (b_8mf8, 16), 16); + vuint8mf8_t u_b_8mf8_abs = __riscv_vreinterpret_v_i8mf8_u8mf8 (b_8mf8_abs); + vuint64m1_t u_b = __riscv_vreinterpret_v_i64m1_u64m1 (b_.sv128); + vint64m1_t b_abs = __riscv_vmax_vv_i64m1 (b_.sv128, __riscv_vneg_v_i64m1 (b_.sv128, 2), 2); + vuint64m1_t u_b_abs = __riscv_vreinterpret_v_i64m1_u64m1 (b_abs); + r_.sv128 = __riscv_vmerge_vxm_u64m1(__riscv_vmerge_vvm_u64m1(__riscv_vsrl_vv_u64m1(a_.sv128, u_b_abs, 2) + , __riscv_vsll_vv_u64m1 (a_.sv128, u_b, 2), __riscv_vmsge_vx_i8mf8_b64(b_8mf8, 0, 2), 2), 0 + , __riscv_vmsgeu_vx_u8mf8_b64(u_b_8mf8_abs, 64, 16), 2); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vshld_u64(a_.values[i], b_.values[i]); + } + #endif #endif return simde_uint64x2_from_private(r_); diff --git a/simde/arm/neon/shl_n.h b/simde/arm/neon/shl_n.h index 61fb143a8..c80cf9f24 100644 --- a/simde/arm/neon/shl_n.h +++ b/simde/arm/neon/shl_n.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Christopher Moore + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_SHL_N_H) @@ -69,8 +70,9 @@ simde_vshl_n_s8 (const simde_int8x8_t a, const int n) simde_int8x8_private r_, a_ = simde_int8x8_to_private(a); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_100762) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vsll_vx_i8m1 (a_.sv64, n, 8); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_100762) r_.values = a_.values << HEDLEY_STATIC_CAST(int8_t, n); #else SIMDE_VECTORIZE @@ -100,7 +102,9 @@ simde_vshl_n_s16 (const simde_int16x4_t a, const int n) r_, a_ = simde_int16x4_to_private(a); - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vsll_vx_i16m1 (a_.sv64, n, 4); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.values = a_.values << HEDLEY_STATIC_CAST(int16_t, n); #else SIMDE_VECTORIZE @@ -129,7 +133,9 @@ simde_vshl_n_s32 (const simde_int32x2_t a, const int n) r_, a_ = simde_int32x2_to_private(a); - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vsll_vx_i32m1 (a_.sv64, n, 2); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.values = a_.values << n; #else SIMDE_VECTORIZE @@ -158,7 +164,9 @@ simde_vshl_n_s64 (const simde_int64x1_t a, const int n) r_, a_ = simde_int64x1_to_private(a); - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vsll_vx_i64m1 (a_.sv64, n, 1); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.values = a_.values << n; #else SIMDE_VECTORIZE @@ -187,7 +195,9 @@ simde_vshl_n_u8 (const simde_uint8x8_t a, const int n) r_, a_ = simde_uint8x8_to_private(a); - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_100762) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vsll_vx_u8m1 (a_.sv64, n, 8); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_100762) r_.values = a_.values << HEDLEY_STATIC_CAST(uint8_t, n); #else SIMDE_VECTORIZE @@ -217,7 +227,9 @@ simde_vshl_n_u16 (const simde_uint16x4_t a, const int n) r_, a_ = simde_uint16x4_to_private(a); - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vsll_vx_u16m1 (a_.sv64, n, 4); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.values = a_.values << HEDLEY_STATIC_CAST(uint16_t, n); #else SIMDE_VECTORIZE @@ -246,7 +258,9 @@ simde_vshl_n_u32 (const simde_uint32x2_t a, const int n) r_, a_ = simde_uint32x2_to_private(a); - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vsll_vx_u32m1 (a_.sv64, n, 2); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.values = a_.values << n; #else SIMDE_VECTORIZE @@ -275,7 +289,9 @@ simde_vshl_n_u64 (const simde_uint64x1_t a, const int n) r_, a_ = simde_uint64x1_to_private(a); - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vsll_vx_u64m1 (a_.sv64, n, 1); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.values = a_.values << n; #else SIMDE_VECTORIZE @@ -311,6 +327,8 @@ simde_vshlq_n_s8 (const simde_int8x16_t a, const int n) r_.m128i = _mm_andnot_si128(_mm_set1_epi8(HEDLEY_STATIC_CAST(int8_t, (1 << n) - 1)), _mm_slli_epi64(a_.m128i, n)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i8x16_shl(a_.v128, HEDLEY_STATIC_CAST(uint32_t, n)); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vsll_vx_i8m1 (a_.sv128, n, 16); #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.values = a_.values << HEDLEY_STATIC_CAST(int8_t, n); #else @@ -344,6 +362,8 @@ simde_vshlq_n_s16 (const simde_int16x8_t a, const int n) r_.m128i = _mm_slli_epi16(a_.m128i, (n)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i16x8_shl(a_.v128, HEDLEY_STATIC_CAST(uint32_t, n)); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vsll_vx_i16m1 (a_.sv128, n, 8); #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.values = a_.values << HEDLEY_STATIC_CAST(int16_t, n); #else @@ -377,6 +397,8 @@ simde_vshlq_n_s32 (const simde_int32x4_t a, const int n) r_.m128i = _mm_slli_epi32(a_.m128i, (n)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i32x4_shl(a_.v128, HEDLEY_STATIC_CAST(uint32_t, n)); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vsll_vx_i32m1 (a_.sv128, n, 4); #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.values = a_.values << n; #else @@ -410,6 +432,8 @@ simde_vshlq_n_s64 (const simde_int64x2_t a, const int n) r_.m128i = _mm_slli_epi64(a_.m128i, (n)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i64x2_shl(a_.v128, HEDLEY_STATIC_CAST(uint32_t, n)); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vsll_vx_i64m1 (a_.sv128, n, 2); #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.values = a_.values << n; #else @@ -446,6 +470,8 @@ simde_vshlq_n_u8 (const simde_uint8x16_t a, const int n) r_.m128i = _mm_andnot_si128(_mm_set1_epi8(HEDLEY_STATIC_CAST(int8_t, (1 << n) - 1)), _mm_slli_epi64(a_.m128i, (n))); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i8x16_shl(a_.v128, HEDLEY_STATIC_CAST(uint32_t, n)); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vsll_vx_u8m1 (a_.sv128, n, 16); #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.values = a_.values << HEDLEY_STATIC_CAST(uint8_t, n); #else @@ -479,6 +505,8 @@ simde_vshlq_n_u16 (const simde_uint16x8_t a, const int n) r_.m128i = _mm_slli_epi16(a_.m128i, (n)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i16x8_shl(a_.v128, HEDLEY_STATIC_CAST(uint32_t, n)); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vsll_vx_u16m1 (a_.sv128, n, 8); #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.values = a_.values << HEDLEY_STATIC_CAST(uint16_t, n); #else @@ -512,6 +540,8 @@ simde_vshlq_n_u32 (const simde_uint32x4_t a, const int n) r_.m128i = _mm_slli_epi32(a_.m128i, (n)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i32x4_shl(a_.v128, HEDLEY_STATIC_CAST(uint32_t, n)); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vsll_vx_u32m1 (a_.sv128, n, 4); #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.values = a_.values << n; #else @@ -545,6 +575,8 @@ simde_vshlq_n_u64 (const simde_uint64x2_t a, const int n) r_.m128i = _mm_slli_epi64(a_.m128i, (n)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i64x2_shl(a_.v128, HEDLEY_STATIC_CAST(uint32_t, n)); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vsll_vx_u64m1 (a_.sv128, n, 2); #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.values = a_.values << n; #else diff --git a/simde/arm/neon/shll_n.h b/simde/arm/neon/shll_n.h index 898e307ed..cf6374677 100644 --- a/simde/arm/neon/shll_n.h +++ b/simde/arm/neon/shll_n.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Christopher Moore + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_SHLL_N_H) @@ -49,12 +50,16 @@ simde_vshll_n_s8 (const simde_int8x8_t a, const int n) SIMDE_REQUIRE_CONSTANT_RANGE(n, 0, 8) { simde_int16x8_private r_; simde_int8x8_private a_ = simde_int8x8_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = HEDLEY_STATIC_CAST(int16_t, HEDLEY_STATIC_CAST(int16_t, a_.values[i]) << n); - } - + #if defined(SIMDE_RISCV_V_NATIVE) + vint16m2_t va_wide = __riscv_vwcvt_x_x_v_i16m2 (a_.sv64, 8); + vint16m2_t rst = __riscv_vsll_vx_i16m2 (va_wide, n, 8); + r_.sv128 = __riscv_vlmul_trunc_v_i16m2_i16m1 (rst); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = HEDLEY_STATIC_CAST(int16_t, HEDLEY_STATIC_CAST(int16_t, a_.values[i]) << n); + } + #endif return simde_int16x8_from_private(r_); } #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) @@ -71,12 +76,16 @@ simde_vshll_n_s16 (const simde_int16x4_t a, const int n) SIMDE_REQUIRE_CONSTANT_RANGE(n, 0, 16) { simde_int32x4_private r_; simde_int16x4_private a_ = simde_int16x4_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = HEDLEY_STATIC_CAST(int32_t, a_.values[i]) << n; - } - + #if defined(SIMDE_RISCV_V_NATIVE) + vint32m2_t va_wide = __riscv_vwcvt_x_x_v_i32m2 (a_.sv64, 4); + vint32m2_t rst = __riscv_vsll_vx_i32m2 (va_wide, n, 4); + r_.sv128 = __riscv_vlmul_trunc_v_i32m2_i32m1 (rst); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = HEDLEY_STATIC_CAST(int32_t, a_.values[i]) << n; + } + #endif return simde_int32x4_from_private(r_); } #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) @@ -93,12 +102,16 @@ simde_vshll_n_s32 (const simde_int32x2_t a, const int n) SIMDE_REQUIRE_CONSTANT_RANGE(n, 0, 32) { simde_int64x2_private r_; simde_int32x2_private a_ = simde_int32x2_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = HEDLEY_STATIC_CAST(int64_t, a_.values[i]) << n; - } - + #if defined(SIMDE_RISCV_V_NATIVE) + vint64m2_t va_wide = __riscv_vwcvt_x_x_v_i64m2 (a_.sv64, 2); + vint64m2_t rst = __riscv_vsll_vx_i64m2 (va_wide, n, 2); + r_.sv128 = __riscv_vlmul_trunc_v_i64m2_i64m1 (rst); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = HEDLEY_STATIC_CAST(int64_t, a_.values[i]) << n; + } + #endif return simde_int64x2_from_private(r_); } #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) @@ -115,12 +128,16 @@ simde_vshll_n_u8 (const simde_uint8x8_t a, const int n) SIMDE_REQUIRE_CONSTANT_RANGE(n, 0, 8) { simde_uint16x8_private r_; simde_uint8x8_private a_ = simde_uint8x8_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = HEDLEY_STATIC_CAST(uint16_t, HEDLEY_STATIC_CAST(uint16_t, a_.values[i]) << n); - } - + #if defined(SIMDE_RISCV_V_NATIVE) + vuint16m2_t va_wide = __riscv_vwcvtu_x_x_v_u16m2 (a_.sv64, 8); + vuint16m2_t rst = __riscv_vsll_vx_u16m2 (va_wide, n, 8); + r_.sv128 = __riscv_vlmul_trunc_v_u16m2_u16m1 (rst); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = HEDLEY_STATIC_CAST(uint16_t, HEDLEY_STATIC_CAST(uint16_t, a_.values[i]) << n); + } + #endif return simde_uint16x8_from_private(r_); } #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) @@ -137,12 +154,16 @@ simde_vshll_n_u16 (const simde_uint16x4_t a, const int n) SIMDE_REQUIRE_CONSTANT_RANGE(n, 0, 16) { simde_uint32x4_private r_; simde_uint16x4_private a_ = simde_uint16x4_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = HEDLEY_STATIC_CAST(uint32_t, a_.values[i]) << n; - } - + #if defined(SIMDE_RISCV_V_NATIVE) + vuint32m2_t va_wide = __riscv_vwcvtu_x_x_v_u32m2 (a_.sv64, 4); + vuint32m2_t rst = __riscv_vsll_vx_u32m2 (va_wide, n, 4); + r_.sv128 = __riscv_vlmul_trunc_v_u32m2_u32m1 (rst); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = HEDLEY_STATIC_CAST(uint32_t, a_.values[i]) << n; + } + #endif return simde_uint32x4_from_private(r_); } #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) @@ -159,12 +180,16 @@ simde_vshll_n_u32 (const simde_uint32x2_t a, const int n) SIMDE_REQUIRE_CONSTANT_RANGE(n, 0, 32) { simde_uint64x2_private r_; simde_uint32x2_private a_ = simde_uint32x2_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = HEDLEY_STATIC_CAST(uint64_t, a_.values[i]) << n; - } - + #if defined(SIMDE_RISCV_V_NATIVE) + vuint64m2_t va_wide = __riscv_vwcvtu_x_x_v_u64m2 (a_.sv64, 2); + vuint64m2_t rst = __riscv_vsll_vx_u64m2 (va_wide, n, 2); + r_.sv128 = __riscv_vlmul_trunc_v_u64m2_u64m1 (rst); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = HEDLEY_STATIC_CAST(uint64_t, a_.values[i]) << n; + } + #endif return simde_uint64x2_from_private(r_); } #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) diff --git a/simde/arm/neon/shr_n.h b/simde/arm/neon/shr_n.h index 10f77d786..aeb4360d5 100644 --- a/simde/arm/neon/shr_n.h +++ b/simde/arm/neon/shr_n.h @@ -24,6 +24,7 @@ * 2020 Evan Nemerson * 2020 Christopher Moore * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_SHR_N_H) @@ -100,7 +101,9 @@ simde_vshr_n_s8 (const simde_int8x8_t a, const int n) a_ = simde_int8x8_to_private(a); int32_t n_ = (n == 8) ? 7 : n; - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_100762) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vsra_vx_i8m1 (a_.sv64, n_, 8); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_100762) r_.values = a_.values >> n_; #else SIMDE_VECTORIZE @@ -133,7 +136,9 @@ simde_vshr_n_s16 (const simde_int16x4_t a, const int n) a_ = simde_int16x4_to_private(a); int32_t n_ = (n == 16) ? 15 : n; - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_100762) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vsra_vx_i16m1 (a_.sv64, n_, 4); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_100762) r_.values = a_.values >> n_; #else SIMDE_VECTORIZE @@ -163,7 +168,9 @@ simde_vshr_n_s32 (const simde_int32x2_t a, const int n) a_ = simde_int32x2_to_private(a); int32_t n_ = (n == 32) ? 31 : n; - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vsra_vx_i32m1 (a_.sv64, n_, 2); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.values = a_.values >> n_; #else SIMDE_VECTORIZE @@ -193,7 +200,9 @@ simde_vshr_n_s64 (const simde_int64x1_t a, const int n) a_ = simde_int64x1_to_private(a); int32_t n_ = (n == 64) ? 63 : n; - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vsra_vx_i64m1 (a_.sv64, n_, 1); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.values = a_.values >> n_; #else SIMDE_VECTORIZE @@ -223,7 +232,9 @@ simde_vshr_n_u8 (const simde_uint8x8_t a, const int n) if (n == 8) { simde_memset(&r_, 0, sizeof(r_)); } else { - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_100762) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vsrl_vx_u8m1 (a_.sv64, n, 8); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_100762) r_.values = a_.values >> n; #else SIMDE_VECTORIZE @@ -257,7 +268,9 @@ simde_vshr_n_u16 (const simde_uint16x4_t a, const int n) if (n == 16) { simde_memset(&r_, 0, sizeof(r_)); } else { - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vsrl_vx_u16m1 (a_.sv64, n, 4); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.values = a_.values >> n; #else SIMDE_VECTORIZE @@ -290,7 +303,9 @@ simde_vshr_n_u32 (const simde_uint32x2_t a, const int n) if (n == 32) { simde_memset(&r_, 0, sizeof(r_)); } else { - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vsrl_vx_u32m1 (a_.sv64, n, 2); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.values = a_.values >> n; #else SIMDE_VECTORIZE @@ -323,7 +338,9 @@ simde_vshr_n_u64 (const simde_uint64x1_t a, const int n) if (n == 64) { simde_memset(&r_, 0, sizeof(r_)); } else { - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vsrl_vx_u64m1 (a_.sv64, n, 1); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.values = a_.values >> n; #else SIMDE_VECTORIZE @@ -369,6 +386,9 @@ simde_vshrq_n_s8 (const simde_int8x16_t a, const int n) _mm_and_si128(_mm_set1_epi16(0x00FF), _mm_srai_epi16(_mm_slli_epi16(a_.m128i, 8), 8 + (n)))); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i8x16_shr(a_.v128, ((n) == 8) ? 7 : HEDLEY_STATIC_CAST(uint32_t, n)); + #elif defined(SIMDE_RISCV_V_NATIVE) + int32_t n_ = (n == 8) ? 7 : n; + r_.sv128 = __riscv_vsra_vx_i8m1 (a_.sv128, n_, 16); #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.values = a_.values >> ((n == 8) ? 7 : n); #else @@ -402,6 +422,9 @@ simde_vshrq_n_s16 (const simde_int16x8_t a, const int n) r_.m128i = _mm_srai_epi16(a_.m128i, n); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i16x8_shr(a_.v128, ((n) == 16) ? 15 : HEDLEY_STATIC_CAST(uint32_t, n)); + #elif defined(SIMDE_RISCV_V_NATIVE) + int32_t n_ = (n == 16) ? 15 : n; + r_.sv128 = __riscv_vsra_vx_i16m1 (a_.sv128, n_, 8); #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.values = a_.values >> ((n == 16) ? 15 : n); #else @@ -435,6 +458,9 @@ simde_vshrq_n_s32 (const simde_int32x4_t a, const int n) r_.m128i = _mm_srai_epi32(a_.m128i, n); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i32x4_shr(a_.v128, ((n) == 32) ? 31 : HEDLEY_STATIC_CAST(uint32_t, n)); + #elif defined(SIMDE_RISCV_V_NATIVE) + int32_t n_ = (n == 32) ? 31 : n; + r_.sv128 = __riscv_vsra_vx_i32m1 (a_.sv128, n_, 4); #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.values = a_.values >> ((n == 32) ? 31 : n); #else @@ -467,6 +493,9 @@ simde_vshrq_n_s64 (const simde_int64x2_t a, const int n) #if defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i64x2_shr(a_.v128, ((n) == 64) ? 63 : HEDLEY_STATIC_CAST(uint32_t, n)); + #elif defined(SIMDE_RISCV_V_NATIVE) + int32_t n_ = (n == 64) ? 63 : n; + r_.sv128 = __riscv_vsra_vx_i64m1 (a_.sv128, n_, 2); #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.values = a_.values >> ((n == 64) ? 63 : n); #else @@ -508,7 +537,9 @@ simde_vshrq_n_u8 (const simde_uint8x16_t a, const int n) if (n == 8) { simde_memset(&r_, 0, sizeof(r_)); } else { - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vsrl_vx_u8m1 (a_.sv128, n, 16); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.values = a_.values >> n; #else SIMDE_VECTORIZE @@ -548,7 +579,9 @@ simde_vshrq_n_u16 (const simde_uint16x8_t a, const int n) if (n == 16) { simde_memset(&r_, 0, sizeof(r_)); } else { - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vsrl_vx_u16m1 (a_.sv128, n, 8); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.values = a_.values >> n; #else SIMDE_VECTORIZE @@ -588,7 +621,9 @@ simde_vshrq_n_u32 (const simde_uint32x4_t a, const int n) if (n == 32) { simde_memset(&r_, 0, sizeof(r_)); } else { - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vsrl_vx_u32m1 (a_.sv128, n, 4); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.values = a_.values >> n; #else SIMDE_VECTORIZE @@ -628,7 +663,9 @@ simde_vshrq_n_u64 (const simde_uint64x2_t a, const int n) if (n == 64) { simde_memset(&r_, 0, sizeof(r_)); } else { - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_97248) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vsrl_vx_u64m1 (a_.sv128, n, 2); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_97248) r_.values = a_.values >> n; #else SIMDE_VECTORIZE diff --git a/simde/arm/neon/shrn_n.h b/simde/arm/neon/shrn_n.h index bba58a8ec..5c14a73b0 100644 --- a/simde/arm/neon/shrn_n.h +++ b/simde/arm/neon/shrn_n.h @@ -24,6 +24,7 @@ * 2020 Evan Nemerson * 2021 Zhi An Ng (Copyright owned by Google, LLC) * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_SHRN_N_H) @@ -44,10 +45,16 @@ simde_vshrn_n_s16 (const simde_int16x8_t a, const int n) SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 8) { simde_int8x8_private r_; simde_int16x8_private a_ = simde_int16x8_to_private(a); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = HEDLEY_STATIC_CAST(int8_t, (a_.values[i] >> n) & UINT8_MAX); - } + + #if defined(SIMDE_RISCV_V_NATIVE) + vint16m1_t shift = __riscv_vand_vx_i16m1(__riscv_vsll_vx_i16m1 (a_.sv128, n, 8), UINT8_MAX, 8); + r_.sv64 = __riscv_vlmul_ext_v_i8mf2_i8m1(__riscv_vncvt_x_x_w_i8mf2(shift, 8)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = HEDLEY_STATIC_CAST(int8_t, (a_.values[i] >> n) & UINT8_MAX); + } + #endif return simde_int8x8_from_private(r_); } #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) @@ -66,12 +73,15 @@ simde_vshrn_n_s32 (const simde_int32x4_t a, const int n) SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 16) { simde_int16x4_private r_; simde_int32x4_private a_ = simde_int32x4_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = HEDLEY_STATIC_CAST(int16_t, (a_.values[i] >> n) & UINT16_MAX); - } - + #if defined(SIMDE_RISCV_V_NATIVE) + vint32m1_t shift = __riscv_vand_vx_i32m1(__riscv_vsll_vx_i32m1 (a_.sv128, n, 4), UINT16_MAX, 4); + r_.sv64 = __riscv_vlmul_ext_v_i16mf2_i16m1(__riscv_vncvt_x_x_w_i16mf2(shift, 4)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = HEDLEY_STATIC_CAST(int16_t, (a_.values[i] >> n) & UINT16_MAX); + } + #endif return simde_int16x4_from_private(r_); } #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) @@ -91,11 +101,15 @@ simde_vshrn_n_s64 (const simde_int64x2_t a, const int n) simde_int32x2_private r_; simde_int64x2_private a_ = simde_int64x2_to_private(a); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = HEDLEY_STATIC_CAST(int32_t, (a_.values[i] >> n) & UINT32_MAX); - } - + #if defined(SIMDE_RISCV_V_NATIVE) + vint64m1_t shift = __riscv_vand_vx_i64m1(__riscv_vsll_vx_i64m1 (a_.sv128, n, 2), UINT32_MAX, 2); + r_.sv64 = __riscv_vlmul_ext_v_i32mf2_i32m1(__riscv_vncvt_x_x_w_i32mf2(shift, 2)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = HEDLEY_STATIC_CAST(int32_t, (a_.values[i] >> n) & UINT32_MAX); + } + #endif return simde_int32x2_from_private(r_); } #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) diff --git a/simde/arm/neon/sqadd.h b/simde/arm/neon/sqadd.h index 7e39cf10f..9afd89fff 100644 --- a/simde/arm/neon/sqadd.h +++ b/simde/arm/neon/sqadd.h @@ -22,6 +22,7 @@ * * Copyright: * 2021 Atharva Nimbalkar + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_SQADD_H) @@ -142,12 +143,20 @@ simde_vsqadd_u8(simde_uint8x8_t a, simde_int8x8_t b) { r_, a_ = simde_uint8x8_to_private(a); simde_int8x8_private b_ = simde_int8x8_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = simde_vsqaddb_u8(a_.values[i], b_.values[i]); - } - + #if defined(SIMDE_RISCV_V_NATIVE) + vint16m2_t sum = __riscv_vreinterpret_v_u16m2_i16m2( + __riscv_vadd_vv_u16m2 ( + __riscv_vwcvtu_x_x_v_u16m2 (a_.sv64, 8), __riscv_vreinterpret_v_i16m2_u16m2( \ + __riscv_vwcvt_x_x_v_i16m2 (b_.sv64, 8)),8)); + r_.sv64 = __riscv_vmerge_vxm_u8m1(__riscv_vmerge_vxm_u8m1(__riscv_vncvt_x_x_w_u8m1 \ + (__riscv_vreinterpret_v_i16m2_u16m2(sum), 8),255, __riscv_vmsgt_vx_i16m2_b8(sum, 255, 8), + 8), 0, __riscv_vmslt_vx_i16m2_b8(sum, 0, 8), 8); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vsqaddb_u8(a_.values[i], b_.values[i]); + } + #endif return simde_uint8x8_from_private(r_); #endif } @@ -166,12 +175,19 @@ simde_vsqadd_u16(simde_uint16x4_t a, simde_int16x4_t b) { r_, a_ = simde_uint16x4_to_private(a); simde_int16x4_private b_ = simde_int16x4_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = simde_vsqaddh_u16(a_.values[i], b_.values[i]); - } - + #if defined(SIMDE_RISCV_V_NATIVE) + vint32m2_t sum = __riscv_vreinterpret_v_u32m2_i32m2(__riscv_vadd_vv_u32m2 \ + (__riscv_vwcvtu_x_x_v_u32m2 (a_.sv64, 4), __riscv_vreinterpret_v_i32m2_u32m2( \ + __riscv_vwcvt_x_x_v_i32m2 (b_.sv64, 4)), 4)); + r_.sv64 = __riscv_vmerge_vxm_u16m1(__riscv_vmerge_vxm_u16m1(__riscv_vncvt_x_x_w_u16m1( \ + __riscv_vreinterpret_v_i32m2_u32m2(sum), 4),UINT16_MAX,__riscv_vmsgt_vx_i32m2_b16(sum, UINT16_MAX, 4), + 4), 0, __riscv_vmslt_vx_i32m2_b16(sum, 0, 4), 4); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vsqaddh_u16(a_.values[i], b_.values[i]); + } + #endif return simde_uint16x4_from_private(r_); #endif } @@ -190,12 +206,19 @@ simde_vsqadd_u32(simde_uint32x2_t a, simde_int32x2_t b) { r_, a_ = simde_uint32x2_to_private(a); simde_int32x2_private b_ = simde_int32x2_to_private(b); + #if defined(SIMDE_RISCV_V_NATIVE) + vint64m2_t sum = __riscv_vreinterpret_v_u64m2_i64m2(__riscv_vadd_vv_u64m2 (__riscv_vwcvtu_x_x_v_u64m2 (a_.sv64, 2), \ + __riscv_vreinterpret_v_i64m2_u64m2(__riscv_vwcvt_x_x_v_i64m2 (b_.sv64, 2)), 2)); + r_.sv64 = __riscv_vmerge_vxm_u32m1( + __riscv_vmerge_vxm_u32m1(__riscv_vncvt_x_x_w_u32m1(__riscv_vreinterpret_v_i64m2_u64m2(sum), 2), + UINT32_MAX,__riscv_vmsgt_vx_i64m2_b32(sum, UINT32_MAX, 2),2), 0, __riscv_vmslt_vx_i64m2_b32(sum, 0, 2), 2); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = simde_vsqadds_u32(a_.values[i], b_.values[i]); - } - + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vsqadds_u32(a_.values[i], b_.values[i]); + } + #endif return simde_uint32x2_from_private(r_); #endif } @@ -214,12 +237,18 @@ simde_vsqadd_u64(simde_uint64x1_t a, simde_int64x1_t b) { r_, a_ = simde_uint64x1_to_private(a); simde_int64x1_private b_ = simde_int64x1_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = simde_vsqaddd_u64(a_.values[i], b_.values[i]); - } - + #if defined(SIMDE_RISCV_V_NATIVE) + vuint64m1_t sum = __riscv_vreinterpret_v_i64m1_u64m1(__riscv_vadd_vx_i64m1(b_.sv64, (int64_t)a_.values[0], 1)); + r_.sv64 = __riscv_vmerge_vvm_u64m1(__riscv_vmerge_vxm_u64m1(sum,UINT64_MAX,__riscv_vmsgtu_vx_u64m1_b64( \ + __riscv_vreinterpret_v_i64m1_u64m1(b_.sv64), UINT64_MAX - a_.values[0], 1), 1), __riscv_vmerge_vxm_u64m1( \ + sum, 0, __riscv_vmsgtu_vx_u64m1_b64(__riscv_vreinterpret_v_i64m1_u64m1(__riscv_vneg_v_i64m1(b_.sv64, 1)), \ + a_.values[0], 1), 1), __riscv_vmsle_vx_i64m1_b64(b_.sv64, 0, 1), 1); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vsqaddd_u64(a_.values[i], b_.values[i]); + } + #endif return simde_uint64x1_from_private(r_); #endif } @@ -238,12 +267,18 @@ simde_vsqaddq_u8(simde_uint8x16_t a, simde_int8x16_t b) { r_, a_ = simde_uint8x16_to_private(a); simde_int8x16_private b_ = simde_int8x16_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = simde_vsqaddb_u8(a_.values[i], b_.values[i]); - } - + #if defined(SIMDE_RISCV_V_NATIVE) + vint16m2_t sum = __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vadd_vv_u16m2 (__riscv_vwcvtu_x_x_v_u16m2 \ + (a_.sv128, 16), __riscv_vreinterpret_v_i16m2_u16m2(__riscv_vwcvt_x_x_v_i16m2 (b_.sv128, 16)), 16)); + r_.sv128 = __riscv_vmerge_vxm_u8m1(__riscv_vmerge_vxm_u8m1(__riscv_vncvt_x_x_w_u8m1( \ + __riscv_vreinterpret_v_i16m2_u16m2(sum), 16), 255, __riscv_vmsgt_vx_i16m2_b8(sum, 255, 16), 16), 0, \ + __riscv_vmslt_vx_i16m2_b8(sum, 0, 16), 16); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vsqaddb_u8(a_.values[i], b_.values[i]); + } + #endif return simde_uint8x16_from_private(r_); #endif } @@ -262,12 +297,18 @@ simde_vsqaddq_u16(simde_uint16x8_t a, simde_int16x8_t b) { r_, a_ = simde_uint16x8_to_private(a); simde_int16x8_private b_ = simde_int16x8_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = simde_vsqaddh_u16(a_.values[i], b_.values[i]); - } - + #if defined(SIMDE_RISCV_V_NATIVE) + vint32m2_t sum = __riscv_vreinterpret_v_u32m2_i32m2(__riscv_vadd_vv_u32m2 (__riscv_vwcvtu_x_x_v_u32m2 \ + (a_.sv128, 8), __riscv_vreinterpret_v_i32m2_u32m2(__riscv_vwcvt_x_x_v_i32m2 (b_.sv128, 8)), 8)); + r_.sv128 = __riscv_vmerge_vxm_u16m1(__riscv_vmerge_vxm_u16m1(__riscv_vncvt_x_x_w_u16m1( \ + __riscv_vreinterpret_v_i32m2_u32m2(sum), 8), UINT16_MAX, __riscv_vmsgt_vx_i32m2_b16(sum, UINT16_MAX, 8), \ + 8), 0, __riscv_vmslt_vx_i32m2_b16(sum, 0, 8), 8); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vsqaddh_u16(a_.values[i], b_.values[i]); + } + #endif return simde_uint16x8_from_private(r_); #endif } @@ -286,12 +327,19 @@ simde_vsqaddq_u32(simde_uint32x4_t a, simde_int32x4_t b) { r_, a_ = simde_uint32x4_to_private(a); simde_int32x4_private b_ = simde_int32x4_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = simde_vsqadds_u32(a_.values[i], b_.values[i]); - } - + #if defined(SIMDE_RISCV_V_NATIVE) + vint64m2_t sum = __riscv_vreinterpret_v_u64m2_i64m2(__riscv_vadd_vv_u64m2 ( + __riscv_vwcvtu_x_x_v_u64m2 (a_.sv128, 4), __riscv_vreinterpret_v_i64m2_u64m2( \ + __riscv_vwcvt_x_x_v_i64m2 (b_.sv128, 4)), 4)); + r_.sv128 = __riscv_vmerge_vxm_u32m1(__riscv_vmerge_vxm_u32m1( + __riscv_vncvt_x_x_w_u32m1(__riscv_vreinterpret_v_i64m2_u64m2(sum), 4), UINT32_MAX, + __riscv_vmsgt_vx_i64m2_b32(sum, UINT32_MAX, 4), 4), 0, __riscv_vmslt_vx_i64m2_b32(sum, 0, 4), 4); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vsqadds_u32(a_.values[i], b_.values[i]); + } + #endif return simde_uint32x4_from_private(r_); #endif } @@ -310,12 +358,21 @@ simde_vsqaddq_u64(simde_uint64x2_t a, simde_int64x2_t b) { r_, a_ = simde_uint64x2_to_private(a); simde_int64x2_private b_ = simde_int64x2_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = simde_vsqaddd_u64(a_.values[i], b_.values[i]); - } - + #if defined(SIMDE_RISCV_V_NATIVE) + vuint64m1_t sum = __riscv_vreinterpret_v_i64m1_u64m1(__riscv_vadd_vv_i64m1(b_.sv128, \ + __riscv_vreinterpret_v_u64m1_i64m1(a_.sv128), 2)); + r_.sv128 = __riscv_vmerge_vvm_u64m1( + __riscv_vmerge_vxm_u64m1(sum, UINT64_MAX, __riscv_vmsgtu_vv_u64m1_b64( + __riscv_vreinterpret_v_i64m1_u64m1(b_.sv128), __riscv_vsub_vv_u64m1(__riscv_vmv_v_x_u64m1(UINT64_MAX, 2), \ + a_.sv128, 2), 2), 2), __riscv_vmerge_vxm_u64m1(sum, 0, __riscv_vmsgtu_vv_u64m1_b64 \ + (__riscv_vreinterpret_v_i64m1_u64m1(__riscv_vneg_v_i64m1(b_.sv128, 2)), a_.sv128, 2), 2), \ + __riscv_vmsle_vx_i64m1_b64(b_.sv128, 0, 2), 2); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vsqaddd_u64(a_.values[i], b_.values[i]); + } + #endif return simde_uint64x2_from_private(r_); #endif } diff --git a/simde/arm/neon/sqrt.h b/simde/arm/neon/sqrt.h index 0ddbc3494..01f212b50 100644 --- a/simde/arm/neon/sqrt.h +++ b/simde/arm/neon/sqrt.h @@ -22,6 +22,7 @@ * * Copyright: * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_SQRT_H) @@ -60,10 +61,14 @@ simde_vsqrt_f16(simde_float16x4_t a) { r_, a_ = simde_float16x4_to_private(a); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = simde_vsqrth_f16(a_.values[i]); - } + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + r_.sv64 = __riscv_vfsqrt_v_f16m1(a_.sv64, 4); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vsqrth_f16(a_.values[i]); + } + #endif return simde_float16x4_from_private(r_); #endif @@ -83,11 +88,14 @@ simde_vsqrt_f32(simde_float32x2_t a) { r_, a_ = simde_float32x2_to_private(a); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = simde_math_sqrtf(a_.values[i]); - } - + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vfsqrt_v_f32m1(a_.sv64, 2); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_math_sqrtf(a_.values[i]); + } + #endif return simde_float32x2_from_private(r_); #else HEDLEY_UNREACHABLE(); @@ -108,10 +116,14 @@ simde_vsqrt_f64(simde_float64x1_t a) { r_, a_ = simde_float64x1_to_private(a); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = simde_math_sqrt(a_.values[i]); - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vfsqrt_v_f64m1(a_.sv64, 1); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_math_sqrt(a_.values[i]); + } + #endif return simde_float64x1_from_private(r_); #else @@ -132,11 +144,14 @@ simde_vsqrtq_f16(simde_float16x8_t a) { simde_float16x8_private r_, a_ = simde_float16x8_to_private(a); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = simde_vsqrth_f16(a_.values[i]); - } - + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + r_.sv128 = __riscv_vfsqrt_v_f16m1(a_.sv128, 8); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vsqrth_f16(a_.values[i]); + } + #endif return simde_float16x8_from_private(r_); #endif } @@ -155,11 +170,14 @@ simde_vsqrtq_f32(simde_float32x4_t a) { r_, a_ = simde_float32x4_to_private(a); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = simde_math_sqrtf(a_.values[i]); - } - + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vfsqrt_v_f32m1(a_.sv128, 4); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_math_sqrtf(a_.values[i]); + } + #endif return simde_float32x4_from_private(r_); #else HEDLEY_UNREACHABLE(); @@ -180,11 +198,14 @@ simde_vsqrtq_f64(simde_float64x2_t a) { r_, a_ = simde_float64x2_to_private(a); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = simde_math_sqrt(a_.values[i]); - } - + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vfsqrt_v_f64m1(a_.sv128, 2); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_math_sqrt(a_.values[i]); + } + #endif return simde_float64x2_from_private(r_); #else HEDLEY_UNREACHABLE();