Skip to content

Commit

Permalink
arm: improve performance in vhadd_xxx for risc-v
Browse files Browse the repository at this point in the history
Signed-off-by: Zhijin Zeng <[email protected]>
  • Loading branch information
zengdage authored and mr-c committed Apr 25, 2024
1 parent 7429dff commit a68fa90
Showing 1 changed file with 60 additions and 0 deletions.
60 changes: 60 additions & 0 deletions simde/arm/neon/hadd.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,14 @@ simde_int8x8_t
simde_vhadd_s8(simde_int8x8_t a, simde_int8x8_t b) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vhadd_s8(a, b);
#elif defined(SIMDE_RISCV_V_NATIVE)
simde_int8x8_private
r_,
a_ = simde_int8x8_to_private(a),
b_ = simde_int8x8_to_private(b);

r_.sv64 = __riscv_vaadd_vv_i8m1(a_.sv64, b_.sv64, __RISCV_VXRM_RDN, 8);
return simde_int8x8_from_private(r_);
#else
return simde_vmovn_s16(simde_vshrq_n_s16(simde_vaddl_s8(a, b), 1));
#endif
Expand All @@ -60,6 +68,14 @@ simde_int16x4_t
simde_vhadd_s16(simde_int16x4_t a, simde_int16x4_t b) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vhadd_s16(a, b);
#elif defined(SIMDE_RISCV_V_NATIVE)
simde_int16x4_private
r_,
a_ = simde_int16x4_to_private(a),
b_ = simde_int16x4_to_private(b);

r_.sv64 = __riscv_vaadd_vv_i16m1(a_.sv64, b_.sv64, __RISCV_VXRM_RDN, 4);
return simde_int16x4_from_private(r_);
#else
return simde_vmovn_s32(simde_vshrq_n_s32(simde_vaddl_s16(a, b), 1));
#endif
Expand All @@ -74,6 +90,14 @@ simde_int32x2_t
simde_vhadd_s32(simde_int32x2_t a, simde_int32x2_t b) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vhadd_s32(a, b);
#elif defined(SIMDE_RISCV_V_NATIVE)
simde_int32x2_private
r_,
a_ = simde_int32x2_to_private(a),
b_ = simde_int32x2_to_private(b);

r_.sv64 = __riscv_vaadd_vv_i32m1(a_.sv64, b_.sv64, __RISCV_VXRM_RDN, 2);
return simde_int32x2_from_private(r_);
#else
return simde_vmovn_s64(simde_vshrq_n_s64(simde_vaddl_s32(a, b), 1));
#endif
Expand All @@ -88,6 +112,14 @@ simde_uint8x8_t
simde_vhadd_u8(simde_uint8x8_t a, simde_uint8x8_t b) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vhadd_u8(a, b);
#elif defined(SIMDE_RISCV_V_NATIVE)
simde_uint8x8_private
r_,
a_ = simde_uint8x8_to_private(a),
b_ = simde_uint8x8_to_private(b);

r_.sv64 = __riscv_vaaddu_vv_u8m1(a_.sv64, b_.sv64, __RISCV_VXRM_RDN, 8);
return simde_uint8x8_from_private(r_);
#else
return simde_vmovn_u16(simde_vshrq_n_u16(simde_vaddl_u8(a, b), 1));
#endif
Expand All @@ -102,6 +134,14 @@ simde_uint16x4_t
simde_vhadd_u16(simde_uint16x4_t a, simde_uint16x4_t b) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vhadd_u16(a, b);
#elif defined(SIMDE_RISCV_V_NATIVE)
simde_uint16x4_private
r_,
a_ = simde_uint16x4_to_private(a),
b_ = simde_uint16x4_to_private(b);

r_.sv64 = __riscv_vaaddu_vv_u16m1(a_.sv64, b_.sv64, __RISCV_VXRM_RDN, 4);
return simde_uint16x4_from_private(r_);
#else
return simde_vmovn_u32(simde_vshrq_n_u32(simde_vaddl_u16(a, b), 1));
#endif
Expand All @@ -116,6 +156,14 @@ simde_uint32x2_t
simde_vhadd_u32(simde_uint32x2_t a, simde_uint32x2_t b) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vhadd_u32(a, b);
#elif defined(SIMDE_RISCV_V_NATIVE)
simde_uint32x2_private
r_,
a_ = simde_uint32x2_to_private(a),
b_ = simde_uint32x2_to_private(b);

r_.sv64 = __riscv_vaaddu_vv_u32m1(a_.sv64, b_.sv64, __RISCV_VXRM_RDN, 2);
return simde_uint32x2_from_private(r_);
#else
return simde_vmovn_u64(simde_vshrq_n_u64(simde_vaddl_u32(a, b), 1));
#endif
Expand All @@ -138,6 +186,8 @@ simde_vhaddq_s8(simde_int8x16_t a, simde_int8x16_t b) {

#if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE)
r_.m128i = _mm256_cvtepi16_epi8(_mm256_srai_epi16(_mm256_add_epi16(_mm256_cvtepi8_epi16(a_.m128i), _mm256_cvtepi8_epi16(b_.m128i)), 1));
#elif defined(SIMDE_RISCV_V_NATIVE)
r_.sv128 = __riscv_vaadd_vv_i8m1(a_.sv128, b_.sv128, __RISCV_VXRM_RDN, 16);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
Expand Down Expand Up @@ -166,6 +216,8 @@ simde_vhaddq_s16(simde_int16x8_t a, simde_int16x8_t b) {

#if defined(SIMDE_X86_AVX512VL_NATIVE)
r_.m128i = _mm256_cvtepi32_epi16(_mm256_srai_epi32(_mm256_add_epi32(_mm256_cvtepi16_epi32(a_.m128i), _mm256_cvtepi16_epi32(b_.m128i)), 1));
#elif defined(SIMDE_RISCV_V_NATIVE)
r_.sv128 = __riscv_vaadd_vv_i16m1(a_.sv128, b_.sv128, __RISCV_VXRM_RDN, 8);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
Expand Down Expand Up @@ -194,6 +246,8 @@ simde_vhaddq_s32(simde_int32x4_t a, simde_int32x4_t b) {

#if defined(SIMDE_X86_AVX512VL_NATIVE)
r_.m128i = _mm256_cvtepi64_epi32(_mm256_srai_epi64(_mm256_add_epi64(_mm256_cvtepi32_epi64(a_.m128i), _mm256_cvtepi32_epi64(b_.m128i)), 1));
#elif defined(SIMDE_RISCV_V_NATIVE)
r_.sv128 = __riscv_vaadd_vv_i32m1(a_.sv128, b_.sv128, __RISCV_VXRM_RDN, 4);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
Expand Down Expand Up @@ -233,6 +287,8 @@ simde_vhaddq_u8(simde_uint8x16_t a, simde_uint8x16_t b) {
1);
r_.v128 = wasm_i8x16_shuffle(lo, hi, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20,
22, 24, 26, 28, 30);
#elif defined(SIMDE_RISCV_V_NATIVE)
r_.sv128 = __riscv_vaaddu_vv_u8m1(a_.sv128, b_.sv128, __RISCV_VXRM_RDN, 16);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
Expand Down Expand Up @@ -261,6 +317,8 @@ simde_vhaddq_u16(simde_uint16x8_t a, simde_uint16x8_t b) {

#if defined(SIMDE_X86_AVX512VL_NATIVE)
r_.m128i = _mm256_cvtepi32_epi16(_mm256_srli_epi32(_mm256_add_epi32(_mm256_cvtepu16_epi32(a_.m128i), _mm256_cvtepu16_epi32(b_.m128i)), 1));
#elif defined(SIMDE_RISCV_V_NATIVE)
r_.sv128 = __riscv_vaaddu_vv_u16m1(a_.sv128, b_.sv128, __RISCV_VXRM_RDN, 8);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
Expand Down Expand Up @@ -289,6 +347,8 @@ simde_vhaddq_u32(simde_uint32x4_t a, simde_uint32x4_t b) {

#if defined(SIMDE_X86_AVX512VL_NATIVE)
r_.m128i = _mm256_cvtepi64_epi32(_mm256_srli_epi64(_mm256_add_epi64(_mm256_cvtepu32_epi64(a_.m128i), _mm256_cvtepu32_epi64(b_.m128i)), 1));
#elif defined(SIMDE_RISCV_V_NATIVE)
r_.sv128 = __riscv_vaaddu_vv_u32m1(a_.sv128, b_.sv128, __RISCV_VXRM_RDN, 4);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
Expand Down

0 comments on commit a68fa90

Please sign in to comment.