diff --git a/simde/arm/neon/hadd.h b/simde/arm/neon/hadd.h index 53e26d716..7e72ba3f7 100644 --- a/simde/arm/neon/hadd.h +++ b/simde/arm/neon/hadd.h @@ -46,6 +46,14 @@ simde_int8x8_t simde_vhadd_s8(simde_int8x8_t a, simde_int8x8_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vhadd_s8(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int8x8_private + r_, + a_ = simde_int8x8_to_private(a), + b_ = simde_int8x8_to_private(b); + + r_.sv64 = __riscv_vaadd_vv_i8m1(a_.sv64, b_.sv64, __RISCV_VXRM_RDN, 8); + return simde_int8x8_from_private(r_); #else return simde_vmovn_s16(simde_vshrq_n_s16(simde_vaddl_s8(a, b), 1)); #endif @@ -60,6 +68,14 @@ simde_int16x4_t simde_vhadd_s16(simde_int16x4_t a, simde_int16x4_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vhadd_s16(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int16x4_private + r_, + a_ = simde_int16x4_to_private(a), + b_ = simde_int16x4_to_private(b); + + r_.sv64 = __riscv_vaadd_vv_i16m1(a_.sv64, b_.sv64, __RISCV_VXRM_RDN, 4); + return simde_int16x4_from_private(r_); #else return simde_vmovn_s32(simde_vshrq_n_s32(simde_vaddl_s16(a, b), 1)); #endif @@ -74,6 +90,14 @@ simde_int32x2_t simde_vhadd_s32(simde_int32x2_t a, simde_int32x2_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vhadd_s32(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int32x2_private + r_, + a_ = simde_int32x2_to_private(a), + b_ = simde_int32x2_to_private(b); + + r_.sv64 = __riscv_vaadd_vv_i32m1(a_.sv64, b_.sv64, __RISCV_VXRM_RDN, 2); + return simde_int32x2_from_private(r_); #else return simde_vmovn_s64(simde_vshrq_n_s64(simde_vaddl_s32(a, b), 1)); #endif @@ -88,6 +112,14 @@ simde_uint8x8_t simde_vhadd_u8(simde_uint8x8_t a, simde_uint8x8_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vhadd_u8(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint8x8_private + r_, + a_ = simde_uint8x8_to_private(a), + b_ = simde_uint8x8_to_private(b); + + r_.sv64 = __riscv_vaaddu_vv_u8m1(a_.sv64, b_.sv64, __RISCV_VXRM_RDN, 8); + return simde_uint8x8_from_private(r_); #else return simde_vmovn_u16(simde_vshrq_n_u16(simde_vaddl_u8(a, b), 1)); #endif @@ -102,6 +134,14 @@ simde_uint16x4_t simde_vhadd_u16(simde_uint16x4_t a, simde_uint16x4_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vhadd_u16(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint16x4_private + r_, + a_ = simde_uint16x4_to_private(a), + b_ = simde_uint16x4_to_private(b); + + r_.sv64 = __riscv_vaaddu_vv_u16m1(a_.sv64, b_.sv64, __RISCV_VXRM_RDN, 4); + return simde_uint16x4_from_private(r_); #else return simde_vmovn_u32(simde_vshrq_n_u32(simde_vaddl_u16(a, b), 1)); #endif @@ -116,6 +156,14 @@ simde_uint32x2_t simde_vhadd_u32(simde_uint32x2_t a, simde_uint32x2_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vhadd_u32(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint32x2_private + r_, + a_ = simde_uint32x2_to_private(a), + b_ = simde_uint32x2_to_private(b); + + r_.sv64 = __riscv_vaaddu_vv_u32m1(a_.sv64, b_.sv64, __RISCV_VXRM_RDN, 2); + return simde_uint32x2_from_private(r_); #else return simde_vmovn_u64(simde_vshrq_n_u64(simde_vaddl_u32(a, b), 1)); #endif @@ -138,6 +186,8 @@ simde_vhaddq_s8(simde_int8x16_t a, simde_int8x16_t b) { #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) r_.m128i = _mm256_cvtepi16_epi8(_mm256_srai_epi16(_mm256_add_epi16(_mm256_cvtepi8_epi16(a_.m128i), _mm256_cvtepi8_epi16(b_.m128i)), 1)); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vaadd_vv_i8m1(a_.sv128, b_.sv128, __RISCV_VXRM_RDN, 16); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -166,6 +216,8 @@ simde_vhaddq_s16(simde_int16x8_t a, simde_int16x8_t b) { #if defined(SIMDE_X86_AVX512VL_NATIVE) r_.m128i = _mm256_cvtepi32_epi16(_mm256_srai_epi32(_mm256_add_epi32(_mm256_cvtepi16_epi32(a_.m128i), _mm256_cvtepi16_epi32(b_.m128i)), 1)); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vaadd_vv_i16m1(a_.sv128, b_.sv128, __RISCV_VXRM_RDN, 8); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -194,6 +246,8 @@ simde_vhaddq_s32(simde_int32x4_t a, simde_int32x4_t b) { #if defined(SIMDE_X86_AVX512VL_NATIVE) r_.m128i = _mm256_cvtepi64_epi32(_mm256_srai_epi64(_mm256_add_epi64(_mm256_cvtepi32_epi64(a_.m128i), _mm256_cvtepi32_epi64(b_.m128i)), 1)); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vaadd_vv_i32m1(a_.sv128, b_.sv128, __RISCV_VXRM_RDN, 4); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -233,6 +287,8 @@ simde_vhaddq_u8(simde_uint8x16_t a, simde_uint8x16_t b) { 1); r_.v128 = wasm_i8x16_shuffle(lo, hi, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vaaddu_vv_u8m1(a_.sv128, b_.sv128, __RISCV_VXRM_RDN, 16); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -261,6 +317,8 @@ simde_vhaddq_u16(simde_uint16x8_t a, simde_uint16x8_t b) { #if defined(SIMDE_X86_AVX512VL_NATIVE) r_.m128i = _mm256_cvtepi32_epi16(_mm256_srli_epi32(_mm256_add_epi32(_mm256_cvtepu16_epi32(a_.m128i), _mm256_cvtepu16_epi32(b_.m128i)), 1)); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vaaddu_vv_u16m1(a_.sv128, b_.sv128, __RISCV_VXRM_RDN, 8); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -289,6 +347,8 @@ simde_vhaddq_u32(simde_uint32x4_t a, simde_uint32x4_t b) { #if defined(SIMDE_X86_AVX512VL_NATIVE) r_.m128i = _mm256_cvtepi64_epi32(_mm256_srli_epi64(_mm256_add_epi64(_mm256_cvtepu32_epi64(a_.m128i), _mm256_cvtepu32_epi64(b_.m128i)), 1)); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vaaddu_vv_u32m1(a_.sv128, b_.sv128, __RISCV_VXRM_RDN, 4); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {