Skip to content

Commit

Permalink
arm: improve performance in vqadd and vmvn in risc-v
Browse files Browse the repository at this point in the history
Signed-off-by: Zhijin Zeng <[email protected]>
  • Loading branch information
zengdage authored and mr-c committed May 17, 2024
1 parent f046ab7 commit 17416b1
Show file tree
Hide file tree
Showing 2 changed files with 56 additions and 0 deletions.
24 changes: 24 additions & 0 deletions simde/arm/neon/mvn.h
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,8 @@ simde_vmvnq_s8(simde_int8x16_t a) {
r_.v128 = wasm_v128_not(a_.v128);
#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.values = ~a_.values;
#elif defined(SIMDE_RISCV_V_NATIVE)
r_.sv128 = __riscv_vnot_v_i8m1(a_.sv128, b_.sv128, 16);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
Expand Down Expand Up @@ -92,6 +94,8 @@ simde_vmvnq_s16(simde_int16x8_t a) {
r_.v128 = wasm_v128_not(a_.v128);
#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.values = ~a_.values;
#elif defined(SIMDE_RISCV_V_NATIVE)
r_.sv128 = __riscv_vnot_v_i16m1(a_.sv128, b_.sv128, 8);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
Expand Down Expand Up @@ -127,6 +131,8 @@ simde_vmvnq_s32(simde_int32x4_t a) {
r_.v128 = wasm_v128_not(a_.v128);
#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.values = ~a_.values;
#elif defined(SIMDE_RISCV_V_NATIVE)
r_.sv128 = __riscv_vnot_v_i32m1(a_.sv128, b_.sv128, 4);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
Expand Down Expand Up @@ -162,6 +168,8 @@ simde_vmvnq_u8(simde_uint8x16_t a) {
r_.v128 = wasm_v128_not(a_.v128);
#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.values = ~a_.values;
#elif defined(SIMDE_RISCV_V_NATIVE)
r_.sv128 = __riscv_vnot_v_u8m1(a_.sv128, b_.sv128, 16);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
Expand Down Expand Up @@ -197,6 +205,8 @@ simde_vmvnq_u16(simde_uint16x8_t a) {
r_.v128 = wasm_v128_not(a_.v128);
#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.values = ~a_.values;
#elif defined(SIMDE_RISCV_V_NATIVE)
r_.sv128 = __riscv_vnot_v_u16m1(a_.sv128, b_.sv128, 8);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
Expand Down Expand Up @@ -232,6 +242,8 @@ simde_vmvnq_u32(simde_uint32x4_t a) {
r_.v128 = wasm_v128_not(a_.v128);
#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.values = ~a_.values;
#elif defined(SIMDE_RISCV_V_NATIVE)
r_.sv128 = __riscv_vnot_v_u32m1(a_.sv128, b_.sv128, 4);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
Expand Down Expand Up @@ -261,6 +273,8 @@ simde_vmvn_s8(simde_int8x8_t a) {
r_.m64 = _mm_andnot_si64(a_.m64, _mm_cmpeq_pi8(a_.m64, a_.m64));
#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.values = ~a_.values;
#elif defined(SIMDE_RISCV_V_NATIVE)
r_.sv64 = __riscv_vnot_v_i8m1(a_.sv64, b_.sv64, 8);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
Expand Down Expand Up @@ -290,6 +304,8 @@ simde_vmvn_s16(simde_int16x4_t a) {
r_.m64 = _mm_andnot_si64(a_.m64, _mm_cmpeq_pi16(a_.m64, a_.m64));
#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.values = ~a_.values;
#elif defined(SIMDE_RISCV_V_NATIVE)
r_.sv64 = __riscv_vnot_v_i16m1(a_.sv64, b_.sv64, 4);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
Expand Down Expand Up @@ -319,6 +335,8 @@ simde_vmvn_s32(simde_int32x2_t a) {
r_.m64 = _mm_andnot_si64(a_.m64, _mm_cmpeq_pi32(a_.m64, a_.m64));
#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.values = ~a_.values;
#elif defined(SIMDE_RISCV_V_NATIVE)
r_.sv64 = __riscv_vnot_v_i32m1(a_.sv64, b_.sv64, 2);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
Expand Down Expand Up @@ -348,6 +366,8 @@ simde_vmvn_u8(simde_uint8x8_t a) {
r_.m64 = _mm_andnot_si64(a_.m64, _mm_cmpeq_pi8(a_.m64, a_.m64));
#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.values = ~a_.values;
#elif defined(SIMDE_RISCV_V_NATIVE)
r_.sv64 = __riscv_vnot_v_u8m1(a_.sv64, b_.sv64, 8);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
Expand Down Expand Up @@ -377,6 +397,8 @@ simde_vmvn_u16(simde_uint16x4_t a) {
r_.m64 = _mm_andnot_si64(a_.m64, _mm_cmpeq_pi16(a_.m64, a_.m64));
#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.values = ~a_.values;
#elif defined(SIMDE_RISCV_V_NATIVE)
r_.sv64 = __riscv_vnot_v_u16m1(a_.sv64, b_.sv64, 4);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
Expand Down Expand Up @@ -406,6 +428,8 @@ simde_vmvn_u32(simde_uint32x2_t a) {
r_.m64 = _mm_andnot_si64(a_.m64, _mm_cmpeq_pi32(a_.m64, a_.m64));
#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.values = ~a_.values;
#elif defined(SIMDE_RISCV_V_NATIVE)
r_.sv64 = __riscv_vnot_v_u32m1(a_.sv64, b_.sv64, 2);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
Expand Down
32 changes: 32 additions & 0 deletions simde/arm/neon/qadd.h
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,8 @@ simde_vqadd_s8(simde_int8x8_t a, simde_int8x8_t b) {

uint8_t m SIMDE_VECTOR(8) = HEDLEY_REINTERPRET_CAST(__typeof__(m), HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), (au ^ bu) | ~(bu ^ ru)) < 0);
r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), (au & ~m) | (ru & m));
#elif defined(SIMDE_RISCV_V_NATIVE)
r_.sv64 = __riscv_vsadd_vv_i8m1(a_.sv64, b_.sv64, 8);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
Expand Down Expand Up @@ -181,6 +183,8 @@ simde_vqadd_s16(simde_int16x4_t a, simde_int16x4_t b) {

uint16_t m SIMDE_VECTOR(8) = HEDLEY_REINTERPRET_CAST(__typeof__(m), HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), (au ^ bu) | ~(bu ^ ru)) < 0);
r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), (au & ~m) | (ru & m));
#elif defined(SIMDE_RISCV_V_NATIVE)
r_.sv64 = __riscv_vsadd_vv_i16m1(a_.sv64, b_.sv64, 4);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
Expand Down Expand Up @@ -216,6 +220,8 @@ simde_vqadd_s32(simde_int32x2_t a, simde_int32x2_t b) {

uint32_t m SIMDE_VECTOR(8) = HEDLEY_REINTERPRET_CAST(__typeof__(m), HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), (au ^ bu) | ~(bu ^ ru)) < 0);
r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), (au & ~m) | (ru & m));
#elif defined(SIMDE_RISCV_V_NATIVE)
r_.sv64 = __riscv_vsadd_vv_i32m1(a_.sv64, b_.sv64, 2);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
Expand Down Expand Up @@ -251,6 +257,8 @@ simde_vqadd_s64(simde_int64x1_t a, simde_int64x1_t b) {

uint64_t m SIMDE_VECTOR(8) = HEDLEY_REINTERPRET_CAST(__typeof__(m), HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), (au ^ bu) | ~(bu ^ ru)) < 0);
r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), (au & ~m) | (ru & m));
#elif defined(SIMDE_RISCV_V_NATIVE)
r_.sv64 = __riscv_vsadd_vv_i64m1(a_.sv64, b_.sv64, 1);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
Expand Down Expand Up @@ -282,6 +290,8 @@ simde_vqadd_u8(simde_uint8x8_t a, simde_uint8x8_t b) {
#elif defined(SIMDE_VECTOR_SUBSCRIPT) && !defined(SIMDE_BUG_GCC_100762)
r_.values = a_.values + b_.values;
r_.values |= HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), r_.values < a_.values);
#elif defined(SIMDE_RISCV_V_NATIVE)
r_.sv64 = __riscv_vsaddu_vv_u8m1(a_.sv64, b_.sv64, 8);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
Expand Down Expand Up @@ -313,6 +323,8 @@ simde_vqadd_u16(simde_uint16x4_t a, simde_uint16x4_t b) {
#elif defined(SIMDE_VECTOR_SUBSCRIPT) && !defined(SIMDE_BUG_GCC_100762)
r_.values = a_.values + b_.values;
r_.values |= HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), r_.values < a_.values);
#elif defined(SIMDE_RISCV_V_NATIVE)
r_.sv64 = __riscv_vsaddu_vv_u16m1(a_.sv64, b_.sv64, 4);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
Expand Down Expand Up @@ -342,6 +354,8 @@ simde_vqadd_u32(simde_uint32x2_t a, simde_uint32x2_t b) {
#if defined(SIMDE_VECTOR_SUBSCRIPT) && !defined(SIMDE_BUG_GCC_100762)
r_.values = a_.values + b_.values;
r_.values |= HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), r_.values < a_.values);
#elif defined(SIMDE_RISCV_V_NATIVE)
r_.sv64 = __riscv_vsaddu_vv_u32m1(a_.sv64, b_.sv64, 2);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
Expand Down Expand Up @@ -371,6 +385,8 @@ simde_vqadd_u64(simde_uint64x1_t a, simde_uint64x1_t b) {
#if defined(SIMDE_VECTOR_SUBSCRIPT)
r_.values = a_.values + b_.values;
r_.values |= HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), r_.values < a_.values);
#elif defined(SIMDE_RISCV_V_NATIVE)
r_.sv64 = __riscv_vsaddu_vv_u64m1(a_.sv64, b_.sv64, 1);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
Expand Down Expand Up @@ -412,6 +428,8 @@ simde_vqaddq_s8(simde_int8x16_t a, simde_int8x16_t b) {

uint8_t m SIMDE_VECTOR(16) = HEDLEY_REINTERPRET_CAST(__typeof__(m), HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), (au ^ bu) | ~(bu ^ ru)) < 0);
r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), (au & ~m) | (ru & m));
#elif defined(SIMDE_RISCV_V_NATIVE)
r_.sv128 = __riscv_vsadd_vv_i8m1(a_.sv128, b_.sv128, 16);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
Expand Down Expand Up @@ -453,6 +471,8 @@ simde_vqaddq_s16(simde_int16x8_t a, simde_int16x8_t b) {

uint16_t m SIMDE_VECTOR(16) = HEDLEY_REINTERPRET_CAST(__typeof__(m), HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), (au ^ bu) | ~(bu ^ ru)) < 0);
r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), (au & ~m) | (ru & m));
#elif defined(SIMDE_RISCV_V_NATIVE)
r_.sv128 = __riscv_vsadd_vv_i16m1(a_.sv128, b_.sv128, 8);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
Expand Down Expand Up @@ -530,6 +550,8 @@ simde_vqaddq_s32(simde_int32x4_t a, simde_int32x4_t b) {

uint32_t m SIMDE_VECTOR(16) = HEDLEY_REINTERPRET_CAST(__typeof__(m), HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), (au ^ bu) | ~(bu ^ ru)) < 0);
r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), (au & ~m) | (ru & m));
#elif defined(SIMDE_RISCV_V_NATIVE)
r_.sv128 = __riscv_vsadd_vv_i32m1(a_.sv128, b_.sv128, 4);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
Expand Down Expand Up @@ -596,6 +618,8 @@ simde_vqaddq_s64(simde_int64x2_t a, simde_int64x2_t b) {

uint64_t m SIMDE_VECTOR(16) = HEDLEY_REINTERPRET_CAST(__typeof__(m), HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), (au ^ bu) | ~(bu ^ ru)) < 0);
r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), (au & ~m) | (ru & m));
#elif defined(SIMDE_RISCV_V_NATIVE)
r_.sv128 = __riscv_vsadd_vv_i64m1(a_.sv128, b_.sv128, 2);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
Expand Down Expand Up @@ -631,6 +655,8 @@ simde_vqaddq_u8(simde_uint8x16_t a, simde_uint8x16_t b) {
#elif defined(SIMDE_VECTOR_SUBSCRIPT)
r_.values = a_.values + b_.values;
r_.values |= HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), r_.values < a_.values);
#elif defined(SIMDE_RISCV_V_NATIVE)
r_.sv128 = __riscv_vsaddu_vv_u8m1(a_.sv128, b_.sv128, 16);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
Expand Down Expand Up @@ -666,6 +692,8 @@ simde_vqaddq_u16(simde_uint16x8_t a, simde_uint16x8_t b) {
#elif defined(SIMDE_VECTOR_SUBSCRIPT)
r_.values = a_.values + b_.values;
r_.values |= HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), r_.values < a_.values);
#elif defined(SIMDE_RISCV_V_NATIVE)
r_.sv128 = __riscv_vsaddu_vv_u16m1(a_.sv128, b_.sv128, 8);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
Expand Down Expand Up @@ -716,6 +744,8 @@ simde_vqaddq_u32(simde_uint32x4_t a, simde_uint32x4_t b) {
#elif defined(SIMDE_VECTOR_SUBSCRIPT)
r_.values = a_.values + b_.values;
r_.values |= HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), r_.values < a_.values);
#elif defined(SIMDE_RISCV_V_NATIVE)
r_.sv128 = __riscv_vsaddu_vv_u32m1(a_.sv128, b_.sv128, 4);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
Expand Down Expand Up @@ -745,6 +775,8 @@ simde_vqaddq_u64(simde_uint64x2_t a, simde_uint64x2_t b) {
#if defined(SIMDE_VECTOR_SUBSCRIPT)
r_.values = a_.values + b_.values;
r_.values |= HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), r_.values < a_.values);
#elif defined(SIMDE_RISCV_V_NATIVE)
r_.sv128 = __riscv_vsaddu_vv_u64m1(a_.sv128, b_.sv128, 2);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
Expand Down

0 comments on commit 17416b1

Please sign in to comment.