Skip to content

Commit

Permalink
feat: Add veor3q_[s8|s16|s32|s64|u8|u16|u32|u64]
Browse files Browse the repository at this point in the history
  • Loading branch information
howjmay committed Aug 2, 2024
1 parent 3bbb6d9 commit c2daf9c
Show file tree
Hide file tree
Showing 4 changed files with 219 additions and 25 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ endif

ifndef CROSS_COMPILE
processor := $(shell uname -m)
ARCH_CFLAGS = -march=armv8.4-a+simd+i8mm+dotprod
ARCH_CFLAGS = -march=armv8.4-a+simd+i8mm+dotprod+sha3
else # CROSS_COMPILE was set
CC = $(CROSS_COMPILE)gcc
CXX = $(CROSS_COMPILE)g++
Expand Down
32 changes: 24 additions & 8 deletions neon2rvv.h
Original file line number Diff line number Diff line change
Expand Up @@ -10701,21 +10701,37 @@ FORCE_INLINE int32x4_t vdotq_lane_s32(int32x4_t r, int8x16_t a, int8x8_t b, cons

// FORCE_INLINE uint64x2_t vsha512su1q_u64(uint64x2_t s01_s02, uint64x2_t w14_15, uint64x2_t w9_10);

// FORCE_INLINE uint8x16_t veor3q_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c);
FORCE_INLINE uint8x16_t veor3q_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) {
return __riscv_vxor_vv_u8m1(a, __riscv_vxor_vv_u8m1(b, c, 16), 16);
}

// FORCE_INLINE uint16x8_t veor3q_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c);
FORCE_INLINE uint16x8_t veor3q_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) {
return __riscv_vxor_vv_u16m1(a, __riscv_vxor_vv_u16m1(b, c, 8), 8);
}

// FORCE_INLINE uint32x4_t veor3q_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c);
FORCE_INLINE uint32x4_t veor3q_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) {
return __riscv_vxor_vv_u32m1(a, __riscv_vxor_vv_u32m1(b, c, 4), 4);
}

// FORCE_INLINE uint64x2_t veor3q_u64(uint64x2_t a, uint64x2_t b, uint64x2_t c);
FORCE_INLINE uint64x2_t veor3q_u64(uint64x2_t a, uint64x2_t b, uint64x2_t c) {
return __riscv_vxor_vv_u64m1(a, __riscv_vxor_vv_u64m1(b, c, 2), 2);
}

// FORCE_INLINE int8x16_t veor3q_s8(int8x16_t a, int8x16_t b, int8x16_t c);
FORCE_INLINE int8x16_t veor3q_s8(int8x16_t a, int8x16_t b, int8x16_t c) {
return __riscv_vxor_vv_i8m1(a, __riscv_vxor_vv_i8m1(b, c, 16), 16);
}

// FORCE_INLINE int16x8_t veor3q_s16(int16x8_t a, int16x8_t b, int16x8_t c);
FORCE_INLINE int16x8_t veor3q_s16(int16x8_t a, int16x8_t b, int16x8_t c) {
return __riscv_vxor_vv_i16m1(a, __riscv_vxor_vv_i16m1(b, c, 8), 8);
}

// FORCE_INLINE int32x4_t veor3q_s32(int32x4_t a, int32x4_t b, int32x4_t c);
FORCE_INLINE int32x4_t veor3q_s32(int32x4_t a, int32x4_t b, int32x4_t c) {
return __riscv_vxor_vv_i32m1(a, __riscv_vxor_vv_i32m1(b, c, 4), 4);
}

// FORCE_INLINE int64x2_t veor3q_s64(int64x2_t a, int64x2_t b, int64x2_t c);
FORCE_INLINE int64x2_t veor3q_s64(int64x2_t a, int64x2_t b, int64x2_t c) {
return __riscv_vxor_vv_i64m1(a, __riscv_vxor_vv_i64m1(b, c, 2), 2);
}

// FORCE_INLINE uint64x2_t vrax1q_u64(uint64x2_t a, uint64x2_t b);

Expand Down
194 changes: 186 additions & 8 deletions tests/impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37311,21 +37311,199 @@ result_t test_vsha512su0q_u64(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { r

result_t test_vsha512su1q_u64(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }

result_t test_veor3q_u8(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
result_t test_veor3q_u8(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
#if defined(__clang__)
return TEST_UNIMPL;
#else
#ifdef ENABLE_TEST_ALL
const uint8_t *_a = (const uint8_t *)impl.test_cases_int_pointer1;
const uint8_t *_b = (const uint8_t *)impl.test_cases_int_pointer2;
const uint8_t *_c = (const uint8_t *)impl.test_cases_int_pointer3;
uint8_t _d[16];
for (int i = 0; i < 16; i++) {
_d[i] = _a[i] ^ _b[i] ^ _c[i];
}

uint8x16_t a = vld1q_u8(_a);
uint8x16_t b = vld1q_u8(_b);
uint8x16_t c = vld1q_u8(_c);
uint8x16_t d = veor3q_u8(a, b, c);
return validate_uint8(d, _d[0], _d[1], _d[2], _d[3], _d[4], _d[5], _d[6], _d[7], _d[8], _d[9], _d[10], _d[11], _d[12],
_d[13], _d[14], _d[15]);
#else
return TEST_UNIMPL;
#endif // defined(__clang__)
#endif // ENABLE_TEST_ALL
}

result_t test_veor3q_u16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
#if defined(__clang__)
return TEST_UNIMPL;
#else
#ifdef ENABLE_TEST_ALL
const uint16_t *_a = (const uint16_t *)impl.test_cases_int_pointer1;
const uint16_t *_b = (const uint16_t *)impl.test_cases_int_pointer2;
const uint16_t *_c = (const uint16_t *)impl.test_cases_int_pointer3;
uint16_t _d[8];
for (int i = 0; i < 8; i++) {
_d[i] = _a[i] ^ _b[i] ^ _c[i];
}

uint16x8_t a = vld1q_u16(_a);
uint16x8_t b = vld1q_u16(_b);
uint16x8_t c = vld1q_u16(_c);
uint16x8_t d = veor3q_u16(a, b, c);
return validate_uint16(d, _d[0], _d[1], _d[2], _d[3], _d[4], _d[5], _d[6], _d[7]);
#else
return TEST_UNIMPL;
#endif // ENABLE_TEST_ALL
#endif // defined(__clang__)
}

result_t test_veor3q_u32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
#if defined(__clang__)
return TEST_UNIMPL;
#else
#ifdef ENABLE_TEST_ALL
const uint32_t *_a = (const uint32_t *)impl.test_cases_int_pointer1;
const uint32_t *_b = (const uint32_t *)impl.test_cases_int_pointer2;
const uint32_t *_c = (const uint32_t *)impl.test_cases_int_pointer3;
uint32_t _d[4];
for (int i = 0; i < 4; i++) {
_d[i] = _a[i] ^ _b[i] ^ _c[i];
}

uint32x4_t a = vld1q_u32(_a);
uint32x4_t b = vld1q_u32(_b);
uint32x4_t c = vld1q_u32(_c);
uint32x4_t d = veor3q_u32(a, b, c);
return validate_uint32(d, _d[0], _d[1], _d[2], _d[3]);
#else
return TEST_UNIMPL;
#endif // ENABLE_TEST_ALL
#endif // defined(__clang__)
}

result_t test_veor3q_u64(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
#if defined(__clang__)
return TEST_UNIMPL;
#else
#ifdef ENABLE_TEST_ALL
const uint64_t *_a = (const uint64_t *)impl.test_cases_int_pointer1;
const uint64_t *_b = (const uint64_t *)impl.test_cases_int_pointer2;
const uint64_t *_c = (const uint64_t *)impl.test_cases_int_pointer3;
uint64_t _d[2];
for (int i = 0; i < 2; i++) {
_d[i] = _a[i] ^ _b[i] ^ _c[i];
}

uint64x2_t a = vld1q_u64(_a);
uint64x2_t b = vld1q_u64(_b);
uint64x2_t c = vld1q_u64(_c);
uint64x2_t d = veor3q_u64(a, b, c);
return validate_uint64(d, _d[0], _d[1]);
#else
return TEST_UNIMPL;
#endif // ENABLE_TEST_ALL
#endif // defined(__clang__)
}

result_t test_veor3q_s8(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
#if defined(__clang__)
return TEST_UNIMPL;
#else
#ifdef ENABLE_TEST_ALL
const int8_t *_a = (const int8_t *)impl.test_cases_int_pointer1;
const int8_t *_b = (const int8_t *)impl.test_cases_int_pointer2;
const int8_t *_c = (const int8_t *)impl.test_cases_int_pointer3;
int8_t _d[16];
for (int i = 0; i < 16; i++) {
_d[i] = _a[i] ^ _b[i] ^ _c[i];
}

result_t test_veor3q_u16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
int8x16_t a = vld1q_s8(_a);
int8x16_t b = vld1q_s8(_b);
int8x16_t c = vld1q_s8(_c);
int8x16_t d = veor3q_s8(a, b, c);
return validate_int8(d, _d[0], _d[1], _d[2], _d[3], _d[4], _d[5], _d[6], _d[7], _d[8], _d[9], _d[10], _d[11], _d[12],
_d[13], _d[14], _d[15]);
#else
return TEST_UNIMPL;
#endif // defined(__clang__)
#endif // ENABLE_TEST_ALL
}

result_t test_veor3q_u32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
result_t test_veor3q_s16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
#if defined(__clang__)
return TEST_UNIMPL;
#else
#ifdef ENABLE_TEST_ALL
const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1;
const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer2;
const int16_t *_c = (const int16_t *)impl.test_cases_int_pointer3;
int16_t _d[8];
for (int i = 0; i < 8; i++) {
_d[i] = _a[i] ^ _b[i] ^ _c[i];
}

result_t test_veor3q_u64(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
int16x8_t a = vld1q_s16(_a);
int16x8_t b = vld1q_s16(_b);
int16x8_t c = vld1q_s16(_c);
int16x8_t d = veor3q_s16(a, b, c);
return validate_int16(d, _d[0], _d[1], _d[2], _d[3], _d[4], _d[5], _d[6], _d[7]);
#else
return TEST_UNIMPL;
#endif // ENABLE_TEST_ALL
#endif // defined(__clang__)
}

result_t test_veor3q_s8(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
result_t test_veor3q_s32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
#if defined(__clang__)
return TEST_UNIMPL;
#else
#ifdef ENABLE_TEST_ALL
const int32_t *_a = (const int32_t *)impl.test_cases_int_pointer1;
const int32_t *_b = (const int32_t *)impl.test_cases_int_pointer2;
const int32_t *_c = (const int32_t *)impl.test_cases_int_pointer3;
int32_t _d[4];
for (int i = 0; i < 4; i++) {
_d[i] = _a[i] ^ _b[i] ^ _c[i];
}

result_t test_veor3q_s16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
int32x4_t a = vld1q_s32(_a);
int32x4_t b = vld1q_s32(_b);
int32x4_t c = vld1q_s32(_c);
int32x4_t d = veor3q_s32(a, b, c);
return validate_int32(d, _d[0], _d[1], _d[2], _d[3]);
#else
return TEST_UNIMPL;
#endif // ENABLE_TEST_ALL
#endif // defined(__clang__)
}

result_t test_veor3q_s32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
result_t test_veor3q_s64(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
#if defined(__clang__)
return TEST_UNIMPL;
#else
#ifdef ENABLE_TEST_ALL
const int64_t *_a = (const int64_t *)impl.test_cases_int_pointer1;
const int64_t *_b = (const int64_t *)impl.test_cases_int_pointer2;
const int64_t *_c = (const int64_t *)impl.test_cases_int_pointer3;
int64_t _d[2];
for (int i = 0; i < 2; i++) {
_d[i] = _a[i] ^ _b[i] ^ _c[i];
}

result_t test_veor3q_s64(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
int64x2_t a = vld1q_s64(_a);
int64x2_t b = vld1q_s64(_b);
int64x2_t c = vld1q_s64(_c);
int64x2_t d = veor3q_s64(a, b, c);
return validate_int64(d, _d[0], _d[1]);
#else
return TEST_UNIMPL;
#endif // ENABLE_TEST_ALL
#endif // defined(__clang__)
}

result_t test_vrax1q_u64(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }

Expand Down
16 changes: 8 additions & 8 deletions tests/impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -4069,14 +4069,14 @@
/*_(vsha512h2q_u64) */ \
/*_(vsha512su0q_u64) */ \
/*_(vsha512su1q_u64) */ \
/*_(veor3q_u8) */ \
/*_(veor3q_u16) */ \
/*_(veor3q_u32) */ \
/*_(veor3q_u64) */ \
/*_(veor3q_s8) */ \
/*_(veor3q_s16) */ \
/*_(veor3q_s32) */ \
/*_(veor3q_s64) */ \
_(veor3q_u8) \
_(veor3q_u16) \
_(veor3q_u32) \
_(veor3q_u64) \
_(veor3q_s8) \
_(veor3q_s16) \
_(veor3q_s32) \
_(veor3q_s64) \
/*_(vrax1q_u64) */ \
/*_(vxarq_u64) */ \
/*_(vbcaxq_u8) */ \
Expand Down

0 comments on commit c2daf9c

Please sign in to comment.