Skip to content

Commit

Permalink
feat: Add vld1[q]_lane_f64
Browse files Browse the repository at this point in the history
  • Loading branch information
howjmay committed Jul 30, 2024
1 parent e8dc580 commit 2a46988
Show file tree
Hide file tree
Showing 3 changed files with 100 additions and 42 deletions.
84 changes: 46 additions & 38 deletions neon2rvv.h
Original file line number Diff line number Diff line change
Expand Up @@ -13084,80 +13084,80 @@ FORCE_INLINE uint64x2_t vld1q_u64(const uint64_t *ptr) { return __riscv_vle64_v_

// FORCE_INLINE float16x8_t vld1q_f16(float16_t const * ptr);

FORCE_INLINE int8x8_t vld1_lane_s8(const int8_t *a, int8x8_t b, const int c) {
vbool8_t mask = __riscv_vreinterpret_v_u8m1_b8(vdup_n_u8((uint8_t)(1 << c)));
FORCE_INLINE int8x8_t vld1_lane_s8(const int8_t *a, int8x8_t b, const int lane) {
vbool8_t mask = __riscv_vreinterpret_v_u8m1_b8(vdup_n_u8((uint8_t)(1 << lane)));
vint8m1_t a_dup = vdup_n_s8(a[0]);
return __riscv_vmerge_vvm_i8m1(b, a_dup, mask, 8);
}

FORCE_INLINE int16x4_t vld1_lane_s16(const int16_t *a, int16x4_t b, const int c) {
vbool16_t mask = __riscv_vreinterpret_v_u8m1_b16(vdup_n_u8((uint8_t)(1 << c)));
FORCE_INLINE int16x4_t vld1_lane_s16(const int16_t *a, int16x4_t b, const int lane) {
vbool16_t mask = __riscv_vreinterpret_v_u8m1_b16(vdup_n_u8((uint8_t)(1 << lane)));
vint16m1_t a_dup = vdup_n_s16(a[0]);
return __riscv_vmerge_vvm_i16m1(b, a_dup, mask, 4);
}

FORCE_INLINE int32x2_t vld1_lane_s32(const int32_t *a, int32x2_t b, const int c) {
vbool32_t mask = __riscv_vreinterpret_v_u8m1_b32(vdup_n_u8((uint8_t)(1 << c)));
FORCE_INLINE int32x2_t vld1_lane_s32(const int32_t *a, int32x2_t b, const int lane) {
vbool32_t mask = __riscv_vreinterpret_v_u8m1_b32(vdup_n_u8((uint8_t)(1 << lane)));
vint32m1_t a_dup = vdup_n_s32(a[0]);
return __riscv_vmerge_vvm_i32m1(b, a_dup, mask, 2);
}

FORCE_INLINE float32x2_t vld1_lane_f32(const float32_t *a, float32x2_t b, const int c) {
vbool32_t mask = __riscv_vreinterpret_v_u8m1_b32(vdup_n_u8((uint8_t)(1 << c)));
FORCE_INLINE float32x2_t vld1_lane_f32(const float32_t *a, float32x2_t b, const int lane) {
vbool32_t mask = __riscv_vreinterpret_v_u8m1_b32(vdup_n_u8((uint8_t)(1 << lane)));
vfloat32m1_t a_dup = vdup_n_f32(a[0]);
return __riscv_vmerge_vvm_f32m1(b, a_dup, mask, 2);
}

FORCE_INLINE uint8x8_t vld1_lane_u8(const uint8_t *a, uint8x8_t b, const int c) {
vbool8_t mask = __riscv_vreinterpret_v_u8m1_b8(vdup_n_u8((uint8_t)(1 << c)));
FORCE_INLINE uint8x8_t vld1_lane_u8(const uint8_t *a, uint8x8_t b, const int lane) {
vbool8_t mask = __riscv_vreinterpret_v_u8m1_b8(vdup_n_u8((uint8_t)(1 << lane)));
vuint8m1_t a_dup = vdup_n_u8(a[0]);
return __riscv_vmerge_vvm_u8m1(b, a_dup, mask, 8);
}

FORCE_INLINE uint16x4_t vld1_lane_u16(const uint16_t *a, uint16x4_t b, const int c) {
vbool16_t mask = __riscv_vreinterpret_v_u8m1_b16(vdup_n_u8((uint8_t)(1 << c)));
FORCE_INLINE uint16x4_t vld1_lane_u16(const uint16_t *a, uint16x4_t b, const int lane) {
vbool16_t mask = __riscv_vreinterpret_v_u8m1_b16(vdup_n_u8((uint8_t)(1 << lane)));
vuint16m1_t a_dup = vdup_n_u16(a[0]);
return __riscv_vmerge_vvm_u16m1(b, a_dup, mask, 4);
}

FORCE_INLINE uint32x2_t vld1_lane_u32(const uint32_t *a, uint32x2_t b, const int c) {
vbool32_t mask = __riscv_vreinterpret_v_u8m1_b32(vdup_n_u8((uint8_t)(1 << c)));
FORCE_INLINE uint32x2_t vld1_lane_u32(const uint32_t *a, uint32x2_t b, const int lane) {
vbool32_t mask = __riscv_vreinterpret_v_u8m1_b32(vdup_n_u8((uint8_t)(1 << lane)));
vuint32m1_t a_dup = vdup_n_u32(a[0]);
return __riscv_vmerge_vvm_u32m1(b, a_dup, mask, 2);
}

FORCE_INLINE int64x1_t vld1_lane_s64(const int64_t *a, int64x1_t b, const int c) {
vbool64_t mask = __riscv_vreinterpret_v_u8m1_b64(vdup_n_u8((uint8_t)(1 << c)));
FORCE_INLINE int64x1_t vld1_lane_s64(const int64_t *a, int64x1_t b, const int lane) {
vbool64_t mask = __riscv_vreinterpret_v_u8m1_b64(vdup_n_u8((uint8_t)(1 << lane)));
vint64m1_t a_dup = vdup_n_s64(a[0]);
return __riscv_vmerge_vvm_i64m1(b, a_dup, mask, 1);
}

FORCE_INLINE uint64x1_t vld1_lane_u64(const uint64_t *a, uint64x1_t b, const int c) {
vbool64_t mask = __riscv_vreinterpret_v_u8m1_b64(vdup_n_u8((uint8_t)(1 << c)));
FORCE_INLINE uint64x1_t vld1_lane_u64(const uint64_t *a, uint64x1_t b, const int lane) {
vbool64_t mask = __riscv_vreinterpret_v_u8m1_b64(vdup_n_u8((uint8_t)(1 << lane)));
vuint64m1_t a_dup = vdup_n_u64(a[0]);
return __riscv_vmerge_vvm_u64m1(b, a_dup, mask, 1);
}

FORCE_INLINE int8x16_t vld1q_lane_s8(const int8_t *a, int8x16_t b, const int c) {
vbool8_t mask = __riscv_vreinterpret_v_u16m1_b8(vdup_n_u16((uint16_t)(1 << c)));
FORCE_INLINE int8x16_t vld1q_lane_s8(const int8_t *a, int8x16_t b, const int lane) {
vbool8_t mask = __riscv_vreinterpret_v_u16m1_b8(vdup_n_u16((uint16_t)(1 << lane)));
vint8m1_t a_dup = vdupq_n_s8(a[0]);
return __riscv_vmerge_vvm_i8m1(b, a_dup, mask, 16);
}

FORCE_INLINE int16x8_t vld1q_lane_s16(const int16_t *a, int16x8_t b, const int c) {
vbool16_t mask = __riscv_vreinterpret_v_u8m1_b16(vdup_n_u8((uint8_t)(1 << c)));
FORCE_INLINE int16x8_t vld1q_lane_s16(const int16_t *a, int16x8_t b, const int lane) {
vbool16_t mask = __riscv_vreinterpret_v_u8m1_b16(vdup_n_u8((uint8_t)(1 << lane)));
vint16m1_t a_dup = vdupq_n_s16(a[0]);
return __riscv_vmerge_vvm_i16m1(b, a_dup, mask, 8);
}

FORCE_INLINE int32x4_t vld1q_lane_s32(const int32_t *a, int32x4_t b, const int c) {
vbool32_t mask = __riscv_vreinterpret_v_u8m1_b32(vdup_n_u8((uint8_t)(1 << c)));
FORCE_INLINE int32x4_t vld1q_lane_s32(const int32_t *a, int32x4_t b, const int lane) {
vbool32_t mask = __riscv_vreinterpret_v_u8m1_b32(vdup_n_u8((uint8_t)(1 << lane)));
vint32m1_t a_dup = vdupq_n_s32(a[0]);
return __riscv_vmerge_vvm_i32m1(b, a_dup, mask, 4);
}

FORCE_INLINE float32x4_t vld1q_lane_f32(const float32_t *a, float32x4_t b, const int c) {
vbool32_t mask = __riscv_vreinterpret_v_u8m1_b32(vdup_n_u8((uint8_t)(1 << c)));
FORCE_INLINE float32x4_t vld1q_lane_f32(const float32_t *a, float32x4_t b, const int lane) {
vbool32_t mask = __riscv_vreinterpret_v_u8m1_b32(vdup_n_u8((uint8_t)(1 << lane)));
vfloat32m1_t a_dup = vdupq_n_f32(a[0]);
return __riscv_vmerge_vvm_f32m1(b, a_dup, mask, 4);
}
Expand All @@ -13170,36 +13170,44 @@ FORCE_INLINE float32x4_t vld1q_lane_f32(const float32_t *a, float32x4_t b, const

// FORCE_INLINE poly16x8_t vld1q_lane_p16(poly16_t const * ptr, poly16x8_t src, const int lane);

// FORCE_INLINE float64x1_t vld1_lane_f64(float64_t const * ptr, float64x1_t src, const int lane);
FORCE_INLINE float64x1_t vld1_lane_f64(float64_t const *a, float64x1_t b, const int lane) {
vbool64_t mask = __riscv_vreinterpret_v_u8m1_b64(vdup_n_u8((uint8_t)(1 << lane)));
vfloat64m1_t a_dup = vdup_n_f64(a[0]);
return __riscv_vmerge_vvm_f64m1(b, a_dup, mask, 1);
}

// FORCE_INLINE float64x2_t vld1q_lane_f64(float64_t const * ptr, float64x2_t src, const int lane);
FORCE_INLINE float64x2_t vld1q_lane_f64(float64_t const *a, float64x2_t b, const int lane) {
vbool64_t mask = __riscv_vreinterpret_v_u8m1_b64(vdup_n_u8((uint8_t)(1 << lane)));
vfloat64m1_t a_dup = vdupq_n_f64(a[0]);
return __riscv_vmerge_vvm_f64m1(b, a_dup, mask, 2);
}

FORCE_INLINE uint8x16_t vld1q_lane_u8(const uint8_t *a, uint8x16_t b, const int c) {
vbool8_t mask = __riscv_vreinterpret_v_u16m1_b8(vdup_n_u16((uint16_t)(1 << c)));
FORCE_INLINE uint8x16_t vld1q_lane_u8(const uint8_t *a, uint8x16_t b, const int lane) {
vbool8_t mask = __riscv_vreinterpret_v_u16m1_b8(vdup_n_u16((uint16_t)(1 << lane)));
vuint8m1_t a_dup = vdupq_n_u8(a[0]);
return __riscv_vmerge_vvm_u8m1(b, a_dup, mask, 16);
}

FORCE_INLINE uint16x8_t vld1q_lane_u16(const uint16_t *a, uint16x8_t b, const int c) {
vbool16_t mask = __riscv_vreinterpret_v_u8m1_b16(vdup_n_u8((uint8_t)(1 << c)));
FORCE_INLINE uint16x8_t vld1q_lane_u16(const uint16_t *a, uint16x8_t b, const int lane) {
vbool16_t mask = __riscv_vreinterpret_v_u8m1_b16(vdup_n_u8((uint8_t)(1 << lane)));
vuint16m1_t a_dup = vdupq_n_u16(a[0]);
return __riscv_vmerge_vvm_u16m1(b, a_dup, mask, 8);
}

FORCE_INLINE uint32x4_t vld1q_lane_u32(const uint32_t *a, uint32x4_t b, const int c) {
vbool32_t mask = __riscv_vreinterpret_v_u8m1_b32(vdup_n_u8((uint8_t)(1 << c)));
FORCE_INLINE uint32x4_t vld1q_lane_u32(const uint32_t *a, uint32x4_t b, const int lane) {
vbool32_t mask = __riscv_vreinterpret_v_u8m1_b32(vdup_n_u8((uint8_t)(1 << lane)));
vuint32m1_t a_dup = vdupq_n_u32(a[0]);
return __riscv_vmerge_vvm_u32m1(b, a_dup, mask, 4);
}

FORCE_INLINE int64x2_t vld1q_lane_s64(const int64_t *a, int64x2_t b, const int c) {
vbool64_t mask = __riscv_vreinterpret_v_u8m1_b64(vdup_n_u8((uint8_t)(1 << c)));
FORCE_INLINE int64x2_t vld1q_lane_s64(const int64_t *a, int64x2_t b, const int lane) {
vbool64_t mask = __riscv_vreinterpret_v_u8m1_b64(vdup_n_u8((uint8_t)(1 << lane)));
vint64m1_t a_dup = vdupq_n_s64(a[0]);
return __riscv_vmerge_vvm_i64m1(b, a_dup, mask, 2);
}

FORCE_INLINE uint64x2_t vld1q_lane_u64(const uint64_t *a, uint64x2_t b, const int c) {
vbool64_t mask = __riscv_vreinterpret_v_u8m1_b64(vdup_n_u8((uint8_t)(1 << c)));
FORCE_INLINE uint64x2_t vld1q_lane_u64(const uint64_t *a, uint64x2_t b, const int lane) {
vbool64_t mask = __riscv_vreinterpret_v_u8m1_b64(vdup_n_u8((uint8_t)(1 << lane)));
vuint64m1_t a_dup = vdupq_n_u64(a[0]);
return __riscv_vmerge_vvm_u64m1(b, a_dup, mask, 2);
}
Expand Down
54 changes: 52 additions & 2 deletions tests/impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43414,9 +43414,59 @@ result_t test_vld1_lane_p16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { ret

result_t test_vld1q_lane_p16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }

result_t test_vld1_lane_f64(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
result_t test_vld1_lane_f64(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
#ifdef ENABLE_TEST_ALL
double *_a = (double *)impl.test_cases_float_pointer1;
double *_b = (double *)impl.test_cases_float_pointer2;
double _c[1];
float64x1_t c;
float64x1_t b = vld1_f64(_b);
#define TEST_IMPL(IDX) \
for (int i = 0; i < 1; i++) { \
if (i != IDX) { \
_c[i] = _b[i]; \
} else { \
_c[i] = _a[0]; \
} \
} \
c = vld1_lane_f64(_a, b, IDX); \
CHECK_RESULT(validate_double(c, _c[0]))

IMM_1_ITER
#undef TEST_IMPL

return TEST_SUCCESS;
#else
return TEST_UNIMPL;
#endif // ENABLE_TEST_ALL
}

result_t test_vld1q_lane_f64(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
result_t test_vld1q_lane_f64(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
#ifdef ENABLE_TEST_ALL
double *_a = (double *)impl.test_cases_float_pointer1;
double *_b = (double *)impl.test_cases_float_pointer2;
double _c[2];
float64x2_t c;
float64x2_t b = vld1q_f64(_b);
#define TEST_IMPL(IDX) \
for (int i = 0; i < 2; i++) { \
if (i != IDX) { \
_c[i] = _b[i]; \
} else { \
_c[i] = _a[0]; \
} \
} \
c = vld1q_lane_f64(_a, b, IDX); \
CHECK_RESULT(validate_double(c, _c[0], _c[1]))

IMM_2_ITER
#undef TEST_IMPL

return TEST_SUCCESS;
#else
return TEST_UNIMPL;
#endif // ENABLE_TEST_ALL
}

result_t test_vld1q_lane_u8(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
#ifdef ENABLE_TEST_ALL
Expand Down
4 changes: 2 additions & 2 deletions tests/impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -2634,8 +2634,8 @@
_(vld1q_lane_u32) \
/*_(vld1q_lane_p8) */ \
/*_(vld1q_lane_p16) */ \
/*_(vld1_lane_f64) */ \
/*_(vld1q_lane_f64) */ \
_(vld1_lane_f64) \
_(vld1q_lane_f64) \
/*_(vld1q_lane_p64) */ \
_(vld1q_lane_s64) \
_(vld1q_lane_u64) \
Expand Down

0 comments on commit 2a46988

Please sign in to comment.