diff --git a/neon2rvv.h b/neon2rvv.h index 8374ab53..48a481d5 100644 --- a/neon2rvv.h +++ b/neon2rvv.h @@ -13084,80 +13084,80 @@ FORCE_INLINE uint64x2_t vld1q_u64(const uint64_t *ptr) { return __riscv_vle64_v_ // FORCE_INLINE float16x8_t vld1q_f16(float16_t const * ptr); -FORCE_INLINE int8x8_t vld1_lane_s8(const int8_t *a, int8x8_t b, const int c) { - vbool8_t mask = __riscv_vreinterpret_v_u8m1_b8(vdup_n_u8((uint8_t)(1 << c))); +FORCE_INLINE int8x8_t vld1_lane_s8(const int8_t *a, int8x8_t b, const int lane) { + vbool8_t mask = __riscv_vreinterpret_v_u8m1_b8(vdup_n_u8((uint8_t)(1 << lane))); vint8m1_t a_dup = vdup_n_s8(a[0]); return __riscv_vmerge_vvm_i8m1(b, a_dup, mask, 8); } -FORCE_INLINE int16x4_t vld1_lane_s16(const int16_t *a, int16x4_t b, const int c) { - vbool16_t mask = __riscv_vreinterpret_v_u8m1_b16(vdup_n_u8((uint8_t)(1 << c))); +FORCE_INLINE int16x4_t vld1_lane_s16(const int16_t *a, int16x4_t b, const int lane) { + vbool16_t mask = __riscv_vreinterpret_v_u8m1_b16(vdup_n_u8((uint8_t)(1 << lane))); vint16m1_t a_dup = vdup_n_s16(a[0]); return __riscv_vmerge_vvm_i16m1(b, a_dup, mask, 4); } -FORCE_INLINE int32x2_t vld1_lane_s32(const int32_t *a, int32x2_t b, const int c) { - vbool32_t mask = __riscv_vreinterpret_v_u8m1_b32(vdup_n_u8((uint8_t)(1 << c))); +FORCE_INLINE int32x2_t vld1_lane_s32(const int32_t *a, int32x2_t b, const int lane) { + vbool32_t mask = __riscv_vreinterpret_v_u8m1_b32(vdup_n_u8((uint8_t)(1 << lane))); vint32m1_t a_dup = vdup_n_s32(a[0]); return __riscv_vmerge_vvm_i32m1(b, a_dup, mask, 2); } -FORCE_INLINE float32x2_t vld1_lane_f32(const float32_t *a, float32x2_t b, const int c) { - vbool32_t mask = __riscv_vreinterpret_v_u8m1_b32(vdup_n_u8((uint8_t)(1 << c))); +FORCE_INLINE float32x2_t vld1_lane_f32(const float32_t *a, float32x2_t b, const int lane) { + vbool32_t mask = __riscv_vreinterpret_v_u8m1_b32(vdup_n_u8((uint8_t)(1 << lane))); vfloat32m1_t a_dup = vdup_n_f32(a[0]); return __riscv_vmerge_vvm_f32m1(b, a_dup, mask, 2); } -FORCE_INLINE uint8x8_t vld1_lane_u8(const uint8_t *a, uint8x8_t b, const int c) { - vbool8_t mask = __riscv_vreinterpret_v_u8m1_b8(vdup_n_u8((uint8_t)(1 << c))); +FORCE_INLINE uint8x8_t vld1_lane_u8(const uint8_t *a, uint8x8_t b, const int lane) { + vbool8_t mask = __riscv_vreinterpret_v_u8m1_b8(vdup_n_u8((uint8_t)(1 << lane))); vuint8m1_t a_dup = vdup_n_u8(a[0]); return __riscv_vmerge_vvm_u8m1(b, a_dup, mask, 8); } -FORCE_INLINE uint16x4_t vld1_lane_u16(const uint16_t *a, uint16x4_t b, const int c) { - vbool16_t mask = __riscv_vreinterpret_v_u8m1_b16(vdup_n_u8((uint8_t)(1 << c))); +FORCE_INLINE uint16x4_t vld1_lane_u16(const uint16_t *a, uint16x4_t b, const int lane) { + vbool16_t mask = __riscv_vreinterpret_v_u8m1_b16(vdup_n_u8((uint8_t)(1 << lane))); vuint16m1_t a_dup = vdup_n_u16(a[0]); return __riscv_vmerge_vvm_u16m1(b, a_dup, mask, 4); } -FORCE_INLINE uint32x2_t vld1_lane_u32(const uint32_t *a, uint32x2_t b, const int c) { - vbool32_t mask = __riscv_vreinterpret_v_u8m1_b32(vdup_n_u8((uint8_t)(1 << c))); +FORCE_INLINE uint32x2_t vld1_lane_u32(const uint32_t *a, uint32x2_t b, const int lane) { + vbool32_t mask = __riscv_vreinterpret_v_u8m1_b32(vdup_n_u8((uint8_t)(1 << lane))); vuint32m1_t a_dup = vdup_n_u32(a[0]); return __riscv_vmerge_vvm_u32m1(b, a_dup, mask, 2); } -FORCE_INLINE int64x1_t vld1_lane_s64(const int64_t *a, int64x1_t b, const int c) { - vbool64_t mask = __riscv_vreinterpret_v_u8m1_b64(vdup_n_u8((uint8_t)(1 << c))); +FORCE_INLINE int64x1_t vld1_lane_s64(const int64_t *a, int64x1_t b, const int lane) { + vbool64_t mask = __riscv_vreinterpret_v_u8m1_b64(vdup_n_u8((uint8_t)(1 << lane))); vint64m1_t a_dup = vdup_n_s64(a[0]); return __riscv_vmerge_vvm_i64m1(b, a_dup, mask, 1); } -FORCE_INLINE uint64x1_t vld1_lane_u64(const uint64_t *a, uint64x1_t b, const int c) { - vbool64_t mask = __riscv_vreinterpret_v_u8m1_b64(vdup_n_u8((uint8_t)(1 << c))); +FORCE_INLINE uint64x1_t vld1_lane_u64(const uint64_t *a, uint64x1_t b, const int lane) { + vbool64_t mask = __riscv_vreinterpret_v_u8m1_b64(vdup_n_u8((uint8_t)(1 << lane))); vuint64m1_t a_dup = vdup_n_u64(a[0]); return __riscv_vmerge_vvm_u64m1(b, a_dup, mask, 1); } -FORCE_INLINE int8x16_t vld1q_lane_s8(const int8_t *a, int8x16_t b, const int c) { - vbool8_t mask = __riscv_vreinterpret_v_u16m1_b8(vdup_n_u16((uint16_t)(1 << c))); +FORCE_INLINE int8x16_t vld1q_lane_s8(const int8_t *a, int8x16_t b, const int lane) { + vbool8_t mask = __riscv_vreinterpret_v_u16m1_b8(vdup_n_u16((uint16_t)(1 << lane))); vint8m1_t a_dup = vdupq_n_s8(a[0]); return __riscv_vmerge_vvm_i8m1(b, a_dup, mask, 16); } -FORCE_INLINE int16x8_t vld1q_lane_s16(const int16_t *a, int16x8_t b, const int c) { - vbool16_t mask = __riscv_vreinterpret_v_u8m1_b16(vdup_n_u8((uint8_t)(1 << c))); +FORCE_INLINE int16x8_t vld1q_lane_s16(const int16_t *a, int16x8_t b, const int lane) { + vbool16_t mask = __riscv_vreinterpret_v_u8m1_b16(vdup_n_u8((uint8_t)(1 << lane))); vint16m1_t a_dup = vdupq_n_s16(a[0]); return __riscv_vmerge_vvm_i16m1(b, a_dup, mask, 8); } -FORCE_INLINE int32x4_t vld1q_lane_s32(const int32_t *a, int32x4_t b, const int c) { - vbool32_t mask = __riscv_vreinterpret_v_u8m1_b32(vdup_n_u8((uint8_t)(1 << c))); +FORCE_INLINE int32x4_t vld1q_lane_s32(const int32_t *a, int32x4_t b, const int lane) { + vbool32_t mask = __riscv_vreinterpret_v_u8m1_b32(vdup_n_u8((uint8_t)(1 << lane))); vint32m1_t a_dup = vdupq_n_s32(a[0]); return __riscv_vmerge_vvm_i32m1(b, a_dup, mask, 4); } -FORCE_INLINE float32x4_t vld1q_lane_f32(const float32_t *a, float32x4_t b, const int c) { - vbool32_t mask = __riscv_vreinterpret_v_u8m1_b32(vdup_n_u8((uint8_t)(1 << c))); +FORCE_INLINE float32x4_t vld1q_lane_f32(const float32_t *a, float32x4_t b, const int lane) { + vbool32_t mask = __riscv_vreinterpret_v_u8m1_b32(vdup_n_u8((uint8_t)(1 << lane))); vfloat32m1_t a_dup = vdupq_n_f32(a[0]); return __riscv_vmerge_vvm_f32m1(b, a_dup, mask, 4); } @@ -13170,36 +13170,44 @@ FORCE_INLINE float32x4_t vld1q_lane_f32(const float32_t *a, float32x4_t b, const // FORCE_INLINE poly16x8_t vld1q_lane_p16(poly16_t const * ptr, poly16x8_t src, const int lane); -// FORCE_INLINE float64x1_t vld1_lane_f64(float64_t const * ptr, float64x1_t src, const int lane); +FORCE_INLINE float64x1_t vld1_lane_f64(float64_t const *a, float64x1_t b, const int lane) { + vbool64_t mask = __riscv_vreinterpret_v_u8m1_b64(vdup_n_u8((uint8_t)(1 << lane))); + vfloat64m1_t a_dup = vdup_n_f64(a[0]); + return __riscv_vmerge_vvm_f64m1(b, a_dup, mask, 1); +} -// FORCE_INLINE float64x2_t vld1q_lane_f64(float64_t const * ptr, float64x2_t src, const int lane); +FORCE_INLINE float64x2_t vld1q_lane_f64(float64_t const *a, float64x2_t b, const int lane) { + vbool64_t mask = __riscv_vreinterpret_v_u8m1_b64(vdup_n_u8((uint8_t)(1 << lane))); + vfloat64m1_t a_dup = vdupq_n_f64(a[0]); + return __riscv_vmerge_vvm_f64m1(b, a_dup, mask, 2); +} -FORCE_INLINE uint8x16_t vld1q_lane_u8(const uint8_t *a, uint8x16_t b, const int c) { - vbool8_t mask = __riscv_vreinterpret_v_u16m1_b8(vdup_n_u16((uint16_t)(1 << c))); +FORCE_INLINE uint8x16_t vld1q_lane_u8(const uint8_t *a, uint8x16_t b, const int lane) { + vbool8_t mask = __riscv_vreinterpret_v_u16m1_b8(vdup_n_u16((uint16_t)(1 << lane))); vuint8m1_t a_dup = vdupq_n_u8(a[0]); return __riscv_vmerge_vvm_u8m1(b, a_dup, mask, 16); } -FORCE_INLINE uint16x8_t vld1q_lane_u16(const uint16_t *a, uint16x8_t b, const int c) { - vbool16_t mask = __riscv_vreinterpret_v_u8m1_b16(vdup_n_u8((uint8_t)(1 << c))); +FORCE_INLINE uint16x8_t vld1q_lane_u16(const uint16_t *a, uint16x8_t b, const int lane) { + vbool16_t mask = __riscv_vreinterpret_v_u8m1_b16(vdup_n_u8((uint8_t)(1 << lane))); vuint16m1_t a_dup = vdupq_n_u16(a[0]); return __riscv_vmerge_vvm_u16m1(b, a_dup, mask, 8); } -FORCE_INLINE uint32x4_t vld1q_lane_u32(const uint32_t *a, uint32x4_t b, const int c) { - vbool32_t mask = __riscv_vreinterpret_v_u8m1_b32(vdup_n_u8((uint8_t)(1 << c))); +FORCE_INLINE uint32x4_t vld1q_lane_u32(const uint32_t *a, uint32x4_t b, const int lane) { + vbool32_t mask = __riscv_vreinterpret_v_u8m1_b32(vdup_n_u8((uint8_t)(1 << lane))); vuint32m1_t a_dup = vdupq_n_u32(a[0]); return __riscv_vmerge_vvm_u32m1(b, a_dup, mask, 4); } -FORCE_INLINE int64x2_t vld1q_lane_s64(const int64_t *a, int64x2_t b, const int c) { - vbool64_t mask = __riscv_vreinterpret_v_u8m1_b64(vdup_n_u8((uint8_t)(1 << c))); +FORCE_INLINE int64x2_t vld1q_lane_s64(const int64_t *a, int64x2_t b, const int lane) { + vbool64_t mask = __riscv_vreinterpret_v_u8m1_b64(vdup_n_u8((uint8_t)(1 << lane))); vint64m1_t a_dup = vdupq_n_s64(a[0]); return __riscv_vmerge_vvm_i64m1(b, a_dup, mask, 2); } -FORCE_INLINE uint64x2_t vld1q_lane_u64(const uint64_t *a, uint64x2_t b, const int c) { - vbool64_t mask = __riscv_vreinterpret_v_u8m1_b64(vdup_n_u8((uint8_t)(1 << c))); +FORCE_INLINE uint64x2_t vld1q_lane_u64(const uint64_t *a, uint64x2_t b, const int lane) { + vbool64_t mask = __riscv_vreinterpret_v_u8m1_b64(vdup_n_u8((uint8_t)(1 << lane))); vuint64m1_t a_dup = vdupq_n_u64(a[0]); return __riscv_vmerge_vvm_u64m1(b, a_dup, mask, 2); } diff --git a/tests/impl.cpp b/tests/impl.cpp index 26580e9f..4b4f9319 100644 --- a/tests/impl.cpp +++ b/tests/impl.cpp @@ -43414,9 +43414,59 @@ result_t test_vld1_lane_p16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { ret result_t test_vld1q_lane_p16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; } -result_t test_vld1_lane_f64(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; } +result_t test_vld1_lane_f64(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { +#ifdef ENABLE_TEST_ALL + double *_a = (double *)impl.test_cases_float_pointer1; + double *_b = (double *)impl.test_cases_float_pointer2; + double _c[1]; + float64x1_t c; + float64x1_t b = vld1_f64(_b); +#define TEST_IMPL(IDX) \ + for (int i = 0; i < 1; i++) { \ + if (i != IDX) { \ + _c[i] = _b[i]; \ + } else { \ + _c[i] = _a[0]; \ + } \ + } \ + c = vld1_lane_f64(_a, b, IDX); \ + CHECK_RESULT(validate_double(c, _c[0])) + + IMM_1_ITER +#undef TEST_IMPL + + return TEST_SUCCESS; +#else + return TEST_UNIMPL; +#endif // ENABLE_TEST_ALL +} -result_t test_vld1q_lane_f64(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; } +result_t test_vld1q_lane_f64(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { +#ifdef ENABLE_TEST_ALL + double *_a = (double *)impl.test_cases_float_pointer1; + double *_b = (double *)impl.test_cases_float_pointer2; + double _c[2]; + float64x2_t c; + float64x2_t b = vld1q_f64(_b); +#define TEST_IMPL(IDX) \ + for (int i = 0; i < 2; i++) { \ + if (i != IDX) { \ + _c[i] = _b[i]; \ + } else { \ + _c[i] = _a[0]; \ + } \ + } \ + c = vld1q_lane_f64(_a, b, IDX); \ + CHECK_RESULT(validate_double(c, _c[0], _c[1])) + + IMM_2_ITER +#undef TEST_IMPL + + return TEST_SUCCESS; +#else + return TEST_UNIMPL; +#endif // ENABLE_TEST_ALL +} result_t test_vld1q_lane_u8(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { #ifdef ENABLE_TEST_ALL diff --git a/tests/impl.h b/tests/impl.h index 667d8b68..52cef98d 100644 --- a/tests/impl.h +++ b/tests/impl.h @@ -2634,8 +2634,8 @@ _(vld1q_lane_u32) \ /*_(vld1q_lane_p8) */ \ /*_(vld1q_lane_p16) */ \ - /*_(vld1_lane_f64) */ \ - /*_(vld1q_lane_f64) */ \ + _(vld1_lane_f64) \ + _(vld1q_lane_f64) \ /*_(vld1q_lane_p64) */ \ _(vld1q_lane_s64) \ _(vld1q_lane_u64) \