feat: Add vld1[q]_lane_f64

howjmay · Jul 30, 2024 · 2a46988 · 2a46988
1 parent e8dc580
commit 2a46988
Show file tree

Hide file tree

Showing 3 changed files with 100 additions and 42 deletions.
diff --git a/neon2rvv.h b/neon2rvv.h
@@ -13084,80 +13084,80 @@ FORCE_INLINE uint64x2_t vld1q_u64(const uint64_t *ptr) { return __riscv_vle64_v_
 
 // FORCE_INLINE float16x8_t vld1q_f16(float16_t const * ptr);
 
-FORCE_INLINE int8x8_t vld1_lane_s8(const int8_t *a, int8x8_t b, const int c) {
-  vbool8_t mask = __riscv_vreinterpret_v_u8m1_b8(vdup_n_u8((uint8_t)(1 << c)));
+FORCE_INLINE int8x8_t vld1_lane_s8(const int8_t *a, int8x8_t b, const int lane) {
+  vbool8_t mask = __riscv_vreinterpret_v_u8m1_b8(vdup_n_u8((uint8_t)(1 << lane)));
   vint8m1_t a_dup = vdup_n_s8(a[0]);
   return __riscv_vmerge_vvm_i8m1(b, a_dup, mask, 8);
 }
 
-FORCE_INLINE int16x4_t vld1_lane_s16(const int16_t *a, int16x4_t b, const int c) {
-  vbool16_t mask = __riscv_vreinterpret_v_u8m1_b16(vdup_n_u8((uint8_t)(1 << c)));
+FORCE_INLINE int16x4_t vld1_lane_s16(const int16_t *a, int16x4_t b, const int lane) {
+  vbool16_t mask = __riscv_vreinterpret_v_u8m1_b16(vdup_n_u8((uint8_t)(1 << lane)));
   vint16m1_t a_dup = vdup_n_s16(a[0]);
   return __riscv_vmerge_vvm_i16m1(b, a_dup, mask, 4);
 }
 
-FORCE_INLINE int32x2_t vld1_lane_s32(const int32_t *a, int32x2_t b, const int c) {
-  vbool32_t mask = __riscv_vreinterpret_v_u8m1_b32(vdup_n_u8((uint8_t)(1 << c)));
+FORCE_INLINE int32x2_t vld1_lane_s32(const int32_t *a, int32x2_t b, const int lane) {
+  vbool32_t mask = __riscv_vreinterpret_v_u8m1_b32(vdup_n_u8((uint8_t)(1 << lane)));
   vint32m1_t a_dup = vdup_n_s32(a[0]);
   return __riscv_vmerge_vvm_i32m1(b, a_dup, mask, 2);
 }
 
-FORCE_INLINE float32x2_t vld1_lane_f32(const float32_t *a, float32x2_t b, const int c) {
-  vbool32_t mask = __riscv_vreinterpret_v_u8m1_b32(vdup_n_u8((uint8_t)(1 << c)));
+FORCE_INLINE float32x2_t vld1_lane_f32(const float32_t *a, float32x2_t b, const int lane) {
+  vbool32_t mask = __riscv_vreinterpret_v_u8m1_b32(vdup_n_u8((uint8_t)(1 << lane)));
   vfloat32m1_t a_dup = vdup_n_f32(a[0]);
   return __riscv_vmerge_vvm_f32m1(b, a_dup, mask, 2);
 }
 
-FORCE_INLINE uint8x8_t vld1_lane_u8(const uint8_t *a, uint8x8_t b, const int c) {
-  vbool8_t mask = __riscv_vreinterpret_v_u8m1_b8(vdup_n_u8((uint8_t)(1 << c)));
+FORCE_INLINE uint8x8_t vld1_lane_u8(const uint8_t *a, uint8x8_t b, const int lane) {
+  vbool8_t mask = __riscv_vreinterpret_v_u8m1_b8(vdup_n_u8((uint8_t)(1 << lane)));
   vuint8m1_t a_dup = vdup_n_u8(a[0]);
   return __riscv_vmerge_vvm_u8m1(b, a_dup, mask, 8);
 }
 
-FORCE_INLINE uint16x4_t vld1_lane_u16(const uint16_t *a, uint16x4_t b, const int c) {
-  vbool16_t mask = __riscv_vreinterpret_v_u8m1_b16(vdup_n_u8((uint8_t)(1 << c)));
+FORCE_INLINE uint16x4_t vld1_lane_u16(const uint16_t *a, uint16x4_t b, const int lane) {
+  vbool16_t mask = __riscv_vreinterpret_v_u8m1_b16(vdup_n_u8((uint8_t)(1 << lane)));
   vuint16m1_t a_dup = vdup_n_u16(a[0]);
   return __riscv_vmerge_vvm_u16m1(b, a_dup, mask, 4);
 }
 
-FORCE_INLINE uint32x2_t vld1_lane_u32(const uint32_t *a, uint32x2_t b, const int c) {
-  vbool32_t mask = __riscv_vreinterpret_v_u8m1_b32(vdup_n_u8((uint8_t)(1 << c)));
+FORCE_INLINE uint32x2_t vld1_lane_u32(const uint32_t *a, uint32x2_t b, const int lane) {
+  vbool32_t mask = __riscv_vreinterpret_v_u8m1_b32(vdup_n_u8((uint8_t)(1 << lane)));
   vuint32m1_t a_dup = vdup_n_u32(a[0]);
   return __riscv_vmerge_vvm_u32m1(b, a_dup, mask, 2);
 }
 
-FORCE_INLINE int64x1_t vld1_lane_s64(const int64_t *a, int64x1_t b, const int c) {
-  vbool64_t mask = __riscv_vreinterpret_v_u8m1_b64(vdup_n_u8((uint8_t)(1 << c)));
+FORCE_INLINE int64x1_t vld1_lane_s64(const int64_t *a, int64x1_t b, const int lane) {
+  vbool64_t mask = __riscv_vreinterpret_v_u8m1_b64(vdup_n_u8((uint8_t)(1 << lane)));
   vint64m1_t a_dup = vdup_n_s64(a[0]);
   return __riscv_vmerge_vvm_i64m1(b, a_dup, mask, 1);
 }
 
-FORCE_INLINE uint64x1_t vld1_lane_u64(const uint64_t *a, uint64x1_t b, const int c) {
-  vbool64_t mask = __riscv_vreinterpret_v_u8m1_b64(vdup_n_u8((uint8_t)(1 << c)));
+FORCE_INLINE uint64x1_t vld1_lane_u64(const uint64_t *a, uint64x1_t b, const int lane) {
+  vbool64_t mask = __riscv_vreinterpret_v_u8m1_b64(vdup_n_u8((uint8_t)(1 << lane)));
   vuint64m1_t a_dup = vdup_n_u64(a[0]);
   return __riscv_vmerge_vvm_u64m1(b, a_dup, mask, 1);
 }
 
-FORCE_INLINE int8x16_t vld1q_lane_s8(const int8_t *a, int8x16_t b, const int c) {
-  vbool8_t mask = __riscv_vreinterpret_v_u16m1_b8(vdup_n_u16((uint16_t)(1 << c)));
+FORCE_INLINE int8x16_t vld1q_lane_s8(const int8_t *a, int8x16_t b, const int lane) {
+  vbool8_t mask = __riscv_vreinterpret_v_u16m1_b8(vdup_n_u16((uint16_t)(1 << lane)));
   vint8m1_t a_dup = vdupq_n_s8(a[0]);
   return __riscv_vmerge_vvm_i8m1(b, a_dup, mask, 16);
 }
 
-FORCE_INLINE int16x8_t vld1q_lane_s16(const int16_t *a, int16x8_t b, const int c) {
-  vbool16_t mask = __riscv_vreinterpret_v_u8m1_b16(vdup_n_u8((uint8_t)(1 << c)));
+FORCE_INLINE int16x8_t vld1q_lane_s16(const int16_t *a, int16x8_t b, const int lane) {
+  vbool16_t mask = __riscv_vreinterpret_v_u8m1_b16(vdup_n_u8((uint8_t)(1 << lane)));
   vint16m1_t a_dup = vdupq_n_s16(a[0]);
   return __riscv_vmerge_vvm_i16m1(b, a_dup, mask, 8);
 }
 
-FORCE_INLINE int32x4_t vld1q_lane_s32(const int32_t *a, int32x4_t b, const int c) {
-  vbool32_t mask = __riscv_vreinterpret_v_u8m1_b32(vdup_n_u8((uint8_t)(1 << c)));
+FORCE_INLINE int32x4_t vld1q_lane_s32(const int32_t *a, int32x4_t b, const int lane) {
+  vbool32_t mask = __riscv_vreinterpret_v_u8m1_b32(vdup_n_u8((uint8_t)(1 << lane)));
   vint32m1_t a_dup = vdupq_n_s32(a[0]);
   return __riscv_vmerge_vvm_i32m1(b, a_dup, mask, 4);
 }
 
-FORCE_INLINE float32x4_t vld1q_lane_f32(const float32_t *a, float32x4_t b, const int c) {
-  vbool32_t mask = __riscv_vreinterpret_v_u8m1_b32(vdup_n_u8((uint8_t)(1 << c)));
+FORCE_INLINE float32x4_t vld1q_lane_f32(const float32_t *a, float32x4_t b, const int lane) {
+  vbool32_t mask = __riscv_vreinterpret_v_u8m1_b32(vdup_n_u8((uint8_t)(1 << lane)));
   vfloat32m1_t a_dup = vdupq_n_f32(a[0]);
   return __riscv_vmerge_vvm_f32m1(b, a_dup, mask, 4);
 }
@@ -13170,36 +13170,44 @@ FORCE_INLINE float32x4_t vld1q_lane_f32(const float32_t *a, float32x4_t b, const
 
 // FORCE_INLINE poly16x8_t vld1q_lane_p16(poly16_t const * ptr, poly16x8_t src, const int lane);
 
-// FORCE_INLINE float64x1_t vld1_lane_f64(float64_t const * ptr, float64x1_t src, const int lane);
+FORCE_INLINE float64x1_t vld1_lane_f64(float64_t const *a, float64x1_t b, const int lane) {
+  vbool64_t mask = __riscv_vreinterpret_v_u8m1_b64(vdup_n_u8((uint8_t)(1 << lane)));
+  vfloat64m1_t a_dup = vdup_n_f64(a[0]);
+  return __riscv_vmerge_vvm_f64m1(b, a_dup, mask, 1);
+}
 
-// FORCE_INLINE float64x2_t vld1q_lane_f64(float64_t const * ptr, float64x2_t src, const int lane);
+FORCE_INLINE float64x2_t vld1q_lane_f64(float64_t const *a, float64x2_t b, const int lane) {
+  vbool64_t mask = __riscv_vreinterpret_v_u8m1_b64(vdup_n_u8((uint8_t)(1 << lane)));
+  vfloat64m1_t a_dup = vdupq_n_f64(a[0]);
+  return __riscv_vmerge_vvm_f64m1(b, a_dup, mask, 2);
+}
 
-FORCE_INLINE uint8x16_t vld1q_lane_u8(const uint8_t *a, uint8x16_t b, const int c) {
-  vbool8_t mask = __riscv_vreinterpret_v_u16m1_b8(vdup_n_u16((uint16_t)(1 << c)));
+FORCE_INLINE uint8x16_t vld1q_lane_u8(const uint8_t *a, uint8x16_t b, const int lane) {
+  vbool8_t mask = __riscv_vreinterpret_v_u16m1_b8(vdup_n_u16((uint16_t)(1 << lane)));
   vuint8m1_t a_dup = vdupq_n_u8(a[0]);
   return __riscv_vmerge_vvm_u8m1(b, a_dup, mask, 16);
 }
 
-FORCE_INLINE uint16x8_t vld1q_lane_u16(const uint16_t *a, uint16x8_t b, const int c) {
-  vbool16_t mask = __riscv_vreinterpret_v_u8m1_b16(vdup_n_u8((uint8_t)(1 << c)));
+FORCE_INLINE uint16x8_t vld1q_lane_u16(const uint16_t *a, uint16x8_t b, const int lane) {
+  vbool16_t mask = __riscv_vreinterpret_v_u8m1_b16(vdup_n_u8((uint8_t)(1 << lane)));
   vuint16m1_t a_dup = vdupq_n_u16(a[0]);
   return __riscv_vmerge_vvm_u16m1(b, a_dup, mask, 8);
 }
 
-FORCE_INLINE uint32x4_t vld1q_lane_u32(const uint32_t *a, uint32x4_t b, const int c) {
-  vbool32_t mask = __riscv_vreinterpret_v_u8m1_b32(vdup_n_u8((uint8_t)(1 << c)));
+FORCE_INLINE uint32x4_t vld1q_lane_u32(const uint32_t *a, uint32x4_t b, const int lane) {
+  vbool32_t mask = __riscv_vreinterpret_v_u8m1_b32(vdup_n_u8((uint8_t)(1 << lane)));
   vuint32m1_t a_dup = vdupq_n_u32(a[0]);
   return __riscv_vmerge_vvm_u32m1(b, a_dup, mask, 4);
 }
 
-FORCE_INLINE int64x2_t vld1q_lane_s64(const int64_t *a, int64x2_t b, const int c) {
-  vbool64_t mask = __riscv_vreinterpret_v_u8m1_b64(vdup_n_u8((uint8_t)(1 << c)));
+FORCE_INLINE int64x2_t vld1q_lane_s64(const int64_t *a, int64x2_t b, const int lane) {
+  vbool64_t mask = __riscv_vreinterpret_v_u8m1_b64(vdup_n_u8((uint8_t)(1 << lane)));
   vint64m1_t a_dup = vdupq_n_s64(a[0]);
   return __riscv_vmerge_vvm_i64m1(b, a_dup, mask, 2);
 }
 
-FORCE_INLINE uint64x2_t vld1q_lane_u64(const uint64_t *a, uint64x2_t b, const int c) {
-  vbool64_t mask = __riscv_vreinterpret_v_u8m1_b64(vdup_n_u8((uint8_t)(1 << c)));
+FORCE_INLINE uint64x2_t vld1q_lane_u64(const uint64_t *a, uint64x2_t b, const int lane) {
+  vbool64_t mask = __riscv_vreinterpret_v_u8m1_b64(vdup_n_u8((uint8_t)(1 << lane)));
   vuint64m1_t a_dup = vdupq_n_u64(a[0]);
   return __riscv_vmerge_vvm_u64m1(b, a_dup, mask, 2);
 }

diff --git a/tests/impl.cpp b/tests/impl.cpp
@@ -43414,9 +43414,59 @@ result_t test_vld1_lane_p16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { ret
 
 result_t test_vld1q_lane_p16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
 
-result_t test_vld1_lane_f64(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
+result_t test_vld1_lane_f64(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
+#ifdef ENABLE_TEST_ALL
+  double *_a = (double *)impl.test_cases_float_pointer1;
+  double *_b = (double *)impl.test_cases_float_pointer2;
+  double _c[1];
+  float64x1_t c;
+  float64x1_t b = vld1_f64(_b);
+#define TEST_IMPL(IDX)           \
+  for (int i = 0; i < 1; i++) {  \
+    if (i != IDX) {              \
+      _c[i] = _b[i];             \
+    } else {                     \
+      _c[i] = _a[0];             \
+    }                            \
+  }                              \
+  c = vld1_lane_f64(_a, b, IDX); \
+  CHECK_RESULT(validate_double(c, _c[0]))
+
+  IMM_1_ITER
+#undef TEST_IMPL
+
+  return TEST_SUCCESS;
+#else
+  return TEST_UNIMPL;
+#endif  // ENABLE_TEST_ALL
+}
 
-result_t test_vld1q_lane_f64(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
+result_t test_vld1q_lane_f64(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
+#ifdef ENABLE_TEST_ALL
+  double *_a = (double *)impl.test_cases_float_pointer1;
+  double *_b = (double *)impl.test_cases_float_pointer2;
+  double _c[2];
+  float64x2_t c;
+  float64x2_t b = vld1q_f64(_b);
+#define TEST_IMPL(IDX)            \
+  for (int i = 0; i < 2; i++) {   \
+    if (i != IDX) {               \
+      _c[i] = _b[i];              \
+    } else {                      \
+      _c[i] = _a[0];              \
+    }                             \
+  }                               \
+  c = vld1q_lane_f64(_a, b, IDX); \
+  CHECK_RESULT(validate_double(c, _c[0], _c[1]))
+
+  IMM_2_ITER
+#undef TEST_IMPL
+
+  return TEST_SUCCESS;
+#else
+  return TEST_UNIMPL;
+#endif  // ENABLE_TEST_ALL
+}
 
 result_t test_vld1q_lane_u8(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
 #ifdef ENABLE_TEST_ALL

diff --git a/tests/impl.h b/tests/impl.h
@@ -2634,8 +2634,8 @@
   _(vld1q_lane_u32)                                                              \
   /*_(vld1q_lane_p8)                                                          */ \
   /*_(vld1q_lane_p16)                                                         */ \
-  /*_(vld1_lane_f64)                                                          */ \
-  /*_(vld1q_lane_f64)                                                         */ \
+  _(vld1_lane_f64)                                                               \
+  _(vld1q_lane_f64)                                                              \
   /*_(vld1q_lane_p64)                                                         */ \
   _(vld1q_lane_s64)                                                              \
   _(vld1q_lane_u64)                                                              \