From 3eccc392080951358a37ee7c121c6be6a16ac083 Mon Sep 17 00:00:00 2001
From: Yang Hau <yuanyanghau@gmail.com>
Date: Sun, 28 Jul 2024 00:54:58 +0800
Subject: [PATCH] feat: Add vmulx[q]_lane[q]_[f32|f64]

---
 neon2rvv.h     |  48 +++++++++---
 tests/impl.cpp | 196 ++++++++++++++++++++++++++++++++++++++++++++++---
 tests/impl.h   |  16 ++--
 3 files changed, 230 insertions(+), 30 deletions(-)

diff --git a/neon2rvv.h b/neon2rvv.h
index 38b943ab..2b16ceda 100644
--- a/neon2rvv.h
+++ b/neon2rvv.h
@@ -845,29 +845,53 @@ FORCE_INLINE float64_t vmulxd_f64(float64_t a, float64_t b) {
   return mul;
 }
 
-// FORCE_INLINE float32x2_t vmulx_lane_f32(float32x2_t a, float32x2_t v, const int lane);
+FORCE_INLINE float32x2_t vmulx_lane_f32(float32x2_t a, float32x2_t b, const int lane) {
+  vfloat32m1_t b_dup_lane = __riscv_vrgather_vx_f32m1(b, lane, 2);
+  return vmulx_f32(a, b_dup_lane);
+}
 
-// FORCE_INLINE float32x4_t vmulxq_lane_f32(float32x4_t a, float32x2_t v, const int lane);
+FORCE_INLINE float32x4_t vmulxq_lane_f32(float32x4_t a, float32x2_t b, const int lane) {
+  vfloat32m1_t b_dup_lane = __riscv_vrgather_vx_f32m1(b, lane, 4);
+  return vmulxq_f32(a, b_dup_lane);
+}
 
-// FORCE_INLINE float64x1_t vmulx_lane_f64(float64x1_t a, float64x1_t v, const int lane);
+FORCE_INLINE float64x1_t vmulx_lane_f64(float64x1_t a, float64x1_t b, const int lane) {
+  vfloat64m1_t b_dup_lane = __riscv_vrgather_vx_f64m1(b, lane, 1);
+  return vmulx_f64(a, b_dup_lane);
+}
 
-// FORCE_INLINE float64x2_t vmulxq_lane_f64(float64x2_t a, float64x1_t v, const int lane);
+FORCE_INLINE float64x2_t vmulxq_lane_f64(float64x2_t a, float64x1_t b, const int lane) {
+  vfloat64m1_t b_dup_lane = __riscv_vrgather_vx_f64m1(b, lane, 2);
+  return vmulxq_f64(a, b_dup_lane);
+}
 
-// FORCE_INLINE float32_t vmulxs_lane_f32(float32_t a, float32x2_t v, const int lane);
+// FORCE_INLINE float32_t vmulxs_lane_f32(float32_t a, float32x2_t b, const int lane);
 
-// FORCE_INLINE float64_t vmulxd_lane_f64(float64_t a, float64x1_t v, const int lane);
+// FORCE_INLINE float64_t vmulxd_lane_f64(float64_t a, float64x1_t b, const int lane);
 
-// FORCE_INLINE float32x2_t vmulx_laneq_f32(float32x2_t a, float32x4_t v, const int lane);
+FORCE_INLINE float32x2_t vmulx_laneq_f32(float32x2_t a, float32x4_t b, const int lane) {
+  vfloat32m1_t b_dup_lane = __riscv_vrgather_vx_f32m1(b, lane, 4);
+  return vmulx_f32(a, b_dup_lane);
+}
 
-// FORCE_INLINE float32x4_t vmulxq_laneq_f32(float32x4_t a, float32x4_t v, const int lane);
+FORCE_INLINE float32x4_t vmulxq_laneq_f32(float32x4_t a, float32x4_t b, const int lane) {
+  vfloat32m1_t b_dup_lane = __riscv_vrgather_vx_f32m1(b, lane, 4);
+  return vmulxq_f32(a, b_dup_lane);
+}
 
-// FORCE_INLINE float64x1_t vmulx_laneq_f64(float64x1_t a, float64x2_t v, const int lane);
+FORCE_INLINE float64x1_t vmulx_laneq_f64(float64x1_t a, float64x2_t b, const int lane) {
+  vfloat64m1_t b_dup_lane = __riscv_vrgather_vx_f64m1(b, lane, 2);
+  return vmulx_f64(a, b_dup_lane);
+}
 
-// FORCE_INLINE float64x2_t vmulxq_laneq_f64(float64x2_t a, float64x2_t v, const int lane);
+FORCE_INLINE float64x2_t vmulxq_laneq_f64(float64x2_t a, float64x2_t b, const int lane) {
+  vfloat64m1_t b_dup_lane = __riscv_vrgather_vx_f64m1(b, lane, 2);
+  return vmulxq_f64(a, b_dup_lane);
+}
 
-// FORCE_INLINE float32_t vmulxs_laneq_f32(float32_t a, float32x4_t v, const int lane);
+// FORCE_INLINE float32_t vmulxs_laneq_f32(float32_t a, float32x4_t b, const int lane);
 
-// FORCE_INLINE float64_t vmulxd_laneq_f64(float64_t a, float64x2_t v, const int lane);
+// FORCE_INLINE float64_t vmulxd_laneq_f64(float64_t a, float64x2_t b, const int lane);
 
 FORCE_INLINE float32x2_t vdiv_f32(float32x2_t a, float32x2_t b) { return __riscv_vfdiv_vv_f32m1(a, b, 2); }
 
diff --git a/tests/impl.cpp b/tests/impl.cpp
index 46ceed96..bcb530eb 100644
--- a/tests/impl.cpp
+++ b/tests/impl.cpp
@@ -2872,25 +2872,203 @@ result_t test_vmulxd_f64(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
 #endif  // ENABLE_TEST_ALL
 }
 
-result_t test_vmulx_lane_f32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
+result_t test_vmulx_lane_f32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
+#ifdef ENABLE_TEST_ALL
+  const float *_a = (float *)impl.test_cases_float_pointer1;
+  const float *_b = (float *)impl.test_cases_float_pointer2;
+  float _c[2];
+  float32x2_t a = vld1_f32(_a);
+  float32x2_t b = vld1_f32(_b);
+  float32x2_t c;
+
+#define TEST_IMPL(IDX)           \
+  for (int i = 0; i < 2; i++) {  \
+    _c[i] = _a[i] * _b[IDX];     \
+  }                              \
+  c = vmulx_lane_f32(a, b, IDX); \
+  CHECK_RESULT(validate_float(c, _c[0], _c[1]))
+
+  IMM_2_ITER
+#undef TEST_IMPL
+
+  return TEST_SUCCESS;
+#else
+  return TEST_UNIMPL;
+#endif  // ENABLE_TEST_ALL
+}
 
-result_t test_vmulxq_lane_f32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
+result_t test_vmulxq_lane_f32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
+#ifdef ENABLE_TEST_ALL
+  const float *_a = (float *)impl.test_cases_float_pointer1;
+  const float *_b = (float *)impl.test_cases_float_pointer2;
+  float _c[4];
+  float32x4_t a = vld1q_f32(_a);
+  float32x2_t b = vld1_f32(_b);
+  float32x4_t c;
 
-result_t test_vmulx_lane_f64(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
+#define TEST_IMPL(IDX)            \
+  for (int i = 0; i < 4; i++) {   \
+    _c[i] = _a[i] * _b[IDX];      \
+  }                               \
+  c = vmulxq_lane_f32(a, b, IDX); \
+  CHECK_RESULT(validate_float(c, _c[0], _c[1], _c[2], _c[3]))
 
-result_t test_vmulxq_lane_f64(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
+  IMM_2_ITER
+#undef TEST_IMPL
+
+  return TEST_SUCCESS;
+#else
+  return TEST_UNIMPL;
+#endif  // ENABLE_TEST_ALL
+}
+
+result_t test_vmulx_lane_f64(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
+#ifdef ENABLE_TEST_ALL
+  const double *_a = (double *)impl.test_cases_float_pointer1;
+  const double *_b = (double *)impl.test_cases_float_pointer2;
+  double _c[2];
+  float64x1_t a = vld1_f64(_a);
+  float64x1_t b = vld1_f64(_b);
+  float64x1_t c;
+
+  for (int i = 0; i < 1; i++) {
+    _c[i] = _a[i] * _b[0];
+  }
+  c = vmulx_lane_f64(a, b, 0);
+  return validate_double(c, _c[0]);
+
+  return TEST_SUCCESS;
+
+#else
+  return TEST_UNIMPL;
+#endif  // ENABLE_TEST_ALL
+}
+
+result_t test_vmulxq_lane_f64(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
+#ifdef ENABLE_TEST_ALL
+  const double *_a = (double *)impl.test_cases_float_pointer1;
+  const double *_b = (double *)impl.test_cases_float_pointer2;
+  double _c[4];
+  float64x2_t a = vld1q_f64(_a);
+  float64x1_t b = vld1_f64(_b);
+  float64x2_t c;
+
+  for (int i = 0; i < 2; i++) {
+    _c[i] = _a[i] * _b[0];
+  }
+  c = vmulxq_lane_f64(a, b, 0);
+  return validate_double(c, _c[0], _c[1]);
+
+#else
+  return TEST_UNIMPL;
+#endif  // ENABLE_TEST_ALL
+}
 
 result_t test_vmulxs_lane_f32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
 
 result_t test_vmulxd_lane_f64(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
 
-result_t test_vmulx_laneq_f32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
+result_t test_vmulx_laneq_f32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
+#ifdef ENABLE_TEST_ALL
+  const float *_a = (float *)impl.test_cases_float_pointer1;
+  const float *_b = (float *)impl.test_cases_float_pointer2;
+  float _c[2];
+  float32x2_t a = vld1_f32(_a);
+  float32x4_t b = vld1q_f32(_b);
+  float32x2_t c;
+
+#define TEST_IMPL(IDX)            \
+  for (int i = 0; i < 2; i++) {   \
+    _c[i] = _a[i] * _b[IDX];      \
+  }                               \
+  c = vmulx_laneq_f32(a, b, IDX); \
+  CHECK_RESULT(validate_float(c, _c[0], _c[1]))
+
+  IMM_4_ITER
+#undef TEST_IMPL
+
+  return TEST_SUCCESS;
+#else
+  return TEST_UNIMPL;
+#endif  // ENABLE_TEST_ALL
+}
+
+result_t test_vmulxq_laneq_f32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
+#ifdef ENABLE_TEST_ALL
+  const float *_a = (float *)impl.test_cases_float_pointer1;
+  const float *_b = (float *)impl.test_cases_float_pointer2;
+  float _c[4];
+  float32x4_t a = vld1q_f32(_a);
+  float32x4_t b = vld1q_f32(_b);
+  float32x4_t c;
 
-result_t test_vmulxq_laneq_f32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
+#define TEST_IMPL(IDX)             \
+  for (int i = 0; i < 4; i++) {    \
+    _c[i] = _a[i] * _b[IDX];       \
+  }                                \
+  c = vmulxq_laneq_f32(a, b, IDX); \
+  CHECK_RESULT(validate_float(c, _c[0], _c[1], _c[2], _c[3]))
 
-result_t test_vmulx_laneq_f64(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
+  IMM_4_ITER
+#undef TEST_IMPL
 
-result_t test_vmulxq_laneq_f64(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
+  return TEST_SUCCESS;
+#else
+  return TEST_UNIMPL;
+#endif  // ENABLE_TEST_ALL
+}
+
+result_t test_vmulx_laneq_f64(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
+#ifdef ENABLE_TEST_ALL
+  const double *_a = (double *)impl.test_cases_float_pointer1;
+  const double *_b = (double *)impl.test_cases_float_pointer2;
+  double _c[2];
+  float64x1_t a = vld1_f64(_a);
+  float64x2_t b = vld1q_f64(_b);
+  float64x1_t c;
+
+#define TEST_IMPL(IDX)            \
+  for (int i = 0; i < 1; i++) {   \
+    _c[i] = _a[i] * _b[IDX];      \
+  }                               \
+  c = vmulx_laneq_f64(a, b, IDX); \
+  CHECK_RESULT(validate_double(c, _c[0]))
+
+  IMM_2_ITER
+#undef TEST_IMPL
+
+  return TEST_SUCCESS;
+
+#else
+  return TEST_UNIMPL;
+#endif  // ENABLE_TEST_ALL
+}
+
+result_t test_vmulxq_laneq_f64(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
+#ifdef ENABLE_TEST_ALL
+  const double *_a = (double *)impl.test_cases_float_pointer1;
+  const double *_b = (double *)impl.test_cases_float_pointer2;
+  double _c[4];
+  float64x2_t a = vld1q_f64(_a);
+  float64x2_t b = vld1q_f64(_b);
+  float64x2_t c;
+
+#define TEST_IMPL(IDX)             \
+  for (int i = 0; i < 2; i++) {    \
+    _c[i] = _a[i] * _b[IDX];       \
+  }                                \
+  c = vmulxq_laneq_f64(a, b, IDX); \
+  CHECK_RESULT(validate_double(c, _c[0], _c[1]))
+
+  IMM_2_ITER
+#undef TEST_IMPL
+
+  return TEST_SUCCESS;
+
+#else
+  return TEST_UNIMPL;
+#endif  // ENABLE_TEST_ALL
+}
 
 result_t test_vmulxs_laneq_f32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
 
@@ -31126,7 +31304,6 @@ result_t test_vmul_lane_f32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
 #undef TEST_IMPL
 
   return TEST_SUCCESS;
-
 #else
   return TEST_UNIMPL;
 #endif  // ENABLE_TEST_ALL
@@ -31255,7 +31432,6 @@ result_t test_vmulq_lane_f32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
 #undef TEST_IMPL
 
   return TEST_SUCCESS;
-
 #else
   return TEST_UNIMPL;
 #endif  // ENABLE_TEST_ALL
diff --git a/tests/impl.h b/tests/impl.h
index 4af0ac2d..0eff143b 100644
--- a/tests/impl.h
+++ b/tests/impl.h
@@ -171,16 +171,16 @@
   _(vmulxq_f64)                                                                  \
   _(vmulxs_f32)                                                                  \
   _(vmulxd_f64)                                                                  \
-  /*_(vmulx_lane_f32)                                                         */ \
-  /*_(vmulxq_lane_f32)                                                        */ \
-  /*_(vmulx_lane_f64)                                                         */ \
-  /*_(vmulxq_lane_f64)                                                        */ \
+  _(vmulx_lane_f32)                                                              \
+  _(vmulxq_lane_f32)                                                             \
+  _(vmulx_lane_f64)                                                              \
+  _(vmulxq_lane_f64)                                                             \
   /*_(vmulxs_lane_f32)                                                        */ \
   /*_(vmulxd_lane_f64)                                                        */ \
-  /*_(vmulx_laneq_f32)                                                        */ \
-  /*_(vmulxq_laneq_f32)                                                       */ \
-  /*_(vmulx_laneq_f64)                                                        */ \
-  /*_(vmulxq_laneq_f64)                                                       */ \
+  _(vmulx_laneq_f32)                                                             \
+  _(vmulxq_laneq_f32)                                                            \
+  _(vmulx_laneq_f64)                                                             \
+  _(vmulxq_laneq_f64)                                                            \
   /*_(vmulxs_laneq_f32)                                                       */ \
   /*_(vmulxd_laneq_f64)                                                       */ \
   _(vdiv_f32)                                                                    \