Skip to content

Commit

Permalink
feat: Add vmulx[q]_lane[q]_[f32|f64]
Browse files Browse the repository at this point in the history
  • Loading branch information
howjmay committed Jul 27, 2024
1 parent d92a501 commit 3eccc39
Show file tree
Hide file tree
Showing 3 changed files with 230 additions and 30 deletions.
48 changes: 36 additions & 12 deletions neon2rvv.h
Original file line number Diff line number Diff line change
Expand Up @@ -845,29 +845,53 @@ FORCE_INLINE float64_t vmulxd_f64(float64_t a, float64_t b) {
return mul;
}

// FORCE_INLINE float32x2_t vmulx_lane_f32(float32x2_t a, float32x2_t v, const int lane);
FORCE_INLINE float32x2_t vmulx_lane_f32(float32x2_t a, float32x2_t b, const int lane) {
vfloat32m1_t b_dup_lane = __riscv_vrgather_vx_f32m1(b, lane, 2);
return vmulx_f32(a, b_dup_lane);
}

// FORCE_INLINE float32x4_t vmulxq_lane_f32(float32x4_t a, float32x2_t v, const int lane);
FORCE_INLINE float32x4_t vmulxq_lane_f32(float32x4_t a, float32x2_t b, const int lane) {
vfloat32m1_t b_dup_lane = __riscv_vrgather_vx_f32m1(b, lane, 4);
return vmulxq_f32(a, b_dup_lane);
}

// FORCE_INLINE float64x1_t vmulx_lane_f64(float64x1_t a, float64x1_t v, const int lane);
FORCE_INLINE float64x1_t vmulx_lane_f64(float64x1_t a, float64x1_t b, const int lane) {
vfloat64m1_t b_dup_lane = __riscv_vrgather_vx_f64m1(b, lane, 1);
return vmulx_f64(a, b_dup_lane);
}

// FORCE_INLINE float64x2_t vmulxq_lane_f64(float64x2_t a, float64x1_t v, const int lane);
FORCE_INLINE float64x2_t vmulxq_lane_f64(float64x2_t a, float64x1_t b, const int lane) {
vfloat64m1_t b_dup_lane = __riscv_vrgather_vx_f64m1(b, lane, 2);
return vmulxq_f64(a, b_dup_lane);
}

// FORCE_INLINE float32_t vmulxs_lane_f32(float32_t a, float32x2_t v, const int lane);
// FORCE_INLINE float32_t vmulxs_lane_f32(float32_t a, float32x2_t b, const int lane);

// FORCE_INLINE float64_t vmulxd_lane_f64(float64_t a, float64x1_t v, const int lane);
// FORCE_INLINE float64_t vmulxd_lane_f64(float64_t a, float64x1_t b, const int lane);

// FORCE_INLINE float32x2_t vmulx_laneq_f32(float32x2_t a, float32x4_t v, const int lane);
FORCE_INLINE float32x2_t vmulx_laneq_f32(float32x2_t a, float32x4_t b, const int lane) {
vfloat32m1_t b_dup_lane = __riscv_vrgather_vx_f32m1(b, lane, 4);
return vmulx_f32(a, b_dup_lane);
}

// FORCE_INLINE float32x4_t vmulxq_laneq_f32(float32x4_t a, float32x4_t v, const int lane);
FORCE_INLINE float32x4_t vmulxq_laneq_f32(float32x4_t a, float32x4_t b, const int lane) {
vfloat32m1_t b_dup_lane = __riscv_vrgather_vx_f32m1(b, lane, 4);
return vmulxq_f32(a, b_dup_lane);
}

// FORCE_INLINE float64x1_t vmulx_laneq_f64(float64x1_t a, float64x2_t v, const int lane);
FORCE_INLINE float64x1_t vmulx_laneq_f64(float64x1_t a, float64x2_t b, const int lane) {
vfloat64m1_t b_dup_lane = __riscv_vrgather_vx_f64m1(b, lane, 2);
return vmulx_f64(a, b_dup_lane);
}

// FORCE_INLINE float64x2_t vmulxq_laneq_f64(float64x2_t a, float64x2_t v, const int lane);
FORCE_INLINE float64x2_t vmulxq_laneq_f64(float64x2_t a, float64x2_t b, const int lane) {
vfloat64m1_t b_dup_lane = __riscv_vrgather_vx_f64m1(b, lane, 2);
return vmulxq_f64(a, b_dup_lane);
}

// FORCE_INLINE float32_t vmulxs_laneq_f32(float32_t a, float32x4_t v, const int lane);
// FORCE_INLINE float32_t vmulxs_laneq_f32(float32_t a, float32x4_t b, const int lane);

// FORCE_INLINE float64_t vmulxd_laneq_f64(float64_t a, float64x2_t v, const int lane);
// FORCE_INLINE float64_t vmulxd_laneq_f64(float64_t a, float64x2_t b, const int lane);

FORCE_INLINE float32x2_t vdiv_f32(float32x2_t a, float32x2_t b) { return __riscv_vfdiv_vv_f32m1(a, b, 2); }

Expand Down
196 changes: 186 additions & 10 deletions tests/impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2872,25 +2872,203 @@ result_t test_vmulxd_f64(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
#endif // ENABLE_TEST_ALL
}

result_t test_vmulx_lane_f32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
result_t test_vmulx_lane_f32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
#ifdef ENABLE_TEST_ALL
const float *_a = (float *)impl.test_cases_float_pointer1;
const float *_b = (float *)impl.test_cases_float_pointer2;
float _c[2];
float32x2_t a = vld1_f32(_a);
float32x2_t b = vld1_f32(_b);
float32x2_t c;

#define TEST_IMPL(IDX) \
for (int i = 0; i < 2; i++) { \
_c[i] = _a[i] * _b[IDX]; \
} \
c = vmulx_lane_f32(a, b, IDX); \
CHECK_RESULT(validate_float(c, _c[0], _c[1]))

IMM_2_ITER
#undef TEST_IMPL

return TEST_SUCCESS;
#else
return TEST_UNIMPL;
#endif // ENABLE_TEST_ALL
}

result_t test_vmulxq_lane_f32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
result_t test_vmulxq_lane_f32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
#ifdef ENABLE_TEST_ALL
const float *_a = (float *)impl.test_cases_float_pointer1;
const float *_b = (float *)impl.test_cases_float_pointer2;
float _c[4];
float32x4_t a = vld1q_f32(_a);
float32x2_t b = vld1_f32(_b);
float32x4_t c;

result_t test_vmulx_lane_f64(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
#define TEST_IMPL(IDX) \
for (int i = 0; i < 4; i++) { \
_c[i] = _a[i] * _b[IDX]; \
} \
c = vmulxq_lane_f32(a, b, IDX); \
CHECK_RESULT(validate_float(c, _c[0], _c[1], _c[2], _c[3]))

result_t test_vmulxq_lane_f64(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
IMM_2_ITER
#undef TEST_IMPL

return TEST_SUCCESS;
#else
return TEST_UNIMPL;
#endif // ENABLE_TEST_ALL
}

result_t test_vmulx_lane_f64(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
#ifdef ENABLE_TEST_ALL
const double *_a = (double *)impl.test_cases_float_pointer1;
const double *_b = (double *)impl.test_cases_float_pointer2;
double _c[2];
float64x1_t a = vld1_f64(_a);
float64x1_t b = vld1_f64(_b);
float64x1_t c;

for (int i = 0; i < 1; i++) {
_c[i] = _a[i] * _b[0];
}
c = vmulx_lane_f64(a, b, 0);
return validate_double(c, _c[0]);

return TEST_SUCCESS;

#else
return TEST_UNIMPL;
#endif // ENABLE_TEST_ALL
}

result_t test_vmulxq_lane_f64(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
#ifdef ENABLE_TEST_ALL
const double *_a = (double *)impl.test_cases_float_pointer1;
const double *_b = (double *)impl.test_cases_float_pointer2;
double _c[4];
float64x2_t a = vld1q_f64(_a);
float64x1_t b = vld1_f64(_b);
float64x2_t c;

for (int i = 0; i < 2; i++) {
_c[i] = _a[i] * _b[0];
}
c = vmulxq_lane_f64(a, b, 0);
return validate_double(c, _c[0], _c[1]);

#else
return TEST_UNIMPL;
#endif // ENABLE_TEST_ALL
}

result_t test_vmulxs_lane_f32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }

result_t test_vmulxd_lane_f64(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }

result_t test_vmulx_laneq_f32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
result_t test_vmulx_laneq_f32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
#ifdef ENABLE_TEST_ALL
const float *_a = (float *)impl.test_cases_float_pointer1;
const float *_b = (float *)impl.test_cases_float_pointer2;
float _c[2];
float32x2_t a = vld1_f32(_a);
float32x4_t b = vld1q_f32(_b);
float32x2_t c;

#define TEST_IMPL(IDX) \
for (int i = 0; i < 2; i++) { \
_c[i] = _a[i] * _b[IDX]; \
} \
c = vmulx_laneq_f32(a, b, IDX); \
CHECK_RESULT(validate_float(c, _c[0], _c[1]))

IMM_4_ITER
#undef TEST_IMPL

return TEST_SUCCESS;
#else
return TEST_UNIMPL;
#endif // ENABLE_TEST_ALL
}

result_t test_vmulxq_laneq_f32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
#ifdef ENABLE_TEST_ALL
const float *_a = (float *)impl.test_cases_float_pointer1;
const float *_b = (float *)impl.test_cases_float_pointer2;
float _c[4];
float32x4_t a = vld1q_f32(_a);
float32x4_t b = vld1q_f32(_b);
float32x4_t c;

result_t test_vmulxq_laneq_f32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
#define TEST_IMPL(IDX) \
for (int i = 0; i < 4; i++) { \
_c[i] = _a[i] * _b[IDX]; \
} \
c = vmulxq_laneq_f32(a, b, IDX); \
CHECK_RESULT(validate_float(c, _c[0], _c[1], _c[2], _c[3]))

result_t test_vmulx_laneq_f64(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
IMM_4_ITER
#undef TEST_IMPL

result_t test_vmulxq_laneq_f64(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
return TEST_SUCCESS;
#else
return TEST_UNIMPL;
#endif // ENABLE_TEST_ALL
}

result_t test_vmulx_laneq_f64(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
#ifdef ENABLE_TEST_ALL
const double *_a = (double *)impl.test_cases_float_pointer1;
const double *_b = (double *)impl.test_cases_float_pointer2;
double _c[2];
float64x1_t a = vld1_f64(_a);
float64x2_t b = vld1q_f64(_b);
float64x1_t c;

#define TEST_IMPL(IDX) \
for (int i = 0; i < 1; i++) { \
_c[i] = _a[i] * _b[IDX]; \
} \
c = vmulx_laneq_f64(a, b, IDX); \
CHECK_RESULT(validate_double(c, _c[0]))

IMM_2_ITER
#undef TEST_IMPL

return TEST_SUCCESS;

#else
return TEST_UNIMPL;
#endif // ENABLE_TEST_ALL
}

result_t test_vmulxq_laneq_f64(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
#ifdef ENABLE_TEST_ALL
const double *_a = (double *)impl.test_cases_float_pointer1;
const double *_b = (double *)impl.test_cases_float_pointer2;
double _c[4];
float64x2_t a = vld1q_f64(_a);
float64x2_t b = vld1q_f64(_b);
float64x2_t c;

#define TEST_IMPL(IDX) \
for (int i = 0; i < 2; i++) { \
_c[i] = _a[i] * _b[IDX]; \
} \
c = vmulxq_laneq_f64(a, b, IDX); \
CHECK_RESULT(validate_double(c, _c[0], _c[1]))

IMM_2_ITER
#undef TEST_IMPL

return TEST_SUCCESS;

#else
return TEST_UNIMPL;
#endif // ENABLE_TEST_ALL
}

result_t test_vmulxs_laneq_f32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }

Expand Down Expand Up @@ -31126,7 +31304,6 @@ result_t test_vmul_lane_f32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
#undef TEST_IMPL

return TEST_SUCCESS;

#else
return TEST_UNIMPL;
#endif // ENABLE_TEST_ALL
Expand Down Expand Up @@ -31255,7 +31432,6 @@ result_t test_vmulq_lane_f32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
#undef TEST_IMPL

return TEST_SUCCESS;

#else
return TEST_UNIMPL;
#endif // ENABLE_TEST_ALL
Expand Down
16 changes: 8 additions & 8 deletions tests/impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -171,16 +171,16 @@
_(vmulxq_f64) \
_(vmulxs_f32) \
_(vmulxd_f64) \
/*_(vmulx_lane_f32) */ \
/*_(vmulxq_lane_f32) */ \
/*_(vmulx_lane_f64) */ \
/*_(vmulxq_lane_f64) */ \
_(vmulx_lane_f32) \
_(vmulxq_lane_f32) \
_(vmulx_lane_f64) \
_(vmulxq_lane_f64) \
/*_(vmulxs_lane_f32) */ \
/*_(vmulxd_lane_f64) */ \
/*_(vmulx_laneq_f32) */ \
/*_(vmulxq_laneq_f32) */ \
/*_(vmulx_laneq_f64) */ \
/*_(vmulxq_laneq_f64) */ \
_(vmulx_laneq_f32) \
_(vmulxq_laneq_f32) \
_(vmulx_laneq_f64) \
_(vmulxq_laneq_f64) \
/*_(vmulxs_laneq_f32) */ \
/*_(vmulxd_laneq_f64) */ \
_(vdiv_f32) \
Expand Down

0 comments on commit 3eccc39

Please sign in to comment.