Skip to content

Commit

Permalink
[Fix] : f16 intrinsics
Browse files Browse the repository at this point in the history
  • Loading branch information
朱季葳 authored and 朱季葳 committed Oct 16, 2023
1 parent 5352cfa commit 2bab3a9
Show file tree
Hide file tree
Showing 4 changed files with 226 additions and 200 deletions.
84 changes: 48 additions & 36 deletions simde/arm/neon/cmla_lane.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,15 +52,18 @@ simde_float16x4_t simde_vcmla_lane_f16(simde_float16x4_t r, simde_float16x4_t a,
b_ = simde_float16x4_to_private(simde_vdup_n_f16(
simde_float16x4_to_private(b).values[lane]));

SIMDE_VECTORIZE
for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0])));
i++) {
r_.values[2 * i] = simde_vaddh_f16(
r_.values[2 * i], simde_vmulh_f16(b_.values[lane], a_.values[2 * i]));
r_.values[2 * i + 1] =
simde_vaddh_f16(r_.values[2 * i + 1],
simde_vmulh_f16(b_.values[lane], a_.values[2 * i]));
}
#if defined(SIMDE_SHUFFLE_VECTOR_) && ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16))
a_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_.values, a_.values, 0, 0, 2, 2);
r_.values += b_.values * a_.values;
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))) ; i++) {
r_.values[2 * i] = simde_float16_from_float32(
simde_float16_to_float32(r_.values[2 * i]) + simde_float16_to_float32(b_.values[lane]) * simde_float16_to_float32(a_.values[2 * i]));
r_.values[2 * i + 1] = simde_float16_from_float32(
simde_float16_to_float32(r_.values[2 * i + 1]) + simde_float16_to_float32(b_.values[lane]) * simde_float16_to_float32(a_.values[2 * i]));
}
#endif
result = simde_float16x4_from_private(r_);
return result;
#endif
Expand Down Expand Up @@ -121,15 +124,18 @@ simde_float16x4_t simde_vcmla_laneq_f16(simde_float16x4_t r,
b_ = simde_float16x4_to_private(simde_vdup_n_f16(
simde_float16x8_to_private(b).values[lane]));

SIMDE_VECTORIZE
for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0])));
i++) {
r_.values[2 * i] = simde_vaddh_f16(
r_.values[2 * i], simde_vmulh_f16(b_.values[lane], a_.values[2 * i]));
r_.values[2 * i + 1] =
simde_vaddh_f16(r_.values[2 * i + 1],
simde_vmulh_f16(b_.values[lane], a_.values[2 * i]));
}
#if defined(SIMDE_SHUFFLE_VECTOR_) && ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16))
a_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_.values, a_.values, 0, 0, 2, 2);
r_.values += b_.values * a_.values;
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))) ; i++) {
r_.values[2 * i] = simde_float16_from_float32(
simde_float16_to_float32(r_.values[2 * i]) + simde_float16_to_float32(b_.values[lane]) * simde_float16_to_float32(a_.values[2 * i]));
r_.values[2 * i + 1] = simde_float16_from_float32(
simde_float16_to_float32(r_.values[2 * i + 1]) + simde_float16_to_float32(b_.values[lane]) * simde_float16_to_float32(a_.values[2 * i]));
}
#endif
result = simde_float16x4_from_private(r_);
return result;
#endif
Expand Down Expand Up @@ -193,15 +199,18 @@ simde_float16x8_t simde_vcmlaq_lane_f16(simde_float16x8_t r,
b_ = simde_float16x8_to_private(simde_vdupq_n_f16(
simde_float16x4_to_private(b).values[lane]));

SIMDE_VECTORIZE
for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0])));
i++) {
r_.values[2 * i] = simde_vaddh_f16(
r_.values[2 * i], simde_vmulh_f16(b_.values[lane], a_.values[2 * i]));
r_.values[2 * i + 1] =
simde_vaddh_f16(r_.values[2 * i + 1],
simde_vmulh_f16(b_.values[lane], a_.values[2 * i]));
}
#if defined(SIMDE_SHUFFLE_VECTOR_) && ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16))
a_.values = SIMDE_SHUFFLE_VECTOR_(16, 8, a_.values, a_.values, 0, 0, 2, 2, 4, 4, 6, 6);
r_.values += b_.values * a_.values;
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))) ; i++) {
r_.values[2 * i] = simde_float16_from_float32(
simde_float16_to_float32(r_.values[2 * i]) + simde_float16_to_float32(b_.values[lane]) * simde_float16_to_float32(a_.values[2 * i]));
r_.values[2 * i + 1] = simde_float16_from_float32(
simde_float16_to_float32(r_.values[2 * i + 1]) + simde_float16_to_float32(b_.values[lane]) * simde_float16_to_float32(a_.values[2 * i]));
}
#endif
result = simde_float16x8_from_private(r_);
return result;
#endif
Expand Down Expand Up @@ -263,15 +272,18 @@ simde_float16x8_t simde_vcmlaq_laneq_f16(simde_float16x8_t r,
b_ = simde_float16x8_to_private(simde_vdupq_n_f16(
simde_float16x8_to_private(b).values[lane]));

SIMDE_VECTORIZE
for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0])));
i++) {
r_.values[2 * i] = simde_vaddh_f16(
r_.values[2 * i], simde_vmulh_f16(b_.values[lane], a_.values[2 * i]));
r_.values[2 * i + 1] =
simde_vaddh_f16(r_.values[2 * i + 1],
simde_vmulh_f16(b_.values[lane], a_.values[2 * i]));
}
#if defined(SIMDE_SHUFFLE_VECTOR_) && ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16))
a_.values = SIMDE_SHUFFLE_VECTOR_(16, 8, a_.values, a_.values, 0, 0, 2, 2, 4, 4, 6, 6);
r_.values += b_.values * a_.values;
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))) ; i++) {
r_.values[2 * i] = simde_float16_from_float32(
simde_float16_to_float32(r_.values[2 * i]) + simde_float16_to_float32(b_.values[lane]) * simde_float16_to_float32(a_.values[2 * i]));
r_.values[2 * i + 1] = simde_float16_from_float32(
simde_float16_to_float32(r_.values[2 * i + 1]) + simde_float16_to_float32(b_.values[lane]) * simde_float16_to_float32(a_.values[2 * i]));
}
#endif
result = simde_float16x8_from_private(r_);
return result;
#endif
Expand Down
112 changes: 56 additions & 56 deletions simde/arm/neon/cmla_rot180_lane.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,20 +54,20 @@ simde_float16x4_t simde_vcmla_rot180_lane_f16(simde_float16x4_t r,
b_ = simde_float16x4_to_private(simde_vdup_n_f16(
simde_float16x4_to_private(b).values[lane]));

SIMDE_VECTORIZE
for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0])));
i++) {
r_.values[2 * i] = simde_vaddh_f16(
r_.values[2 * i],
simde_vmulh_f16(simde_float16_from_float32(
-simde_float16_to_float32(b_.values[2 * i])),
a_.values[2 * i]));
r_.values[2 * i + 1] = simde_vaddh_f16(
r_.values[2 * i + 1],
simde_vmulh_f16(simde_float16_from_float32(
-simde_float16_to_float32(b_.values[2 * i + 1])),
a_.values[2 * i]));
}
#if defined(SIMDE_SHUFFLE_VECTOR_) && \
((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16))
a_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_.values, a_.values, 0, 0, 2, 2);
b_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, -b_.values, b_.values, 0, 1, 2, 3);
r_.values += b_.values * a_.values;
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))) ; i++) {
r_.values[2 * i] = simde_float16_from_float32(
simde_float16_to_float32(r_.values[2 * i]) + (-simde_float16_to_float32(b_.values[2 * i])) * simde_float16_to_float32(a_.values[2 * i]));
r_.values[2 * i + 1] = simde_float16_from_float32(
simde_float16_to_float32(r_.values[2 * i + 1]) + (-simde_float16_to_float32(b_.values[2 * i + 1])) * simde_float16_to_float32(a_.values[2 * i]));
}
#endif

result = simde_float16x4_from_private(r_);
return result;
Expand Down Expand Up @@ -135,20 +135,20 @@ simde_float16x8_t simde_vcmlaq_rot180_lane_f16(simde_float16x8_t r,
b_ = simde_float16x8_to_private(simde_vdupq_n_f16(
simde_float16x4_to_private(b).values[lane]));

SIMDE_VECTORIZE
for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0])));
i++) {
r_.values[2 * i] = simde_vaddh_f16(
r_.values[2 * i],
simde_vmulh_f16(simde_float16_from_float32(
-simde_float16_to_float32(b_.values[2 * i])),
a_.values[2 * i]));
r_.values[2 * i + 1] = simde_vaddh_f16(
r_.values[2 * i + 1],
simde_vmulh_f16(simde_float16_from_float32(
-simde_float16_to_float32(b_.values[2 * i + 1])),
a_.values[2 * i]));
}
#if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) \
&& ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16))
a_.values = SIMDE_SHUFFLE_VECTOR_(16, 8, a_.values, a_.values, 0, 0, 2, 2, 4, 4, 6, 6);
b_.values = SIMDE_SHUFFLE_VECTOR_(16, 8, -b_.values, b_.values, 0, 1, 2, 3, 4, 5, 6, 7);
r_.values += b_.values * a_.values;
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))) ; i++) {
r_.values[2 * i] = simde_float16_from_float32(
simde_float16_to_float32(r_.values[2 * i]) + (-simde_float16_to_float32(b_.values[2 * i])) * simde_float16_to_float32(a_.values[2 * i]));
r_.values[2 * i + 1] = simde_float16_from_float32(
simde_float16_to_float32(r_.values[2 * i + 1]) + (-simde_float16_to_float32(b_.values[2 * i + 1])) * simde_float16_to_float32(a_.values[2 * i]));
}
#endif

result = simde_float16x8_from_private(r_);
return result;
Expand Down Expand Up @@ -216,20 +216,20 @@ simde_float16x4_t simde_vcmla_rot180_laneq_f16(simde_float16x4_t r,
b_ = simde_float16x4_to_private(simde_vdup_n_f16(
simde_float16x8_to_private(b).values[lane]));

SIMDE_VECTORIZE
for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0])));
i++) {
r_.values[2 * i] = simde_vaddh_f16(
r_.values[2 * i],
simde_vmulh_f16(simde_float16_from_float32(
-simde_float16_to_float32(b_.values[2 * i])),
a_.values[2 * i]));
r_.values[2 * i + 1] = simde_vaddh_f16(
r_.values[2 * i + 1],
simde_vmulh_f16(simde_float16_from_float32(
-simde_float16_to_float32(b_.values[2 * i + 1])),
a_.values[2 * i]));
}
#if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) \
&& ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16))
a_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_.values, a_.values, 0, 0, 2, 2);
b_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, -b_.values, b_.values, 0, 1, 2, 3);
r_.values += b_.values * a_.values;
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))) ; i++) {
r_.values[2 * i] = simde_float16_from_float32(
simde_float16_to_float32(r_.values[2 * i]) + (-simde_float16_to_float32(b_.values[2 * i])) * simde_float16_to_float32(a_.values[2 * i]));
r_.values[2 * i + 1] = simde_float16_from_float32(
simde_float16_to_float32(r_.values[2 * i + 1]) + (-simde_float16_to_float32(b_.values[2 * i + 1])) * simde_float16_to_float32(a_.values[2 * i]));
}
#endif

result = simde_float16x4_from_private(r_);
return result;
Expand Down Expand Up @@ -300,20 +300,20 @@ simde_float16x8_t simde_vcmlaq_rot180_laneq_f16(simde_float16x8_t r,
b_ = simde_float16x8_to_private(simde_vdupq_n_f16(
simde_float16x8_to_private(b).values[lane]));

SIMDE_VECTORIZE
for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0])));
i++) {
r_.values[2 * i] = simde_vaddh_f16(
r_.values[2 * i],
simde_vmulh_f16(simde_float16_from_float32(
-simde_float16_to_float32(b_.values[2 * i])),
a_.values[2 * i]));
r_.values[2 * i + 1] = simde_vaddh_f16(
r_.values[2 * i + 1],
simde_vmulh_f16(simde_float16_from_float32(
-simde_float16_to_float32(b_.values[2 * i + 1])),
a_.values[2 * i]));
}
#if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) \
&& ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16))
a_.values = SIMDE_SHUFFLE_VECTOR_(16, 8, a_.values, a_.values, 0, 0, 2, 2, 4, 4, 6, 6);
b_.values = SIMDE_SHUFFLE_VECTOR_(16, 8, -b_.values, b_.values, 0, 1, 2, 3, 4, 5, 6, 7);
r_.values += b_.values * a_.values;
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))) ; i++) {
r_.values[2 * i] = simde_float16_from_float32(
simde_float16_to_float32(r_.values[2 * i]) + (-simde_float16_to_float32(b_.values[2 * i])) * simde_float16_to_float32(a_.values[2 * i]));
r_.values[2 * i + 1] = simde_float16_from_float32(
simde_float16_to_float32(r_.values[2 * i + 1]) + (-simde_float16_to_float32(b_.values[2 * i + 1])) * simde_float16_to_float32(a_.values[2 * i]));
}
#endif

result = simde_float16x8_from_private(r_);
return result;
Expand Down
Loading

0 comments on commit 2bab3a9

Please sign in to comment.