-
Notifications
You must be signed in to change notification settings - Fork 259
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
NEON: Implement some f16XN types and f16 related intrinsics. (#1071)
* [NEON] Add vmulq_f16 and vmul_f16. * [NEON] Add vmulq_f16 and vmul_f16 test. * [NEON] Add vget_lane_f16 and vgetq_lane_f16. * [NEON] Add vsubh_f16 and vsub_f16. * [NEON] Add vextq_f16. * [NEON] Add vget_low_f16. * [NEON] Add vmulq_lane_f16. * [NEON] Add vmul_n_f16. * [NEON] Add vget_high_f16. * [NEON] Add vsetq_lane_f16. * [NEON] Add vcombine_f16. * [NEON] Add vcvtaq_s32_f32, vcvtas_s32_f32, and vcvta_s32_f32. * [NEON] Add vpadd_f16. * [NEON] Add vuzp1_f16. * [NEON] Add vuzp2_f16. * [NEON] Add vmaxq_f16 and vmax_f16. * [NEON] Add vcvtas_u32_f32, vcvta_u32_f32, vcvtaq_u32_f32. * [NEON] Add type simde_float16x8x2_t. * [NEON] Add vld2q_f16. * [NEON] Add vld1q_dup_f16. * [NEON] Add vpmax_f16. * [NEON] Add vrsqrtsq_f16, vrsqrtsh_f16, vrsqrts_f16. * [NEON] Add vcgtq_f16, vcgt_f16, vcgth_f16. * [NEON] Add vdiv_f32 and vdivq_f32. * [NEON] Add vrecps_f16 and vrecpsq_f16. * [NEON] Add vset_lane_f16. * [NEON] Add vrecpe_f16, vrecpeq_f16. * [NEON] Add vfmaq_f16. * [NEON] Add vabsq_f16 and vabs_f16. * [NEON] Add vcltq_f16 and vclth_f16. * [NEON] Add vmin_f16 and vminq_f16. * [NEON] Add vclt_f16. * [NEON] Add vclezq_f16, vclez_f16, and vclezh_f16. * [NEON] Add vzip_f16 and vzipq_f16. * [NEON] Add vzip1_f16 and vzip1q_f16. * [NEON] Add vzip2_f16 and vzip2q_f16. * [NEON] Add vst2_f16 and vst2q_f16. * [NEON] Add type simde_float16x4x2. * [NEON] Add 16 intrinsics of vreinterpret series. * [SIMDE] Add sqrtl() in simde_math_sqrtl. * [NEON] Add vrndnq_f16, vrndns_f16, and vrndn_f16. * [NEON] Add sqrt in meson.build. * [NEON] Add 7 sqrt related intrinsics. * [Fix] Add the judge whether define sqrt() or not. * [NEON] Add vrsqrteq_f16, vrsqrte_f16, and vrsqrteh_f16. * [NEON] Add vqrshrnh_n_s16 and vqrshrnh_n_u16. * [NEON] Add vqrshrunh_n_s16. * [Fix] Add new conditions for fp16 intrinsics. * [License] Add Copyright.
- Loading branch information
Showing
82 changed files
with
6,412 additions
and
16 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -22,6 +22,7 @@ | |
* | ||
* Copyright: | ||
* 2020 Evan Nemerson <[email protected]> | ||
* 2023 Yi-Yen Chung <[email protected]> (Copyright owned by Andes Technology) | ||
*/ | ||
|
||
#if !defined(SIMDE_ARM_NEON_H) | ||
|
@@ -68,6 +69,7 @@ | |
#include "neon/cvtn.h" | ||
#include "neon/combine.h" | ||
#include "neon/create.h" | ||
#include "neon/div.h" | ||
#include "neon/dot.h" | ||
#include "neon/dot_lane.h" | ||
#include "neon/dup_lane.h" | ||
|
@@ -184,6 +186,7 @@ | |
#include "neon/shr_n.h" | ||
#include "neon/shrn_n.h" | ||
#include "neon/sqadd.h" | ||
#include "neon/sqrt.h" | ||
#include "neon/sra_n.h" | ||
#include "neon/sri_n.h" | ||
#include "neon/st1.h" | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -22,6 +22,7 @@ | |
* | ||
* Copyright: | ||
* 2020 Evan Nemerson <[email protected]> | ||
* 2023 Yi-Yen Chung <[email protected]> (Copyright owned by Andes Technology) | ||
*/ | ||
|
||
#if !defined(SIMDE_ARM_NEON_ABS_H) | ||
|
@@ -47,6 +48,45 @@ simde_vabsd_s64(int64_t a) { | |
#define vabsd_s64(a) simde_vabsd_s64(a) | ||
#endif | ||
|
||
SIMDE_FUNCTION_ATTRIBUTES | ||
simde_float16_t | ||
simde_vabsh_f16(simde_float16_t a) { | ||
#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) | ||
return vabsh_f16(a); | ||
#else | ||
simde_float32_t a_ = simde_float16_to_float32(a); | ||
|
||
return (a_ >= 0.0f) ? simde_float16_from_float32(a_) : simde_float16_from_float32(-a_); | ||
#endif | ||
} | ||
#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) | ||
#undef vabsh_f16 | ||
#define vabsh_f16(a) simde_vabsh_f16(a) | ||
#endif | ||
|
||
SIMDE_FUNCTION_ATTRIBUTES | ||
simde_float16x4_t | ||
simde_vabs_f16(simde_float16x4_t a) { | ||
#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) | ||
return vabs_f16(a); | ||
#else | ||
simde_float16x4_private | ||
r_, | ||
a_ = simde_float16x4_to_private(a); | ||
|
||
SIMDE_VECTORIZE | ||
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { | ||
r_.values[i] = simde_vabsh_f16(a_.values[i]); | ||
} | ||
|
||
return simde_float16x4_from_private(r_); | ||
#endif | ||
} | ||
#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) | ||
#undef vabs_f16 | ||
#define vabs_f16(a) simde_vabs_f16(a) | ||
#endif | ||
|
||
SIMDE_FUNCTION_ATTRIBUTES | ||
simde_float32x2_t | ||
simde_vabs_f32(simde_float32x2_t a) { | ||
|
@@ -211,6 +251,29 @@ simde_vabs_s64(simde_int64x1_t a) { | |
#define vabs_s64(a) simde_vabs_s64(a) | ||
#endif | ||
|
||
SIMDE_FUNCTION_ATTRIBUTES | ||
simde_float16x8_t | ||
simde_vabsq_f16(simde_float16x8_t a) { | ||
#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) | ||
return vabsq_f16(a); | ||
#else | ||
simde_float16x8_private | ||
r_, | ||
a_ = simde_float16x8_to_private(a); | ||
|
||
SIMDE_VECTORIZE | ||
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { | ||
r_.values[i] = simde_vabsh_f16(a_.values[i]); | ||
} | ||
|
||
return simde_float16x8_from_private(r_); | ||
#endif | ||
} | ||
#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) | ||
#undef vabsq_f16 | ||
#define vabsq_f16(a) simde_vabsq_f16(a) | ||
#endif | ||
|
||
SIMDE_FUNCTION_ATTRIBUTES | ||
simde_float32x4_t | ||
simde_vabsq_f32(simde_float32x4_t a) { | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -23,6 +23,7 @@ | |
* Copyright: | ||
* 2020 Evan Nemerson <[email protected]> | ||
* 2020 Christopher Moore <[email protected]> | ||
* 2023 Yi-Yen Chung <[email protected]> (Copyright owned by Andes Technology) | ||
*/ | ||
|
||
#if !defined(SIMDE_ARM_NEON_CGT_H) | ||
|
@@ -78,6 +79,23 @@ simde_vcgtd_u64(uint64_t a, uint64_t b) { | |
#define vcgtd_u64(a, b) simde_vcgtd_u64((a), (b)) | ||
#endif | ||
|
||
SIMDE_FUNCTION_ATTRIBUTES | ||
uint16_t | ||
simde_vcgth_f16(simde_float16_t a, simde_float16_t b) { | ||
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) | ||
return HEDLEY_STATIC_CAST(uint16_t, vcgth_f16(a, b)); | ||
#else | ||
simde_float32_t a_ = simde_float16_to_float32(a); | ||
simde_float32_t b_ = simde_float16_to_float32(b); | ||
|
||
return (a_ > b_) ? UINT16_MAX : 0; | ||
#endif | ||
} | ||
#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) | ||
#undef vcgth_f16 | ||
#define vcgth_f16(a, b) simde_vcgth_f16((a), (b)) | ||
#endif | ||
|
||
SIMDE_FUNCTION_ATTRIBUTES | ||
uint32_t | ||
simde_vcgts_f32(simde_float32_t a, simde_float32_t b) { | ||
|
@@ -92,6 +110,30 @@ simde_vcgts_f32(simde_float32_t a, simde_float32_t b) { | |
#define vcgts_f32(a, b) simde_vcgts_f32((a), (b)) | ||
#endif | ||
|
||
SIMDE_FUNCTION_ATTRIBUTES | ||
simde_uint16x8_t | ||
simde_vcgtq_f16(simde_float16x8_t a, simde_float16x8_t b) { | ||
#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) | ||
return vcgtq_f16(a, b); | ||
#else | ||
simde_float16x8_private | ||
a_ = simde_float16x8_to_private(a), | ||
b_ = simde_float16x8_to_private(b); | ||
simde_uint16x8_private r_; | ||
|
||
SIMDE_VECTORIZE | ||
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { | ||
r_.values[i] = simde_vcgth_f16(a_.values[i], b_.values[i]); | ||
} | ||
|
||
return simde_uint16x8_from_private(r_); | ||
#endif | ||
} | ||
#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) | ||
#undef vcgtq_f16 | ||
#define vcgtq_f16(a, b) simde_vcgtq_f16((a), (b)) | ||
#endif | ||
|
||
SIMDE_FUNCTION_ATTRIBUTES | ||
simde_uint32x4_t | ||
simde_vcgtq_f32(simde_float32x4_t a, simde_float32x4_t b) { | ||
|
@@ -442,6 +484,30 @@ simde_vcgtq_u64(simde_uint64x2_t a, simde_uint64x2_t b) { | |
#define vcgtq_u64(a, b) simde_vcgtq_u64((a), (b)) | ||
#endif | ||
|
||
SIMDE_FUNCTION_ATTRIBUTES | ||
simde_uint16x4_t | ||
simde_vcgt_f16(simde_float16x4_t a, simde_float16x4_t b) { | ||
#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) | ||
return vcgt_f16(a, b); | ||
#else | ||
simde_float16x4_private | ||
a_ = simde_float16x4_to_private(a), | ||
b_ = simde_float16x4_to_private(b); | ||
simde_uint16x4_private r_; | ||
|
||
SIMDE_VECTORIZE | ||
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { | ||
r_.values[i] = simde_vcgth_f16(a_.values[i], b_.values[i]); | ||
} | ||
|
||
return simde_uint16x4_from_private(r_); | ||
#endif | ||
} | ||
#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) | ||
#undef vcgt_f16 | ||
#define vcgt_f16(a, b) simde_vcgt_f16((a), (b)) | ||
#endif | ||
|
||
SIMDE_FUNCTION_ATTRIBUTES | ||
simde_uint32x2_t | ||
simde_vcgt_f32(simde_float32x2_t a, simde_float32x2_t b) { | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -23,6 +23,7 @@ | |
* Copyright: | ||
* 2020 Evan Nemerson <[email protected]> | ||
* 2020 Christopher Moore <[email protected]> | ||
* 2023 Yi-Yen Chung <[email protected]> (Copyright owned by Andes Technology) | ||
*/ | ||
|
||
#if !defined(SIMDE_ARM_NEON_CLEZ_H) | ||
|
@@ -78,6 +79,44 @@ simde_vclezs_f32(simde_float32_t a) { | |
#define vclezs_f32(a) simde_vclezs_f32(a) | ||
#endif | ||
|
||
SIMDE_FUNCTION_ATTRIBUTES | ||
uint16_t | ||
simde_vclezh_f16(simde_float16_t a) { | ||
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) | ||
return HEDLEY_STATIC_CAST(uint16_t, vclezh_f16(a)); | ||
#else | ||
simde_float32_t a_ = simde_float16_to_float32(a); | ||
|
||
return (a_ <= 0.0f) ? UINT16_MAX : 0; | ||
#endif | ||
} | ||
#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) | ||
#undef vclezh_f16 | ||
#define vclezh_f16(a) simde_vclezh_f16(a) | ||
#endif | ||
|
||
SIMDE_FUNCTION_ATTRIBUTES | ||
simde_uint16x8_t | ||
simde_vclezq_f16(simde_float16x8_t a) { | ||
#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) | ||
return vclezq_f16(a); | ||
#else | ||
simde_float16x8_private a_ = simde_float16x8_to_private(a); | ||
simde_uint16x8_private r_; | ||
|
||
SIMDE_VECTORIZE | ||
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { | ||
r_.values[i] = simde_vclezh_f16(a_.values[i]); | ||
} | ||
|
||
return simde_uint16x8_from_private(r_); | ||
#endif | ||
} | ||
#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) | ||
#undef vclezq_f16 | ||
#define vclezq_f16(a) simde_vclezq_f16(a) | ||
#endif | ||
|
||
SIMDE_FUNCTION_ATTRIBUTES | ||
simde_uint32x4_t | ||
simde_vclezq_f32(simde_float32x4_t a) { | ||
|
@@ -246,6 +285,28 @@ simde_vclezq_s64(simde_int64x2_t a) { | |
#define vclezq_s64(a) simde_vclezq_s64(a) | ||
#endif | ||
|
||
SIMDE_FUNCTION_ATTRIBUTES | ||
simde_uint16x4_t | ||
simde_vclez_f16(simde_float16x4_t a) { | ||
#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) | ||
return vclez_f16(a); | ||
#else | ||
simde_float16x4_private a_ = simde_float16x4_to_private(a); | ||
simde_uint16x4_private r_; | ||
|
||
SIMDE_VECTORIZE | ||
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { | ||
r_.values[i] = simde_vclezh_f16(a_.values[i]); | ||
} | ||
|
||
return simde_uint16x4_from_private(r_); | ||
#endif | ||
} | ||
#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) | ||
#undef vclez_f16 | ||
#define vclez_f16(a) simde_vclez_f16(a) | ||
#endif | ||
|
||
SIMDE_FUNCTION_ATTRIBUTES | ||
simde_uint32x2_t | ||
simde_vclez_f32(simde_float32x2_t a) { | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -23,6 +23,7 @@ | |
* Copyright: | ||
* 2020 Evan Nemerson <[email protected]> | ||
* 2020 Christopher Moore <[email protected]> | ||
* 2023 Yi-Yen Chung <[email protected]> (Copyright owned by Andes Technology) | ||
*/ | ||
|
||
#if !defined(SIMDE_ARM_NEON_CLT_H) | ||
|
@@ -77,6 +78,23 @@ simde_vcltd_u64(uint64_t a, uint64_t b) { | |
#define vcltd_u64(a, b) simde_vcltd_u64((a), (b)) | ||
#endif | ||
|
||
SIMDE_FUNCTION_ATTRIBUTES | ||
uint16_t | ||
simde_vclth_f16(simde_float16_t a, simde_float16_t b) { | ||
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) | ||
return HEDLEY_STATIC_CAST(uint16_t, vclth_f16(a, b)); | ||
#else | ||
simde_float32_t a_ = simde_float16_to_float32(a); | ||
simde_float32_t b_ = simde_float16_to_float32(b); | ||
|
||
return (a_ < b_) ? UINT16_MAX : 0; | ||
#endif | ||
} | ||
#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) | ||
#undef vclth_f16 | ||
#define vclth_f16(a, b) simde_vclth_f16((a), (b)) | ||
#endif | ||
|
||
SIMDE_FUNCTION_ATTRIBUTES | ||
uint32_t | ||
simde_vclts_f32(simde_float32_t a, simde_float32_t b) { | ||
|
@@ -91,6 +109,30 @@ simde_vclts_f32(simde_float32_t a, simde_float32_t b) { | |
#define vclts_f32(a, b) simde_vclts_f32((a), (b)) | ||
#endif | ||
|
||
SIMDE_FUNCTION_ATTRIBUTES | ||
simde_uint16x8_t | ||
simde_vcltq_f16(simde_float16x8_t a, simde_float16x8_t b) { | ||
#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) | ||
return vcltq_f16(a, b); | ||
#else | ||
simde_float16x8_private | ||
a_ = simde_float16x8_to_private(a), | ||
b_ = simde_float16x8_to_private(b); | ||
simde_uint16x8_private r_; | ||
|
||
SIMDE_VECTORIZE | ||
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { | ||
r_.values[i] = simde_vclth_f16(a_.values[i], b_.values[i]); | ||
} | ||
|
||
return simde_uint16x8_from_private(r_); | ||
#endif | ||
} | ||
#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) | ||
#undef vcltq_f16 | ||
#define vcltq_f16(a, b) simde_vcltq_f16((a), (b)) | ||
#endif | ||
|
||
SIMDE_FUNCTION_ATTRIBUTES | ||
simde_uint32x4_t | ||
simde_vcltq_f32(simde_float32x4_t a, simde_float32x4_t b) { | ||
|
@@ -450,6 +492,30 @@ simde_vcltq_u64(simde_uint64x2_t a, simde_uint64x2_t b) { | |
#define vcltq_u64(a, b) simde_vcltq_u64((a), (b)) | ||
#endif | ||
|
||
SIMDE_FUNCTION_ATTRIBUTES | ||
simde_uint16x4_t | ||
simde_vclt_f16(simde_float16x4_t a, simde_float16x4_t b) { | ||
#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) | ||
return vclt_f16(a, b); | ||
#else | ||
simde_float16x4_private | ||
a_ = simde_float16x4_to_private(a), | ||
b_ = simde_float16x4_to_private(b); | ||
simde_uint16x4_private r_; | ||
|
||
SIMDE_VECTORIZE | ||
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { | ||
r_.values[i] = simde_vclth_f16(a_.values[i], b_.values[i]); | ||
} | ||
|
||
return simde_uint16x4_from_private(r_); | ||
#endif | ||
} | ||
#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) | ||
#undef vclt_f16 | ||
#define vclt_f16(a, b) simde_vclt_f16((a), (b)) | ||
#endif | ||
|
||
SIMDE_FUNCTION_ATTRIBUTES | ||
simde_uint32x2_t | ||
simde_vclt_f32(simde_float32x2_t a, simde_float32x2_t b) { | ||
|
Oops, something went wrong.