-
Notifications
You must be signed in to change notification settings - Fork 259
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
NEON: more fp16 using intrinsics supported by architecture v7 (skip v…
…ersion) (#1081) * [NEON] Add vabal_{s/u}{8/16/32} * [NEON] Add vabal_high_{s/u}{8/16/32} * [NEON] Add all vcale* intrinsics (9) * [NEON] Add all vcalt intrinsics (9) * [NEON] Add vcreate_f16 * [NEON] Add vreinterpret_u64_f16 * [NEON] Add vcvth_f16_s16 and vcvth_f16_u16 * [NEON] Add vduph_lane_f16, vdup_lane_f16, and vdupq_lane_f16 * [NEON] Add vext_f16 * [NEON] Add 16 vcvt{q}_n_* intrinsics * [Fix] Correct function input parameters * [NEON] Add 6 vcvtn_{s/u}{16/32/64}_f{*} intrinsics * [Fix] Correct vdup_lane_f16 and vdupq_lane_f16. * [Fix] Correct function input parameters. * [NEON] Add 24 vcvt{q}_n_* intrinsics * [NEON] Add all vcvtn* intrinsics * [NEON] Add vfmah_f16 and vfma_f16 * [NEON] Add vfma_n_f16 and vfmaq_n_f16 * [NEON] Add vmulh_f16 * [NEON] Add fma_lane related intrinsics. * [NEON] Add 5 vmul* related intrinsics vmulh_lane_f16, vmulh_laneq_f16, vmul_lane_f16, vmul_laneq_f16, vmulq_laneq_f16. * [NEON] Add neg related intrinsics. * [NEON] Add all fms, fms_n, and fms_lane intrinsics * [NEON] Add types float16x{4/8}x{2/3/4} * [NEON] Add 9 vld1 related intrinsics * [Fix] Modified wrong rounding implementation. Modified wrong implementation "Ties to Away" to "rounding to nearest with ties to Away" add.h: Remove redundant code. * [Fix] Fix wrong intrinsic alias names. * [Refactor] Remove redundant functions. * [NEON] Add 45 ld2 related intrinsics one ld2_f16, twenty-two ld2_lane series, and twenty-two ld2_dup series. * [NEON] Add ld3_dup, ld3_lane, and ld4_dup * [NEON] Add vld3_f16 and vld4_f16. * [NEON] Add vld{3/4}_{dup/lane} series intrinsics * [NEON] Add mla_{high}_lane series intrinsics * [NEON] Add qdmlal_{high}_{lane} series intrinsics. * [NEON] Add qdmlal_lane and qdmlal_n series intrinsics * [NEON] Add mls_lane and mlsl_high_lane series intrinsics * [NEON] Add 22 qdmlsl series intrinsics * [NEON] Add 10 qdmull_* series intrinsics * [NEON] Add 3 qdmulh series intrinsics * [Fix] Fix wrong function name. * [Fix] Correct the wrong alias function name. * [NEON] Add qdmullh_lane{q}_s{16/32} related intrinsics * [NEON] Add qdmull_n and qdmull_high_lane series intrinsics * [Fix] Add conditions for fp16 intrinsics * [Hack] Skip functions that trigger compiler bugs.
- Loading branch information
Showing
129 changed files
with
33,598 additions
and
190 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,125 @@ | ||
/* SPDX-License-Identifier: MIT | ||
* | ||
* Permission is hereby granted, free of charge, to any person | ||
* obtaining a copy of this software and associated documentation | ||
* files (the "Software"), to deal in the Software without | ||
* restriction, including without limitation the rights to use, copy, | ||
* modify, merge, publish, distribute, sublicense, and/or sell copies | ||
* of the Software, and to permit persons to whom the Software is | ||
* furnished to do so, subject to the following conditions: | ||
* | ||
* The above copyright notice and this permission notice shall be | ||
* included in all copies or substantial portions of the Software. | ||
* | ||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | ||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | ||
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | ||
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | ||
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
* SOFTWARE. | ||
* | ||
* Copyright: | ||
* 2023 Yi-Yen Chung <[email protected]> (Copyright owned by Andes Technology) | ||
*/ | ||
|
||
#if !defined(SIMDE_ARM_NEON_ABAL_H) | ||
#define SIMDE_ARM_NEON_ABAL_H | ||
|
||
#include "abdl.h" | ||
#include "add.h" | ||
|
||
HEDLEY_DIAGNOSTIC_PUSH | ||
SIMDE_DISABLE_UNWANTED_DIAGNOSTICS | ||
SIMDE_BEGIN_DECLS_ | ||
|
||
SIMDE_FUNCTION_ATTRIBUTES | ||
simde_int16x8_t | ||
simde_vabal_s8(simde_int16x8_t a, simde_int8x8_t b, simde_int8x8_t c) { | ||
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) | ||
return vabal_s8(a, b, c); | ||
#else | ||
return simde_vaddq_s16(simde_vabdl_s8(b, c), a); | ||
#endif | ||
} | ||
#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) | ||
#undef vabal_s8 | ||
#define vabal_s8(a, b, c) simde_vabal_s8((a), (b), (c)) | ||
#endif | ||
|
||
SIMDE_FUNCTION_ATTRIBUTES | ||
simde_int32x4_t | ||
simde_vabal_s16(simde_int32x4_t a, simde_int16x4_t b, simde_int16x4_t c) { | ||
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) | ||
return vabal_s16(a, b, c); | ||
#else | ||
return simde_vaddq_s32(simde_vabdl_s16(b, c), a); | ||
#endif | ||
} | ||
#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) | ||
#undef vabal_s16 | ||
#define vabal_s16(a, b, c) simde_vabal_s16((a), (b), (c)) | ||
#endif | ||
|
||
SIMDE_FUNCTION_ATTRIBUTES | ||
simde_int64x2_t | ||
simde_vabal_s32(simde_int64x2_t a, simde_int32x2_t b, simde_int32x2_t c) { | ||
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) | ||
return vabal_s32(a, b, c); | ||
#else | ||
return simde_vaddq_s64(simde_vabdl_s32(b, c), a); | ||
#endif | ||
} | ||
#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) | ||
#undef vabal_s32 | ||
#define vabal_s32(a, b, c) simde_vabal_s32((a), (b), (c)) | ||
#endif | ||
|
||
SIMDE_FUNCTION_ATTRIBUTES | ||
simde_uint16x8_t | ||
simde_vabal_u8(simde_uint16x8_t a, simde_uint8x8_t b, simde_uint8x8_t c) { | ||
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) | ||
return vabal_u8(a, b, c); | ||
#else | ||
return simde_vaddq_u16(simde_vabdl_u8(b, c), a); | ||
#endif | ||
} | ||
#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) | ||
#undef vabal_u8 | ||
#define vabal_u8(a, b, c) simde_vabal_u8((a), (b), (c)) | ||
#endif | ||
|
||
SIMDE_FUNCTION_ATTRIBUTES | ||
simde_uint32x4_t | ||
simde_vabal_u16(simde_uint32x4_t a, simde_uint16x4_t b, simde_uint16x4_t c) { | ||
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) | ||
return vabal_u16(a, b, c); | ||
#else | ||
return simde_vaddq_u32(simde_vabdl_u16(b, c), a); | ||
#endif | ||
} | ||
#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) | ||
#undef vabal_u16 | ||
#define vabal_u16(a, b, c) simde_vabal_u16((a), (b), (c)) | ||
#endif | ||
|
||
SIMDE_FUNCTION_ATTRIBUTES | ||
simde_uint64x2_t | ||
simde_vabal_u32(simde_uint64x2_t a, simde_uint32x2_t b, simde_uint32x2_t c) { | ||
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) | ||
return vabal_u32(a, b, c); | ||
#else | ||
return simde_vaddq_u64(simde_vabdl_u32(b, c), a); | ||
#endif | ||
} | ||
#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) | ||
#undef vabal_u32 | ||
#define vabal_u32(a, b, c) simde_vabal_u32((a), (b), (c)) | ||
#endif | ||
|
||
|
||
SIMDE_END_DECLS_ | ||
HEDLEY_DIAGNOSTIC_POP | ||
|
||
#endif /* !defined(SIMDE_ARM_NEON_abal_H) */ |
Oops, something went wrong.