Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

NEON: more fp16 using intrinsics supported by architecture v7 (skip version) #1081

Merged
merged 45 commits into from
Oct 18, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
45 commits
Select commit Hold shift + click to select a range
ce9c71c
[NEON] Add vabal_{s/u}{8/16/32}
yyctw Jun 21, 2023
ebfa9fc
[NEON] Add vabal_high_{s/u}{8/16/32}
yyctw Jun 21, 2023
d6f44df
[NEON] Add all vcale* intrinsics (9)
yyctw Jun 21, 2023
684e6cd
[NEON] Add all vcalt intrinsics (9)
yyctw Jun 21, 2023
46f80ad
[NEON] Add vcreate_f16
yyctw Jun 21, 2023
51fc505
[NEON] Add vreinterpret_u64_f16
yyctw Jun 21, 2023
5129a66
[NEON] Add vcvth_f16_s16 and vcvth_f16_u16
yyctw Jul 7, 2023
ea344d4
[NEON] Add vduph_lane_f16, vdup_lane_f16, and vdupq_lane_f16
yyctw Jul 7, 2023
a6b2211
[NEON] Add vext_f16
yyctw Jul 7, 2023
6ac66c8
[NEON] Add 16 vcvt{q}_n_* intrinsics
yyctw Jul 7, 2023
a8a5a01
[Fix] Correct function input parameters
yyctw Jul 10, 2023
5dbccfa
[NEON] Add 6 vcvtn_{s/u}{16/32/64}_f{*} intrinsics
yyctw Jul 10, 2023
bba63e2
[Fix] Correct vdup_lane_f16 and vdupq_lane_f16.
yyctw Jul 10, 2023
429acee
[Fix] Correct function input parameters.
yyctw Jul 10, 2023
23599f1
[NEON] Add 24 vcvt{q}_n_* intrinsics
yyctw Jul 11, 2023
af7989a
[NEON] Add all vcvtn* intrinsics
yyctw Jul 17, 2023
cddd206
[NEON] Add vfmah_f16 and vfma_f16
yyctw Jul 18, 2023
9fb2adc
[NEON] Add vfma_n_f16 and vfmaq_n_f16
yyctw Jul 18, 2023
66df463
[NEON] Add vmulh_f16
yyctw Jul 18, 2023
704c647
[NEON] Add fma_lane related intrinsics.
yyctw Jul 18, 2023
a3fbd43
[NEON] Add 5 vmul* related intrinsics
yyctw Jul 18, 2023
6fdc082
[NEON] Add neg related intrinsics.
yyctw Jul 18, 2023
9a9cec6
[NEON] Add all fms, fms_n, and fms_lane intrinsics
yyctw Jul 19, 2023
c613af7
[NEON] Add types float16x{4/8}x{2/3/4}
yyctw Jul 19, 2023
d5a8e52
[NEON] Add 9 vld1 related intrinsics
yyctw Jul 19, 2023
bc1bb5e
[Fix] Modified wrong rounding implementation.
yyctw Jul 20, 2023
e42299f
[Fix] Fix wrong intrinsic alias names.
yyctw Jul 20, 2023
60163ad
[Refactor] Remove redundant functions.
yyctw Jul 25, 2023
5ff6097
[NEON] Add 45 ld2 related intrinsics
yyctw Jul 26, 2023
cc3542a
[NEON] Add ld3_dup, ld3_lane, and ld4_dup
yyctw Jul 28, 2023
4a647c4
[NEON] Add vld3_f16 and vld4_f16.
yyctw Jul 28, 2023
3f5357f
[NEON] Add vld{3/4}_{dup/lane} series intrinsics
yyctw Jul 28, 2023
32f0645
[NEON] Add mla_{high}_lane series intrinsics
yyctw Aug 1, 2023
921c4e2
[NEON] Add qdmlal_{high}_{lane} series intrinsics.
yyctw Aug 1, 2023
fbf48ef
[NEON] Add qdmlal_lane and qdmlal_n series intrinsics
yyctw Aug 1, 2023
0d01303
[NEON] Add mls_lane and mlsl_high_lane series intrinsics
yyctw Aug 2, 2023
e4ada23
[NEON] Add 22 qdmlsl series intrinsics
yyctw Aug 2, 2023
c5f6d4c
[NEON] Add 10 qdmull_* series intrinsics
yyctw Aug 2, 2023
be1bd26
[NEON] Add 3 qdmulh series intrinsics
yyctw Aug 2, 2023
e69bafe
[Fix] Fix wrong function name.
yyctw Aug 2, 2023
b7de455
[Fix] Correct the wrong alias function name.
yyctw Aug 4, 2023
0a1fdd5
[NEON] Add qdmullh_lane{q}_s{16/32} related intrinsics
yyctw Aug 4, 2023
a624b17
[NEON] Add qdmull_n and qdmull_high_lane series intrinsics
yyctw Aug 4, 2023
edb3576
[Fix] Add conditions for fp16 intrinsics
yyctw Oct 11, 2023
d5c2855
[Hack] Skip functions that trigger compiler bugs.
yyctw Oct 17, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 34 additions & 0 deletions meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ cxx = meson.get_compiler('cpp')

simde_neon_families = [
'aba',
'abal',
'abal_high',
'abd',
'abdl',
'abs',
Expand All @@ -29,6 +31,8 @@ simde_neon_families = [
'cadd_rot90',
'cage',
'cagt',
'cale',
'calt',
'ceq',
'ceqz',
'cge',
Expand All @@ -51,6 +55,7 @@ simde_neon_families = [
'cmla_rot270',
'cnt',
'cvt',
'cvt_n',
'cvtn',
'combine',
'create',
Expand All @@ -64,6 +69,9 @@ simde_neon_families = [
'fma',
'fma_lane',
'fma_n',
'fms',
'fms_lane',
'fms_n',
'get_high',
'get_lane',
'get_low',
Expand All @@ -79,8 +87,13 @@ simde_neon_families = [
'ld1q_x4',
'ld1',
'ld2',
'ld2_dup',
'ld2_lane',
'ld3',
'ld3_dup',
'ld3_lane',
'ld4',
'ld4_dup',
'ld4_lane',
'max',
'maxnm',
Expand All @@ -93,16 +106,20 @@ simde_neon_families = [
'mla_n',
'mlal',
'mlal_high',
'mlal_high_lane',
'mlal_high_n',
'mlal_lane',
'mlal_n',
'mls',
'mls_lane',
'mls_n',
'mlsl',
'mlsl_high',
'mlsl_high_lane',
'mlsl_high_n',
'mlsl_lane',
'mlsl_n',
#'mmlaq',
'movl',
'movl_high',
'movn',
Expand All @@ -125,10 +142,27 @@ simde_neon_families = [
'pmin',
'qadd',
'qabs',
'qdmlal',
'qdmlal_high',
'qdmlal_high_lane',
'qdmlal_high_n',
'qdmlal_lane',
'qdmlal_n',
'qdmlsl',
'qdmlsl_high',
'qdmlsl_high_lane',
'qdmlsl_high_n',
'qdmlsl_lane',
'qdmlsl_n',
'qdmulh',
'qdmulh_lane',
'qdmulh_n',
'qdmull',
'qdmull_high',
'qdmull_high_lane',
'qdmull_high_n',
'qdmull_lane',
'qdmull_n',
'qrdmulh',
'qrdmulh_lane',
'qrdmulh_n',
Expand Down
34 changes: 34 additions & 0 deletions simde/arm/neon.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@
#include "neon/types.h"

#include "neon/aba.h"
#include "neon/abal.h"
#include "neon/abal_high.h"
#include "neon/abd.h"
#include "neon/abdl.h"
#include "neon/abs.h"
Expand All @@ -50,6 +52,8 @@
#include "neon/cadd_rot90.h"
#include "neon/cage.h"
#include "neon/cagt.h"
#include "neon/cale.h"
#include "neon/calt.h"
#include "neon/ceq.h"
#include "neon/ceqz.h"
#include "neon/cge.h"
Expand All @@ -72,6 +76,7 @@
#include "neon/cmla_rot270.h"
#include "neon/cnt.h"
#include "neon/cvt.h"
#include "neon/cvt_n.h"
#include "neon/cvtn.h"
#include "neon/combine.h"
#include "neon/create.h"
Expand All @@ -85,6 +90,9 @@
#include "neon/fma.h"
#include "neon/fma_lane.h"
#include "neon/fma_n.h"
#include "neon/fms.h"
#include "neon/fms_lane.h"
#include "neon/fms_n.h"
#include "neon/get_high.h"
#include "neon/get_lane.h"
#include "neon/get_low.h"
Expand All @@ -100,8 +108,13 @@
#include "neon/ld1q_x3.h"
#include "neon/ld1q_x4.h"
#include "neon/ld2.h"
#include "neon/ld2_dup.h"
#include "neon/ld2_lane.h"
#include "neon/ld3.h"
#include "neon/ld3_dup.h"
#include "neon/ld3_lane.h"
#include "neon/ld4.h"
#include "neon/ld4_dup.h"
#include "neon/ld4_lane.h"
#include "neon/max.h"
#include "neon/maxnm.h"
Expand All @@ -114,16 +127,20 @@
#include "neon/mla_n.h"
#include "neon/mlal.h"
#include "neon/mlal_high.h"
#include "neon/mlal_high_lane.h"
#include "neon/mlal_high_n.h"
#include "neon/mlal_lane.h"
#include "neon/mlal_n.h"
#include "neon/mls.h"
#include "neon/mls_lane.h"
#include "neon/mls_n.h"
#include "neon/mlsl.h"
#include "neon/mlsl_high.h"
#include "neon/mlsl_high_lane.h"
#include "neon/mlsl_high_n.h"
#include "neon/mlsl_lane.h"
#include "neon/mlsl_n.h"
//#include "neon/mmlaq.h"
#include "neon/movl.h"
#include "neon/movl_high.h"
#include "neon/movn.h"
Expand All @@ -146,10 +163,27 @@
#include "neon/pmin.h"
#include "neon/qabs.h"
#include "neon/qadd.h"
#include "neon/qdmlal.h"
#include "neon/qdmlal_high.h"
#include "neon/qdmlal_high_lane.h"
#include "neon/qdmlal_high_n.h"
#include "neon/qdmlal_lane.h"
#include "neon/qdmlal_n.h"
#include "neon/qdmlsl.h"
#include "neon/qdmlsl_high.h"
#include "neon/qdmlsl_high_lane.h"
#include "neon/qdmlsl_high_n.h"
#include "neon/qdmlsl_lane.h"
#include "neon/qdmlsl_n.h"
#include "neon/qdmulh.h"
#include "neon/qdmulh_lane.h"
#include "neon/qdmulh_n.h"
#include "neon/qdmull.h"
#include "neon/qdmull_high.h"
#include "neon/qdmull_high_lane.h"
#include "neon/qdmull_high_n.h"
#include "neon/qdmull_lane.h"
#include "neon/qdmull_n.h"
#include "neon/qrdmulh.h"
#include "neon/qrdmulh_lane.h"
#include "neon/qrdmulh_n.h"
Expand Down
125 changes: 125 additions & 0 deletions simde/arm/neon/abal.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
/* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use, copy,
* modify, merge, publish, distribute, sublicense, and/or sell copies
* of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
* Copyright:
* 2023 Yi-Yen Chung <[email protected]> (Copyright owned by Andes Technology)
*/

#if !defined(SIMDE_ARM_NEON_ABAL_H)
#define SIMDE_ARM_NEON_ABAL_H

#include "abdl.h"
#include "add.h"

HEDLEY_DIAGNOSTIC_PUSH
SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
SIMDE_BEGIN_DECLS_

SIMDE_FUNCTION_ATTRIBUTES
simde_int16x8_t
simde_vabal_s8(simde_int16x8_t a, simde_int8x8_t b, simde_int8x8_t c) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vabal_s8(a, b, c);
#else
return simde_vaddq_s16(simde_vabdl_s8(b, c), a);
#endif
}
#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
#undef vabal_s8
#define vabal_s8(a, b, c) simde_vabal_s8((a), (b), (c))
#endif

SIMDE_FUNCTION_ATTRIBUTES
simde_int32x4_t
simde_vabal_s16(simde_int32x4_t a, simde_int16x4_t b, simde_int16x4_t c) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vabal_s16(a, b, c);
#else
return simde_vaddq_s32(simde_vabdl_s16(b, c), a);
#endif
}
#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
#undef vabal_s16
#define vabal_s16(a, b, c) simde_vabal_s16((a), (b), (c))
#endif

SIMDE_FUNCTION_ATTRIBUTES
simde_int64x2_t
simde_vabal_s32(simde_int64x2_t a, simde_int32x2_t b, simde_int32x2_t c) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vabal_s32(a, b, c);
#else
return simde_vaddq_s64(simde_vabdl_s32(b, c), a);
#endif
}
#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
#undef vabal_s32
#define vabal_s32(a, b, c) simde_vabal_s32((a), (b), (c))
#endif

SIMDE_FUNCTION_ATTRIBUTES
simde_uint16x8_t
simde_vabal_u8(simde_uint16x8_t a, simde_uint8x8_t b, simde_uint8x8_t c) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vabal_u8(a, b, c);
#else
return simde_vaddq_u16(simde_vabdl_u8(b, c), a);
#endif
}
#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
#undef vabal_u8
#define vabal_u8(a, b, c) simde_vabal_u8((a), (b), (c))
#endif

SIMDE_FUNCTION_ATTRIBUTES
simde_uint32x4_t
simde_vabal_u16(simde_uint32x4_t a, simde_uint16x4_t b, simde_uint16x4_t c) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vabal_u16(a, b, c);
#else
return simde_vaddq_u32(simde_vabdl_u16(b, c), a);
#endif
}
#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
#undef vabal_u16
#define vabal_u16(a, b, c) simde_vabal_u16((a), (b), (c))
#endif

SIMDE_FUNCTION_ATTRIBUTES
simde_uint64x2_t
simde_vabal_u32(simde_uint64x2_t a, simde_uint32x2_t b, simde_uint32x2_t c) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vabal_u32(a, b, c);
#else
return simde_vaddq_u64(simde_vabdl_u32(b, c), a);
#endif
}
#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
#undef vabal_u32
#define vabal_u32(a, b, c) simde_vabal_u32((a), (b), (c))
#endif


SIMDE_END_DECLS_
HEDLEY_DIAGNOSTIC_POP

#endif /* !defined(SIMDE_ARM_NEON_abal_H) */
Loading