Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

NEON: implement all intrinsics supported by architecture A64-part1 #1090

Merged
merged 42 commits into from
Oct 24, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
e85f3dc
[NEON] Add qshrnh_n_{s/u}16.
yyctw Aug 8, 2023
67c7d95
[NEON] Add qrshr{u}n_high_n, qshrn_high_n, and rshrn_high_n.
yyctw Aug 8, 2023
20dfdc0
[NEON] Add qrshrn_high_n_{s/u}{16/32/64}.
yyctw Aug 8, 2023
78abecf
[NEON] Add qrshrun_high_n_s{16/32/64}.
yyctw Aug 8, 2023
aa58563
[NEON] Add qshrn_high_n_{s/u}{16/32/64}.
yyctw Aug 8, 2023
8a06d47
[NEON] Add rshrn_high_n_{s/u}{16/32/64}.
yyctw Aug 8, 2023
3ca0369
[NEON] Add r{add/sub}hn_{high} and reinterpret series.
yyctw Aug 9, 2023
6df53cd
[NEON] Add raddhn_{high} series.
yyctw Aug 9, 2023
d9cff09
[NEON] Add rsubhn_{high} series.
yyctw Aug 9, 2023
40c5c3d
[NEON] Completed reinterpret series.
yyctw Aug 9, 2023
2808975
[NEON] Add sli_n, st1_x2, and st1q_x2.
yyctw Oct 19, 2023
ca09873
[NEON] Add 18 sli{q}_n_{s/u}{8/16/32/64} intrinsics.
yyctw Aug 10, 2023
0d64820
[NEON] Add 22 st1{q}_{TYPE}_x2 series intrinsic.
yyctw Oct 19, 2023
c7a6663
[NEON] Add st1{q}_x3 series.
yyctw Oct 19, 2023
050bc67
[NEON] Add qrshl, st1_x4, and st1q_x4 series.
yyctw Aug 14, 2023
a45e6a1
[NEON] Add 4 intrinsics (st{3/4}{q}_f16).
yyctw Aug 14, 2023
9be7e95
[NEON] Add 8 intrinsics (vst{1/2/3/4}{q}_lane_f16).
yyctw Aug 14, 2023
3b18f15
[NEON] Add vld{3/4}q_f16 intrinsics
yyctw Aug 14, 2023
f71e013
[NEON] Add vtrn{/1/2}{q}_f16 intrinsics
yyctw Aug 14, 2023
57bff2b
[NEON] Add 4 intrinsics (vuzp{q}_f16 and vzup{1/2}q_f16)
yyctw Aug 14, 2023
5bb2aec
[NEON] Add 2 intrinsics (vrev64{q}_f16).
yyctw Aug 14, 2023
f174ec5
[NEON] Add 24 intrinsics (vqrshl{q}_{TYPE}, vqrshl{b/h/s/d}_{TYPE}).
yyctw Aug 15, 2023
78837a9
[NEON] Add abdl_high, addhn_high, and qshl_n.
yyctw Aug 16, 2023
ba0eaba
[NEON] Add 24 intrinsics (vqshl{q}_n series).
yyctw Aug 16, 2023
6f5f3f5
[NEON] Add 6 intrinsics (vabdl_high_{s/u} series).
yyctw Aug 16, 2023
e87855b
[NEON] Add 6 intrinsics (vaddhn_high_{s/u} series).
yyctw Aug 16, 2023
ba86dd6
[NEON] Add 4 intrinsics (3 vabd{h//q}_f16 and 1 vabsh_f16).
yyctw Aug 16, 2023
a41ad47
[NEON] Add 3 intrinsics (vcgez{/h/q}_f16).
yyctw Aug 16, 2023
03a8584
[NEON] Add 3 intrinsics (vcgtz{/h/q}_f16).
yyctw Aug 16, 2023
2a9608a
[NEON] Add 3 intrinsics (vcle{/h/q}_f16).
yyctw Aug 16, 2023
254dec3
[NEON] Add 3 intrinsics (vcltz{/h/q}_f16).
yyctw Aug 16, 2023
7ce1eb9
[NEON] Add cvtm, cvtp, and copy_lane.
yyctw Aug 23, 2023
9773af8
[NEON] Add 40 intrinsics (vcopy{q}_lane{q}_{TYPE}).
yyctw Aug 23, 2023
7aee29b
[NEON] Add 33 vcvt series intrinsics.
yyctw Oct 19, 2023
e71bbd7
[NEON] Add 20 vcvt{h/s/d}_n_{TYPE} series intrinsics.
yyctw Oct 19, 2023
d2f9d1e
[NEON] Add 22 intrinsics (vcvtm_{TYPE}}).
yyctw Aug 23, 2023
0588b4d
[NEON] Add 22 intrinsics (vcvtp_{TYPE}}).
yyctw Aug 23, 2023
1423839
[Fix] Add copyright
yyctw Oct 19, 2023
3acfd40
[Fix] Fix bugs
yyctw Oct 18, 2023
4b1f792
[Fix] Fix bugs (10/23)
Oct 23, 2023
88892f0
[Fix] Fix the bugs in the initial review.
yyctw Oct 23, 2023
dccd46e
[Fix] Fix the bugs in the 1st 75 files review.
yyctw Oct 24, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -659,7 +659,7 @@ jobs:
- name: Configure and Build
run: |
meson --backend=ninja build --cross-file test/arm64cl.txt
meson test -C build --print-errorlogs $(meson test -C build --list | grep -v emul)
ninja -C build test

linux-gcc-loongarch64:
runs-on: ubuntu-22.04
Expand Down
16 changes: 16 additions & 0 deletions meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,11 @@ simde_neon_families = [
'abal_high',
'abd',
'abdl',
'abdl_high',
'abs',
'add',
'addhn',
'addhn_high',
'addl',
'addlv',
'addl_high',
Expand Down Expand Up @@ -56,8 +58,11 @@ simde_neon_families = [
'cnt',
'cvt',
'cvt_n',
'cvtm',
'cvtn',
'cvtp',
'combine',
'copy_lane',
'create',
'div',
'dot',
Expand Down Expand Up @@ -166,19 +171,26 @@ simde_neon_families = [
'qrdmulh',
'qrdmulh_lane',
'qrdmulh_n',
'qrshl',
'qrshrn_high_n',
'qrshrn_n',
'qrshrun_high_n',
'qrshrun_n',
'qmovn',
'qmovn_high',
'qmovun',
'qneg',
'qshl',
'qshl_n',
'qshlu_n',
'qshrn_high_n',
'qshrn_n',
'qshrun_n',
'qsub',
'qtbl',
'qtbx',
'raddhn',
'raddhn_high',
'rbit',
'recpe',
'recps',
Expand All @@ -194,16 +206,20 @@ simde_neon_families = [
'rndp',
'rshl',
'rshr_n',
'rshrn_high_n',
'rshrn_n',
'rsqrte',
'rsqrts',
'rsra_n',
'rsubhn',
'rsubhn_high',
'set_lane',
'shl',
'shl_n',
'shll_n',
'shr_n',
'shrn_n',
'sli_n',
'sqadd',
'sqrt',
'sra_n',
Expand Down
16 changes: 16 additions & 0 deletions simde/arm/neon.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,11 @@
#include "neon/abal_high.h"
#include "neon/abd.h"
#include "neon/abdl.h"
#include "neon/abdl_high.h"
#include "neon/abs.h"
#include "neon/add.h"
#include "neon/addhn.h"
#include "neon/addhn_high.h"
#include "neon/addl.h"
#include "neon/addlv.h"
#include "neon/addl_high.h"
Expand Down Expand Up @@ -77,8 +79,11 @@
#include "neon/cnt.h"
#include "neon/cvt.h"
#include "neon/cvt_n.h"
#include "neon/cvtm.h"
#include "neon/cvtn.h"
#include "neon/cvtp.h"
#include "neon/combine.h"
#include "neon/copy_lane.h"
#include "neon/create.h"
#include "neon/div.h"
#include "neon/dot.h"
Expand Down Expand Up @@ -187,19 +192,26 @@
#include "neon/qrdmulh.h"
#include "neon/qrdmulh_lane.h"
#include "neon/qrdmulh_n.h"
#include "neon/qrshl.h"
#include "neon/qrshrn_high_n.h"
#include "neon/qrshrn_n.h"
#include "neon/qrshrun_high_n.h"
#include "neon/qrshrun_n.h"
#include "neon/qmovn.h"
#include "neon/qmovun.h"
#include "neon/qmovn_high.h"
#include "neon/qneg.h"
#include "neon/qsub.h"
#include "neon/qshl.h"
#include "neon/qshl_n.h"
#include "neon/qshlu_n.h"
#include "neon/qshrn_high_n.h"
#include "neon/qshrn_n.h"
#include "neon/qshrun_n.h"
#include "neon/qtbl.h"
#include "neon/qtbx.h"
#include "neon/raddhn.h"
#include "neon/raddhn_high.h"
#include "neon/rbit.h"
#include "neon/recpe.h"
#include "neon/recps.h"
Expand All @@ -215,16 +227,20 @@
#include "neon/rndp.h"
#include "neon/rshl.h"
#include "neon/rshr_n.h"
#include "neon/rshrn_high_n.h"
#include "neon/rshrn_n.h"
#include "neon/rsqrte.h"
#include "neon/rsqrts.h"
#include "neon/rsra_n.h"
#include "neon/rsubhn.h"
#include "neon/rsubhn_high.h"
#include "neon/set_lane.h"
#include "neon/shl.h"
#include "neon/shl_n.h"
#include "neon/shll_n.h"
#include "neon/shr_n.h"
#include "neon/shrn_n.h"
#include "neon/sli_n.h"
#include "neon/sqadd.h"
#include "neon/sqrt.h"
#include "neon/sra_n.h"
Expand Down
46 changes: 46 additions & 0 deletions simde/arm/neon/abd.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
*
* Copyright:
* 2020 Evan Nemerson <[email protected]>
* 2023 Yi-Yen Chung <[email protected]> (Copyright owned by Andes Technology)
*/

#if !defined(SIMDE_ARM_NEON_ABD_H)
Expand All @@ -37,6 +38,23 @@ HEDLEY_DIAGNOSTIC_PUSH
SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
SIMDE_BEGIN_DECLS_

SIMDE_FUNCTION_ATTRIBUTES
simde_float16_t
simde_vabdh_f16(simde_float16_t a, simde_float16_t b) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
return vabdh_f16(a, b);
#else
simde_float32_t a_ = simde_float16_to_float32(a);
simde_float32_t b_ = simde_float16_to_float32(b);
simde_float32_t r_ = a_ - b_;
return r_ < 0 ? simde_float16_from_float32(-r_) : simde_float16_from_float32(r_);
#endif
}
#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
#undef vabdh_f16
#define vabdh_f16(a, b) simde_vabdh_f16((a), (b))
#endif

SIMDE_FUNCTION_ATTRIBUTES
simde_float32_t
simde_vabds_f32(simde_float32_t a, simde_float32_t b) {
Expand Down Expand Up @@ -67,6 +85,20 @@ simde_vabdd_f64(simde_float64_t a, simde_float64_t b) {
#define vabdd_f64(a, b) simde_vabdd_f64((a), (b))
#endif

SIMDE_FUNCTION_ATTRIBUTES
simde_float16x4_t
simde_vabd_f16(simde_float16x4_t a, simde_float16x4_t b) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
return vabd_f16(a, b);
#else
return simde_vabs_f16(simde_vsub_f16(a, b));
#endif
}
#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
#undef vabd_f16
#define vabd_f16(a, b) simde_vabd_f16((a), (b))
#endif

SIMDE_FUNCTION_ATTRIBUTES
simde_float32x2_t
simde_vabd_f32(simde_float32x2_t a, simde_float32x2_t b) {
Expand Down Expand Up @@ -220,6 +252,20 @@ simde_vabd_u32(simde_uint32x2_t a, simde_uint32x2_t b) {
#define vabd_u32(a, b) simde_vabd_u32((a), (b))
#endif

SIMDE_FUNCTION_ATTRIBUTES
simde_float16x8_t
simde_vabdq_f16(simde_float16x8_t a, simde_float16x8_t b) {
#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
return vabdq_f16(a, b);
#else
return simde_vabsq_f16(simde_vsubq_f16(a, b));
#endif
}
#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
#undef vabdq_f16
#define vabdq_f16(a, b) simde_vabdq_f16((a), (b))
#endif

SIMDE_FUNCTION_ATTRIBUTES
simde_float32x4_t
simde_vabdq_f32(simde_float32x4_t a, simde_float32x4_t b) {
Expand Down
123 changes: 123 additions & 0 deletions simde/arm/neon/abdl_high.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
/* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use, copy,
* modify, merge, publish, distribute, sublicense, and/or sell copies
* of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
* Copyright:
* 2023 Yi-Yen Chung <[email protected]> (Copyright owned by Andes Technology)
*/

#if !defined(SIMDE_ARM_NEON_ABDL_HIGH_H)
#define SIMDE_ARM_NEON_ABDL_HIGH_H

#include "abdl.h"

HEDLEY_DIAGNOSTIC_PUSH
SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
SIMDE_BEGIN_DECLS_

SIMDE_FUNCTION_ATTRIBUTES
simde_int16x8_t
simde_vabdl_high_s8(simde_int8x16_t a, simde_int8x16_t b) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
return vabdl_high_s8(a, b);
#else
return simde_vabdl_s8(simde_vget_high_s8(a), simde_vget_high_s8(b));
#endif
}
#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
#undef vabdl_high_s8
#define vabdl_high_s8(a, b) simde_vabdl_high_s8((a), (b))
#endif

SIMDE_FUNCTION_ATTRIBUTES
simde_int32x4_t
simde_vabdl_high_s16(simde_int16x8_t a, simde_int16x8_t b) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
return vabdl_high_s16(a, b);
#else
return simde_vabdl_s16(simde_vget_high_s16(a), simde_vget_high_s16(b));
#endif
}
#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
#undef vabdl_high_s16
#define vabdl_high_s16(a, b) simde_vabdl_high_s16((a), (b))
#endif

SIMDE_FUNCTION_ATTRIBUTES
simde_int64x2_t
simde_vabdl_high_s32(simde_int32x4_t a, simde_int32x4_t b) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
return vabdl_high_s32(a, b);
#else
return simde_vabdl_s32(simde_vget_high_s32(a), simde_vget_high_s32(b));
#endif
}
#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
#undef vabdl_high_s32
#define vabdl_high_s32(a, b) simde_vabdl_high_s32((a), (b))
#endif

SIMDE_FUNCTION_ATTRIBUTES
simde_uint16x8_t
simde_vabdl_high_u8(simde_uint8x16_t a, simde_uint8x16_t b) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
return vabdl_high_u8(a, b);
#else
return simde_vabdl_u8(simde_vget_high_u8(a), simde_vget_high_u8(b));
#endif
}
#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
#undef vabdl_high_u8
#define vabdl_high_u8(a, b) simde_vabdl_high_u8((a), (b))
#endif

SIMDE_FUNCTION_ATTRIBUTES
simde_uint32x4_t
simde_vabdl_high_u16(simde_uint16x8_t a, simde_uint16x8_t b) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
return vabdl_high_u16(a, b);
#else
return simde_vabdl_u16(simde_vget_high_u16(a), simde_vget_high_u16(b));
#endif
}
#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
#undef vabdl_high_u16
#define vabdl_high_u16(a, b) simde_vabdl_high_u16((a), (b))
#endif

SIMDE_FUNCTION_ATTRIBUTES
simde_uint64x2_t
simde_vabdl_high_u32(simde_uint32x4_t a, simde_uint32x4_t b) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
return vabdl_high_u32(a, b);
#else
return simde_vabdl_u32(simde_vget_high_u32(a), simde_vget_high_u32(b));
#endif
}
#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
#undef vabdl_high_u32
#define vabdl_high_u32(a, b) simde_vabdl_high_u32((a), (b))
#endif

SIMDE_END_DECLS_
HEDLEY_DIAGNOSTIC_POP

#endif /* !defined(SIMDE_ARM_NEON_ABDL_HIGH_H) */
Loading