Skip to content

Commit

Permalink
NEON: implement all intrinsics supported by architecture A64-remainin…
Browse files Browse the repository at this point in the history
…g part (#1093)

* [NEON] Add 5 intrinsics (vdiv{h/q}_f{16/64}).
* [NEON] Add 11 dup_lane series intrinsics.
- 2 dup{q}_laneq_f16
- 9 dup{b,h}_lane{q}_{s/u}{8,16}, duph_laneq_f16
* [NEON] Add 8 intrinsics (veor3q{s/u}{8/16/32/64}).
* [NEON] Add fmlal, fmlsl, maxnmv, minnmv, pmaxnm, pminnm.
* [NEON] Add 12 fmlal series intrinsics.
* [NEON] Add 12 fmlsl series intrinsics.
* [NEON] Add 11 vmax series intrinsics.
- 1 vmaxh_f16
- 3 vmaxnm{/h/q}_f16
- 5 vmaxnmv{q}_f{16/32/64}
- 2 vmaxv{q}_f16
* [NEON] Add 11 vmin series intrinsics.
- 1 vminh_f16
- 3 vminnm{/h/q}_f16
- 5 vminnmv{q}_f{16/32/64}
- 2 vminv{q}_f16
* [NEON] Add 8 vpmax series intrinsics.
- 1 vpmaxq_f16
- 7 vpmaxnm{/s/q/qd}_f{16/32/64}
* [NEON] Add 9 vpmin series intrinsics.
- 2 vpmin{q}_f16
- 7 vpminnm{/s/q/qd}_f{16/32/64}
* [NEON] Add 8 intrinsic function families.
mmlaq, mull_high_lane, mull_high_n, mulx,
mulx_lane, mulx_n, qrdmlah, qmovun_high.
* [NEON] Add 3 vmmlaq series intrinsics.
* [NEON] Add 41 vmul-related intrinsics.
- 8 mull_high_lane series intrinsics
- 4 mull_high_n series intrinsics
- 9 vmulx series intrinsics
- 2 vmulx{q}_n_f16 series intrinsics
- 18 vmulx_lane series intrinsics
* [NEON] Add 1 vpaddq_f16 intrinsic.
* [NEON] Add 3 vqmovun_high_s{16/32/64} intrinsic.
* [NEON] Add 6 vqrdmlah series intrinsics.
* [NEON] Add 11 series intrinsics.
qrdmlah_lane, qrdmlsh, qrdmlsh_lane, qshrun_high_n,
rnd32x, rnd32z, rnd64x, rnd64z, rnda, rndx, shll_high_n.
* [NEON] Add 30 vqrdmlah, vqrdmlsh related intrinsics.
- 12 vqrdmlah{h/s/q}_lane{q}_s{16/32}
- 6 vqrdmlsh{h/s/q}_s{16/32}
- 12 vqrdmlsh{h/s/q}_lane{q}_s{16/32}
* [NEON] Add 2 vqrdmulhh_lane{q}_s16 intrinsics.
* [NEON] Add 5 vqsh related intrinsics.
- 1 vqshluh_n_s16
- 3 vqshrun_high_n_s{16/32/64}
- 1 vqshrun_n_s16
* [NEON] Add 16 vrnd32x, vrnd32z, vrnd64x, vrnd64z related intrinsics.
- 4 vrnd32x{q}_f{32/64}
- 4 vrnd32z{q}_f{32/64}
- 4 vrnd64x{q}_f{32/64}
- 4 vrnd64x{q}_f{32/64}
* [NEON] Add vrnd{/a/i/m/p/x} related intrinsics.
- 3 vrnd{/h/q}_f16
- 3 vrndi{/h/q}_f16
- 3 vrndm{/h/q}_f16
- 3 vrndp{/h/q}_f16
- 7 vrnda{q}_f{16/32/64}, vrndah_f16
- 7 vrndx{q}_f{16/32/64}, vrndxh_f16
* [NEON] Add 6 vshll_high_n series intrinsics.
* [NEON] Add 7 intrinsic series.
cadd_rot270, cadd_rot90, shrn_high_n, subhn_high,
sudot_lane, usdot, usdot_lane
* [NEON] Add 2 vcmla{q}_f16 intrinsics
* [NEON] Add 6 vshrn_high_n series intrinsics
* [NEON] Add 6 vsubhn_high series intrinsics
* [NEON] Add 10 vsudot_lane, vusdot, and vusdot_lane series intrinsics.
- 4 sudot{q}_lane{q}_s32
- 2 vusdot{q}_s32
- 4 vusdot{q}_lane{q}_s32
* [NEON] Add 10 vadd{q}_rot{90/270}_f{16/32/64} intrinsics.
* [NEON] Add 5 series intrinsics.
cmla_lane, cmla_rot180_lane, cmla_rot270_lane, cmla_rot90_lane, recpx.
* [NEON] Add 38 vcmla related intrinsics.
- 8 cvmla{q}_lane{q}_f{16/32}
- 2 cvmla{q}_rot90_f16
- 8 cvmla{q}_rot90_lane{q}_f{16/32}
- 2 cvmla{q}_rot180_f16
- 8 cvmla{q}_rot180_lane{q}_f{16/32}
- 2 cvmla{q}_rot270_f16
- 8 cvmla{q}_rot270_lane{q}_f{16/32}
* [NEON] Add vrecpeh_f16 and vrecpsh_f16 intrinsics.
* [NEON] Add 3 vrecpx{h,s,d}_f{16,32,64} intrinsics.
* [NEON] Add 8 series intrinsics.
__crc32, ras, sha1, sha256, sha512, sm3, sm4
* [NEON] Add 8 __crc series intrinsics.
* [NEON] Add vrax1q_u64 intrinsic
* [NEON] Add sha1, sha256, and sha512 series intrinsics
* [NEON] Add sm3 and sm4 series intrinsics
* [NEON] Include <arm_acle.h> for __crc32 intrinsics
* [NEON] Use uint to simulate the poly type and implement it
* [NEON] Add poly type related intrinsics
* [NEON] Add ldr and str related intrinsics.

Co-authored-by: Eric Yi-Yen Chung <[email protected]>
Co-authored-by: Michael R. Crusoe <[email protected]>
  • Loading branch information
3 people authored Nov 16, 2023
1 parent 692a2e8 commit 018ba24
Show file tree
Hide file tree
Showing 283 changed files with 86,094 additions and 165 deletions.
7 changes: 7 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -690,6 +690,13 @@ jobs:
rm test/x86/svml.c
echo "Due to the qemu versions 7.2 through 8.0 causing timeouts in four sets of test cases,"
echo "the SVML tests have been temporarily disabled."
- name: Disable RND*X tests
run: |
sed -i "/rndx/d" meson.build
sed -i "/rnd..x/d" meson.build
rm test/arm/neon/rndx.c test/arm/neon/rnd32x.c test/arm/neon/rnd64x.c
echo "Due to the qemu versions 7.2 through 8.0 causing timeouts in four sets of test cases,"
echo "the RNDX, RND32X, and RND64X tests have been temporarily disabled."
- name: Configure
run: meson setup build --cross-file=docker/cross-files/loongarch64-gcc-13-ccache.cross
- name: Build
Expand Down
45 changes: 41 additions & 4 deletions meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -50,12 +50,12 @@ simde_neon_families = [
'clz',
'cmla',
'cmla_lane',
'cmla_rot180',
'cmla_rot180_lane',
'cmla_rot270',
'cmla_rot270_lane',
'cmla_rot90_lane',
'cmla_rot90',
'cmla_rot180',
'cmla_rot270',
'cmla_rot90_lane',
'cnt',
'cvt',
'cvt_n',
Expand All @@ -64,6 +64,7 @@ simde_neon_families = [
'cvtp',
'combine',
'copy_lane',
'crc32',
'create',
'div',
'dot',
Expand All @@ -75,6 +76,8 @@ simde_neon_families = [
'fma',
'fma_lane',
'fma_n',
'fmlal',
'fmlsl',
'fms',
'fms_lane',
'fms_n',
Expand Down Expand Up @@ -103,9 +106,11 @@ simde_neon_families = [
'ld4_lane',
'max',
'maxnm',
'maxnmv',
'maxv',
'min',
'minnm',
'minnmv',
'minv',
'mla',
'mla_lane',
Expand All @@ -125,7 +130,7 @@ simde_neon_families = [
'mlsl_high_n',
'mlsl_lane',
'mlsl_n',
#'mmlaq',
'mmlaq',
'movl',
'movl_high',
'movn',
Expand All @@ -135,8 +140,13 @@ simde_neon_families = [
'mul_n',
'mull',
'mull_high',
'mull_high_lane',
'mull_high_n',
'mull_lane',
'mull_n',
'mulx',
'mulx_lane',
'mulx_n',
'mvn',
'neg',
'orn',
Expand All @@ -145,7 +155,9 @@ simde_neon_families = [
'padd',
'paddl',
'pmax',
'pmaxnm',
'pmin',
'pminnm',
'qadd',
'qabs',
'qdmlal',
Expand All @@ -169,6 +181,10 @@ simde_neon_families = [
'qdmull_high_n',
'qdmull_lane',
'qdmull_n',
'qrdmlah',
'qrdmlah_lane',
'qrdmlsh',
'qrdmlsh_lane',
'qrdmulh',
'qrdmulh_lane',
'qrdmulh_n',
Expand All @@ -180,31 +196,41 @@ simde_neon_families = [
'qmovn',
'qmovn_high',
'qmovun',
'qmovun_high',
'qneg',
'qshl',
'qshl_n',
'qshlu_n',
'qshrn_high_n',
'qshrn_n',
'qshrun_high_n',
'qshrun_n',
'qsub',
'qtbl',
'qtbx',
'raddhn',
'raddhn_high',
'rax',
'rbit',
'recpe',
'recps',
'recpx',
'reinterpret',
'rev16',
'rev32',
'rev64',
'rhadd',
'rnd',
'rnd32x',
'rnd32z',
'rnd64x',
'rnd64z',
'rnda',
'rndi',
'rndm',
'rndn',
'rndp',
'rndx',
'rshl',
'rshr_n',
'rshrn_high_n',
Expand All @@ -215,12 +241,19 @@ simde_neon_families = [
'rsubhn',
'rsubhn_high',
'set_lane',
'sha1',
'sha256',
'sha512',
'shl',
'shl_n',
'shll_high_n',
'shll_n',
'shr_n',
'shrn_high_n',
'shrn_n',
'sli_n',
'sm3',
'sm4',
'sqadd',
'sqrt',
'sra_n',
Expand All @@ -241,17 +274,21 @@ simde_neon_families = [
'st4_lane',
'sub',
'subhn',
'subhn_high',
'subl',
'subl_high',
'subw',
'subw_high',
'sudot_lane',
'tbl',
'tbx',
'trn1',
'trn2',
'trn',
'tst',
'uqadd',
'usdot',
'usdot_lane',
'uzp1',
'uzp2',
'uzp',
Expand Down
47 changes: 42 additions & 5 deletions simde/arm/neon.h
Original file line number Diff line number Diff line change
Expand Up @@ -71,12 +71,12 @@
#include "neon/clz.h"
#include "neon/cmla.h"
#include "neon/cmla_lane.h"
#include "neon/cmla_rot180.h"
#include "neon/cmla_rot180_lane.h"
#include "neon/cmla_rot270.h"
#include "neon/cmla_rot270_lane.h"
#include "neon/cmla_rot90_lane.h"
#include "neon/cmla_rot90.h"
#include "neon/cmla_rot180.h"
#include "neon/cmla_rot270.h"
#include "neon/cmla_rot90_lane.h"
#include "neon/cnt.h"
#include "neon/cvt.h"
#include "neon/cvt_n.h"
Expand All @@ -85,6 +85,7 @@
#include "neon/cvtp.h"
#include "neon/combine.h"
#include "neon/copy_lane.h"
#include "neon/crc32.h"
#include "neon/create.h"
#include "neon/div.h"
#include "neon/dot.h"
Expand All @@ -96,6 +97,8 @@
#include "neon/fma.h"
#include "neon/fma_lane.h"
#include "neon/fma_n.h"
#include "neon/fmlal.h"
#include "neon/fmlsl.h"
#include "neon/fms.h"
#include "neon/fms_lane.h"
#include "neon/fms_n.h"
Expand Down Expand Up @@ -124,9 +127,11 @@
#include "neon/ld4_lane.h"
#include "neon/max.h"
#include "neon/maxnm.h"
#include "neon/maxnmv.h"
#include "neon/maxv.h"
#include "neon/min.h"
#include "neon/minnm.h"
#include "neon/minnmv.h"
#include "neon/minv.h"
#include "neon/mla.h"
#include "neon/mla_lane.h"
Expand All @@ -146,7 +151,7 @@
#include "neon/mlsl_high_n.h"
#include "neon/mlsl_lane.h"
#include "neon/mlsl_n.h"
//#include "neon/mmlaq.h"
#include "neon/mmlaq.h"
#include "neon/movl.h"
#include "neon/movl_high.h"
#include "neon/movn.h"
Expand All @@ -156,8 +161,13 @@
#include "neon/mul_n.h"
#include "neon/mull.h"
#include "neon/mull_high.h"
#include "neon/mull_high_lane.h"
#include "neon/mull_high_n.h"
#include "neon/mull_lane.h"
#include "neon/mull_n.h"
#include "neon/mulx.h"
#include "neon/mulx_lane.h"
#include "neon/mulx_n.h"
#include "neon/mvn.h"
#include "neon/neg.h"
#include "neon/orn.h"
Expand All @@ -166,7 +176,9 @@
#include "neon/padd.h"
#include "neon/paddl.h"
#include "neon/pmax.h"
#include "neon/pmaxnm.h"
#include "neon/pmin.h"
#include "neon/pminnm.h"
#include "neon/qabs.h"
#include "neon/qadd.h"
#include "neon/qdmlal.h"
Expand All @@ -190,6 +202,10 @@
#include "neon/qdmull_high_n.h"
#include "neon/qdmull_lane.h"
#include "neon/qdmull_n.h"
#include "neon/qrdmlah.h"
#include "neon/qrdmlah_lane.h"
#include "neon/qrdmlsh.h"
#include "neon/qrdmlsh_lane.h"
#include "neon/qrdmulh.h"
#include "neon/qrdmulh_lane.h"
#include "neon/qrdmulh_n.h"
Expand All @@ -199,33 +215,43 @@
#include "neon/qrshrun_high_n.h"
#include "neon/qrshrun_n.h"
#include "neon/qmovn.h"
#include "neon/qmovun.h"
#include "neon/qmovn_high.h"
#include "neon/qmovun.h"
#include "neon/qmovun_high.h"
#include "neon/qneg.h"
#include "neon/qsub.h"
#include "neon/qshl.h"
#include "neon/qshl_n.h"
#include "neon/qshlu_n.h"
#include "neon/qshrn_high_n.h"
#include "neon/qshrn_n.h"
#include "neon/qshrun_high_n.h"
#include "neon/qshrun_n.h"
#include "neon/qtbl.h"
#include "neon/qtbx.h"
#include "neon/raddhn.h"
#include "neon/raddhn_high.h"
#include "neon/rax.h"
#include "neon/rbit.h"
#include "neon/recpe.h"
#include "neon/recps.h"
#include "neon/recpx.h"
#include "neon/reinterpret.h"
#include "neon/rev16.h"
#include "neon/rev32.h"
#include "neon/rev64.h"
#include "neon/rhadd.h"
#include "neon/rnd.h"
#include "neon/rnd32x.h"
#include "neon/rnd32z.h"
#include "neon/rnd64x.h"
#include "neon/rnd64z.h"
#include "neon/rnda.h"
#include "neon/rndm.h"
#include "neon/rndi.h"
#include "neon/rndn.h"
#include "neon/rndp.h"
#include "neon/rndx.h"
#include "neon/rshl.h"
#include "neon/rshr_n.h"
#include "neon/rshrn_high_n.h"
Expand All @@ -236,12 +262,19 @@
#include "neon/rsubhn.h"
#include "neon/rsubhn_high.h"
#include "neon/set_lane.h"
#include "neon/sha1.h"
#include "neon/sha256.h"
#include "neon/sha512.h"
#include "neon/shl.h"
#include "neon/shl_n.h"
#include "neon/shll_high_n.h"
#include "neon/shll_n.h"
#include "neon/shr_n.h"
#include "neon/shrn_high_n.h"
#include "neon/shrn_n.h"
#include "neon/sli_n.h"
#include "neon/sm3.h"
#include "neon/sm4.h"
#include "neon/sqadd.h"
#include "neon/sqrt.h"
#include "neon/sra_n.h"
Expand All @@ -262,17 +295,21 @@
#include "neon/st4_lane.h"
#include "neon/sub.h"
#include "neon/subhn.h"
#include "neon/subhn_high.h"
#include "neon/subl.h"
#include "neon/subl_high.h"
#include "neon/subw.h"
#include "neon/subw_high.h"
#include "neon/sudot_lane.h"
#include "neon/tbl.h"
#include "neon/tbx.h"
#include "neon/trn.h"
#include "neon/trn1.h"
#include "neon/trn2.h"
#include "neon/tst.h"
#include "neon/uqadd.h"
#include "neon/usdot.h"
#include "neon/usdot_lane.h"
#include "neon/uzp.h"
#include "neon/uzp1.h"
#include "neon/uzp2.h"
Expand Down
Loading

0 comments on commit 018ba24

Please sign in to comment.