NEON: implement all intrinsics supported by architecture A64-remainin…

…g part (#1093) * [NEON] Add 5 intrinsics (vdiv{h/q}_f{16/64}). * [NEON] Add 11 dup_lane series intrinsics. - 2 dup{q}_laneq_f16 - 9 dup{b,h}_lane{q}_{s/u}{8,16}, duph_laneq_f16 * [NEON] Add 8 intrinsics (veor3q{s/u}{8/16/32/64}). * [NEON] Add fmlal, fmlsl, maxnmv, minnmv, pmaxnm, pminnm. * [NEON] Add 12 fmlal series intrinsics. * [NEON] Add 12 fmlsl series intrinsics. * [NEON] Add 11 vmax series intrinsics. - 1 vmaxh_f16 - 3 vmaxnm{/h/q}_f16 - 5 vmaxnmv{q}_f{16/32/64} - 2 vmaxv{q}_f16 * [NEON] Add 11 vmin series intrinsics. - 1 vminh_f16 - 3 vminnm{/h/q}_f16 - 5 vminnmv{q}_f{16/32/64} - 2 vminv{q}_f16 * [NEON] Add 8 vpmax series intrinsics. - 1 vpmaxq_f16 - 7 vpmaxnm{/s/q/qd}_f{16/32/64} * [NEON] Add 9 vpmin series intrinsics. - 2 vpmin{q}_f16 - 7 vpminnm{/s/q/qd}_f{16/32/64} * [NEON] Add 8 intrinsic function families. mmlaq, mull_high_lane, mull_high_n, mulx, mulx_lane, mulx_n, qrdmlah, qmovun_high. * [NEON] Add 3 vmmlaq series intrinsics. * [NEON] Add 41 vmul-related intrinsics. - 8 mull_high_lane series intrinsics - 4 mull_high_n series intrinsics - 9 vmulx series intrinsics - 2 vmulx{q}_n_f16 series intrinsics - 18 vmulx_lane series intrinsics * [NEON] Add 1 vpaddq_f16 intrinsic. * [NEON] Add 3 vqmovun_high_s{16/32/64} intrinsic. * [NEON] Add 6 vqrdmlah series intrinsics. * [NEON] Add 11 series intrinsics. qrdmlah_lane, qrdmlsh, qrdmlsh_lane, qshrun_high_n, rnd32x, rnd32z, rnd64x, rnd64z, rnda, rndx, shll_high_n. * [NEON] Add 30 vqrdmlah, vqrdmlsh related intrinsics. - 12 vqrdmlah{h/s/q}_lane{q}_s{16/32} - 6 vqrdmlsh{h/s/q}_s{16/32} - 12 vqrdmlsh{h/s/q}_lane{q}_s{16/32} * [NEON] Add 2 vqrdmulhh_lane{q}_s16 intrinsics. * [NEON] Add 5 vqsh related intrinsics. - 1 vqshluh_n_s16 - 3 vqshrun_high_n_s{16/32/64} - 1 vqshrun_n_s16 * [NEON] Add 16 vrnd32x, vrnd32z, vrnd64x, vrnd64z related intrinsics. - 4 vrnd32x{q}_f{32/64} - 4 vrnd32z{q}_f{32/64} - 4 vrnd64x{q}_f{32/64} - 4 vrnd64x{q}_f{32/64} * [NEON] Add vrnd{/a/i/m/p/x} related intrinsics. - 3 vrnd{/h/q}_f16 - 3 vrndi{/h/q}_f16 - 3 vrndm{/h/q}_f16 - 3 vrndp{/h/q}_f16 - 7 vrnda{q}_f{16/32/64}, vrndah_f16 - 7 vrndx{q}_f{16/32/64}, vrndxh_f16 * [NEON] Add 6 vshll_high_n series intrinsics. * [NEON] Add 7 intrinsic series. cadd_rot270, cadd_rot90, shrn_high_n, subhn_high, sudot_lane, usdot, usdot_lane * [NEON] Add 2 vcmla{q}_f16 intrinsics * [NEON] Add 6 vshrn_high_n series intrinsics * [NEON] Add 6 vsubhn_high series intrinsics * [NEON] Add 10 vsudot_lane, vusdot, and vusdot_lane series intrinsics. - 4 sudot{q}_lane{q}_s32 - 2 vusdot{q}_s32 - 4 vusdot{q}_lane{q}_s32 * [NEON] Add 10 vadd{q}_rot{90/270}_f{16/32/64} intrinsics. * [NEON] Add 5 series intrinsics. cmla_lane, cmla_rot180_lane, cmla_rot270_lane, cmla_rot90_lane, recpx. * [NEON] Add 38 vcmla related intrinsics. - 8 cvmla{q}_lane{q}_f{16/32} - 2 cvmla{q}_rot90_f16 - 8 cvmla{q}_rot90_lane{q}_f{16/32} - 2 cvmla{q}_rot180_f16 - 8 cvmla{q}_rot180_lane{q}_f{16/32} - 2 cvmla{q}_rot270_f16 - 8 cvmla{q}_rot270_lane{q}_f{16/32} * [NEON] Add vrecpeh_f16 and vrecpsh_f16 intrinsics. * [NEON] Add 3 vrecpx{h,s,d}_f{16,32,64} intrinsics. * [NEON] Add 8 series intrinsics. __crc32, ras, sha1, sha256, sha512, sm3, sm4 * [NEON] Add 8 __crc series intrinsics. * [NEON] Add vrax1q_u64 intrinsic * [NEON] Add sha1, sha256, and sha512 series intrinsics * [NEON] Add sm3 and sm4 series intrinsics * [NEON] Include <arm_acle.h> for __crc32 intrinsics * [NEON] Use uint to simulate the poly type and implement it * [NEON] Add poly type related intrinsics * [NEON] Add ldr and str related intrinsics. Co-authored-by: Eric Yi-Yen Chung <[email protected]> Co-authored-by: Michael R. Crusoe <[email protected]>
simd-everywhere · Nov 16, 2023 · 018ba24 · 018ba24
1 parent 692a2e8
commit 018ba24
Show file tree

Hide file tree

Showing 283 changed files with 86,094 additions and 165 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -690,6 +690,13 @@ jobs:
         rm test/x86/svml.c
         echo "Due to the qemu versions 7.2 through 8.0 causing timeouts in four sets of test cases,"
         echo "the SVML tests have been temporarily disabled."
+    - name: Disable RND*X tests
+      run: |
+        sed -i "/rndx/d" meson.build
+        sed -i "/rnd..x/d" meson.build
+        rm test/arm/neon/rndx.c test/arm/neon/rnd32x.c test/arm/neon/rnd64x.c
+        echo "Due to the qemu versions 7.2 through 8.0 causing timeouts in four sets of test cases,"
+        echo "the RNDX, RND32X, and RND64X tests have been temporarily disabled."
     - name: Configure
       run: meson setup build --cross-file=docker/cross-files/loongarch64-gcc-13-ccache.cross
     - name: Build

diff --git a/meson.build b/meson.build
@@ -50,12 +50,12 @@ simde_neon_families = [
   'clz',
   'cmla',
   'cmla_lane',
+  'cmla_rot180',
   'cmla_rot180_lane',
+  'cmla_rot270',
   'cmla_rot270_lane',
-  'cmla_rot90_lane',
   'cmla_rot90',
-  'cmla_rot180',
-  'cmla_rot270',
+  'cmla_rot90_lane',
   'cnt',
   'cvt',
   'cvt_n',
@@ -64,6 +64,7 @@ simde_neon_families = [
   'cvtp',
   'combine',
   'copy_lane',
+  'crc32',
   'create',
   'div',
   'dot',
@@ -75,6 +76,8 @@ simde_neon_families = [
   'fma',
   'fma_lane',
   'fma_n',
+  'fmlal',
+  'fmlsl',
   'fms',
   'fms_lane',
   'fms_n',
@@ -103,9 +106,11 @@ simde_neon_families = [
   'ld4_lane',
   'max',
   'maxnm',
+  'maxnmv',
   'maxv',
   'min',
   'minnm',
+  'minnmv',
   'minv',
   'mla',
   'mla_lane',
@@ -125,7 +130,7 @@ simde_neon_families = [
   'mlsl_high_n',
   'mlsl_lane',
   'mlsl_n',
-  #'mmlaq',
+  'mmlaq',
   'movl',
   'movl_high',
   'movn',
@@ -135,8 +140,13 @@ simde_neon_families = [
   'mul_n',
   'mull',
   'mull_high',
+  'mull_high_lane',
+  'mull_high_n',
   'mull_lane',
   'mull_n',
+  'mulx',
+  'mulx_lane',
+  'mulx_n',
   'mvn',
   'neg',
   'orn',
@@ -145,7 +155,9 @@ simde_neon_families = [
   'padd',
   'paddl',
   'pmax',
+  'pmaxnm',
   'pmin',
+  'pminnm',
   'qadd',
   'qabs',
   'qdmlal',
@@ -169,6 +181,10 @@ simde_neon_families = [
   'qdmull_high_n',
   'qdmull_lane',
   'qdmull_n',
+  'qrdmlah',
+  'qrdmlah_lane',
+  'qrdmlsh',
+  'qrdmlsh_lane',
   'qrdmulh',
   'qrdmulh_lane',
   'qrdmulh_n',
@@ -180,31 +196,41 @@ simde_neon_families = [
   'qmovn',
   'qmovn_high',
   'qmovun',
+  'qmovun_high',
   'qneg',
   'qshl',
   'qshl_n',
   'qshlu_n',
   'qshrn_high_n',
   'qshrn_n',
+  'qshrun_high_n',
   'qshrun_n',
   'qsub',
   'qtbl',
   'qtbx',
   'raddhn',
   'raddhn_high',
+  'rax',
   'rbit',
   'recpe',
   'recps',
+  'recpx',
   'reinterpret',
   'rev16',
   'rev32',
   'rev64',
   'rhadd',
   'rnd',
+  'rnd32x',
+  'rnd32z',
+  'rnd64x',
+  'rnd64z',
+  'rnda',
   'rndi',
   'rndm',
   'rndn',
   'rndp',
+  'rndx',
   'rshl',
   'rshr_n',
   'rshrn_high_n',
@@ -215,12 +241,19 @@ simde_neon_families = [
   'rsubhn',
   'rsubhn_high',
   'set_lane',
+  'sha1',
+  'sha256',
+  'sha512',
   'shl',
   'shl_n',
+  'shll_high_n',
   'shll_n',
   'shr_n',
+  'shrn_high_n',
   'shrn_n',
   'sli_n',
+  'sm3',
+  'sm4',
   'sqadd',
   'sqrt',
   'sra_n',
@@ -241,17 +274,21 @@ simde_neon_families = [
   'st4_lane',
   'sub',
   'subhn',
+  'subhn_high',
   'subl',
   'subl_high',
   'subw',
   'subw_high',
+  'sudot_lane',
   'tbl',
   'tbx',
   'trn1',
   'trn2',
   'trn',
   'tst',
   'uqadd',
+  'usdot',
+  'usdot_lane',
   'uzp1',
   'uzp2',
   'uzp',

diff --git a/simde/arm/neon.h b/simde/arm/neon.h
@@ -71,12 +71,12 @@
 #include "neon/clz.h"
 #include "neon/cmla.h"
 #include "neon/cmla_lane.h"
+#include "neon/cmla_rot180.h"
 #include "neon/cmla_rot180_lane.h"
+#include "neon/cmla_rot270.h"
 #include "neon/cmla_rot270_lane.h"
-#include "neon/cmla_rot90_lane.h"
 #include "neon/cmla_rot90.h"
-#include "neon/cmla_rot180.h"
-#include "neon/cmla_rot270.h"
+#include "neon/cmla_rot90_lane.h"
 #include "neon/cnt.h"
 #include "neon/cvt.h"
 #include "neon/cvt_n.h"
@@ -85,6 +85,7 @@
 #include "neon/cvtp.h"
 #include "neon/combine.h"
 #include "neon/copy_lane.h"
+#include "neon/crc32.h"
 #include "neon/create.h"
 #include "neon/div.h"
 #include "neon/dot.h"
@@ -96,6 +97,8 @@
 #include "neon/fma.h"
 #include "neon/fma_lane.h"
 #include "neon/fma_n.h"
+#include "neon/fmlal.h"
+#include "neon/fmlsl.h"
 #include "neon/fms.h"
 #include "neon/fms_lane.h"
 #include "neon/fms_n.h"
@@ -124,9 +127,11 @@
 #include "neon/ld4_lane.h"
 #include "neon/max.h"
 #include "neon/maxnm.h"
+#include "neon/maxnmv.h"
 #include "neon/maxv.h"
 #include "neon/min.h"
 #include "neon/minnm.h"
+#include "neon/minnmv.h"
 #include "neon/minv.h"
 #include "neon/mla.h"
 #include "neon/mla_lane.h"
@@ -146,7 +151,7 @@
 #include "neon/mlsl_high_n.h"
 #include "neon/mlsl_lane.h"
 #include "neon/mlsl_n.h"
-//#include "neon/mmlaq.h"
+#include "neon/mmlaq.h"
 #include "neon/movl.h"
 #include "neon/movl_high.h"
 #include "neon/movn.h"
@@ -156,8 +161,13 @@
 #include "neon/mul_n.h"
 #include "neon/mull.h"
 #include "neon/mull_high.h"
+#include "neon/mull_high_lane.h"
+#include "neon/mull_high_n.h"
 #include "neon/mull_lane.h"
 #include "neon/mull_n.h"
+#include "neon/mulx.h"
+#include "neon/mulx_lane.h"
+#include "neon/mulx_n.h"
 #include "neon/mvn.h"
 #include "neon/neg.h"
 #include "neon/orn.h"
@@ -166,7 +176,9 @@
 #include "neon/padd.h"
 #include "neon/paddl.h"
 #include "neon/pmax.h"
+#include "neon/pmaxnm.h"
 #include "neon/pmin.h"
+#include "neon/pminnm.h"
 #include "neon/qabs.h"
 #include "neon/qadd.h"
 #include "neon/qdmlal.h"
@@ -190,6 +202,10 @@
 #include "neon/qdmull_high_n.h"
 #include "neon/qdmull_lane.h"
 #include "neon/qdmull_n.h"
+#include "neon/qrdmlah.h"
+#include "neon/qrdmlah_lane.h"
+#include "neon/qrdmlsh.h"
+#include "neon/qrdmlsh_lane.h"
 #include "neon/qrdmulh.h"
 #include "neon/qrdmulh_lane.h"
 #include "neon/qrdmulh_n.h"
@@ -199,33 +215,43 @@
 #include "neon/qrshrun_high_n.h"
 #include "neon/qrshrun_n.h"
 #include "neon/qmovn.h"
-#include "neon/qmovun.h"
 #include "neon/qmovn_high.h"
+#include "neon/qmovun.h"
+#include "neon/qmovun_high.h"
 #include "neon/qneg.h"
 #include "neon/qsub.h"
 #include "neon/qshl.h"
 #include "neon/qshl_n.h"
 #include "neon/qshlu_n.h"
 #include "neon/qshrn_high_n.h"
 #include "neon/qshrn_n.h"
+#include "neon/qshrun_high_n.h"
 #include "neon/qshrun_n.h"
 #include "neon/qtbl.h"
 #include "neon/qtbx.h"
 #include "neon/raddhn.h"
 #include "neon/raddhn_high.h"
+#include "neon/rax.h"
 #include "neon/rbit.h"
 #include "neon/recpe.h"
 #include "neon/recps.h"
+#include "neon/recpx.h"
 #include "neon/reinterpret.h"
 #include "neon/rev16.h"
 #include "neon/rev32.h"
 #include "neon/rev64.h"
 #include "neon/rhadd.h"
 #include "neon/rnd.h"
+#include "neon/rnd32x.h"
+#include "neon/rnd32z.h"
+#include "neon/rnd64x.h"
+#include "neon/rnd64z.h"
+#include "neon/rnda.h"
 #include "neon/rndm.h"
 #include "neon/rndi.h"
 #include "neon/rndn.h"
 #include "neon/rndp.h"
+#include "neon/rndx.h"
 #include "neon/rshl.h"
 #include "neon/rshr_n.h"
 #include "neon/rshrn_high_n.h"
@@ -236,12 +262,19 @@
 #include "neon/rsubhn.h"
 #include "neon/rsubhn_high.h"
 #include "neon/set_lane.h"
+#include "neon/sha1.h"
+#include "neon/sha256.h"
+#include "neon/sha512.h"
 #include "neon/shl.h"
 #include "neon/shl_n.h"
+#include "neon/shll_high_n.h"
 #include "neon/shll_n.h"
 #include "neon/shr_n.h"
+#include "neon/shrn_high_n.h"
 #include "neon/shrn_n.h"
 #include "neon/sli_n.h"
+#include "neon/sm3.h"
+#include "neon/sm4.h"
 #include "neon/sqadd.h"
 #include "neon/sqrt.h"
 #include "neon/sra_n.h"
@@ -262,17 +295,21 @@
 #include "neon/st4_lane.h"
 #include "neon/sub.h"
 #include "neon/subhn.h"
+#include "neon/subhn_high.h"
 #include "neon/subl.h"
 #include "neon/subl_high.h"
 #include "neon/subw.h"
 #include "neon/subw_high.h"
+#include "neon/sudot_lane.h"
 #include "neon/tbl.h"
 #include "neon/tbx.h"
 #include "neon/trn.h"
 #include "neon/trn1.h"
 #include "neon/trn2.h"
 #include "neon/tst.h"
 #include "neon/uqadd.h"
+#include "neon/usdot.h"
+#include "neon/usdot_lane.h"
 #include "neon/uzp.h"
 #include "neon/uzp1.h"
 #include "neon/uzp2.h"