Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

NEON: more fp16 using intrinsics supported by architecture v7 #1075

Closed
wants to merge 44 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
44 commits
Select commit Hold shift + click to select a range
8856f0e
[NEON] Add vabal_{s/u}{8/16/32}
yyctw Jun 21, 2023
f8c778a
[NEON] Add vabal_high_{s/u}{8/16/32}
yyctw Jun 21, 2023
ff2ec4d
[NEON] Add all vcale* intrinsics (9)
yyctw Jun 21, 2023
755cb1d
[NEON] Add all vcalt intrinsics (9)
yyctw Jun 21, 2023
65847b2
[NEON] Add vcreate_f16
yyctw Jun 21, 2023
c51141b
[NEON] Add vreinterpret_u64_f16
yyctw Jun 21, 2023
c3372b2
[NEON] Add vcvth_f16_s16 and vcvth_f16_u16
yyctw Jul 7, 2023
1f0b8ff
[NEON] Add vduph_lane_f16, vdup_lane_f16, and vdupq_lane_f16
yyctw Jul 7, 2023
e655a5c
[NEON] Add vext_f16
yyctw Jul 7, 2023
a6edcd7
[NEON] Add 16 vcvt{q}_n_* intrinsics
yyctw Jul 7, 2023
71ad453
[Fix] Correct function input parameters
yyctw Jul 10, 2023
dfe46c2
[NEON] Add 6 vcvtn_{s/u}{16/32/64}_f{*} intrinsics
yyctw Jul 10, 2023
978095e
[Fix] Correct vdup_lane_f16 and vdupq_lane_f16.
yyctw Jul 10, 2023
0c6df69
[Fix] Correct function input parameters.
yyctw Jul 10, 2023
3ea516f
[NEON] Add 24 vcvt{q}_n_* intrinsics
yyctw Jul 11, 2023
b091361
[NEON] Add all vcvtn* intrinsics
yyctw Jul 17, 2023
5e5d10b
[NEON] Add vfmah_f16 and vfma_f16
yyctw Jul 18, 2023
7778f3d
[NEON] Add vfma_n_f16 and vfmaq_n_f16
yyctw Jul 18, 2023
c45b405
[NEON] Add vmulh_f16
yyctw Jul 18, 2023
2ae7405
[NEON] Add fma_lane related intrinsics.
yyctw Jul 18, 2023
32833b1
[NEON] Add 5 vmul* related intrinsics
yyctw Jul 18, 2023
90c6072
[NEON] Add neg related intrinsics.
yyctw Jul 18, 2023
dfc8c1f
[NEON] Add all fms, fms_n, and fms_lane intrinsics
yyctw Jul 19, 2023
876015f
[NEON] Add types float16x{4/8}x{2/3/4}
yyctw Jul 19, 2023
899d0ec
[NEON] Add 9 vld1 related intrinsics
yyctw Jul 19, 2023
82a66b8
[Fix] Modified wrong rounding implementation.
yyctw Jul 20, 2023
0f04ab2
[Fix] Fix wrong intrinsic alias names.
yyctw Jul 20, 2023
a0be5a2
[Refactor] Remove redundant functions.
yyctw Jul 25, 2023
077d5ff
[NEON] Add 45 ld2 related intrinsics
yyctw Jul 26, 2023
19c5191
[NEON] Add ld3_dup, ld3_lane, and ld4_dup
yyctw Jul 28, 2023
e10760e
[NEON] Add vld3_f16 and vld4_f16.
yyctw Jul 28, 2023
e10263b
[NEON] Add vld{3/4}_{dup/lane} series intrinsics
yyctw Jul 28, 2023
5bf20e9
[NEON] Add mla_{high}_lane series intrinsics
yyctw Aug 1, 2023
03df636
[NEON] Add qdmlal_{high}_{lane} series intrinsics.
yyctw Aug 1, 2023
b1b4a1e
[NEON] Add qdmlal_lane and qdmlal_n series intrinsics
yyctw Aug 1, 2023
def93bf
[NEON] Add mls_lane and mlsl_high_lane series intrinsics
yyctw Aug 2, 2023
72fbd7d
[NEON] Add 22 qdmlsl series intrinsics
yyctw Aug 2, 2023
670aafd
[NEON] Add 10 qdmull_* series intrinsics
yyctw Aug 2, 2023
e54669b
[NEON] Add 3 qdmulh series intrinsics
yyctw Aug 2, 2023
999c394
[Fix] Fix wrong function name.
yyctw Aug 2, 2023
79cda85
[Fix] Correct the wrong alias function name.
yyctw Aug 4, 2023
cd400ad
[NEON] Add qdmullh_lane{q}_s{16/32} related intrinsics
yyctw Aug 4, 2023
36e40d3
[NEON] Add qdmull_n and qdmull_high_lane series intrinsics
yyctw Aug 4, 2023
675c697
[Fix] Add conditions for fp16 intrinsics
yyctw Oct 11, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docker/cross-files/i686-gcc-11-qemu.cross
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ exe_wrapper = ['qemu-i386-static', '-L', '/usr/i686-linux-gnu']

[properties]
c_args = ['-Wextra', '-Werror', '-O2']
cpp_args = ['-Wextra', '-Werror', '-O2']
cpp_args = ['-Wextra', '-Werror', '-O2', '-ffloat-store']
needs_exe_wrapper = true

[host_machine]
Expand Down
34 changes: 34 additions & 0 deletions meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ cxx = meson.get_compiler('cpp')

simde_neon_families = [
'aba',
'abal',
'abal_high',
'abd',
'abdl',
'abs',
Expand All @@ -27,6 +29,8 @@ simde_neon_families = [
'bsl',
'cage',
'cagt',
'cale',
'calt',
'ceq',
'ceqz',
'cge',
Expand All @@ -45,6 +49,7 @@ simde_neon_families = [
'cmla_rot270',
'cnt',
'cvt',
'cvt_n',
'cvtn',
'combine',
'create',
Expand All @@ -58,6 +63,9 @@ simde_neon_families = [
'fma',
'fma_lane',
'fma_n',
'fms',
'fms_lane',
'fms_n',
'get_high',
'get_lane',
'get_low',
Expand All @@ -73,8 +81,13 @@ simde_neon_families = [
'ld1q_x4',
'ld1',
'ld2',
'ld2_dup',
'ld2_lane',
'ld3',
'ld3_dup',
'ld3_lane',
'ld4',
'ld4_dup',
'ld4_lane',
'max',
'maxnm',
Expand All @@ -87,16 +100,20 @@ simde_neon_families = [
'mla_n',
'mlal',
'mlal_high',
'mlal_high_lane',
'mlal_high_n',
'mlal_lane',
'mlal_n',
'mls',
'mls_lane',
'mls_n',
'mlsl',
'mlsl_high',
'mlsl_high_lane',
'mlsl_high_n',
'mlsl_lane',
'mlsl_n',
#'mmlaq',
'movl',
'movl_high',
'movn',
Expand All @@ -119,10 +136,27 @@ simde_neon_families = [
'pmin',
'qadd',
'qabs',
'qdmlal',
'qdmlal_high',
'qdmlal_high_lane',
'qdmlal_high_n',
'qdmlal_lane',
'qdmlal_n',
'qdmlsl',
'qdmlsl_high',
'qdmlsl_high_lane',
'qdmlsl_high_n',
'qdmlsl_lane',
'qdmlsl_n',
'qdmulh',
'qdmulh_lane',
'qdmulh_n',
'qdmull',
'qdmull_high',
'qdmull_high_lane',
'qdmull_high_n',
'qdmull_lane',
'qdmull_n',
'qrdmulh',
'qrdmulh_lane',
'qrdmulh_n',
Expand Down
34 changes: 34 additions & 0 deletions simde/arm/neon.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@
#include "neon/types.h"

#include "neon/aba.h"
#include "neon/abal.h"
#include "neon/abal_high.h"
#include "neon/abd.h"
#include "neon/abdl.h"
#include "neon/abs.h"
Expand All @@ -48,6 +50,8 @@
#include "neon/bsl.h"
#include "neon/cage.h"
#include "neon/cagt.h"
#include "neon/cale.h"
#include "neon/calt.h"
#include "neon/ceq.h"
#include "neon/ceqz.h"
#include "neon/cge.h"
Expand All @@ -66,6 +70,7 @@
#include "neon/cmla_rot270.h"
#include "neon/cnt.h"
#include "neon/cvt.h"
#include "neon/cvt_n.h"
#include "neon/cvtn.h"
#include "neon/combine.h"
#include "neon/create.h"
Expand All @@ -79,6 +84,9 @@
#include "neon/fma.h"
#include "neon/fma_lane.h"
#include "neon/fma_n.h"
#include "neon/fms.h"
#include "neon/fms_lane.h"
#include "neon/fms_n.h"
#include "neon/get_high.h"
#include "neon/get_lane.h"
#include "neon/get_low.h"
Expand All @@ -94,8 +102,13 @@
#include "neon/ld1q_x3.h"
#include "neon/ld1q_x4.h"
#include "neon/ld2.h"
#include "neon/ld2_dup.h"
#include "neon/ld2_lane.h"
#include "neon/ld3.h"
#include "neon/ld3_dup.h"
#include "neon/ld3_lane.h"
#include "neon/ld4.h"
#include "neon/ld4_dup.h"
#include "neon/ld4_lane.h"
#include "neon/max.h"
#include "neon/maxnm.h"
Expand All @@ -108,16 +121,20 @@
#include "neon/mla_n.h"
#include "neon/mlal.h"
#include "neon/mlal_high.h"
#include "neon/mlal_high_lane.h"
#include "neon/mlal_high_n.h"
#include "neon/mlal_lane.h"
#include "neon/mlal_n.h"
#include "neon/mls.h"
#include "neon/mls_lane.h"
#include "neon/mls_n.h"
#include "neon/mlsl.h"
#include "neon/mlsl_high.h"
#include "neon/mlsl_high_lane.h"
#include "neon/mlsl_high_n.h"
#include "neon/mlsl_lane.h"
#include "neon/mlsl_n.h"
//#include "neon/mmlaq.h"
#include "neon/movl.h"
#include "neon/movl_high.h"
#include "neon/movn.h"
Expand All @@ -140,10 +157,27 @@
#include "neon/pmin.h"
#include "neon/qabs.h"
#include "neon/qadd.h"
#include "neon/qdmlal.h"
#include "neon/qdmlal_high.h"
#include "neon/qdmlal_high_lane.h"
#include "neon/qdmlal_high_n.h"
#include "neon/qdmlal_lane.h"
#include "neon/qdmlal_n.h"
#include "neon/qdmlsl.h"
#include "neon/qdmlsl_high.h"
#include "neon/qdmlsl_high_lane.h"
#include "neon/qdmlsl_high_n.h"
#include "neon/qdmlsl_lane.h"
#include "neon/qdmlsl_n.h"
#include "neon/qdmulh.h"
#include "neon/qdmulh_lane.h"
#include "neon/qdmulh_n.h"
#include "neon/qdmull.h"
#include "neon/qdmull_high.h"
#include "neon/qdmull_high_lane.h"
#include "neon/qdmull_high_n.h"
#include "neon/qdmull_lane.h"
#include "neon/qdmull_n.h"
#include "neon/qrdmulh.h"
#include "neon/qrdmulh_lane.h"
#include "neon/qrdmulh_n.h"
Expand Down
125 changes: 125 additions & 0 deletions simde/arm/neon/abal.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
/* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use, copy,
* modify, merge, publish, distribute, sublicense, and/or sell copies
* of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
* Copyright:
* 2023 Yi-Yen Chung <[email protected]> (Copyright owned by Andes Technology)
*/

#if !defined(SIMDE_ARM_NEON_ABAL_H)
#define SIMDE_ARM_NEON_ABAL_H

#include "abdl.h"
#include "add.h"

HEDLEY_DIAGNOSTIC_PUSH
SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
SIMDE_BEGIN_DECLS_

SIMDE_FUNCTION_ATTRIBUTES
simde_int16x8_t
simde_vabal_s8(simde_int16x8_t a, simde_int8x8_t b, simde_int8x8_t c) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vabal_s8(a, b, c);
#else
return simde_vaddq_s16(simde_vabdl_s8(b, c), a);
#endif
}
#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
#undef vabal_s8
#define vabal_s8(a, b, c) simde_vabal_s8((a), (b), (c))
#endif

SIMDE_FUNCTION_ATTRIBUTES
simde_int32x4_t
simde_vabal_s16(simde_int32x4_t a, simde_int16x4_t b, simde_int16x4_t c) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vabal_s16(a, b, c);
#else
return simde_vaddq_s32(simde_vabdl_s16(b, c), a);
#endif
}
#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
#undef vabal_s16
#define vabal_s16(a, b, c) simde_vabal_s16((a), (b), (c))
#endif

SIMDE_FUNCTION_ATTRIBUTES
simde_int64x2_t
simde_vabal_s32(simde_int64x2_t a, simde_int32x2_t b, simde_int32x2_t c) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vabal_s32(a, b, c);
#else
return simde_vaddq_s64(simde_vabdl_s32(b, c), a);
#endif
}
#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
#undef vabal_s32
#define vabal_s32(a, b, c) simde_vabal_s32((a), (b), (c))
#endif

SIMDE_FUNCTION_ATTRIBUTES
simde_uint16x8_t
simde_vabal_u8(simde_uint16x8_t a, simde_uint8x8_t b, simde_uint8x8_t c) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vabal_u8(a, b, c);
#else
return simde_vaddq_u16(simde_vabdl_u8(b, c), a);
#endif
}
#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
#undef vabal_u8
#define vabal_u8(a, b, c) simde_vabal_u8((a), (b), (c))
#endif

SIMDE_FUNCTION_ATTRIBUTES
simde_uint32x4_t
simde_vabal_u16(simde_uint32x4_t a, simde_uint16x4_t b, simde_uint16x4_t c) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vabal_u16(a, b, c);
#else
return simde_vaddq_u32(simde_vabdl_u16(b, c), a);
#endif
}
#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
#undef vabal_u16
#define vabal_u16(a, b, c) simde_vabal_u16((a), (b), (c))
#endif

SIMDE_FUNCTION_ATTRIBUTES
simde_uint64x2_t
simde_vabal_u32(simde_uint64x2_t a, simde_uint32x2_t b, simde_uint32x2_t c) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vabal_u32(a, b, c);
#else
return simde_vaddq_u64(simde_vabdl_u32(b, c), a);
#endif
}
#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
#undef vabal_u32
#define vabal_u32(a, b, c) simde_vabal_u32((a), (b), (c))
#endif


SIMDE_END_DECLS_
HEDLEY_DIAGNOSTIC_POP

#endif /* !defined(SIMDE_ARM_NEON_abal_H) */
Loading