Skip to content

Commit

Permalink
NEON: more fp16 using intrinsics supported by architecture v7 (skip v…
Browse files Browse the repository at this point in the history
…ersion) (#1081)

* [NEON] Add vabal_{s/u}{8/16/32}

* [NEON] Add vabal_high_{s/u}{8/16/32}

* [NEON] Add all vcale* intrinsics (9)

* [NEON] Add all vcalt intrinsics (9)

* [NEON] Add vcreate_f16

* [NEON] Add vreinterpret_u64_f16

* [NEON] Add vcvth_f16_s16 and vcvth_f16_u16

* [NEON] Add vduph_lane_f16, vdup_lane_f16, and vdupq_lane_f16

* [NEON] Add vext_f16

* [NEON] Add 16 vcvt{q}_n_* intrinsics

* [Fix] Correct function input parameters

* [NEON] Add 6 vcvtn_{s/u}{16/32/64}_f{*} intrinsics

* [Fix] Correct vdup_lane_f16 and vdupq_lane_f16.

* [Fix] Correct function input parameters.

* [NEON] Add 24 vcvt{q}_n_* intrinsics

* [NEON] Add all vcvtn* intrinsics

* [NEON] Add vfmah_f16 and vfma_f16

* [NEON] Add vfma_n_f16 and vfmaq_n_f16

* [NEON] Add vmulh_f16

* [NEON] Add fma_lane related intrinsics.

* [NEON] Add 5 vmul* related intrinsics
vmulh_lane_f16, vmulh_laneq_f16, vmul_lane_f16,
vmul_laneq_f16, vmulq_laneq_f16.

* [NEON] Add neg related intrinsics.

* [NEON] Add all fms, fms_n, and fms_lane intrinsics

* [NEON] Add types float16x{4/8}x{2/3/4}

* [NEON] Add 9 vld1 related intrinsics

* [Fix] Modified wrong rounding implementation.
Modified wrong implementation "Ties to Away" to "rounding to nearest
with ties to Away"
add.h: Remove redundant code.

* [Fix] Fix wrong intrinsic alias names.

* [Refactor] Remove redundant functions.

* [NEON] Add 45 ld2 related intrinsics
one ld2_f16, twenty-two ld2_lane series, and twenty-two ld2_dup series.

* [NEON] Add ld3_dup, ld3_lane, and ld4_dup

* [NEON] Add vld3_f16 and vld4_f16.

* [NEON] Add vld{3/4}_{dup/lane} series intrinsics

* [NEON] Add mla_{high}_lane series intrinsics

* [NEON] Add qdmlal_{high}_{lane} series intrinsics.

* [NEON] Add qdmlal_lane and qdmlal_n series intrinsics

* [NEON] Add mls_lane and mlsl_high_lane series intrinsics

* [NEON] Add 22 qdmlsl series intrinsics

* [NEON] Add 10 qdmull_* series intrinsics

* [NEON] Add 3 qdmulh series intrinsics

* [Fix] Fix wrong function name.

* [Fix] Correct the wrong alias function name.

* [NEON] Add qdmullh_lane{q}_s{16/32} related intrinsics

* [NEON] Add qdmull_n and qdmull_high_lane series intrinsics

* [Fix] Add conditions for fp16 intrinsics

* [Hack] Skip functions that trigger compiler bugs.
  • Loading branch information
yyctw authored Oct 18, 2023
1 parent d08d67c commit 5e7c4d4
Show file tree
Hide file tree
Showing 129 changed files with 33,598 additions and 190 deletions.
34 changes: 34 additions & 0 deletions meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ cxx = meson.get_compiler('cpp')

simde_neon_families = [
'aba',
'abal',
'abal_high',
'abd',
'abdl',
'abs',
Expand All @@ -29,6 +31,8 @@ simde_neon_families = [
'cadd_rot90',
'cage',
'cagt',
'cale',
'calt',
'ceq',
'ceqz',
'cge',
Expand All @@ -51,6 +55,7 @@ simde_neon_families = [
'cmla_rot270',
'cnt',
'cvt',
'cvt_n',
'cvtn',
'combine',
'create',
Expand All @@ -64,6 +69,9 @@ simde_neon_families = [
'fma',
'fma_lane',
'fma_n',
'fms',
'fms_lane',
'fms_n',
'get_high',
'get_lane',
'get_low',
Expand All @@ -79,8 +87,13 @@ simde_neon_families = [
'ld1q_x4',
'ld1',
'ld2',
'ld2_dup',
'ld2_lane',
'ld3',
'ld3_dup',
'ld3_lane',
'ld4',
'ld4_dup',
'ld4_lane',
'max',
'maxnm',
Expand All @@ -93,16 +106,20 @@ simde_neon_families = [
'mla_n',
'mlal',
'mlal_high',
'mlal_high_lane',
'mlal_high_n',
'mlal_lane',
'mlal_n',
'mls',
'mls_lane',
'mls_n',
'mlsl',
'mlsl_high',
'mlsl_high_lane',
'mlsl_high_n',
'mlsl_lane',
'mlsl_n',
#'mmlaq',
'movl',
'movl_high',
'movn',
Expand All @@ -125,10 +142,27 @@ simde_neon_families = [
'pmin',
'qadd',
'qabs',
'qdmlal',
'qdmlal_high',
'qdmlal_high_lane',
'qdmlal_high_n',
'qdmlal_lane',
'qdmlal_n',
'qdmlsl',
'qdmlsl_high',
'qdmlsl_high_lane',
'qdmlsl_high_n',
'qdmlsl_lane',
'qdmlsl_n',
'qdmulh',
'qdmulh_lane',
'qdmulh_n',
'qdmull',
'qdmull_high',
'qdmull_high_lane',
'qdmull_high_n',
'qdmull_lane',
'qdmull_n',
'qrdmulh',
'qrdmulh_lane',
'qrdmulh_n',
Expand Down
34 changes: 34 additions & 0 deletions simde/arm/neon.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@
#include "neon/types.h"

#include "neon/aba.h"
#include "neon/abal.h"
#include "neon/abal_high.h"
#include "neon/abd.h"
#include "neon/abdl.h"
#include "neon/abs.h"
Expand All @@ -50,6 +52,8 @@
#include "neon/cadd_rot90.h"
#include "neon/cage.h"
#include "neon/cagt.h"
#include "neon/cale.h"
#include "neon/calt.h"
#include "neon/ceq.h"
#include "neon/ceqz.h"
#include "neon/cge.h"
Expand All @@ -72,6 +76,7 @@
#include "neon/cmla_rot270.h"
#include "neon/cnt.h"
#include "neon/cvt.h"
#include "neon/cvt_n.h"
#include "neon/cvtn.h"
#include "neon/combine.h"
#include "neon/create.h"
Expand All @@ -85,6 +90,9 @@
#include "neon/fma.h"
#include "neon/fma_lane.h"
#include "neon/fma_n.h"
#include "neon/fms.h"
#include "neon/fms_lane.h"
#include "neon/fms_n.h"
#include "neon/get_high.h"
#include "neon/get_lane.h"
#include "neon/get_low.h"
Expand All @@ -100,8 +108,13 @@
#include "neon/ld1q_x3.h"
#include "neon/ld1q_x4.h"
#include "neon/ld2.h"
#include "neon/ld2_dup.h"
#include "neon/ld2_lane.h"
#include "neon/ld3.h"
#include "neon/ld3_dup.h"
#include "neon/ld3_lane.h"
#include "neon/ld4.h"
#include "neon/ld4_dup.h"
#include "neon/ld4_lane.h"
#include "neon/max.h"
#include "neon/maxnm.h"
Expand All @@ -114,16 +127,20 @@
#include "neon/mla_n.h"
#include "neon/mlal.h"
#include "neon/mlal_high.h"
#include "neon/mlal_high_lane.h"
#include "neon/mlal_high_n.h"
#include "neon/mlal_lane.h"
#include "neon/mlal_n.h"
#include "neon/mls.h"
#include "neon/mls_lane.h"
#include "neon/mls_n.h"
#include "neon/mlsl.h"
#include "neon/mlsl_high.h"
#include "neon/mlsl_high_lane.h"
#include "neon/mlsl_high_n.h"
#include "neon/mlsl_lane.h"
#include "neon/mlsl_n.h"
//#include "neon/mmlaq.h"
#include "neon/movl.h"
#include "neon/movl_high.h"
#include "neon/movn.h"
Expand All @@ -146,10 +163,27 @@
#include "neon/pmin.h"
#include "neon/qabs.h"
#include "neon/qadd.h"
#include "neon/qdmlal.h"
#include "neon/qdmlal_high.h"
#include "neon/qdmlal_high_lane.h"
#include "neon/qdmlal_high_n.h"
#include "neon/qdmlal_lane.h"
#include "neon/qdmlal_n.h"
#include "neon/qdmlsl.h"
#include "neon/qdmlsl_high.h"
#include "neon/qdmlsl_high_lane.h"
#include "neon/qdmlsl_high_n.h"
#include "neon/qdmlsl_lane.h"
#include "neon/qdmlsl_n.h"
#include "neon/qdmulh.h"
#include "neon/qdmulh_lane.h"
#include "neon/qdmulh_n.h"
#include "neon/qdmull.h"
#include "neon/qdmull_high.h"
#include "neon/qdmull_high_lane.h"
#include "neon/qdmull_high_n.h"
#include "neon/qdmull_lane.h"
#include "neon/qdmull_n.h"
#include "neon/qrdmulh.h"
#include "neon/qrdmulh_lane.h"
#include "neon/qrdmulh_n.h"
Expand Down
125 changes: 125 additions & 0 deletions simde/arm/neon/abal.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
/* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use, copy,
* modify, merge, publish, distribute, sublicense, and/or sell copies
* of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
* Copyright:
* 2023 Yi-Yen Chung <[email protected]> (Copyright owned by Andes Technology)
*/

#if !defined(SIMDE_ARM_NEON_ABAL_H)
#define SIMDE_ARM_NEON_ABAL_H

#include "abdl.h"
#include "add.h"

HEDLEY_DIAGNOSTIC_PUSH
SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
SIMDE_BEGIN_DECLS_

SIMDE_FUNCTION_ATTRIBUTES
simde_int16x8_t
simde_vabal_s8(simde_int16x8_t a, simde_int8x8_t b, simde_int8x8_t c) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vabal_s8(a, b, c);
#else
return simde_vaddq_s16(simde_vabdl_s8(b, c), a);
#endif
}
#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
#undef vabal_s8
#define vabal_s8(a, b, c) simde_vabal_s8((a), (b), (c))
#endif

SIMDE_FUNCTION_ATTRIBUTES
simde_int32x4_t
simde_vabal_s16(simde_int32x4_t a, simde_int16x4_t b, simde_int16x4_t c) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vabal_s16(a, b, c);
#else
return simde_vaddq_s32(simde_vabdl_s16(b, c), a);
#endif
}
#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
#undef vabal_s16
#define vabal_s16(a, b, c) simde_vabal_s16((a), (b), (c))
#endif

SIMDE_FUNCTION_ATTRIBUTES
simde_int64x2_t
simde_vabal_s32(simde_int64x2_t a, simde_int32x2_t b, simde_int32x2_t c) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vabal_s32(a, b, c);
#else
return simde_vaddq_s64(simde_vabdl_s32(b, c), a);
#endif
}
#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
#undef vabal_s32
#define vabal_s32(a, b, c) simde_vabal_s32((a), (b), (c))
#endif

SIMDE_FUNCTION_ATTRIBUTES
simde_uint16x8_t
simde_vabal_u8(simde_uint16x8_t a, simde_uint8x8_t b, simde_uint8x8_t c) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vabal_u8(a, b, c);
#else
return simde_vaddq_u16(simde_vabdl_u8(b, c), a);
#endif
}
#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
#undef vabal_u8
#define vabal_u8(a, b, c) simde_vabal_u8((a), (b), (c))
#endif

SIMDE_FUNCTION_ATTRIBUTES
simde_uint32x4_t
simde_vabal_u16(simde_uint32x4_t a, simde_uint16x4_t b, simde_uint16x4_t c) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vabal_u16(a, b, c);
#else
return simde_vaddq_u32(simde_vabdl_u16(b, c), a);
#endif
}
#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
#undef vabal_u16
#define vabal_u16(a, b, c) simde_vabal_u16((a), (b), (c))
#endif

SIMDE_FUNCTION_ATTRIBUTES
simde_uint64x2_t
simde_vabal_u32(simde_uint64x2_t a, simde_uint32x2_t b, simde_uint32x2_t c) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vabal_u32(a, b, c);
#else
return simde_vaddq_u64(simde_vabdl_u32(b, c), a);
#endif
}
#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
#undef vabal_u32
#define vabal_u32(a, b, c) simde_vabal_u32((a), (b), (c))
#endif


SIMDE_END_DECLS_
HEDLEY_DIAGNOSTIC_POP

#endif /* !defined(SIMDE_ARM_NEON_abal_H) */
Loading

0 comments on commit 5e7c4d4

Please sign in to comment.