Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

NEON : Complex operations from Armv8.3-a #1077

Merged
merged 29 commits into from
Oct 17, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
78593be
[Neon] Add vcadd_rot270_f{16/32} and vcaddq_rot270_f{16/32/64}
Oct 16, 2023
617cf05
[Neon] Add vcadd_rot90_f{16/32} and vcaddq_rot90_f{16/32/64}
Oct 16, 2023
94cc32b
[Neon] Add vcmla_lane_f{16/32} and vcmla_laneq_f{16/32} and vcmlaq_la…
Oct 16, 2023
596cafc
[Neon] Add vcmla_rot90_lane_f{16/32} and vcmla_rot90_laneq_f{16/32} a…
Oct 16, 2023
4b6c6c7
[Neon] Add vcmla_rot180_lane_f{16/32} and vcmla_rot180_laneq_f{16/32}…
Oct 16, 2023
560be04
[Neon] Add vcadd_rot270_f{16/32} and vcaddq_rot270_f{16/32/64}
Oct 16, 2023
1818df3
[Neon] : add meson.build and simde/arm/neon.h
Oct 16, 2023
6fbd8c2
[Fix] : add newline
Oct 16, 2023
72f8a1c
[Fix] : formatting the code
Oct 16, 2023
318c72b
[Fix] : add newline
Oct 16, 2023
5866e62
[Fix] : invalid operands to binary expression for f16
Oct 16, 2023
7a7cc03
[Fix] : operation for f16
Oct 16, 2023
21fb644
[Fix] : simde_vaddh_f16 missed
Oct 16, 2023
bf58e4d
[Fix] : invalid argument type 'simde_float16' to unary expression
Oct 16, 2023
856b4a7
[Fix] : not using static const struct for f16
Oct 16, 2023
9f6cace
[Fix] : f16 intrinsic of cadd_rot270 and cadd_rot90
Oct 16, 2023
dfc4481
[Fix] : f16 intrinsics
Oct 16, 2023
d262180
[Fix] : format the code
Oct 16, 2023
0880dac
[Fix] : add newline in test/arm/neon/cadd_rot270.c
Oct 16, 2023
d3fab5b
[Fix] : remove comment for test code
Oct 17, 2023
e42ff32
[Fix] : coding style
Oct 17, 2023
609c46f
[Fix] : warning of unused variable
Oct 17, 2023
3eac558
[Fix] : use another way to implement f16 functions
Oct 17, 2023
e966085
[Fix] : use implementation of f16 functions
Oct 17, 2023
20131b3
[Fix] : delete conflicting type
Oct 17, 2023
0c531c3
[Fix] : another implementation for vcmla{/q}_rot{180/270/90}_lane{/q}…
Oct 17, 2023
788e06e
[Fix] : implementation for vcmla{/q}_rot{180/270/90}_lane{/q}_f16 and…
Oct 17, 2023
790d471
[Fix] : elements in shuffle vector
Oct 17, 2023
19ed113
[Fix] : formatting with ColumnLimit = 125, IdentWidth = 2, TabWidth = 4
Oct 17, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ simde_neon_families = [
'bcax',
'bic',
'bsl',
'cadd_rot270',
'cadd_rot90',
'cage',
'cagt',
'ceq',
Expand All @@ -40,6 +42,10 @@ simde_neon_families = [
'cltz',
'clz',
'cmla',
'cmla_lane',
'cmla_rot180_lane',
'cmla_rot270_lane',
'cmla_rot90_lane',
'cmla_rot90',
'cmla_rot180',
'cmla_rot270',
Expand Down
6 changes: 6 additions & 0 deletions simde/arm/neon.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@
#include "neon/bcax.h"
#include "neon/bic.h"
#include "neon/bsl.h"
#include "neon/cadd_rot270.h"
#include "neon/cadd_rot90.h"
#include "neon/cage.h"
#include "neon/cagt.h"
#include "neon/ceq.h"
Expand All @@ -61,6 +63,10 @@
#include "neon/cltz.h"
#include "neon/clz.h"
#include "neon/cmla.h"
#include "neon/cmla_lane.h"
#include "neon/cmla_rot180_lane.h"
#include "neon/cmla_rot270_lane.h"
#include "neon/cmla_rot90_lane.h"
#include "neon/cmla_rot90.h"
#include "neon/cmla_rot180.h"
#include "neon/cmla_rot270.h"
Expand Down
183 changes: 183 additions & 0 deletions simde/arm/neon/cadd_rot270.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
/* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use, copy,
* modify, merge, publish, distribute, sublicense, and/or sell copies
* of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
* Copyright:
* 2023 Chi-Wei Chu <[email protected]>
*/

#if !defined(SIMDE_ARM_NEON_CADD_ROT270_H)
#define SIMDE_ARM_NEON_CADD_ROT270_H

#include "add.h"
#include "types.h"
HEDLEY_DIAGNOSTIC_PUSH
SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
SIMDE_BEGIN_DECLS_

SIMDE_FUNCTION_ATTRIBUTES
simde_float16x4_t simde_vcadd_rot270_f16(simde_float16x4_t a, simde_float16x4_t b)
{
#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \
(!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \
(!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0))
return vcadd_rot270_f16(a, b);
#else
simde_float16x4_private r_, a_ = simde_float16x4_to_private(a), b_ = simde_float16x4_to_private(b);
#if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) && \
((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16))
b_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, -b_.values, b_.values, 5, 0, 7, 2);
r_.values = b_.values + a_.values;
#else
SIMDE_VECTORIZE
for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++)
{
r_.values[2 * i] = simde_vaddh_f16(b_.values[2 * i + 1], a_.values[2 * i]);
r_.values[2 * i + 1] =
simde_vaddh_f16(simde_float16_from_float32(-simde_float16_to_float32(b_.values[2 * i])), a_.values[2 * i + 1]);
}
#endif
return simde_float16x4_from_private(r_);
#endif
}
#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
#undef vcadd_rot270_f16
#define vcadd_rot270_f16(a, b) simde_vcadd_rot270_f16(a, b)
#endif

SIMDE_FUNCTION_ATTRIBUTES
simde_float16x8_t simde_vcaddq_rot270_f16(simde_float16x8_t a, simde_float16x8_t b)
{
#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \
(!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \
(!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0))
return vcaddq_rot270_f16(a, b);
#else
simde_float16x8_private r_, a_ = simde_float16x8_to_private(a), b_ = simde_float16x8_to_private(b);
#if defined(SIMDE_SHUFFLE_VECTOR_) && \
((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16))
b_.values = SIMDE_SHUFFLE_VECTOR_(16, 8, -b_.values, b_.values, 9, 0, 11, 2, 13, 4, 15, 6);
r_.values = b_.values + a_.values;
#else
SIMDE_VECTORIZE
for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++)
{
r_.values[2 * i] = simde_vaddh_f16(b_.values[2 * i + 1], a_.values[2 * i]);
r_.values[2 * i + 1] =
simde_vaddh_f16(simde_float16_from_float32(-simde_float16_to_float32(b_.values[2 * i])), a_.values[2 * i + 1]);
}
#endif
return simde_float16x8_from_private(r_);
#endif
}
#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
#undef vcaddq_rot270_f16
#define vcaddq_rot270_f16(a, b) simde_vcaddq_rot270_f16(a, b)
#endif

SIMDE_FUNCTION_ATTRIBUTES
simde_float32x2_t simde_vcadd_rot270_f32(simde_float32x2_t a, simde_float32x2_t b)
{
#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \
(!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \
(!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0))
return vcadd_rot270_f32(a, b);
#else
simde_float32x2_private r_, a_ = simde_float32x2_to_private(a), b_ = simde_float32x2_to_private(b);
#if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760)
b_.values = SIMDE_SHUFFLE_VECTOR_(32, 8, -b_.values, b_.values, 3, 0);
r_.values = b_.values + a_.values;
#else
SIMDE_VECTORIZE
for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++)
{
r_.values[2 * i] = b_.values[2 * i + 1] + a_.values[2 * i];
r_.values[2 * i + 1] = -(b_.values[2 * i]) + a_.values[2 * i + 1];
}
#endif
return simde_float32x2_from_private(r_);
#endif
}
#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
#undef vcadd_rot270_f32
#define vcadd_rot270_f32(a, b) simde_vcadd_rot270_f32(a, b)
#endif

SIMDE_FUNCTION_ATTRIBUTES
simde_float32x4_t simde_vcaddq_rot270_f32(simde_float32x4_t a, simde_float32x4_t b)
{
#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \
(!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \
(!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0))
return vcaddq_rot270_f32(a, b);
#else
simde_float32x4_private r_, a_ = simde_float32x4_to_private(a), b_ = simde_float32x4_to_private(b);
#if defined(SIMDE_SHUFFLE_VECTOR_)
b_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, -b_.values, b_.values, 5, 0, 7, 2);
r_.values = b_.values + a_.values;
#else
SIMDE_VECTORIZE
for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++)
{
r_.values[2 * i] = b_.values[2 * i + 1] + a_.values[2 * i];
r_.values[2 * i + 1] = -(b_.values[2 * i]) + a_.values[2 * i + 1];
}
#endif
return simde_float32x4_from_private(r_);
#endif
}
#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
#undef vcaddq_rot270_f32
#define vcaddq_rot270_f32(a, b) simde_vcaddq_rot270_f32(a, b)
#endif

SIMDE_FUNCTION_ATTRIBUTES
simde_float64x2_t simde_vcaddq_rot270_f64(simde_float64x2_t a, simde_float64x2_t b)
{
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \
(!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \
(!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0))
return vcaddq_rot270_f64(a, b);
#else
simde_float64x2_private r_, a_ = simde_float64x2_to_private(a), b_ = simde_float64x2_to_private(b);
#if defined(SIMDE_SHUFFLE_VECTOR_)
b_.values = SIMDE_SHUFFLE_VECTOR_(64, 16, -b_.values, b_.values, 3, 0);
r_.values = b_.values + a_.values;
#else
SIMDE_VECTORIZE
for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++)
{
r_.values[2 * i] = b_.values[2 * i + 1] + a_.values[2 * i];
r_.values[2 * i + 1] = -(b_.values[2 * i]) + a_.values[2 * i + 1];
}
#endif
return simde_float64x2_from_private(r_);
#endif
}
#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
#undef vcaddq_rot270_f64
#define vcaddq_rot270_f64(a, b) simde_vcaddq_rot270_f64(a, b)
#endif

SIMDE_END_DECLS_
HEDLEY_DIAGNOSTIC_POP

#endif /* !defined(SIMDE_ARM_NEON_CADD_ROT270_H) */
Loading