From 84ebde4a7ea13e943fb79486d32642ec5e1a9aeb Mon Sep 17 00:00:00 2001 From: LI Qingwu Date: Mon, 9 Sep 2024 16:30:51 +0800 Subject: [PATCH] arm neon: fix arm64 gcc11 build excess elements in vector failure Resolved a build failure on ARM64 with GCC 11 caused by excess elements in a vector initializer. The issue stemmed from the `__builtin_shuffle` function, where the number of elements in the argument vector(s) and the mask vector must match. For more details, refer to issue #1211. Signed-off-by: LI Qingwu --- simde/arm/neon/cadd_rot270.h | 4 ++-- simde/arm/neon/cadd_rot90.h | 4 ++-- simde/arm/neon/cmla_lane.h | 12 ++++++------ simde/arm/neon/cmla_rot180_lane.h | 20 ++++++++++---------- simde/arm/neon/cmla_rot270_lane.h | 20 ++++++++++---------- simde/arm/neon/cmla_rot90_lane.h | 20 ++++++++++---------- 6 files changed, 40 insertions(+), 40 deletions(-) diff --git a/simde/arm/neon/cadd_rot270.h b/simde/arm/neon/cadd_rot270.h index 1c557c1df..624a7c8a5 100644 --- a/simde/arm/neon/cadd_rot270.h +++ b/simde/arm/neon/cadd_rot270.h @@ -54,7 +54,7 @@ simde_float16x4_t simde_vcadd_rot270_f16(simde_float16x4_t a, simde_float16x4_t r_.sv64 = __riscv_vfadd_vv_f16m1(op1, a_.sv64, 4); #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) && \ ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) - b_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, -b_.values, b_.values, 5, 0, 7, 2); + b_.values = SIMDE_SHUFFLE_VECTOR_(16, 8, -b_.values, b_.values, 5, 0, 7, 2); r_.values = b_.values + a_.values; #else SIMDE_VECTORIZE @@ -90,7 +90,7 @@ simde_float16x8_t simde_vcaddq_rot270_f16(simde_float16x8_t a, simde_float16x8_t r_.sv128 = __riscv_vfadd_vv_f16m1(op1, a_.sv128, 8); #elif defined(SIMDE_SHUFFLE_VECTOR_) && \ ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) - b_.values = SIMDE_SHUFFLE_VECTOR_(16, 8, -b_.values, b_.values, 9, 0, 11, 2, 13, 4, 15, 6); + b_.values = SIMDE_SHUFFLE_VECTOR_(16, 16, -b_.values, b_.values, 9, 0, 11, 2, 13, 4, 15, 6); r_.values = b_.values + a_.values; #else SIMDE_VECTORIZE diff --git a/simde/arm/neon/cadd_rot90.h b/simde/arm/neon/cadd_rot90.h index 70a2c4f9d..db43d0121 100644 --- a/simde/arm/neon/cadd_rot90.h +++ b/simde/arm/neon/cadd_rot90.h @@ -54,7 +54,7 @@ simde_float16x4_t simde_vcadd_rot90_f16(simde_float16x4_t a, simde_float16x4_t b r_.sv64 = __riscv_vfadd_vv_f16m1(op1, a_.sv64, 4); #elif defined(SIMDE_SHUFFLE_VECTOR_) && \ ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) - b_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, -b_.values, b_.values, 1, 4, 3, 6); + b_.values = SIMDE_SHUFFLE_VECTOR_(16, 8, -b_.values, b_.values, 1, 4, 3, 6); r_.values = b_.values + a_.values; #else SIMDE_VECTORIZE @@ -90,7 +90,7 @@ simde_float16x8_t simde_vcaddq_rot90_f16(simde_float16x8_t a, simde_float16x8_t r_.sv128 = __riscv_vfadd_vv_f16m1(op1, a_.sv128, 8); #elif defined(SIMDE_SHUFFLE_VECTOR_) && \ ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) - b_.values = SIMDE_SHUFFLE_VECTOR_(16, 8, -b_.values, b_.values, 1, 8, 3, 10, 5, 12, 7, 14); + b_.values = SIMDE_SHUFFLE_VECTOR_(16, 16, -b_.values, b_.values, 1, 8, 3, 10, 5, 12, 7, 14); r_.values = b_.values + a_.values; #else SIMDE_VECTORIZE diff --git a/simde/arm/neon/cmla_lane.h b/simde/arm/neon/cmla_lane.h index 9256ceccc..9a9f885d3 100644 --- a/simde/arm/neon/cmla_lane.h +++ b/simde/arm/neon/cmla_lane.h @@ -59,7 +59,7 @@ simde_float16x4_t simde_vcmla_lane_f16(simde_float16x4_t r, simde_float16x4_t a, simde_vcvt_f32_f16(simde_vdup_n_f16(simde_float16x4_to_private(b).values[lane]))); #if defined(SIMDE_SHUFFLE_VECTOR_) && \ ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) - a_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_.values, a_.values, 0, 0, 2, 2); + a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 0, 0, 2, 2); r_.values += b_.values * a_.values; #else SIMDE_VECTORIZE @@ -142,7 +142,7 @@ simde_float16x4_t simde_vcmla_laneq_f16(simde_float16x4_t r, simde_float16x4_t a simde_vcvt_f32_f16(simde_vdup_n_f16(simde_float16x8_to_private(b).values[lane]))); #if defined(SIMDE_SHUFFLE_VECTOR_) && \ ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) - a_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_.values, a_.values, 0, 0, 2, 2); + a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 0, 0, 2, 2); r_.values += b_.values * a_.values; #else SIMDE_VECTORIZE @@ -228,8 +228,8 @@ simde_float16x8_t simde_vcmlaq_lane_f16(simde_float16x8_t r, simde_float16x8_t a simde_vcvt_f32_f16(simde_vdup_n_f16(simde_float16x4_to_private(b).values[lane]))); #if defined(SIMDE_SHUFFLE_VECTOR_) && \ ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) - a_low.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_low.values, a_low.values, 0, 0, 2, 2); - a_high.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_high.values, a_high.values, 0, 0, 2, 2); + a_low.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_low.values, a_low.values, 0, 0, 2, 2); + a_high.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_high.values, a_high.values, 0, 0, 2, 2); r_low.values += b_.values * a_low.values; r_high.values += b_.values * a_high.values; #else @@ -320,9 +320,9 @@ simde_float16x8_t simde_vcmlaq_laneq_f16(simde_float16x8_t r, simde_float16x8_t simde_vcvt_f32_f16(simde_vdup_n_f16(simde_float16x8_to_private(b).values[lane]))); #if defined(SIMDE_SHUFFLE_VECTOR_) && \ ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) - a_low.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_low.values, a_low.values, 0, 0, 2, 2); + a_low.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_low.values, a_low.values, 0, 0, 2, 2); r_low.values += b_.values * a_low.values; - a_high.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_high.values, a_high.values, 0, 0, 2, 2); + a_high.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_high.values, a_high.values, 0, 0, 2, 2); r_high.values += b_.values * a_high.values; #else SIMDE_VECTORIZE diff --git a/simde/arm/neon/cmla_rot180_lane.h b/simde/arm/neon/cmla_rot180_lane.h index 0c25d0ee4..8800fbc26 100644 --- a/simde/arm/neon/cmla_rot180_lane.h +++ b/simde/arm/neon/cmla_rot180_lane.h @@ -62,8 +62,8 @@ simde_float16x4_t simde_vcmla_rot180_lane_f16(simde_float16x4_t r, simde_float16 simde_vcvt_f32_f16(simde_vdup_n_f16(simde_float16x4_to_private(b).values[lane]))); #if defined(SIMDE_SHUFFLE_VECTOR_) && \ ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) - a_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_.values, a_.values, 0, 0, 2, 2); - b_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, -b_.values, b_.values, 0, 1, 2, 3); + a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 0, 0, 2, 2); + b_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, -b_.values, b_.values, 0, 1, 2, 3); r_.values += b_.values * a_.values; #else SIMDE_VECTORIZE @@ -151,9 +151,9 @@ simde_float16x8_t simde_vcmlaq_rot180_lane_f16(simde_float16x8_t r, simde_float1 simde_vcvt_f32_f16(simde_vdup_n_f16(simde_float16x4_to_private(b).values[lane]))); #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) && \ ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) - a_low.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_low.values, a_low.values, 0, 0, 2, 2); - a_high.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_high.values, a_high.values, 0, 0, 2, 2); - b_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, -b_.values, b_.values, 0, 1, 2, 3); + a_low.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_low.values, a_low.values, 0, 0, 2, 2); + a_high.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_high.values, a_high.values, 0, 0, 2, 2); + b_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, -b_.values, b_.values, 0, 1, 2, 3); r_low.values += b_.values * a_low.values; r_high.values += b_.values * a_high.values; #else @@ -243,8 +243,8 @@ simde_float16x4_t simde_vcmla_rot180_laneq_f16(simde_float16x4_t r, simde_float1 simde_vcvt_f32_f16(simde_vdup_n_f16(simde_float16x8_to_private(b).values[lane]))); #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) && \ ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) - a_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_.values, a_.values, 0, 0, 2, 2); - b_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, -b_.values, b_.values, 0, 1, 2, 3); + a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 0, 0, 2, 2); + b_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, -b_.values, b_.values, 0, 1, 2, 3); r_.values += b_.values * a_.values; #else SIMDE_VECTORIZE @@ -332,9 +332,9 @@ simde_float16x8_t simde_vcmlaq_rot180_laneq_f16(simde_float16x8_t r, simde_float simde_vcvt_f32_f16(simde_vdup_n_f16(simde_float16x8_to_private(b).values[lane]))); #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) && \ ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) - a_low.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_low.values, a_low.values, 0, 0, 2, 2); - a_high.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_high.values, a_high.values, 0, 0, 2, 2); - b_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, -b_.values, b_.values, 0, 1, 2, 3); + a_low.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_low.values, a_low.values, 0, 0, 2, 2); + a_high.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_high.values, a_high.values, 0, 0, 2, 2); + b_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, -b_.values, b_.values, 0, 1, 2, 3); r_low.values += b_.values * a_low.values; r_high.values += b_.values * a_high.values; #else diff --git a/simde/arm/neon/cmla_rot270_lane.h b/simde/arm/neon/cmla_rot270_lane.h index 38cd93460..c03f5c147 100644 --- a/simde/arm/neon/cmla_rot270_lane.h +++ b/simde/arm/neon/cmla_rot270_lane.h @@ -62,8 +62,8 @@ simde_float16x4_t simde_vcmla_rot270_lane_f16(simde_float16x4_t r, simde_float16 simde_vcvt_f32_f16(simde_vdup_n_f16(simde_float16x4_to_private(b).values[lane]))); #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) && \ ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) - a_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_.values, a_.values, 1, 1, 3, 3); - b_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, -b_.values, b_.values, 5, 0, 7, 2); + a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 1, 1, 3, 3); + b_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, -b_.values, b_.values, 5, 0, 7, 2); r_.values += b_.values * a_.values; #else SIMDE_VECTORIZE @@ -151,9 +151,9 @@ simde_float16x8_t simde_vcmlaq_rot270_lane_f16(simde_float16x8_t r, simde_float1 simde_vcvt_f32_f16(simde_vdup_n_f16(simde_float16x4_to_private(b).values[lane]))); #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) && \ ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) - a_low.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_low.values, a_low.values, 1, 1, 3, 3); - a_high.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_high.values, a_high.values, 1, 1, 3, 3); - b_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, -b_.values, b_.values, 5, 0, 7, 2); + a_low.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_low.values, a_low.values, 1, 1, 3, 3); + a_high.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_high.values, a_high.values, 1, 1, 3, 3); + b_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, -b_.values, b_.values, 5, 0, 7, 2); r_low.values += b_.values * a_low.values; r_high.values += b_.values * a_high.values; #else @@ -243,8 +243,8 @@ simde_float16x4_t simde_vcmla_rot270_laneq_f16(simde_float16x4_t r, simde_float1 simde_vcvt_f32_f16(simde_vdup_n_f16(simde_float16x8_to_private(b).values[lane]))); #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) && \ ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) - a_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_.values, a_.values, 1, 1, 3, 3); - b_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, -b_.values, b_.values, 5, 0, 7, 2); + a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 1, 1, 3, 3); + b_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, -b_.values, b_.values, 5, 0, 7, 2); r_.values += b_.values * a_.values; #else SIMDE_VECTORIZE @@ -332,9 +332,9 @@ simde_float16x8_t simde_vcmlaq_rot270_laneq_f16(simde_float16x8_t r, simde_float simde_vcvt_f32_f16(simde_vdup_n_f16(simde_float16x8_to_private(b).values[lane]))); #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) && \ ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) - a_high.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_high.values, a_high.values, 1, 1, 3, 3); - a_low.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_low.values, a_low.values, 1, 1, 3, 3); - b_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, -b_.values, b_.values, 5, 0, 7, 2); + a_high.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_high.values, a_high.values, 1, 1, 3, 3); + a_low.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_low.values, a_low.values, 1, 1, 3, 3); + b_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, -b_.values, b_.values, 5, 0, 7, 2); r_high.values += b_.values * a_high.values; r_low.values += b_.values * a_low.values; #else diff --git a/simde/arm/neon/cmla_rot90_lane.h b/simde/arm/neon/cmla_rot90_lane.h index 338930281..7e4cbea29 100644 --- a/simde/arm/neon/cmla_rot90_lane.h +++ b/simde/arm/neon/cmla_rot90_lane.h @@ -62,8 +62,8 @@ simde_float16x4_t simde_vcmla_rot90_lane_f16(simde_float16x4_t r, simde_float16x simde_vcvt_f32_f16(simde_vdup_n_f16(simde_float16x4_to_private(b).values[lane]))); #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) && \ ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) - a_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_.values, a_.values, 1, 1, 3, 3); - b_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, -b_.values, b_.values, 1, 4, 3, 6); + a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 1, 1, 3, 3); + b_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, -b_.values, b_.values, 1, 4, 3, 6); r_.values += b_.values * a_.values; #else SIMDE_VECTORIZE @@ -147,8 +147,8 @@ simde_float16x4_t simde_vcmla_rot90_laneq_f16(simde_float16x4_t r, simde_float16 simde_vcvt_f32_f16(simde_vdup_n_f16(simde_float16x8_to_private(b).values[lane]))); #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) && \ ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) - a_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_.values, a_.values, 1, 1, 3, 3); - b_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, -b_.values, b_.values, 1, 4, 3, 6); + a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 1, 1, 3, 3); + b_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, -b_.values, b_.values, 1, 4, 3, 6); r_.values += b_.values * a_.values; #else SIMDE_VECTORIZE @@ -237,9 +237,9 @@ simde_float16x8_t simde_vcmlaq_rot90_lane_f16(simde_float16x8_t r, simde_float16 simde_vcvt_f32_f16(simde_vdup_n_f16(simde_float16x4_to_private(b).values[lane]))); #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) && \ ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) - a_low.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_low.values, a_low.values, 1, 1, 3, 3); - a_high.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_high.values, a_high.values, 1, 1, 3, 3); - b_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, -b_.values, b_.values, 1, 4, 3, 6); + a_low.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_low.values, a_low.values, 1, 1, 3, 3); + a_high.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_high.values, a_high.values, 1, 1, 3, 3); + b_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, -b_.values, b_.values, 1, 4, 3, 6); r_low.values += b_.values * a_low.values; r_high.values += b_.values * a_high.values; #else @@ -333,9 +333,9 @@ simde_float16x8_t simde_vcmlaq_rot90_laneq_f16(simde_float16x8_t r, simde_float1 simde_vcvt_f32_f16(simde_vdup_n_f16(simde_float16x8_to_private(b).values[lane]))); #if defined(SIMDE_SHUFFLE_VECTOR_) && \ ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) - a_low.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_low.values, a_low.values, 1, 1, 3, 3); - a_high.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_high.values, a_high.values, 1, 1, 3, 3); - b_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, -b_.values, b_.values, 1, 4, 3, 6); + a_low.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_low.values, a_low.values, 1, 1, 3, 3); + a_high.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_high.values, a_high.values, 1, 1, 3, 3); + b_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, -b_.values, b_.values, 1, 4, 3, 6); r_low.values += b_.values * a_low.values; r_high.values += b_.values * a_high.values; #else