From b4e805a56c5f3ae47140263fdad0e3baa124e32a Mon Sep 17 00:00:00 2001 From: Eric Su <77781328+eric900115@users.noreply.github.com> Date: Thu, 14 Mar 2024 21:18:43 +0800 Subject: [PATCH] Initial Support for the RISC-V Vector Extension in ARM NEON (#1130) * feat : add ci test for RISC-V Vector * feat : modify types.h for risc-v vector extension * feat : modify simde utilities for rvv * feat : modify load & store for risc-v v extension * feat : modify load & store for risc-v vector * feat : add and mul neon to rvv * feat : add rvv CI without zvfh * feat : add rvv implementation (mul_lane) * feat : add mulx_lane neon2rvv --- .github/workflows/ci.yml | 76 ++ ...4+rvv_vlen128_elen64-clang-17-ccache.cross | 37 + ..._vlen128_elen64_zvfh-clang-17-ccache.cross | 37 + ...4+rvv_vlen256_elen64-clang-17-ccache.cross | 37 + ..._vlen256_elen64_zvfh-clang-17-ccache.cross | 37 + ...4+rvv_vlen512_elen64-clang-17-ccache.cross | 37 + ..._vlen512_elen64_zvfh-clang-17-ccache.cross | 37 + simde/arm/neon/add.h | 95 +- simde/arm/neon/ld1.h | 125 ++- simde/arm/neon/ld1_x2.h | 155 +++- simde/arm/neon/ld1_x3.h | 169 +++- simde/arm/neon/ld1_x4.h | 183 +++- simde/arm/neon/ld1q_x2.h | 155 +++- simde/arm/neon/ld1q_x3.h | 169 +++- simde/arm/neon/ld1q_x4.h | 183 +++- simde/arm/neon/ld2.h | 308 ++++++- simde/arm/neon/ld3.h | 415 +++++++-- simde/arm/neon/ld4.h | 393 ++++++-- simde/arm/neon/mul.h | 95 +- simde/arm/neon/mul_lane.h | 289 ++++-- simde/arm/neon/mulx_lane.h | 96 +- simde/arm/neon/st1.h | 125 ++- simde/arm/neon/st1_x2.h | 39 +- simde/arm/neon/st1_x3.h | 44 +- simde/arm/neon/st1_x4.h | 47 +- simde/arm/neon/st1q_x2.h | 39 +- simde/arm/neon/st1q_x3.h | 44 +- simde/arm/neon/st1q_x4.h | 47 +- simde/arm/neon/st2.h | 391 ++++++-- simde/arm/neon/st3.h | 353 ++++++-- simde/arm/neon/st4.h | 537 ++++++++--- simde/arm/neon/types.h | 165 ++++ simde/simde-arch.h | 36 +- simde/simde-common.h | 29 + simde/simde-f16.h | 4 +- simde/simde-features.h | 17 + test/arm/neon/reinterpret.c | 854 +++++++++--------- test/arm/neon/test-neon.h | 40 +- 38 files changed, 4465 insertions(+), 1474 deletions(-) create mode 100644 docker/cross-files/riscv64+rvv_vlen128_elen64-clang-17-ccache.cross create mode 100644 docker/cross-files/riscv64+rvv_vlen128_elen64_zvfh-clang-17-ccache.cross create mode 100644 docker/cross-files/riscv64+rvv_vlen256_elen64-clang-17-ccache.cross create mode 100644 docker/cross-files/riscv64+rvv_vlen256_elen64_zvfh-clang-17-ccache.cross create mode 100644 docker/cross-files/riscv64+rvv_vlen512_elen64-clang-17-ccache.cross create mode 100644 docker/cross-files/riscv64+rvv_vlen512_elen64_zvfh-clang-17-ccache.cross diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index e34d4da64..d0e97bc70 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -396,6 +396,82 @@ jobs: - name: Test run: meson test -C build --print-errorlogs --print-errorlogs $(meson test -C build --list | grep -v emul) + clang-qemu-rvv: + strategy: + fail-fast: false + matrix: + include: + - version: 17 + cross: riscv64+rvv_vlen128_elen64 + arch_gnu: riscv64 + arch_deb: riscv64 + distro: ubuntu-22.04 + - version: 17 + cross: riscv64+rvv_vlen256_elen64 + arch_gnu: riscv64 + arch_deb: riscv64 + distro: ubuntu-22.04 + - version: 17 + cross: riscv64+rvv_vlen512_elen64 + arch_gnu: riscv64 + arch_deb: riscv64 + distro: ubuntu-22.04 + - version: 17 + cross: riscv64+rvv_vlen128_elen64_zvfh + arch_gnu: riscv64 + arch_deb: riscv64 + distro: ubuntu-22.04 + - version: 17 + cross: riscv64+rvv_vlen256_elen64_zvfh + arch_gnu: riscv64 + arch_deb: riscv64 + distro: ubuntu-22.04 + - version: 17 + cross: riscv64+rvv_vlen512_elen64_zvfh + arch_gnu: riscv64 + arch_deb: riscv64 + distro: ubuntu-22.04 + runs-on: ${{ matrix.distro }} + container: + image: amd64/ubuntu:23.10 + steps: + - run: apt-get update + - name: Install git + run: | + apt-get install -y git + - uses: actions/checkout@v3 + with: + submodules: recursive + - name: CPU Information + run: cat /proc/cpuinfo + - name: Install APT Dependencies + run: | + apt-get install -y python3 python3-pip git ninja-build pkg-config libglib2.0-dev \ + lsb-release wget software-properties-common gnupg qemu-user pipx + apt-get install -y clang-${{ matrix.version }} lldb-${{ matrix.version }} lld-${{ matrix.version }} + #add-apt-repository ppa:savoury1/virtualisation + #add-apt-repository ppa:savoury1/display + apt-get update -y + apt-get -yq install ninja-build parallel \ + binfmt-support libc6-${{ matrix.arch_deb }}-cross \ + libstdc++-12-dev-${{ matrix.arch_deb }}-cross binutils-${{ matrix.arch_gnu }}-linux-gnu + apt install meson + - name: ccache + uses: hendrikmuhs/ccache-action@v1.2 + with: + key: ${{ github.job }}-${{ matrix.distro }}-${{ matrix.cross }} + - name: add ccache to the build path + run: | + export PATH="/usr/lib/ccache:/usr/local/opt/ccache/libexec:$PATH" + - name: Configure + run: | + meson setup --cross-file=docker/cross-files/${{ matrix.cross }}-clang-${{ matrix.version }}-ccache.cross build \ + || (cat build/meson-logs/meson-log.txt ; false) + - name: Build + run: ninja -C build -v + - name: Test + run: meson test -C build --print-errorlogs --print-errorlogs $(meson test -C build --list | grep -v emul) + clang-qemu: strategy: fail-fast: false diff --git a/docker/cross-files/riscv64+rvv_vlen128_elen64-clang-17-ccache.cross b/docker/cross-files/riscv64+rvv_vlen128_elen64-clang-17-ccache.cross new file mode 100644 index 000000000..ae1755341 --- /dev/null +++ b/docker/cross-files/riscv64+rvv_vlen128_elen64-clang-17-ccache.cross @@ -0,0 +1,37 @@ +[binaries] +c = 'clang-17' +cpp = 'clang++-17' +ar = 'llvm-ar-17' +strip = 'llvm-strip-17' +objcopy = 'llvm-objcopy-17' +ld = 'llvm-ld-17' +exe_wrapper = ['qemu-riscv64', '-L', '/usr/riscv64-linux-gnu/', '-cpu', 'rv64,v=true,vlen=128,elen=64,vext_spec=v1.0'] + +[properties] +c_args = ['--target=riscv64-linux-gnu', + '-isystem=/usr/riscv64-linux-gnu/include', + '-march=rv64gcv1', + '-menable-experimental-extensions', + '-mrvv-vector-bits=128', + '-Wextra', + '-Werror', + '-Wno-unsafe-buffer-usage'] + +cpp_args = ['--target=riscv64-linux-gnu', + '-isystem=/usr/riscv64-linux-gnu/include', + '-march=rv64gcv1', + '-menable-experimental-extensions', + '-mrvv-vector-bits=128', + '-Wextra', + '-Werror', + '-Wno-unsafe-buffer-usage'] + +c_link_args = ['--target=riscv64-linux-gnu'] + +cpp_link_args = ['--target=riscv64-linux-gnu'] + +[host_machine] +system = 'linux' +cpu_family = 'riscv64' +cpu = 'rv64,v=true,vlen=128,elen=64,vext_spec=v1.0' +endian = 'little' diff --git a/docker/cross-files/riscv64+rvv_vlen128_elen64_zvfh-clang-17-ccache.cross b/docker/cross-files/riscv64+rvv_vlen128_elen64_zvfh-clang-17-ccache.cross new file mode 100644 index 000000000..8f1076fae --- /dev/null +++ b/docker/cross-files/riscv64+rvv_vlen128_elen64_zvfh-clang-17-ccache.cross @@ -0,0 +1,37 @@ +[binaries] +c = 'clang-17' +cpp = 'clang++-17' +ar = 'llvm-ar-17' +strip = 'llvm-strip-17' +objcopy = 'llvm-objcopy-17' +ld = 'llvm-ld-17' +exe_wrapper = ['qemu-riscv64', '-L', '/usr/riscv64-linux-gnu/', '-cpu', 'rv64,v=true,Zfh=true,x-zvfh=true,vlen=128,elen=64,vext_spec=v1.0'] + +[properties] +c_args = ['--target=riscv64-linux-gnu', + '-isystem=/usr/riscv64-linux-gnu/include', + '-march=rv64gcv1_zfh_zvfh', + '-menable-experimental-extensions', + '-mrvv-vector-bits=128', + '-Wextra', + '-Werror', + '-Wno-unsafe-buffer-usage'] + +cpp_args = ['--target=riscv64-linux-gnu', + '-isystem=/usr/riscv64-linux-gnu/include', + '-march=rv64gcv1_zfh_zvfh', + '-menable-experimental-extensions', + '-mrvv-vector-bits=128', + '-Wextra', + '-Werror', + '-Wno-unsafe-buffer-usage'] + +c_link_args = ['--target=riscv64-linux-gnu'] + +cpp_link_args = ['--target=riscv64-linux-gnu'] + +[host_machine] +system = 'linux' +cpu_family = 'riscv64' +cpu = 'rv64,v=true,Zfh=true,x-zvfh=true,vlen=128,elen=64,vext_spec=v1.0' +endian = 'little' diff --git a/docker/cross-files/riscv64+rvv_vlen256_elen64-clang-17-ccache.cross b/docker/cross-files/riscv64+rvv_vlen256_elen64-clang-17-ccache.cross new file mode 100644 index 000000000..60ee2253b --- /dev/null +++ b/docker/cross-files/riscv64+rvv_vlen256_elen64-clang-17-ccache.cross @@ -0,0 +1,37 @@ +[binaries] +c = 'clang-17' +cpp = 'clang++-17' +ar = 'llvm-ar-17' +strip = 'llvm-strip-17' +objcopy = 'llvm-objcopy-17' +ld = 'llvm-ld-17' +exe_wrapper = ['qemu-riscv64', '-L', '/usr/riscv64-linux-gnu/', '-cpu', 'rv64,v=true,vlen=256,elen=64,vext_spec=v1.0'] + +[properties] +c_args = ['--target=riscv64-linux-gnu', + '-isystem=/usr/riscv64-linux-gnu/include', + '-march=rv64gcv1', + '-menable-experimental-extensions', + '-mrvv-vector-bits=256', + '-Wextra', + '-Werror', + '-Wno-unsafe-buffer-usage'] + +cpp_args = ['--target=riscv64-linux-gnu', + '-isystem=/usr/riscv64-linux-gnu/include', + '-march=rv64gcv1', + '-menable-experimental-extensions', + '-mrvv-vector-bits=256', + '-Wextra', + '-Werror', + '-Wno-unsafe-buffer-usage'] + +c_link_args = ['--target=riscv64-linux-gnu'] + +cpp_link_args = ['--target=riscv64-linux-gnu'] + +[host_machine] +system = 'linux' +cpu_family = 'riscv64' +cpu = 'rv64,v=true,vlen=256,elen=64,vext_spec=v1.0' +endian = 'little' diff --git a/docker/cross-files/riscv64+rvv_vlen256_elen64_zvfh-clang-17-ccache.cross b/docker/cross-files/riscv64+rvv_vlen256_elen64_zvfh-clang-17-ccache.cross new file mode 100644 index 000000000..107016a79 --- /dev/null +++ b/docker/cross-files/riscv64+rvv_vlen256_elen64_zvfh-clang-17-ccache.cross @@ -0,0 +1,37 @@ +[binaries] +c = 'clang-17' +cpp = 'clang++-17' +ar = 'llvm-ar-17' +strip = 'llvm-strip-17' +objcopy = 'llvm-objcopy-17' +ld = 'llvm-ld-17' +exe_wrapper = ['qemu-riscv64', '-L', '/usr/riscv64-linux-gnu/', '-cpu', 'rv64,v=true,Zfh=true,x-zvfh=true,vlen=256,elen=64,vext_spec=v1.0'] + +[properties] +c_args = ['--target=riscv64-linux-gnu', + '-isystem=/usr/riscv64-linux-gnu/include', + '-march=rv64gcv1_zfh_zvfh', + '-menable-experimental-extensions', + '-mrvv-vector-bits=256', + '-Wextra', + '-Werror', + '-Wno-unsafe-buffer-usage'] + +cpp_args = ['--target=riscv64-linux-gnu', + '-isystem=/usr/riscv64-linux-gnu/include', + '-march=rv64gcv1_zfh_zvfh', + '-menable-experimental-extensions', + '-mrvv-vector-bits=256', + '-Wextra', + '-Werror', + '-Wno-unsafe-buffer-usage'] + +c_link_args = ['--target=riscv64-linux-gnu'] + +cpp_link_args = ['--target=riscv64-linux-gnu'] + +[host_machine] +system = 'linux' +cpu_family = 'riscv64' +cpu = 'rv64,v=true,Zfh=true,x-zvfh=true,vlen=256,elen=64,vext_spec=v1.0' +endian = 'little' diff --git a/docker/cross-files/riscv64+rvv_vlen512_elen64-clang-17-ccache.cross b/docker/cross-files/riscv64+rvv_vlen512_elen64-clang-17-ccache.cross new file mode 100644 index 000000000..217b3833d --- /dev/null +++ b/docker/cross-files/riscv64+rvv_vlen512_elen64-clang-17-ccache.cross @@ -0,0 +1,37 @@ +[binaries] +c = 'clang-17' +cpp = 'clang++-17' +ar = 'llvm-ar-17' +strip = 'llvm-strip-17' +objcopy = 'llvm-objcopy-17' +ld = 'llvm-ld-17' +exe_wrapper = ['qemu-riscv64', '-L', '/usr/riscv64-linux-gnu/', '-cpu', 'rv64,v=true,vlen=512,elen=64,vext_spec=v1.0'] + +[properties] +c_args = ['--target=riscv64-linux-gnu', + '-isystem=/usr/riscv64-linux-gnu/include', + '-march=rv64gcv1', + '-menable-experimental-extensions', + '-mrvv-vector-bits=512', + '-Wextra', + '-Werror', + '-Wno-unsafe-buffer-usage'] + +cpp_args = ['--target=riscv64-linux-gnu', + '-isystem=/usr/riscv64-linux-gnu/include', + '-march=rv64gcv1', + '-menable-experimental-extensions', + '-mrvv-vector-bits=512', + '-Wextra', + '-Werror', + '-Wno-unsafe-buffer-usage'] + +c_link_args = ['--target=riscv64-linux-gnu'] + +cpp_link_args = ['--target=riscv64-linux-gnu'] + +[host_machine] +system = 'linux' +cpu_family = 'riscv64' +cpu = 'rv64,v=true,vlen=512,elen=64,vext_spec=v1.0' +endian = 'little' diff --git a/docker/cross-files/riscv64+rvv_vlen512_elen64_zvfh-clang-17-ccache.cross b/docker/cross-files/riscv64+rvv_vlen512_elen64_zvfh-clang-17-ccache.cross new file mode 100644 index 000000000..94f2fb30c --- /dev/null +++ b/docker/cross-files/riscv64+rvv_vlen512_elen64_zvfh-clang-17-ccache.cross @@ -0,0 +1,37 @@ +[binaries] +c = 'clang-17' +cpp = 'clang++-17' +ar = 'llvm-ar-17' +strip = 'llvm-strip-17' +objcopy = 'llvm-objcopy-17' +ld = 'llvm-ld-17' +exe_wrapper = ['qemu-riscv64', '-L', '/usr/riscv64-linux-gnu/', '-cpu', 'rv64,v=true,Zfh=true,x-zvfh=true,vlen=512,elen=64,vext_spec=v1.0'] + +[properties] +c_args = ['--target=riscv64-linux-gnu', + '-isystem=/usr/riscv64-linux-gnu/include', + '-march=rv64gcv1_zfh_zvfh', + '-menable-experimental-extensions', + '-mrvv-vector-bits=512', + '-Wextra', + '-Werror', + '-Wno-unsafe-buffer-usage'] + +cpp_args = ['--target=riscv64-linux-gnu', + '-isystem=/usr/riscv64-linux-gnu/include', + '-march=rv64gcv1_zfh_zvfh', + '-menable-experimental-extensions', + '-mrvv-vector-bits=512', + '-Wextra', + '-Werror', + '-Wno-unsafe-buffer-usage'] + +c_link_args = ['--target=riscv64-linux-gnu'] + +cpp_link_args = ['--target=riscv64-linux-gnu'] + +[host_machine] +system = 'linux' +cpu_family = 'riscv64' +cpu = 'rv64,v=true,Zfh=true,x-zvfh=true,vlen=512,elen=64,vext_spec=v1.0' +endian = 'little' diff --git a/simde/arm/neon/add.h b/simde/arm/neon/add.h index 18a7e55da..8b4fe3499 100644 --- a/simde/arm/neon/add.h +++ b/simde/arm/neon/add.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_ADD_H) @@ -89,10 +90,14 @@ simde_vadd_f16(simde_float16x4_t a, simde_float16x4_t b) { a_ = simde_float16x4_to_private(a), b_ = simde_float16x4_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = simde_vaddh_f16(a_.values[i], b_.values[i]); - } + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVFH) + r_.sv64 = __riscv_vfadd_vv_f16m1(a_.sv64, b_.sv64, 4); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vaddh_f16(a_.values[i], b_.values[i]); + } + #endif return simde_float16x4_from_private(r_); #endif @@ -113,7 +118,9 @@ simde_vadd_f32(simde_float32x2_t a, simde_float32x2_t b) { a_ = simde_float32x2_to_private(a), b_ = simde_float32x2_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vfadd_vv_f32m1(a_.sv64, b_.sv64, 2); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values + b_.values; #else SIMDE_VECTORIZE @@ -141,7 +148,9 @@ simde_vadd_f64(simde_float64x1_t a, simde_float64x1_t b) { a_ = simde_float64x1_to_private(a), b_ = simde_float64x1_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vfadd_vv_f64m1(a_.sv64, b_.sv64, 1); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values + b_.values; #else SIMDE_VECTORIZE @@ -169,7 +178,9 @@ simde_vadd_s8(simde_int8x8_t a, simde_int8x8_t b) { a_ = simde_int8x8_to_private(a), b_ = simde_int8x8_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vadd_vv_i8m1(a_.sv64, b_.sv64, 8); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values + b_.values; #elif defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_add_pi8(a_.m64, b_.m64); @@ -199,7 +210,9 @@ simde_vadd_s16(simde_int16x4_t a, simde_int16x4_t b) { a_ = simde_int16x4_to_private(a), b_ = simde_int16x4_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vadd_vv_i16m1(a_.sv64, b_.sv64, 4); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values + b_.values; #elif defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_add_pi16(a_.m64, b_.m64); @@ -229,7 +242,9 @@ simde_vadd_s32(simde_int32x2_t a, simde_int32x2_t b) { a_ = simde_int32x2_to_private(a), b_ = simde_int32x2_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vadd_vv_i32m1(a_.sv64, b_.sv64, 2); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values + b_.values; #elif defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_add_pi32(a_.m64, b_.m64); @@ -259,7 +274,9 @@ simde_vadd_s64(simde_int64x1_t a, simde_int64x1_t b) { a_ = simde_int64x1_to_private(a), b_ = simde_int64x1_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vadd_vv_i64m1(a_.sv64, b_.sv64, 1); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values + b_.values; #else SIMDE_VECTORIZE @@ -287,7 +304,9 @@ simde_vadd_u8(simde_uint8x8_t a, simde_uint8x8_t b) { a_ = simde_uint8x8_to_private(a), b_ = simde_uint8x8_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vadd_vv_u8m1(a_.sv64, b_.sv64, 8); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values + b_.values; #else SIMDE_VECTORIZE @@ -315,7 +334,10 @@ simde_vadd_u16(simde_uint16x4_t a, simde_uint16x4_t b) { a_ = simde_uint16x4_to_private(a), b_ = simde_uint16x4_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vadd_vv_u16m1(a_.sv64, b_.sv64, 4); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values + b_.values; #else SIMDE_VECTORIZE @@ -343,7 +365,9 @@ simde_vadd_u32(simde_uint32x2_t a, simde_uint32x2_t b) { a_ = simde_uint32x2_to_private(a), b_ = simde_uint32x2_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vadd_vv_u32m1(a_.sv64, b_.sv64, 2); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values + b_.values; #else SIMDE_VECTORIZE @@ -371,7 +395,9 @@ simde_vadd_u64(simde_uint64x1_t a, simde_uint64x1_t b) { a_ = simde_uint64x1_to_private(a), b_ = simde_uint64x1_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vadd_vv_u64m1(a_.sv64, b_.sv64, 1); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values + b_.values; #else SIMDE_VECTORIZE @@ -398,10 +424,15 @@ simde_vaddq_f16(simde_float16x8_t a, simde_float16x8_t b) { r_, a_ = simde_float16x8_to_private(a), b_ = simde_float16x8_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = simde_vaddh_f16(a_.values[i], b_.values[i]); - } + + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVFH) + r_.sv128 = __riscv_vfadd_vv_f16m1(a_.sv128, b_.sv128, 8); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vaddh_f16(a_.values[i], b_.values[i]); + } + #endif return simde_float16x8_from_private(r_); #endif @@ -432,6 +463,8 @@ simde_vaddq_f32(simde_float32x4_t a, simde_float32x4_t b) { r_.m128 = _mm_add_ps(a_.m128, b_.m128); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_f32x4_add(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vfadd_vv_f32m1(a_.sv128, b_.sv128, 4); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values + b_.values; #else @@ -466,6 +499,8 @@ simde_vaddq_f64(simde_float64x2_t a, simde_float64x2_t b) { r_.m128d = _mm_add_pd(a_.m128d, b_.m128d); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_f64x2_add(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vfadd_vv_f64m1(a_.sv128, b_.sv128, 2); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values + b_.values; #else @@ -500,6 +535,8 @@ simde_vaddq_s8(simde_int8x16_t a, simde_int8x16_t b) { r_.m128i = _mm_add_epi8(a_.m128i, b_.m128i); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i8x16_add(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vadd_vv_i8m1(a_.sv128, b_.sv128, 16); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values + b_.values; #else @@ -534,6 +571,8 @@ simde_vaddq_s16(simde_int16x8_t a, simde_int16x8_t b) { r_.m128i = _mm_add_epi16(a_.m128i, b_.m128i); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i16x8_add(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vadd_vv_i16m1(a_.sv128, b_.sv128, 8); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values + b_.values; #else @@ -568,6 +607,8 @@ simde_vaddq_s32(simde_int32x4_t a, simde_int32x4_t b) { r_.m128i = _mm_add_epi32(a_.m128i, b_.m128i); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i32x4_add(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vadd_vv_i32m1(a_.sv128, b_.sv128, 4); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values + b_.values; #else @@ -602,6 +643,8 @@ simde_vaddq_s64(simde_int64x2_t a, simde_int64x2_t b) { r_.m128i = _mm_add_epi64(a_.m128i, b_.m128i); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i64x2_add(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vadd_vv_i64m1(a_.sv128, b_.sv128, 2); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values + b_.values; #else @@ -632,7 +675,9 @@ simde_vaddq_u8(simde_uint8x16_t a, simde_uint8x16_t b) { a_ = simde_uint8x16_to_private(a), b_ = simde_uint8x16_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vadd_vv_u8m1(a_.sv128, b_.sv128, 16); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values + b_.values; #else SIMDE_VECTORIZE @@ -662,7 +707,9 @@ simde_vaddq_u16(simde_uint16x8_t a, simde_uint16x8_t b) { a_ = simde_uint16x8_to_private(a), b_ = simde_uint16x8_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vadd_vv_u16m1(a_.sv128, b_.sv128, 8); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values + b_.values; #else SIMDE_VECTORIZE @@ -692,7 +739,9 @@ simde_vaddq_u32(simde_uint32x4_t a, simde_uint32x4_t b) { a_ = simde_uint32x4_to_private(a), b_ = simde_uint32x4_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vadd_vv_u32m1(a_.sv128, b_.sv128, 4); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values + b_.values; #else SIMDE_VECTORIZE @@ -722,7 +771,9 @@ simde_vaddq_u64(simde_uint64x2_t a, simde_uint64x2_t b) { a_ = simde_uint64x2_to_private(a), b_ = simde_uint64x2_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vadd_vv_u64m1(a_.sv128, b_.sv128, 2); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values + b_.values; #else SIMDE_VECTORIZE diff --git a/simde/arm/neon/ld1.h b/simde/arm/neon/ld1.h index 51624a849..5dd2d17c6 100644 --- a/simde/arm/neon/ld1.h +++ b/simde/arm/neon/ld1.h @@ -24,6 +24,7 @@ * 2020 Evan Nemerson * 2021 Zhi An Ng (Copyright owned by Google, LLC) * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_LD1_H) @@ -42,7 +43,11 @@ simde_vld1_f16(simde_float16_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { return vld1_f16(ptr); #else simde_float16x4_private r_; - simde_memcpy(&r_, ptr, sizeof(r_)); + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVFH) + r_.sv64 = __riscv_vle16_v_f16m1((_Float16 *)ptr , 4); + #else + simde_memcpy(&r_, ptr, sizeof(r_)); + #endif return simde_float16x4_from_private(r_); #endif } @@ -58,7 +63,11 @@ simde_vld1_f32(simde_float32 const ptr[HEDLEY_ARRAY_PARAM(2)]) { return vld1_f32(ptr); #else simde_float32x2_private r_; - simde_memcpy(&r_, ptr, sizeof(r_)); + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vle32_v_f32m1(ptr , 2); + #else + simde_memcpy(&r_, ptr, sizeof(r_)); + #endif return simde_float32x2_from_private(r_); #endif } @@ -74,7 +83,11 @@ simde_vld1_f64(simde_float64 const ptr[HEDLEY_ARRAY_PARAM(1)]) { return vld1_f64(ptr); #else simde_float64x1_private r_; - simde_memcpy(&r_, ptr, sizeof(r_)); + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vle64_v_f64m1(ptr , 1); + #else + simde_memcpy(&r_, ptr, sizeof(r_)); + #endif return simde_float64x1_from_private(r_); #endif } @@ -90,7 +103,11 @@ simde_vld1_s8(int8_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { return vld1_s8(ptr); #else simde_int8x8_private r_; - simde_memcpy(&r_, ptr, sizeof(r_)); + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vle8_v_i8m1(ptr , 8); + #else + simde_memcpy(&r_, ptr, sizeof(r_)); + #endif return simde_int8x8_from_private(r_); #endif } @@ -106,7 +123,11 @@ simde_vld1_s16(int16_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { return vld1_s16(ptr); #else simde_int16x4_private r_; - simde_memcpy(&r_, ptr, sizeof(r_)); + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vle16_v_i16m1(ptr , 4); + #else + simde_memcpy(&r_, ptr, sizeof(r_)); + #endif return simde_int16x4_from_private(r_); #endif } @@ -122,7 +143,11 @@ simde_vld1_s32(int32_t const ptr[HEDLEY_ARRAY_PARAM(2)]) { return vld1_s32(ptr); #else simde_int32x2_private r_; - simde_memcpy(&r_, ptr, sizeof(r_)); + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vle32_v_i32m1(ptr , 2); + #else + simde_memcpy(&r_, ptr, sizeof(r_)); + #endif return simde_int32x2_from_private(r_); #endif } @@ -138,7 +163,11 @@ simde_vld1_s64(int64_t const ptr[HEDLEY_ARRAY_PARAM(1)]) { return vld1_s64(ptr); #else simde_int64x1_private r_; - simde_memcpy(&r_, ptr, sizeof(r_)); + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vle64_v_i64m1(ptr , 1); + #else + simde_memcpy(&r_, ptr, sizeof(r_)); + #endif return simde_int64x1_from_private(r_); #endif } @@ -154,7 +183,11 @@ simde_vld1_u8(uint8_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { return vld1_u8(ptr); #else simde_uint8x8_private r_; - simde_memcpy(&r_, ptr, sizeof(r_)); + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vle8_v_u8m1(ptr , 8); + #else + simde_memcpy(&r_, ptr, sizeof(r_)); + #endif return simde_uint8x8_from_private(r_); #endif } @@ -170,7 +203,11 @@ simde_vld1_u16(uint16_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { return vld1_u16(ptr); #else simde_uint16x4_private r_; - simde_memcpy(&r_, ptr, sizeof(r_)); + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vle16_v_u16m1(ptr , 4); + #else + simde_memcpy(&r_, ptr, sizeof(r_)); + #endif return simde_uint16x4_from_private(r_); #endif } @@ -186,7 +223,11 @@ simde_vld1_u32(uint32_t const ptr[HEDLEY_ARRAY_PARAM(2)]) { return vld1_u32(ptr); #else simde_uint32x2_private r_; - simde_memcpy(&r_, ptr, sizeof(r_)); + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vle32_v_u32m1(ptr , 2); + #else + simde_memcpy(&r_, ptr, sizeof(r_)); + #endif return simde_uint32x2_from_private(r_); #endif } @@ -202,7 +243,11 @@ simde_vld1_u64(uint64_t const ptr[HEDLEY_ARRAY_PARAM(1)]) { return vld1_u64(ptr); #else simde_uint64x1_private r_; - simde_memcpy(&r_, ptr, sizeof(r_)); + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vle64_v_u64m1(ptr , 1); + #else + simde_memcpy(&r_, ptr, sizeof(r_)); + #endif return simde_uint64x1_from_private(r_); #endif } @@ -220,6 +265,8 @@ simde_vld1q_f16(simde_float16_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { simde_float16x8_private r_; #if defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_v128_load(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVFH) + r_.sv128 = __riscv_vle16_v_f16m1((_Float16 *)ptr , 8); #else simde_memcpy(&r_, ptr, sizeof(r_)); #endif @@ -240,6 +287,8 @@ simde_vld1q_f32(simde_float32 const ptr[HEDLEY_ARRAY_PARAM(4)]) { simde_float32x4_private r_; #if defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_v128_load(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vle32_v_f32m1(ptr , 4); #else simde_memcpy(&r_, ptr, sizeof(r_)); #endif @@ -260,6 +309,8 @@ simde_vld1q_f64(simde_float64 const ptr[HEDLEY_ARRAY_PARAM(2)]) { simde_float64x2_private r_; #if defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_v128_load(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vle64_v_f64m1(ptr , 2); #else simde_memcpy(&r_, ptr, sizeof(r_)); #endif @@ -280,6 +331,8 @@ simde_vld1q_s8(int8_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { simde_int8x16_private r_; #if defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_v128_load(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vle8_v_i8m1(ptr , 16); #else simde_memcpy(&r_, ptr, sizeof(r_)); #endif @@ -300,6 +353,8 @@ simde_vld1q_s16(int16_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { simde_int16x8_private r_; #if defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_v128_load(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vle16_v_i16m1(ptr , 8); #else simde_memcpy(&r_, ptr, sizeof(r_)); #endif @@ -320,6 +375,8 @@ simde_vld1q_s32(int32_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { simde_int32x4_private r_; #if defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_v128_load(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vle32_v_i32m1(ptr , 4); #else simde_memcpy(&r_, ptr, sizeof(r_)); #endif @@ -340,6 +397,8 @@ simde_vld1q_s64(int64_t const ptr[HEDLEY_ARRAY_PARAM(2)]) { simde_int64x2_private r_; #if defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_v128_load(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vle64_v_i64m1(ptr , 2); #else simde_memcpy(&r_, ptr, sizeof(r_)); #endif @@ -360,6 +419,8 @@ simde_vld1q_u8(uint8_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { simde_uint8x16_private r_; #if defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_v128_load(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vle8_v_u8m1(ptr , 16); #else simde_memcpy(&r_, ptr, sizeof(r_)); #endif @@ -380,6 +441,8 @@ simde_vld1q_u16(uint16_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { simde_uint16x8_private r_; #if defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_v128_load(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vle16_v_u16m1(ptr , 8); #else simde_memcpy(&r_, ptr, sizeof(r_)); #endif @@ -400,6 +463,8 @@ simde_vld1q_u32(uint32_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { simde_uint32x4_private r_; #if defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_v128_load(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vle32_v_u32m1(ptr , 4); #else simde_memcpy(&r_, ptr, sizeof(r_)); #endif @@ -420,6 +485,8 @@ simde_vld1q_u64(uint64_t const ptr[HEDLEY_ARRAY_PARAM(2)]) { simde_uint64x2_private r_; #if defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_v128_load(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vle64_v_u64m1(ptr , 2); #else simde_memcpy(&r_, ptr, sizeof(r_)); #endif @@ -438,7 +505,11 @@ simde_vld1_p8(simde_poly8_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { return vld1_p8(ptr); #else simde_poly8x8_private r_; - simde_memcpy(&r_, ptr, sizeof(r_)); + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vle8_v_u8m1(ptr , 8); + #else + simde_memcpy(&r_, ptr, sizeof(r_)); + #endif return simde_poly8x8_from_private(r_); #endif } @@ -454,7 +525,11 @@ simde_vld1_p16(simde_poly16_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { return vld1_p16(ptr); #else simde_poly16x4_private r_; - simde_memcpy(&r_, ptr, sizeof(r_)); + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vle16_v_u16m1(ptr , 4); + #else + simde_memcpy(&r_, ptr, sizeof(r_)); + #endif return simde_poly16x4_from_private(r_); #endif } @@ -470,7 +545,11 @@ simde_vld1_p64(simde_poly64_t const ptr[HEDLEY_ARRAY_PARAM(1)]) { return vld1_p64(ptr); #else simde_poly64x1_private r_; - simde_memcpy(&r_, ptr, sizeof(r_)); + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vle64_v_u64m1(ptr , 1); + #else + simde_memcpy(&r_, ptr, sizeof(r_)); + #endif return simde_poly64x1_from_private(r_); #endif } @@ -486,7 +565,11 @@ simde_vld1q_p8(simde_poly8_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { return vld1q_p8(ptr); #else simde_poly8x16_private r_; - simde_memcpy(&r_, ptr, sizeof(r_)); + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vle8_v_u8m1(ptr , 16); + #else + simde_memcpy(&r_, ptr, sizeof(r_)); + #endif return simde_poly8x16_from_private(r_); #endif } @@ -502,7 +585,11 @@ simde_vld1q_p16(simde_poly16_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { return vld1q_p16(ptr); #else simde_poly16x8_private r_; - simde_memcpy(&r_, ptr, sizeof(r_)); + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vle16_v_u16m1(ptr , 8); + #else + simde_memcpy(&r_, ptr, sizeof(r_)); + #endif return simde_poly16x8_from_private(r_); #endif } @@ -518,7 +605,11 @@ simde_vld1q_p64(simde_poly64_t const ptr[HEDLEY_ARRAY_PARAM(2)]) { return vld1q_p64(ptr); #else simde_poly64x2_private r_; - simde_memcpy(&r_, ptr, sizeof(r_)); + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vle64_v_u64m1(ptr , 2); + #else + simde_memcpy(&r_, ptr, sizeof(r_)); + #endif return simde_poly64x2_from_private(r_); #endif } diff --git a/simde/arm/neon/ld1_x2.h b/simde/arm/neon/ld1_x2.h index 237c28afb..75ce61d10 100644 --- a/simde/arm/neon/ld1_x2.h +++ b/simde/arm/neon/ld1_x2.h @@ -25,6 +25,7 @@ * 2021 Zhi An Ng (Copyright owned by Google, LLC) * 2021 Décio Luiz Gazzoni Filho * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_LD1_X2_H) @@ -51,9 +52,14 @@ simde_vld1_f16_x2(simde_float16_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { return vld1_f16_x2(ptr); #else simde_float16x4_private a_[2]; - for (size_t i = 0; i < 8; i++) { - a_[i / 4].values[i % 4] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + a_[0].sv64 = __riscv_vle16_v_f16m1((_Float16 *)ptr , 4); + a_[1].sv64 = __riscv_vle16_v_f16m1((_Float16 *)(ptr+4) , 4); + #else + for (size_t i = 0; i < 8; i++) { + a_[i / 4].values[i % 4] = ptr[i]; + } + #endif simde_float16x4x2_t s_ = { { simde_float16x4_from_private(a_[0]), simde_float16x4_from_private(a_[1]) } }; return s_; @@ -74,9 +80,14 @@ simde_vld1_f32_x2(simde_float32 const ptr[HEDLEY_ARRAY_PARAM(4)]) { return vld1_f32_x2(ptr); #else simde_float32x2_private a_[2]; - for (size_t i = 0; i < 4; i++) { - a_[i / 2].values[i % 2] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle32_v_f32m1(ptr , 2); + a_[1].sv64 = __riscv_vle32_v_f32m1(ptr+2 , 2); + #else + for (size_t i = 0; i < 4; i++) { + a_[i / 2].values[i % 2] = ptr[i]; + } + #endif simde_float32x2x2_t s_ = { { simde_float32x2_from_private(a_[0]), simde_float32x2_from_private(a_[1]) } }; return s_; @@ -97,9 +108,14 @@ simde_vld1_f64_x2(simde_float64 const ptr[HEDLEY_ARRAY_PARAM(2)]) { return vld1_f64_x2(ptr); #else simde_float64x1_private a_[2]; - for (size_t i = 0; i < 2; i++) { - a_[i].values[0] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle64_v_f64m1(ptr , 1); + a_[1].sv64 = __riscv_vle64_v_f64m1(ptr+1 , 1); + #else + for (size_t i = 0; i < 2; i++) { + a_[i].values[0] = ptr[i]; + } + #endif simde_float64x1x2_t s_ = { { simde_float64x1_from_private(a_[0]), simde_float64x1_from_private(a_[1]) } }; return s_; @@ -120,9 +136,14 @@ simde_vld1_s8_x2(int8_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { return vld1_s8_x2(ptr); #else simde_int8x8_private a_[2]; - for (size_t i = 0; i < 16; i++) { - a_[i / 8].values[i % 8] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle8_v_i8m1(ptr , 8); + a_[1].sv64 = __riscv_vle8_v_i8m1(ptr+8 , 8); + #else + for (size_t i = 0; i < 16; i++) { + a_[i / 8].values[i % 8] = ptr[i]; + } + #endif simde_int8x8x2_t s_ = { { simde_int8x8_from_private(a_[0]), simde_int8x8_from_private(a_[1]) } }; return s_; @@ -143,9 +164,14 @@ simde_vld1_s16_x2(int16_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { return vld1_s16_x2(ptr); #else simde_int16x4_private a_[2]; - for (size_t i = 0; i < 8; i++) { - a_[i / 4].values[i % 4] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle16_v_i16m1(ptr , 4); + a_[1].sv64 = __riscv_vle16_v_i16m1(ptr+4 , 4); + #else + for (size_t i = 0; i < 8; i++) { + a_[i / 4].values[i % 4] = ptr[i]; + } + #endif simde_int16x4x2_t s_ = { { simde_int16x4_from_private(a_[0]), simde_int16x4_from_private(a_[1]) } }; return s_; @@ -166,9 +192,14 @@ simde_vld1_s32_x2(int32_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { return vld1_s32_x2(ptr); #else simde_int32x2_private a_[2]; - for (size_t i = 0; i < 4; i++) { - a_[i / 2].values[i % 2] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle32_v_i32m1(ptr , 2); + a_[1].sv64 = __riscv_vle32_v_i32m1(ptr+2 , 2); + #else + for (size_t i = 0; i < 4; i++) { + a_[i / 2].values[i % 2] = ptr[i]; + } + #endif simde_int32x2x2_t s_ = { { simde_int32x2_from_private(a_[0]), simde_int32x2_from_private(a_[1]) } }; return s_; @@ -189,9 +220,14 @@ simde_vld1_s64_x2(int64_t const ptr[HEDLEY_ARRAY_PARAM(2)]) { return vld1_s64_x2(ptr); #else simde_int64x1_private a_[2]; - for (size_t i = 0; i < 2; i++) { - a_[i].values[0] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle64_v_i64m1(ptr , 1); + a_[1].sv64 = __riscv_vle64_v_i64m1(ptr+1 , 1); + #else + for (size_t i = 0; i < 2; i++) { + a_[i].values[0] = ptr[i]; + } + #endif simde_int64x1x2_t s_ = { { simde_int64x1_from_private(a_[0]), simde_int64x1_from_private(a_[1]) } }; return s_; @@ -212,9 +248,14 @@ simde_vld1_u8_x2(uint8_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { return vld1_u8_x2(ptr); #else simde_uint8x8_private a_[2]; - for (size_t i = 0; i < 16; i++) { - a_[i / 8].values[i % 8] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle8_v_u8m1(ptr , 8); + a_[1].sv64 = __riscv_vle8_v_u8m1(ptr+8 , 8); + #else + for (size_t i = 0; i < 16; i++) { + a_[i / 8].values[i % 8] = ptr[i]; + } + #endif simde_uint8x8x2_t s_ = { { simde_uint8x8_from_private(a_[0]), simde_uint8x8_from_private(a_[1]) } }; return s_; @@ -235,9 +276,14 @@ simde_vld1_u16_x2(uint16_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { return vld1_u16_x2(ptr); #else simde_uint16x4_private a_[2]; - for (size_t i = 0; i < 8; i++) { - a_[i / 4].values[i % 4] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle16_v_u16m1(ptr , 4); + a_[1].sv64 = __riscv_vle16_v_u16m1(ptr+4 , 4); + #else + for (size_t i = 0; i < 8; i++) { + a_[i / 4].values[i % 4] = ptr[i]; + } + #endif simde_uint16x4x2_t s_ = { { simde_uint16x4_from_private(a_[0]), simde_uint16x4_from_private(a_[1]) } }; return s_; @@ -258,9 +304,14 @@ simde_vld1_u32_x2(uint32_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { return vld1_u32_x2(ptr); #else simde_uint32x2_private a_[2]; - for (size_t i = 0; i < 4; i++) { - a_[i / 2].values[i % 2] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle32_v_u32m1(ptr , 2); + a_[1].sv64 = __riscv_vle32_v_u32m1(ptr+2 , 2); + #else + for (size_t i = 0; i < 4; i++) { + a_[i / 2].values[i % 2] = ptr[i]; + } + #endif simde_uint32x2x2_t s_ = { { simde_uint32x2_from_private(a_[0]), simde_uint32x2_from_private(a_[1]) } }; return s_; @@ -281,9 +332,14 @@ simde_vld1_u64_x2(uint64_t const ptr[HEDLEY_ARRAY_PARAM(2)]) { return vld1_u64_x2(ptr); #else simde_uint64x1_private a_[2]; - for (size_t i = 0; i < 2; i++) { - a_[i].values[0] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle64_v_u64m1(ptr , 1); + a_[1].sv64 = __riscv_vle64_v_u64m1(ptr+1 , 1); + #else + for (size_t i = 0; i < 2; i++) { + a_[i].values[0] = ptr[i]; + } + #endif simde_uint64x1x2_t s_ = { { simde_uint64x1_from_private(a_[0]), simde_uint64x1_from_private(a_[1]) } }; return s_; @@ -301,9 +357,14 @@ simde_vld1_p8_x2(simde_poly8_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { return vld1_p8_x2(ptr); #else simde_poly8x8_private a_[2]; - for (size_t i = 0; i < 16; i++) { - a_[i / 8].values[i % 8] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle8_v_u8m1(ptr , 8); + a_[1].sv64 = __riscv_vle8_v_u8m1(ptr+8 , 8); + #else + for (size_t i = 0; i < 16; i++) { + a_[i / 8].values[i % 8] = ptr[i]; + } + #endif simde_poly8x8x2_t s_ = { { simde_poly8x8_from_private(a_[0]), simde_poly8x8_from_private(a_[1]) } }; return s_; @@ -321,9 +382,14 @@ simde_vld1_p16_x2(simde_poly16_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { return vld1_p16_x2(ptr); #else simde_poly16x4_private a_[2]; - for (size_t i = 0; i < 8; i++) { - a_[i / 4].values[i % 4] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle16_v_u16m1(ptr , 4); + a_[1].sv64 = __riscv_vle16_v_u16m1(ptr+4 , 4); + #else + for (size_t i = 0; i < 8; i++) { + a_[i / 4].values[i % 4] = ptr[i]; + } + #endif simde_poly16x4x2_t s_ = { { simde_poly16x4_from_private(a_[0]), simde_poly16x4_from_private(a_[1]) } }; return s_; @@ -344,9 +410,14 @@ simde_vld1_p64_x2(simde_poly64_t const ptr[HEDLEY_ARRAY_PARAM(2)]) { return vld1_p64_x2(ptr); #else simde_poly64x1_private a_[2]; - for (size_t i = 0; i < 2; i++) { - a_[i].values[0] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle64_v_u64m1(ptr , 1); + a_[1].sv64 = __riscv_vle64_v_u64m1(ptr+1 , 1); + #else + for (size_t i = 0; i < 2; i++) { + a_[i].values[0] = ptr[i]; + } + #endif simde_poly64x1x2_t s_ = { { simde_poly64x1_from_private(a_[0]), simde_poly64x1_from_private(a_[1]) } }; return s_; diff --git a/simde/arm/neon/ld1_x3.h b/simde/arm/neon/ld1_x3.h index f4616af4c..bdaf8e527 100644 --- a/simde/arm/neon/ld1_x3.h +++ b/simde/arm/neon/ld1_x3.h @@ -24,6 +24,7 @@ * 2020 Evan Nemerson * 2021 Zhi An Ng (Copyright owned by Google, LLC) * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_LD1_X3_H) @@ -50,9 +51,15 @@ simde_vld1_f16_x3(simde_float16_t const ptr[HEDLEY_ARRAY_PARAM(12)]) { return vld1_f16_x3(ptr); #else simde_float16x4_private a_[3]; - for (size_t i = 0; i < 12; i++) { - a_[i / 4].values[i % 4] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + a_[0].sv64 = __riscv_vle16_v_f16m1((_Float16 *)ptr , 4); + a_[1].sv64 = __riscv_vle16_v_f16m1((_Float16 *)(ptr+4) , 4); + a_[2].sv64 = __riscv_vle16_v_f16m1((_Float16 *)(ptr+8) , 4); + #else + for (size_t i = 0; i < 12; i++) { + a_[i / 4].values[i % 4] = ptr[i]; + } + #endif simde_float16x4x3_t s_ = { { simde_float16x4_from_private(a_[0]), simde_float16x4_from_private(a_[1]), simde_float16x4_from_private(a_[2]) } }; @@ -74,9 +81,15 @@ simde_vld1_f32_x3(simde_float32 const ptr[HEDLEY_ARRAY_PARAM(6)]) { return vld1_f32_x3(ptr); #else simde_float32x2_private a_[3]; - for (size_t i = 0; i < 6; i++) { - a_[i / 2].values[i % 2] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle32_v_f32m1(ptr , 2); + a_[1].sv64 = __riscv_vle32_v_f32m1(ptr+2 , 2); + a_[2].sv64 = __riscv_vle32_v_f32m1(ptr+4 , 2); + #else + for (size_t i = 0; i < 6; i++) { + a_[i / 2].values[i % 2] = ptr[i]; + } + #endif simde_float32x2x3_t s_ = { { simde_float32x2_from_private(a_[0]), simde_float32x2_from_private(a_[1]), simde_float32x2_from_private(a_[2]) } }; @@ -98,9 +111,15 @@ simde_vld1_f64_x3(simde_float64 const ptr[HEDLEY_ARRAY_PARAM(3)]) { return vld1_f64_x3(ptr); #else simde_float64x1_private a_[3]; - for (size_t i = 0; i < 3; i++) { - a_[i].values[0] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle64_v_f64m1(ptr , 1); + a_[1].sv64 = __riscv_vle64_v_f64m1(ptr+1 , 1); + a_[2].sv64 = __riscv_vle64_v_f64m1(ptr+2 , 1); + #else + for (size_t i = 0; i < 3; i++) { + a_[i].values[0] = ptr[i]; + } + #endif simde_float64x1x3_t s_ = { { simde_float64x1_from_private(a_[0]), simde_float64x1_from_private(a_[1]), simde_float64x1_from_private(a_[2]) } }; @@ -122,9 +141,15 @@ simde_vld1_s8_x3(int8_t const ptr[HEDLEY_ARRAY_PARAM(24)]) { return vld1_s8_x3(ptr); #else simde_int8x8_private a_[3]; - for (size_t i = 0; i < 24; i++) { - a_[i / 8].values[i % 8] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle8_v_i8m1(ptr , 8); + a_[1].sv64 = __riscv_vle8_v_i8m1(ptr+8 , 8); + a_[2].sv64 = __riscv_vle8_v_i8m1(ptr+16 , 8); + #else + for (size_t i = 0; i < 24; i++) { + a_[i / 8].values[i % 8] = ptr[i]; + } + #endif simde_int8x8x3_t s_ = { { simde_int8x8_from_private(a_[0]), simde_int8x8_from_private(a_[1]), simde_int8x8_from_private(a_[2]) } }; @@ -146,9 +171,15 @@ simde_vld1_s16_x3(int16_t const ptr[HEDLEY_ARRAY_PARAM(12)]) { return vld1_s16_x3(ptr); #else simde_int16x4_private a_[3]; - for (size_t i = 0; i < 12; i++) { - a_[i / 4].values[i % 4] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle16_v_i16m1(ptr , 4); + a_[1].sv64 = __riscv_vle16_v_i16m1(ptr+4 , 4); + a_[2].sv64 = __riscv_vle16_v_i16m1(ptr+8 , 4); + #else + for (size_t i = 0; i < 12; i++) { + a_[i / 4].values[i % 4] = ptr[i]; + } + #endif simde_int16x4x3_t s_ = { { simde_int16x4_from_private(a_[0]), simde_int16x4_from_private(a_[1]), simde_int16x4_from_private(a_[2]) } }; @@ -170,9 +201,15 @@ simde_vld1_s32_x3(int32_t const ptr[HEDLEY_ARRAY_PARAM(6)]) { return vld1_s32_x3(ptr); #else simde_int32x2_private a_[3]; - for (size_t i = 0; i < 6; i++) { - a_[i / 2].values[i % 2] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle32_v_i32m1(ptr , 2); + a_[1].sv64 = __riscv_vle32_v_i32m1(ptr+2 , 2); + a_[2].sv64 = __riscv_vle32_v_i32m1(ptr+4 , 2); + #else + for (size_t i = 0; i < 6; i++) { + a_[i / 2].values[i % 2] = ptr[i]; + } + #endif simde_int32x2x3_t s_ = { { simde_int32x2_from_private(a_[0]), simde_int32x2_from_private(a_[1]), simde_int32x2_from_private(a_[2]) } }; @@ -194,9 +231,15 @@ simde_vld1_s64_x3(int64_t const ptr[HEDLEY_ARRAY_PARAM(3)]) { return vld1_s64_x3(ptr); #else simde_int64x1_private a_[3]; - for (size_t i = 0; i < 3; i++) { - a_[i].values[0] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle64_v_i64m1(ptr , 1); + a_[1].sv64 = __riscv_vle64_v_i64m1(ptr+1 , 1); + a_[2].sv64 = __riscv_vle64_v_i64m1(ptr+2 , 1); + #else + for (size_t i = 0; i < 3; i++) { + a_[i].values[0] = ptr[i]; + } + #endif simde_int64x1x3_t s_ = { { simde_int64x1_from_private(a_[0]), simde_int64x1_from_private(a_[1]), simde_int64x1_from_private(a_[2]) } }; @@ -218,9 +261,15 @@ simde_vld1_u8_x3(uint8_t const ptr[HEDLEY_ARRAY_PARAM(24)]) { return vld1_u8_x3(ptr); #else simde_uint8x8_private a_[3]; - for (size_t i = 0; i < 24; i++) { - a_[i / 8].values[i % 8] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle8_v_u8m1(ptr , 8); + a_[1].sv64 = __riscv_vle8_v_u8m1(ptr+8 , 8); + a_[2].sv64 = __riscv_vle8_v_u8m1(ptr+16 , 8); + #else + for (size_t i = 0; i < 24; i++) { + a_[i / 8].values[i % 8] = ptr[i]; + } + #endif simde_uint8x8x3_t s_ = { { simde_uint8x8_from_private(a_[0]), simde_uint8x8_from_private(a_[1]), simde_uint8x8_from_private(a_[2]) } }; @@ -242,9 +291,15 @@ simde_vld1_u16_x3(uint16_t const ptr[HEDLEY_ARRAY_PARAM(12)]) { return vld1_u16_x3(ptr); #else simde_uint16x4_private a_[3]; - for (size_t i = 0; i < 12; i++) { - a_[i / 4].values[i % 4] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle16_v_u16m1(ptr , 4); + a_[1].sv64 = __riscv_vle16_v_u16m1(ptr+4 , 4); + a_[2].sv64 = __riscv_vle16_v_u16m1(ptr+8 , 4); + #else + for (size_t i = 0; i < 12; i++) { + a_[i / 4].values[i % 4] = ptr[i]; + } + #endif simde_uint16x4x3_t s_ = { { simde_uint16x4_from_private(a_[0]), simde_uint16x4_from_private(a_[1]), simde_uint16x4_from_private(a_[2]) } }; @@ -266,9 +321,15 @@ simde_vld1_u32_x3(uint32_t const ptr[HEDLEY_ARRAY_PARAM(6)]) { return vld1_u32_x3(ptr); #else simde_uint32x2_private a_[3]; - for (size_t i = 0; i < 6; i++) { - a_[i / 2].values[i % 2] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle32_v_u32m1(ptr , 2); + a_[1].sv64 = __riscv_vle32_v_u32m1(ptr+2 , 2); + a_[2].sv64 = __riscv_vle32_v_u32m1(ptr+4 , 2); + #else + for (size_t i = 0; i < 6; i++) { + a_[i / 2].values[i % 2] = ptr[i]; + } + #endif simde_uint32x2x3_t s_ = { { simde_uint32x2_from_private(a_[0]), simde_uint32x2_from_private(a_[1]), simde_uint32x2_from_private(a_[2]) } }; @@ -290,9 +351,15 @@ simde_vld1_u64_x3(uint64_t const ptr[HEDLEY_ARRAY_PARAM(3)]) { return vld1_u64_x3(ptr); #else simde_uint64x1_private a_[3]; - for (size_t i = 0; i < 3; i++) { - a_[i].values[0] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle64_v_u64m1(ptr , 1); + a_[1].sv64 = __riscv_vle64_v_u64m1(ptr+1 , 1); + a_[2].sv64 = __riscv_vle64_v_u64m1(ptr+2 , 1); + #else + for (size_t i = 0; i < 3; i++) { + a_[i].values[0] = ptr[i]; + } + #endif simde_uint64x1x3_t s_ = { { simde_uint64x1_from_private(a_[0]), simde_uint64x1_from_private(a_[1]), simde_uint64x1_from_private(a_[2]) } }; @@ -312,9 +379,15 @@ simde_vld1_p8_x3(simde_poly8_t const ptr[HEDLEY_ARRAY_PARAM(24)]) { return vld1_p8_x3(ptr); #else simde_poly8x8_private a_[3]; - for (size_t i = 0; i < 24; i++) { - a_[i / 8].values[i % 8] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle8_v_u8m1(ptr , 8); + a_[1].sv64 = __riscv_vle8_v_u8m1(ptr+8 , 8); + a_[2].sv64 = __riscv_vle8_v_u8m1(ptr+16 , 8); + #else + for (size_t i = 0; i < 24; i++) { + a_[i / 8].values[i % 8] = ptr[i]; + } + #endif simde_poly8x8x3_t s_ = { { simde_poly8x8_from_private(a_[0]), simde_poly8x8_from_private(a_[1]), simde_poly8x8_from_private(a_[2]) } }; @@ -334,9 +407,15 @@ simde_vld1_p16_x3(simde_poly16_t const ptr[HEDLEY_ARRAY_PARAM(12)]) { return vld1_p16_x3(ptr); #else simde_poly16x4_private a_[3]; - for (size_t i = 0; i < 12; i++) { - a_[i / 4].values[i % 4] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle16_v_u16m1(ptr , 4); + a_[1].sv64 = __riscv_vle16_v_u16m1(ptr+4 , 4); + a_[2].sv64 = __riscv_vle16_v_u16m1(ptr+8 , 4); + #else + for (size_t i = 0; i < 12; i++) { + a_[i / 4].values[i % 4] = ptr[i]; + } + #endif simde_poly16x4x3_t s_ = { { simde_poly16x4_from_private(a_[0]), simde_poly16x4_from_private(a_[1]), simde_poly16x4_from_private(a_[2]) } }; @@ -358,9 +437,15 @@ simde_vld1_p64_x3(simde_poly64_t const ptr[HEDLEY_ARRAY_PARAM(3)]) { return vld1_p64_x3(ptr); #else simde_poly64x1_private a_[3]; - for (size_t i = 0; i < 3; i++) { - a_[i].values[0] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle64_v_u64m1(ptr , 1); + a_[1].sv64 = __riscv_vle64_v_u64m1(ptr+1 , 1); + a_[2].sv64 = __riscv_vle64_v_u64m1(ptr+2 , 1); + #else + for (size_t i = 0; i < 3; i++) { + a_[i].values[0] = ptr[i]; + } + #endif simde_poly64x1x3_t s_ = { { simde_poly64x1_from_private(a_[0]), simde_poly64x1_from_private(a_[1]), simde_poly64x1_from_private(a_[2]) } }; diff --git a/simde/arm/neon/ld1_x4.h b/simde/arm/neon/ld1_x4.h index b8b505aaf..1d797364b 100644 --- a/simde/arm/neon/ld1_x4.h +++ b/simde/arm/neon/ld1_x4.h @@ -25,6 +25,7 @@ * 2021 Zhi An Ng (Copyright owned by Google, LLC) * 2021 Décio Luiz Gazzoni Filho * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_LD1_X4_H) @@ -51,9 +52,16 @@ simde_vld1_f16_x4(simde_float16_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { return vld1_f16_x4(ptr); #else simde_float16x4_private a_[4]; - for (size_t i = 0; i < 16; i++) { - a_[i / 4].values[i % 4] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + a_[0].sv64 = __riscv_vle16_v_f16m1((_Float16 *)ptr , 4); + a_[1].sv64 = __riscv_vle16_v_f16m1((_Float16 *)(ptr+4) , 4); + a_[2].sv64 = __riscv_vle16_v_f16m1((_Float16 *)(ptr+8) , 4); + a_[3].sv64 = __riscv_vle16_v_f16m1((_Float16 *)(ptr+12) , 4); + #else + for (size_t i = 0; i < 16; i++) { + a_[i / 4].values[i % 4] = ptr[i]; + } + #endif simde_float16x4x4_t s_ = { { simde_float16x4_from_private(a_[0]), simde_float16x4_from_private(a_[1]), simde_float16x4_from_private(a_[2]), @@ -76,9 +84,16 @@ simde_vld1_f32_x4(simde_float32 const ptr[HEDLEY_ARRAY_PARAM(8)]) { return vld1_f32_x4(ptr); #else simde_float32x2_private a_[4]; - for (size_t i = 0; i < 8; i++) { - a_[i / 2].values[i % 2] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle32_v_f32m1(ptr , 2); + a_[1].sv64 = __riscv_vle32_v_f32m1(ptr+2 , 2); + a_[2].sv64 = __riscv_vle32_v_f32m1(ptr+4 , 2); + a_[3].sv64 = __riscv_vle32_v_f32m1(ptr+6 , 2); + #else + for (size_t i = 0; i < 8; i++) { + a_[i / 2].values[i % 2] = ptr[i]; + } + #endif simde_float32x2x4_t s_ = { { simde_float32x2_from_private(a_[0]), simde_float32x2_from_private(a_[1]), simde_float32x2_from_private(a_[2]), @@ -101,9 +116,16 @@ simde_vld1_f64_x4(simde_float64 const ptr[HEDLEY_ARRAY_PARAM(4)]) { return vld1_f64_x4(ptr); #else simde_float64x1_private a_[4]; - for (size_t i = 0; i < 4; i++) { - a_[i].values[0] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle64_v_f64m1(ptr , 1); + a_[1].sv64 = __riscv_vle64_v_f64m1(ptr+1 , 1); + a_[2].sv64 = __riscv_vle64_v_f64m1(ptr+2 , 1); + a_[3].sv64 = __riscv_vle64_v_f64m1(ptr+3 , 1); + #else + for (size_t i = 0; i < 4; i++) { + a_[i].values[0] = ptr[i]; + } + #endif simde_float64x1x4_t s_ = { { simde_float64x1_from_private(a_[0]), simde_float64x1_from_private(a_[1]), simde_float64x1_from_private(a_[2]), @@ -126,9 +148,16 @@ simde_vld1_s8_x4(int8_t const ptr[HEDLEY_ARRAY_PARAM(32)]) { return vld1_s8_x4(ptr); #else simde_int8x8_private a_[4]; - for (size_t i = 0; i < 32; i++) { - a_[i / 8].values[i % 8] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle8_v_i8m1(ptr , 8); + a_[1].sv64 = __riscv_vle8_v_i8m1(ptr+8 , 8); + a_[2].sv64 = __riscv_vle8_v_i8m1(ptr+16 , 8); + a_[3].sv64 = __riscv_vle8_v_i8m1(ptr+24 , 8); + #else + for (size_t i = 0; i < 32; i++) { + a_[i / 8].values[i % 8] = ptr[i]; + } + #endif simde_int8x8x4_t s_ = { { simde_int8x8_from_private(a_[0]), simde_int8x8_from_private(a_[1]), simde_int8x8_from_private(a_[2]), @@ -151,9 +180,16 @@ simde_vld1_s16_x4(int16_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { return vld1_s16_x4(ptr); #else simde_int16x4_private a_[4]; - for (size_t i = 0; i < 16; i++) { - a_[i / 4].values[i % 4] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle16_v_i16m1(ptr , 4); + a_[1].sv64 = __riscv_vle16_v_i16m1(ptr+4 , 4); + a_[2].sv64 = __riscv_vle16_v_i16m1(ptr+8 , 4); + a_[3].sv64 = __riscv_vle16_v_i16m1(ptr+12 , 4); + #else + for (size_t i = 0; i < 16; i++) { + a_[i / 4].values[i % 4] = ptr[i]; + } + #endif simde_int16x4x4_t s_ = { { simde_int16x4_from_private(a_[0]), simde_int16x4_from_private(a_[1]), simde_int16x4_from_private(a_[2]), @@ -176,9 +212,16 @@ simde_vld1_s32_x4(int32_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { return vld1_s32_x4(ptr); #else simde_int32x2_private a_[4]; - for (size_t i = 0; i < 8; i++) { - a_[i / 2].values[i % 2] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle32_v_i32m1(ptr , 2); + a_[1].sv64 = __riscv_vle32_v_i32m1(ptr+2 , 2); + a_[2].sv64 = __riscv_vle32_v_i32m1(ptr+4 , 2); + a_[3].sv64 = __riscv_vle32_v_i32m1(ptr+6 , 2); + #else + for (size_t i = 0; i < 8; i++) { + a_[i / 2].values[i % 2] = ptr[i]; + } + #endif simde_int32x2x4_t s_ = { { simde_int32x2_from_private(a_[0]), simde_int32x2_from_private(a_[1]), simde_int32x2_from_private(a_[2]), @@ -201,9 +244,16 @@ simde_vld1_s64_x4(int64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { return vld1_s64_x4(ptr); #else simde_int64x1_private a_[4]; - for (size_t i = 0; i < 4; i++) { - a_[i].values[0] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle64_v_i64m1(ptr , 1); + a_[1].sv64 = __riscv_vle64_v_i64m1(ptr+1 , 1); + a_[2].sv64 = __riscv_vle64_v_i64m1(ptr+2 , 1); + a_[3].sv64 = __riscv_vle64_v_i64m1(ptr+3 , 1); + #else + for (size_t i = 0; i < 4; i++) { + a_[i].values[0] = ptr[i]; + } + #endif simde_int64x1x4_t s_ = { { simde_int64x1_from_private(a_[0]), simde_int64x1_from_private(a_[1]), simde_int64x1_from_private(a_[2]), @@ -226,9 +276,16 @@ simde_vld1_u8_x4(uint8_t const ptr[HEDLEY_ARRAY_PARAM(32)]) { return vld1_u8_x4(ptr); #else simde_uint8x8_private a_[4]; - for (size_t i = 0; i < 32; i++) { - a_[i / 8].values[i % 8] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle8_v_u8m1(ptr , 8); + a_[1].sv64 = __riscv_vle8_v_u8m1(ptr+8 , 8); + a_[2].sv64 = __riscv_vle8_v_u8m1(ptr+16 , 8); + a_[3].sv64 = __riscv_vle8_v_u8m1(ptr+24 , 8); + #else + for (size_t i = 0; i < 32; i++) { + a_[i / 8].values[i % 8] = ptr[i]; + } + #endif simde_uint8x8x4_t s_ = { { simde_uint8x8_from_private(a_[0]), simde_uint8x8_from_private(a_[1]), simde_uint8x8_from_private(a_[2]), @@ -251,9 +308,16 @@ simde_vld1_u16_x4(uint16_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { return vld1_u16_x4(ptr); #else simde_uint16x4_private a_[4]; - for (size_t i = 0; i < 16; i++) { - a_[i / 4].values[i % 4] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle16_v_u16m1(ptr , 4); + a_[1].sv64 = __riscv_vle16_v_u16m1(ptr+4 , 4); + a_[2].sv64 = __riscv_vle16_v_u16m1(ptr+8 , 4); + a_[3].sv64 = __riscv_vle16_v_u16m1(ptr+12 , 4); + #else + for (size_t i = 0; i < 16; i++) { + a_[i / 4].values[i % 4] = ptr[i]; + } + #endif simde_uint16x4x4_t s_ = { { simde_uint16x4_from_private(a_[0]), simde_uint16x4_from_private(a_[1]), simde_uint16x4_from_private(a_[2]), @@ -276,9 +340,16 @@ simde_vld1_u32_x4(uint32_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { return vld1_u32_x4(ptr); #else simde_uint32x2_private a_[4]; - for (size_t i = 0; i < 8; i++) { - a_[i / 2].values[i % 2] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle32_v_u32m1(ptr , 2); + a_[1].sv64 = __riscv_vle32_v_u32m1(ptr+2 , 2); + a_[2].sv64 = __riscv_vle32_v_u32m1(ptr+4 , 2); + a_[3].sv64 = __riscv_vle32_v_u32m1(ptr+6 , 2); + #else + for (size_t i = 0; i < 8; i++) { + a_[i / 2].values[i % 2] = ptr[i]; + } + #endif simde_uint32x2x4_t s_ = { { simde_uint32x2_from_private(a_[0]), simde_uint32x2_from_private(a_[1]), simde_uint32x2_from_private(a_[2]), @@ -301,9 +372,16 @@ simde_vld1_u64_x4(uint64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { return vld1_u64_x4(ptr); #else simde_uint64x1_private a_[4]; - for (size_t i = 0; i < 4; i++) { - a_[i].values[0] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle64_v_u64m1(ptr , 1); + a_[1].sv64 = __riscv_vle64_v_u64m1(ptr+1 , 1); + a_[2].sv64 = __riscv_vle64_v_u64m1(ptr+2 , 1); + a_[3].sv64 = __riscv_vle64_v_u64m1(ptr+3 , 1); + #else + for (size_t i = 0; i < 4; i++) { + a_[i].values[0] = ptr[i]; + } + #endif simde_uint64x1x4_t s_ = { { simde_uint64x1_from_private(a_[0]), simde_uint64x1_from_private(a_[1]), simde_uint64x1_from_private(a_[2]), @@ -324,9 +402,16 @@ simde_vld1_p8_x4(simde_poly8_t const ptr[HEDLEY_ARRAY_PARAM(32)]) { return vld1_p8_x4(ptr); #else simde_poly8x8_private a_[4]; - for (size_t i = 0; i < 32; i++) { - a_[i / 8].values[i % 8] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle8_v_u8m1(ptr , 8); + a_[1].sv64 = __riscv_vle8_v_u8m1(ptr+8 , 8); + a_[2].sv64 = __riscv_vle8_v_u8m1(ptr+16 , 8); + a_[3].sv64 = __riscv_vle8_v_u8m1(ptr+24 , 8); + #else + for (size_t i = 0; i < 32; i++) { + a_[i / 8].values[i % 8] = ptr[i]; + } + #endif simde_poly8x8x4_t s_ = { { simde_poly8x8_from_private(a_[0]), simde_poly8x8_from_private(a_[1]), simde_poly8x8_from_private(a_[2]), @@ -347,9 +432,16 @@ simde_vld1_p16_x4(simde_poly16_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { return vld1_p16_x4(ptr); #else simde_poly16x4_private a_[4]; - for (size_t i = 0; i < 16; i++) { - a_[i / 4].values[i % 4] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle16_v_u16m1(ptr , 4); + a_[1].sv64 = __riscv_vle16_v_u16m1(ptr+4 , 4); + a_[2].sv64 = __riscv_vle16_v_u16m1(ptr+8 , 4); + a_[3].sv64 = __riscv_vle16_v_u16m1(ptr+12 , 4); + #else + for (size_t i = 0; i < 16; i++) { + a_[i / 4].values[i % 4] = ptr[i]; + } + #endif simde_poly16x4x4_t s_ = { { simde_poly16x4_from_private(a_[0]), simde_poly16x4_from_private(a_[1]), simde_poly16x4_from_private(a_[2]), @@ -372,9 +464,16 @@ simde_vld1_p64_x4(simde_poly64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { return vld1_p64_x4(ptr); #else simde_poly64x1_private a_[4]; - for (size_t i = 0; i < 4; i++) { - a_[i].values[0] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle64_v_u64m1(ptr , 1); + a_[1].sv64 = __riscv_vle64_v_u64m1(ptr+1 , 1); + a_[2].sv64 = __riscv_vle64_v_u64m1(ptr+2 , 1); + a_[3].sv64 = __riscv_vle64_v_u64m1(ptr+3 , 1); + #else + for (size_t i = 0; i < 4; i++) { + a_[i].values[0] = ptr[i]; + } + #endif simde_poly64x1x4_t s_ = { { simde_poly64x1_from_private(a_[0]), simde_poly64x1_from_private(a_[1]), simde_poly64x1_from_private(a_[2]), diff --git a/simde/arm/neon/ld1q_x2.h b/simde/arm/neon/ld1q_x2.h index 25453d97c..da1da866a 100644 --- a/simde/arm/neon/ld1q_x2.h +++ b/simde/arm/neon/ld1q_x2.h @@ -25,6 +25,7 @@ * 2021 Zhi An Ng (Copyright owned by Google, LLC) * 2021 Décio Luiz Gazzoni Filho * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_LD1Q_X2_H) @@ -52,9 +53,14 @@ simde_vld1q_f16_x2(simde_float16_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { return vld1q_f16_x2(ptr); #else simde_float16x8_private a_[2]; - for (size_t i = 0; i < 16; i++) { - a_[i / 8].values[i % 8] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + a_[0].sv128 = __riscv_vle16_v_f16m1((_Float16 *)ptr , 8); + a_[1].sv128 = __riscv_vle16_v_f16m1((_Float16 *)(ptr+8) , 8); + #else + for (size_t i = 0; i < 16; i++) { + a_[i / 8].values[i % 8] = ptr[i]; + } + #endif simde_float16x8x2_t s_ = { { simde_float16x8_from_private(a_[0]), simde_float16x8_from_private(a_[1]) } }; return s_; @@ -75,9 +81,14 @@ simde_vld1q_f32_x2(simde_float32 const ptr[HEDLEY_ARRAY_PARAM(8)]) { return vld1q_f32_x2(ptr); #else simde_float32x4_private a_[2]; - for (size_t i = 0; i < 8; i++) { - a_[i / 4].values[i % 4] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle32_v_f32m1(ptr , 4); + a_[1].sv128 = __riscv_vle32_v_f32m1(ptr+4 , 4); + #else + for (size_t i = 0; i < 8; i++) { + a_[i / 4].values[i % 4] = ptr[i]; + } + #endif simde_float32x4x2_t s_ = { { simde_float32x4_from_private(a_[0]), simde_float32x4_from_private(a_[1]) } }; return s_; @@ -98,9 +109,14 @@ simde_vld1q_f64_x2(simde_float64 const ptr[HEDLEY_ARRAY_PARAM(4)]) { return vld1q_f64_x2(ptr); #else simde_float64x2_private a_[2]; - for (size_t i = 0; i < 4; i++) { - a_[i / 2].values[i % 2] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle64_v_f64m1(ptr , 2); + a_[1].sv128 = __riscv_vle64_v_f64m1(ptr+2 , 2); + #else + for (size_t i = 0; i < 4; i++) { + a_[i / 2].values[i % 2] = ptr[i]; + } + #endif simde_float64x2x2_t s_ = { { simde_float64x2_from_private(a_[0]), simde_float64x2_from_private(a_[1]) } }; return s_; @@ -121,9 +137,14 @@ simde_vld1q_s8_x2(int8_t const ptr[HEDLEY_ARRAY_PARAM(32)]) { return vld1q_s8_x2(ptr); #else simde_int8x16_private a_[2]; - for (size_t i = 0; i < 32; i++) { - a_[i / 16].values[i % 16] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle8_v_i8m1(ptr , 16); + a_[1].sv128 = __riscv_vle8_v_i8m1(ptr+16 , 16); + #else + for (size_t i = 0; i < 32; i++) { + a_[i / 16].values[i % 16] = ptr[i]; + } + #endif simde_int8x16x2_t s_ = { { simde_int8x16_from_private(a_[0]), simde_int8x16_from_private(a_[1]) } }; return s_; @@ -144,9 +165,14 @@ simde_vld1q_s16_x2(int16_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { return vld1q_s16_x2(ptr); #else simde_int16x8_private a_[2]; - for (size_t i = 0; i < 16; i++) { - a_[i / 8].values[i % 8] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle16_v_i16m1(ptr , 8); + a_[1].sv128 = __riscv_vle16_v_i16m1(ptr+8 , 8); + #else + for (size_t i = 0; i < 16; i++) { + a_[i / 8].values[i % 8] = ptr[i]; + } + #endif simde_int16x8x2_t s_ = { { simde_int16x8_from_private(a_[0]), simde_int16x8_from_private(a_[1]) } }; return s_; @@ -167,9 +193,14 @@ simde_vld1q_s32_x2(int32_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { return vld1q_s32_x2(ptr); #else simde_int32x4_private a_[2]; - for (size_t i = 0; i < 8; i++) { - a_[i / 4].values[i % 4] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle32_v_i32m1(ptr , 4); + a_[1].sv128 = __riscv_vle32_v_i32m1(ptr+4 , 4); + #else + for (size_t i = 0; i < 8; i++) { + a_[i / 4].values[i % 4] = ptr[i]; + } + #endif simde_int32x4x2_t s_ = { { simde_int32x4_from_private(a_[0]), simde_int32x4_from_private(a_[1]) } }; return s_; @@ -190,9 +221,14 @@ simde_vld1q_s64_x2(int64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { return vld1q_s64_x2(ptr); #else simde_int64x2_private a_[2]; - for (size_t i = 0; i < 4; i++) { - a_[i / 2].values[i % 2] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle64_v_i64m1(ptr , 2); + a_[1].sv128 = __riscv_vle64_v_i64m1(ptr+2 , 2); + #else + for (size_t i = 0; i < 4; i++) { + a_[i / 2].values[i % 2] = ptr[i]; + } + #endif simde_int64x2x2_t s_ = { { simde_int64x2_from_private(a_[0]), simde_int64x2_from_private(a_[1]) } }; return s_; @@ -213,9 +249,14 @@ simde_vld1q_u8_x2(uint8_t const ptr[HEDLEY_ARRAY_PARAM(32)]) { return vld1q_u8_x2(ptr); #else simde_uint8x16_private a_[2]; - for (size_t i = 0; i < 32; i++) { - a_[i / 16].values[i % 16] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle8_v_u8m1(ptr , 16); + a_[1].sv128 = __riscv_vle8_v_u8m1(ptr+16 , 16); + #else + for (size_t i = 0; i < 32; i++) { + a_[i / 16].values[i % 16] = ptr[i]; + } + #endif simde_uint8x16x2_t s_ = { { simde_uint8x16_from_private(a_[0]), simde_uint8x16_from_private(a_[1]) } }; return s_; @@ -236,9 +277,14 @@ simde_vld1q_u16_x2(uint16_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { return vld1q_u16_x2(ptr); #else simde_uint16x8_private a_[2]; - for (size_t i = 0; i < 16; i++) { - a_[i / 8].values[i % 8] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle16_v_u16m1(ptr , 8); + a_[1].sv128 = __riscv_vle16_v_u16m1(ptr+8 , 8); + #else + for (size_t i = 0; i < 16; i++) { + a_[i / 8].values[i % 8] = ptr[i]; + } + #endif simde_uint16x8x2_t s_ = { { simde_uint16x8_from_private(a_[0]), simde_uint16x8_from_private(a_[1]) } }; return s_; @@ -259,9 +305,14 @@ simde_vld1q_u32_x2(uint32_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { return vld1q_u32_x2(ptr); #else simde_uint32x4_private a_[2]; - for (size_t i = 0; i < 8; i++) { - a_[i / 4].values[i % 4] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle32_v_u32m1(ptr , 4); + a_[1].sv128 = __riscv_vle32_v_u32m1(ptr+4 , 4); + #else + for (size_t i = 0; i < 8; i++) { + a_[i / 4].values[i % 4] = ptr[i]; + } + #endif simde_uint32x4x2_t s_ = { { simde_uint32x4_from_private(a_[0]), simde_uint32x4_from_private(a_[1]) } }; return s_; @@ -282,9 +333,14 @@ simde_vld1q_u64_x2(uint64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { return vld1q_u64_x2(ptr); #else simde_uint64x2_private a_[2]; - for (size_t i = 0; i < 4; i++) { - a_[i / 2].values[i % 2] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle64_v_u64m1(ptr , 2); + a_[1].sv128 = __riscv_vle64_v_u64m1(ptr+2 , 2); + #else + for (size_t i = 0; i < 4; i++) { + a_[i / 2].values[i % 2] = ptr[i]; + } + #endif simde_uint64x2x2_t s_ = { { simde_uint64x2_from_private(a_[0]), simde_uint64x2_from_private(a_[1]) } }; return s_; @@ -304,9 +360,14 @@ simde_vld1q_p8_x2(simde_poly8_t const ptr[HEDLEY_ARRAY_PARAM(32)]) { return vld1q_p8_x2(ptr); #else simde_poly8x16_private a_[2]; - for (size_t i = 0; i < 32; i++) { - a_[i / 16].values[i % 16] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle8_v_u8m1(ptr , 16); + a_[1].sv128 = __riscv_vle8_v_u8m1(ptr+16 , 16); + #else + for (size_t i = 0; i < 32; i++) { + a_[i / 16].values[i % 16] = ptr[i]; + } + #endif simde_poly8x16x2_t s_ = { { simde_poly8x16_from_private(a_[0]), simde_poly8x16_from_private(a_[1]) } }; return s_; @@ -326,9 +387,14 @@ simde_vld1q_p16_x2(simde_poly16_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { return vld1q_p16_x2(ptr); #else simde_poly16x8_private a_[2]; - for (size_t i = 0; i < 16; i++) { - a_[i / 8].values[i % 8] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle16_v_u16m1(ptr , 8); + a_[1].sv128 = __riscv_vle16_v_u16m1(ptr+8 , 8); + #else + for (size_t i = 0; i < 16; i++) { + a_[i / 8].values[i % 8] = ptr[i]; + } + #endif simde_poly16x8x2_t s_ = { { simde_poly16x8_from_private(a_[0]), simde_poly16x8_from_private(a_[1]) } }; return s_; @@ -348,9 +414,14 @@ simde_vld1q_p64_x2(simde_poly64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { return vld1q_p64_x2(ptr); #else simde_poly64x2_private a_[2]; - for (size_t i = 0; i < 4; i++) { - a_[i / 2].values[i % 2] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle64_v_u64m1(ptr , 2); + a_[1].sv128 = __riscv_vle64_v_u64m1(ptr+2 , 2); + #else + for (size_t i = 0; i < 4; i++) { + a_[i / 2].values[i % 2] = ptr[i]; + } + #endif simde_poly64x2x2_t s_ = { { simde_poly64x2_from_private(a_[0]), simde_poly64x2_from_private(a_[1]) } }; return s_; diff --git a/simde/arm/neon/ld1q_x3.h b/simde/arm/neon/ld1q_x3.h index d819ed900..ec82989e7 100644 --- a/simde/arm/neon/ld1q_x3.h +++ b/simde/arm/neon/ld1q_x3.h @@ -24,6 +24,7 @@ * 2020 Evan Nemerson * 2021 Zhi An Ng (Copyright owned by Google, LLC) * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_LD1Q_X3_H) @@ -50,9 +51,15 @@ simde_vld1q_f16_x3(simde_float16_t const ptr[HEDLEY_ARRAY_PARAM(24)]) { return vld1q_f16_x3(ptr); #else simde_float16x8_private a_[3]; - for (size_t i = 0; i < 24; i++) { - a_[i / 8].values[i % 8] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + a_[0].sv128 = __riscv_vle16_v_f16m1((_Float16 *)ptr , 8); + a_[1].sv128 = __riscv_vle16_v_f16m1((_Float16 *)(ptr+8) , 8); + a_[2].sv128 = __riscv_vle16_v_f16m1((_Float16 *)(ptr+16) , 8); + #else + for (size_t i = 0; i < 24; i++) { + a_[i / 8].values[i % 8] = ptr[i]; + } + #endif simde_float16x8x3_t s_ = { { simde_float16x8_from_private(a_[0]), simde_float16x8_from_private(a_[1]), simde_float16x8_from_private(a_[2]) } }; @@ -74,9 +81,15 @@ simde_vld1q_f32_x3(simde_float32 const ptr[HEDLEY_ARRAY_PARAM(12)]) { return vld1q_f32_x3(ptr); #else simde_float32x4_private a_[3]; - for (size_t i = 0; i < 12; i++) { - a_[i / 4].values[i % 4] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle32_v_f32m1(ptr , 4); + a_[1].sv128 = __riscv_vle32_v_f32m1(ptr+4 , 4); + a_[2].sv128 = __riscv_vle32_v_f32m1(ptr+8 , 4); + #else + for (size_t i = 0; i < 12; i++) { + a_[i / 4].values[i % 4] = ptr[i]; + } + #endif simde_float32x4x3_t s_ = { { simde_float32x4_from_private(a_[0]), simde_float32x4_from_private(a_[1]), simde_float32x4_from_private(a_[2]) } }; @@ -98,9 +111,15 @@ simde_vld1q_f64_x3(simde_float64 const ptr[HEDLEY_ARRAY_PARAM(6)]) { return vld1q_f64_x3(ptr); #else simde_float64x2_private a_[3]; - for (size_t i = 0; i < 6; i++) { - a_[i / 2].values[i % 2] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle64_v_f64m1(ptr , 2); + a_[1].sv128 = __riscv_vle64_v_f64m1(ptr+2 , 2); + a_[2].sv128 = __riscv_vle64_v_f64m1(ptr+4 , 2); + #else + for (size_t i = 0; i < 6; i++) { + a_[i / 2].values[i % 2] = ptr[i]; + } + #endif simde_float64x2x3_t s_ = { { simde_float64x2_from_private(a_[0]), simde_float64x2_from_private(a_[1]), simde_float64x2_from_private(a_[2]) } }; @@ -122,9 +141,15 @@ simde_vld1q_s8_x3(int8_t const ptr[HEDLEY_ARRAY_PARAM(48)]) { return vld1q_s8_x3(ptr); #else simde_int8x16_private a_[3]; - for (size_t i = 0; i < 48; i++) { - a_[i / 16].values[i % 16] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle8_v_i8m1(ptr , 16); + a_[1].sv128 = __riscv_vle8_v_i8m1(ptr+16 , 16); + a_[2].sv128 = __riscv_vle8_v_i8m1(ptr+32 , 16); + #else + for (size_t i = 0; i < 48; i++) { + a_[i / 16].values[i % 16] = ptr[i]; + } + #endif simde_int8x16x3_t s_ = { { simde_int8x16_from_private(a_[0]), simde_int8x16_from_private(a_[1]), simde_int8x16_from_private(a_[2]) } }; @@ -146,9 +171,15 @@ simde_vld1q_s16_x3(int16_t const ptr[HEDLEY_ARRAY_PARAM(12)]) { return vld1q_s16_x3(ptr); #else simde_int16x8_private a_[3]; - for (size_t i = 0; i < 24; i++) { - a_[i / 8].values[i % 8] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle16_v_i16m1(ptr , 8); + a_[1].sv128 = __riscv_vle16_v_i16m1(ptr+8 , 8); + a_[2].sv128 = __riscv_vle16_v_i16m1(ptr+16 , 8); + #else + for (size_t i = 0; i < 24; i++) { + a_[i / 8].values[i % 8] = ptr[i]; + } + #endif simde_int16x8x3_t s_ = { { simde_int16x8_from_private(a_[0]), simde_int16x8_from_private(a_[1]), simde_int16x8_from_private(a_[2]) } }; @@ -170,9 +201,15 @@ simde_vld1q_s32_x3(int32_t const ptr[HEDLEY_ARRAY_PARAM(6)]) { return vld1q_s32_x3(ptr); #else simde_int32x4_private a_[3]; - for (size_t i = 0; i < 12; i++) { - a_[i / 4].values[i % 4] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle32_v_i32m1(ptr , 4); + a_[1].sv128 = __riscv_vle32_v_i32m1(ptr+4 , 4); + a_[2].sv128 = __riscv_vle32_v_i32m1(ptr+8 , 4); + #else + for (size_t i = 0; i < 12; i++) { + a_[i / 4].values[i % 4] = ptr[i]; + } + #endif simde_int32x4x3_t s_ = { { simde_int32x4_from_private(a_[0]), simde_int32x4_from_private(a_[1]), simde_int32x4_from_private(a_[2]) } }; @@ -194,9 +231,15 @@ simde_vld1q_s64_x3(int64_t const ptr[HEDLEY_ARRAY_PARAM(3)]) { return vld1q_s64_x3(ptr); #else simde_int64x2_private a_[3]; - for (size_t i = 0; i < 6; i++) { - a_[i / 2].values[i % 2] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle64_v_i64m1(ptr , 2); + a_[1].sv128 = __riscv_vle64_v_i64m1(ptr+2 , 2); + a_[2].sv128 = __riscv_vle64_v_i64m1(ptr+4 , 2); + #else + for (size_t i = 0; i < 6; i++) { + a_[i / 2].values[i % 2] = ptr[i]; + } + #endif simde_int64x2x3_t s_ = { { simde_int64x2_from_private(a_[0]), simde_int64x2_from_private(a_[1]), simde_int64x2_from_private(a_[2]) } }; @@ -218,9 +261,15 @@ simde_vld1q_u8_x3(uint8_t const ptr[HEDLEY_ARRAY_PARAM(48)]) { return vld1q_u8_x3(ptr); #else simde_uint8x16_private a_[3]; - for (size_t i = 0; i < 48; i++) { - a_[i / 16].values[i % 16] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle8_v_u8m1(ptr , 16); + a_[1].sv128 = __riscv_vle8_v_u8m1(ptr+16 , 16); + a_[2].sv128 = __riscv_vle8_v_u8m1(ptr+32 , 16); + #else + for (size_t i = 0; i < 48; i++) { + a_[i / 16].values[i % 16] = ptr[i]; + } + #endif simde_uint8x16x3_t s_ = { { simde_uint8x16_from_private(a_[0]), simde_uint8x16_from_private(a_[1]), simde_uint8x16_from_private(a_[2]) } }; @@ -242,9 +291,15 @@ simde_vld1q_u16_x3(uint16_t const ptr[HEDLEY_ARRAY_PARAM(24)]) { return vld1q_u16_x3(ptr); #else simde_uint16x8_private a_[3]; - for (size_t i = 0; i < 24; i++) { - a_[i / 8].values[i % 8] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle16_v_u16m1(ptr , 8); + a_[1].sv128 = __riscv_vle16_v_u16m1(ptr+8 , 8); + a_[2].sv128 = __riscv_vle16_v_u16m1(ptr+16 , 8); + #else + for (size_t i = 0; i < 24; i++) { + a_[i / 8].values[i % 8] = ptr[i]; + } + #endif simde_uint16x8x3_t s_ = { { simde_uint16x8_from_private(a_[0]), simde_uint16x8_from_private(a_[1]), simde_uint16x8_from_private(a_[2]) } }; @@ -266,9 +321,15 @@ simde_vld1q_u32_x3(uint32_t const ptr[HEDLEY_ARRAY_PARAM(6)]) { return vld1q_u32_x3(ptr); #else simde_uint32x4_private a_[3]; - for (size_t i = 0; i < 12; i++) { - a_[i / 4].values[i % 4] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle32_v_u32m1(ptr , 4); + a_[1].sv128 = __riscv_vle32_v_u32m1(ptr+4 , 4); + a_[2].sv128 = __riscv_vle32_v_u32m1(ptr+8 , 4); + #else + for (size_t i = 0; i < 12; i++) { + a_[i / 4].values[i % 4] = ptr[i]; + } + #endif simde_uint32x4x3_t s_ = { { simde_uint32x4_from_private(a_[0]), simde_uint32x4_from_private(a_[1]), simde_uint32x4_from_private(a_[2]) } }; @@ -290,9 +351,15 @@ simde_vld1q_u64_x3(uint64_t const ptr[HEDLEY_ARRAY_PARAM(3)]) { return vld1q_u64_x3(ptr); #else simde_uint64x2_private a_[3]; - for (size_t i = 0; i < 6; i++) { - a_[i / 2].values[i % 2] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle64_v_u64m1(ptr , 2); + a_[1].sv128 = __riscv_vle64_v_u64m1(ptr+2 , 2); + a_[2].sv128 = __riscv_vle64_v_u64m1(ptr+4 , 2); + #else + for (size_t i = 0; i < 6; i++) { + a_[i / 2].values[i % 2] = ptr[i]; + } + #endif simde_uint64x2x3_t s_ = { { simde_uint64x2_from_private(a_[0]), simde_uint64x2_from_private(a_[1]), simde_uint64x2_from_private(a_[2]) } }; @@ -313,9 +380,15 @@ simde_vld1q_p8_x3(simde_poly8_t const ptr[HEDLEY_ARRAY_PARAM(48)]) { return vld1q_p8_x3(ptr); #else simde_poly8x16_private a_[3]; - for (size_t i = 0; i < 48; i++) { - a_[i / 16].values[i % 16] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle8_v_u8m1(ptr , 16); + a_[1].sv128 = __riscv_vle8_v_u8m1(ptr+16 , 16); + a_[2].sv128 = __riscv_vle8_v_u8m1(ptr+32 , 16); + #else + for (size_t i = 0; i < 48; i++) { + a_[i / 16].values[i % 16] = ptr[i]; + } + #endif simde_poly8x16x3_t s_ = { { simde_poly8x16_from_private(a_[0]), simde_poly8x16_from_private(a_[1]), simde_poly8x16_from_private(a_[2]) } }; @@ -336,9 +409,15 @@ simde_vld1q_p16_x3(simde_poly16_t const ptr[HEDLEY_ARRAY_PARAM(24)]) { return vld1q_p16_x3(ptr); #else simde_poly16x8_private a_[3]; - for (size_t i = 0; i < 24; i++) { - a_[i / 8].values[i % 8] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle16_v_u16m1(ptr , 8); + a_[1].sv128 = __riscv_vle16_v_u16m1(ptr+8 , 8); + a_[2].sv128 = __riscv_vle16_v_u16m1(ptr+16 , 8); + #else + for (size_t i = 0; i < 24; i++) { + a_[i / 8].values[i % 8] = ptr[i]; + } + #endif simde_poly16x8x3_t s_ = { { simde_poly16x8_from_private(a_[0]), simde_poly16x8_from_private(a_[1]), simde_poly16x8_from_private(a_[2]) } }; @@ -359,9 +438,15 @@ simde_vld1q_p64_x3(simde_poly64_t const ptr[HEDLEY_ARRAY_PARAM(3)]) { return vld1q_p64_x3(ptr); #else simde_poly64x2_private a_[3]; - for (size_t i = 0; i < 6; i++) { - a_[i / 2].values[i % 2] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle64_v_u64m1(ptr , 2); + a_[1].sv128 = __riscv_vle64_v_u64m1(ptr+2 , 2); + a_[2].sv128 = __riscv_vle64_v_u64m1(ptr+4 , 2); + #else + for (size_t i = 0; i < 6; i++) { + a_[i / 2].values[i % 2] = ptr[i]; + } + #endif simde_poly64x2x3_t s_ = { { simde_poly64x2_from_private(a_[0]), simde_poly64x2_from_private(a_[1]), simde_poly64x2_from_private(a_[2]) } }; diff --git a/simde/arm/neon/ld1q_x4.h b/simde/arm/neon/ld1q_x4.h index 9c64e6895..2fa4c1a69 100644 --- a/simde/arm/neon/ld1q_x4.h +++ b/simde/arm/neon/ld1q_x4.h @@ -25,6 +25,7 @@ * 2021 Zhi An Ng (Copyright owned by Google, LLC) * 2021 Décio Luiz Gazzoni Filho * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_LD1Q_X4_H) @@ -51,9 +52,16 @@ simde_vld1q_f16_x4(simde_float16_t const ptr[HEDLEY_ARRAY_PARAM(32)]) { return vld1q_f16_x4(ptr); #else simde_float16x8_private a_[4]; - for (size_t i = 0; i < 32; i++) { - a_[i / 8].values[i % 8] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + a_[0].sv128 = __riscv_vle16_v_f16m1((_Float16 *)ptr , 8); + a_[1].sv128 = __riscv_vle16_v_f16m1((_Float16 *)(ptr+8) , 8); + a_[2].sv128 = __riscv_vle16_v_f16m1((_Float16 *)(ptr+16) , 8); + a_[3].sv128 = __riscv_vle16_v_f16m1((_Float16 *)(ptr+24) , 8); + #else + for (size_t i = 0; i < 32; i++) { + a_[i / 8].values[i % 8] = ptr[i]; + } + #endif simde_float16x8x4_t s_ = { { simde_float16x8_from_private(a_[0]), simde_float16x8_from_private(a_[1]), simde_float16x8_from_private(a_[2]), @@ -76,9 +84,16 @@ simde_vld1q_f32_x4(simde_float32 const ptr[HEDLEY_ARRAY_PARAM(16)]) { return vld1q_f32_x4(ptr); #else simde_float32x4_private a_[4]; - for (size_t i = 0; i < 16; i++) { - a_[i / 4].values[i % 4] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle32_v_f32m1(ptr , 4); + a_[1].sv128 = __riscv_vle32_v_f32m1(ptr+4 , 4); + a_[2].sv128 = __riscv_vle32_v_f32m1(ptr+8 , 4); + a_[3].sv128 = __riscv_vle32_v_f32m1(ptr+12 , 4); + #else + for (size_t i = 0; i < 16; i++) { + a_[i / 4].values[i % 4] = ptr[i]; + } + #endif simde_float32x4x4_t s_ = { { simde_float32x4_from_private(a_[0]), simde_float32x4_from_private(a_[1]), simde_float32x4_from_private(a_[2]), @@ -101,9 +116,16 @@ simde_vld1q_f64_x4(simde_float64 const ptr[HEDLEY_ARRAY_PARAM(8)]) { return vld1q_f64_x4(ptr); #else simde_float64x2_private a_[4]; - for (size_t i = 0; i < 8; i++) { - a_[i / 2].values[i % 2] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle64_v_f64m1(ptr , 2); + a_[1].sv128 = __riscv_vle64_v_f64m1(ptr+2 , 2); + a_[2].sv128 = __riscv_vle64_v_f64m1(ptr+4 , 2); + a_[3].sv128 = __riscv_vle64_v_f64m1(ptr+6 , 2); + #else + for (size_t i = 0; i < 8; i++) { + a_[i / 2].values[i % 2] = ptr[i]; + } + #endif simde_float64x2x4_t s_ = { { simde_float64x2_from_private(a_[0]), simde_float64x2_from_private(a_[1]), simde_float64x2_from_private(a_[2]), @@ -126,9 +148,16 @@ simde_vld1q_s8_x4(int8_t const ptr[HEDLEY_ARRAY_PARAM(64)]) { return vld1q_s8_x4(ptr); #else simde_int8x16_private a_[4]; - for (size_t i = 0; i < 64; i++) { - a_[i / 16].values[i % 16] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle8_v_i8m1(ptr , 16); + a_[1].sv128 = __riscv_vle8_v_i8m1(ptr+16 , 16); + a_[2].sv128 = __riscv_vle8_v_i8m1(ptr+32 , 16); + a_[3].sv128 = __riscv_vle8_v_i8m1(ptr+48 , 16); + #else + for (size_t i = 0; i < 64; i++) { + a_[i / 16].values[i % 16] = ptr[i]; + } + #endif simde_int8x16x4_t s_ = { { simde_int8x16_from_private(a_[0]), simde_int8x16_from_private(a_[1]), simde_int8x16_from_private(a_[2]), @@ -151,9 +180,16 @@ simde_vld1q_s16_x4(int16_t const ptr[HEDLEY_ARRAY_PARAM(32)]) { return vld1q_s16_x4(ptr); #else simde_int16x8_private a_[4]; - for (size_t i = 0; i < 32; i++) { - a_[i / 8].values[i % 8] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle16_v_i16m1(ptr , 8); + a_[1].sv128 = __riscv_vle16_v_i16m1(ptr+8 , 8); + a_[2].sv128 = __riscv_vle16_v_i16m1(ptr+16 , 8); + a_[3].sv128 = __riscv_vle16_v_i16m1(ptr+24 , 8); + #else + for (size_t i = 0; i < 32; i++) { + a_[i / 8].values[i % 8] = ptr[i]; + } + #endif simde_int16x8x4_t s_ = { { simde_int16x8_from_private(a_[0]), simde_int16x8_from_private(a_[1]), simde_int16x8_from_private(a_[2]), @@ -176,9 +212,16 @@ simde_vld1q_s32_x4(int32_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { return vld1q_s32_x4(ptr); #else simde_int32x4_private a_[4]; - for (size_t i = 0; i < 16; i++) { - a_[i / 4].values[i % 4] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle32_v_i32m1(ptr , 4); + a_[1].sv128 = __riscv_vle32_v_i32m1(ptr+4 , 4); + a_[2].sv128 = __riscv_vle32_v_i32m1(ptr+8 , 4); + a_[3].sv128 = __riscv_vle32_v_i32m1(ptr+12 , 4); + #else + for (size_t i = 0; i < 16; i++) { + a_[i / 4].values[i % 4] = ptr[i]; + } + #endif simde_int32x4x4_t s_ = { { simde_int32x4_from_private(a_[0]), simde_int32x4_from_private(a_[1]), simde_int32x4_from_private(a_[2]), @@ -201,9 +244,16 @@ simde_vld1q_s64_x4(int64_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { return vld1q_s64_x4(ptr); #else simde_int64x2_private a_[4]; - for (size_t i = 0; i < 8; i++) { - a_[i / 2].values[i % 2] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle64_v_i64m1(ptr , 2); + a_[1].sv128 = __riscv_vle64_v_i64m1(ptr+2 , 2); + a_[2].sv128 = __riscv_vle64_v_i64m1(ptr+4 , 2); + a_[3].sv128 = __riscv_vle64_v_i64m1(ptr+6 , 2); + #else + for (size_t i = 0; i < 8; i++) { + a_[i / 2].values[i % 2] = ptr[i]; + } + #endif simde_int64x2x4_t s_ = { { simde_int64x2_from_private(a_[0]), simde_int64x2_from_private(a_[1]), simde_int64x2_from_private(a_[1]), @@ -226,9 +276,16 @@ simde_vld1q_u8_x4(uint8_t const ptr[HEDLEY_ARRAY_PARAM(64)]) { return vld1q_u8_x4(ptr); #else simde_uint8x16_private a_[4]; - for (size_t i = 0; i < 64; i++) { - a_[i / 16].values[i % 16] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle8_v_u8m1(ptr , 16); + a_[1].sv128 = __riscv_vle8_v_u8m1(ptr+16 , 16); + a_[2].sv128 = __riscv_vle8_v_u8m1(ptr+32 , 16); + a_[3].sv128 = __riscv_vle8_v_u8m1(ptr+48 , 16); + #else + for (size_t i = 0; i < 64; i++) { + a_[i / 16].values[i % 16] = ptr[i]; + } + #endif simde_uint8x16x4_t s_ = { { simde_uint8x16_from_private(a_[0]), simde_uint8x16_from_private(a_[1]), simde_uint8x16_from_private(a_[2]), @@ -251,9 +308,16 @@ simde_vld1q_u16_x4(uint16_t const ptr[HEDLEY_ARRAY_PARAM(32)]) { return vld1q_u16_x4(ptr); #else simde_uint16x8_private a_[4]; - for (size_t i = 0; i < 32; i++) { - a_[i / 8].values[i % 8] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle16_v_u16m1(ptr , 8); + a_[1].sv128 = __riscv_vle16_v_u16m1(ptr+8 , 8); + a_[2].sv128 = __riscv_vle16_v_u16m1(ptr+16 , 8); + a_[3].sv128 = __riscv_vle16_v_u16m1(ptr+24 , 8); + #else + for (size_t i = 0; i < 32; i++) { + a_[i / 8].values[i % 8] = ptr[i]; + } + #endif simde_uint16x8x4_t s_ = { { simde_uint16x8_from_private(a_[0]), simde_uint16x8_from_private(a_[1]), simde_uint16x8_from_private(a_[2]), @@ -276,9 +340,16 @@ simde_vld1q_u32_x4(uint32_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { return vld1q_u32_x4(ptr); #else simde_uint32x4_private a_[4]; - for (size_t i = 0; i < 16; i++) { - a_[i / 4].values[i % 4] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle32_v_u32m1(ptr , 4); + a_[1].sv128 = __riscv_vle32_v_u32m1(ptr+4 , 4); + a_[2].sv128 = __riscv_vle32_v_u32m1(ptr+8 , 4); + a_[3].sv128 = __riscv_vle32_v_u32m1(ptr+12 , 4); + #else + for (size_t i = 0; i < 16; i++) { + a_[i / 4].values[i % 4] = ptr[i]; + } + #endif simde_uint32x4x4_t s_ = { { simde_uint32x4_from_private(a_[0]), simde_uint32x4_from_private(a_[1]), simde_uint32x4_from_private(a_[2]), @@ -301,9 +372,16 @@ simde_vld1q_u64_x4(uint64_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { return vld1q_u64_x4(ptr); #else simde_uint64x2_private a_[4]; - for (size_t i = 0; i < 8; i++) { - a_[i / 2].values[i % 2] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle64_v_u64m1(ptr , 2); + a_[1].sv128 = __riscv_vle64_v_u64m1(ptr+2 , 2); + a_[2].sv128 = __riscv_vle64_v_u64m1(ptr+4 , 2); + a_[3].sv128 = __riscv_vle64_v_u64m1(ptr+6 , 2); + #else + for (size_t i = 0; i < 8; i++) { + a_[i / 2].values[i % 2] = ptr[i]; + } + #endif simde_uint64x2x4_t s_ = { { simde_uint64x2_from_private(a_[0]), simde_uint64x2_from_private(a_[1]), simde_uint64x2_from_private(a_[2]), @@ -325,9 +403,16 @@ simde_vld1q_p8_x4(simde_poly8_t const ptr[HEDLEY_ARRAY_PARAM(64)]) { return vld1q_p8_x4(ptr); #else simde_poly8x16_private a_[4]; - for (size_t i = 0; i < 64; i++) { - a_[i / 16].values[i % 16] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle8_v_u8m1(ptr , 16); + a_[1].sv128 = __riscv_vle8_v_u8m1(ptr+16 , 16); + a_[2].sv128 = __riscv_vle8_v_u8m1(ptr+32 , 16); + a_[3].sv128 = __riscv_vle8_v_u8m1(ptr+48 , 16); + #else + for (size_t i = 0; i < 64; i++) { + a_[i / 16].values[i % 16] = ptr[i]; + } + #endif simde_poly8x16x4_t s_ = { { simde_poly8x16_from_private(a_[0]), simde_poly8x16_from_private(a_[1]), simde_poly8x16_from_private(a_[2]), @@ -349,9 +434,16 @@ simde_vld1q_p16_x4(simde_poly16_t const ptr[HEDLEY_ARRAY_PARAM(32)]) { return vld1q_p16_x4(ptr); #else simde_poly16x8_private a_[4]; - for (size_t i = 0; i < 32; i++) { - a_[i / 8].values[i % 8] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle16_v_u16m1(ptr , 8); + a_[1].sv128 = __riscv_vle16_v_u16m1(ptr+8 , 8); + a_[2].sv128 = __riscv_vle16_v_u16m1(ptr+16 , 8); + a_[3].sv128 = __riscv_vle16_v_u16m1(ptr+24 , 8); + #else + for (size_t i = 0; i < 32; i++) { + a_[i / 8].values[i % 8] = ptr[i]; + } + #endif simde_poly16x8x4_t s_ = { { simde_poly16x8_from_private(a_[0]), simde_poly16x8_from_private(a_[1]), simde_poly16x8_from_private(a_[2]), @@ -373,9 +465,16 @@ simde_vld1q_p64_x4(simde_poly64_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { return vld1q_p64_x4(ptr); #else simde_poly64x2_private a_[4]; - for (size_t i = 0; i < 8; i++) { - a_[i / 2].values[i % 2] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle64_v_u64m1(ptr , 2); + a_[1].sv128 = __riscv_vle64_v_u64m1(ptr+2 , 2); + a_[2].sv128 = __riscv_vle64_v_u64m1(ptr+4 , 2); + a_[3].sv128 = __riscv_vle64_v_u64m1(ptr+6 , 2); + #else + for (size_t i = 0; i < 8; i++) { + a_[i / 2].values[i % 2] = ptr[i]; + } + #endif simde_poly64x2x4_t s_ = { { simde_poly64x2_from_private(a_[0]), simde_poly64x2_from_private(a_[1]), simde_poly64x2_from_private(a_[2]), diff --git a/simde/arm/neon/ld2.h b/simde/arm/neon/ld2.h index 8011790c6..4f0a8ca94 100644 --- a/simde/arm/neon/ld2.h +++ b/simde/arm/neon/ld2.h @@ -23,6 +23,7 @@ * Copyright: * 2021 Zhi An Ng (Copyright owned by Google, LLC) * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_LD2_H) @@ -58,6 +59,16 @@ simde_vld2_s8(int8_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { simde_vget_high_s8(q) }; return u; + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int8x8_private a_[2]; + vint8m1x2_t dest = __riscv_vlseg2e8_v_i8m1x2(&ptr[0], 8); + a_[0].sv64 = __riscv_vget_v_i8m1x2_i8m1(dest, 0); + a_[1].sv64 = __riscv_vget_v_i8m1x2_i8m1(dest, 1); + simde_int8x8x2_t r = { { + simde_int8x8_from_private(a_[0]), + simde_int8x8_from_private(a_[1]), + } }; + return r; #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && defined(SIMDE_SHUFFLE_VECTOR_) simde_int8x16_private a_ = simde_int8x16_to_private(simde_vld1q_s8(ptr)); a_.values = SIMDE_SHUFFLE_VECTOR_(8, 16, a_.values, a_.values, 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15); @@ -91,6 +102,16 @@ simde_int16x4x2_t simde_vld2_s16(int16_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vld2_s16(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int16x4_private a_[2]; + vint16m1x2_t dest = __riscv_vlseg2e16_v_i16m1x2(&ptr[0], 4); + a_[0].sv64 = __riscv_vget_v_i16m1x2_i16m1(dest, 0); + a_[1].sv64 = __riscv_vget_v_i16m1x2_i16m1(dest, 1); + simde_int16x4x2_t r = { { + simde_int16x4_from_private(a_[0]), + simde_int16x4_from_private(a_[1]), + } }; + return r; #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && defined(SIMDE_SHUFFLE_VECTOR_) simde_int16x8_private a_ = simde_int16x8_to_private(simde_vld1q_s16(ptr)); a_.values = SIMDE_SHUFFLE_VECTOR_(16, 16, a_.values, a_.values, 0, 2, 4, 6, 1, 3, 5, 7); @@ -131,6 +152,16 @@ simde_int32x2x2_t simde_vld2_s32(int32_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vld2_s32(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int32x2_private a_[2]; + vint32m1x2_t dest = __riscv_vlseg2e32_v_i32m1x2(&ptr[0], 2); + a_[0].sv64 = __riscv_vget_v_i32m1x2_i32m1(dest, 0); + a_[1].sv64 = __riscv_vget_v_i32m1x2_i32m1(dest, 1); + simde_int32x2x2_t r = { { + simde_int32x2_from_private(a_[0]), + simde_int32x2_from_private(a_[1]), + } }; + return r; #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && defined(SIMDE_SHUFFLE_VECTOR_) simde_int32x4_private a_ = simde_int32x4_to_private(simde_vld1q_s32(ptr)); a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 0, 2, 1, 3); @@ -164,6 +195,16 @@ simde_int64x1x2_t simde_vld2_s64(int64_t const ptr[HEDLEY_ARRAY_PARAM(2)]) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vld2_s64(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int64x1_private a_[2]; + vint64m1x2_t dest = __riscv_vlseg2e64_v_i64m1x2(&ptr[0], 1); + a_[0].sv64 = __riscv_vget_v_i64m1x2_i64m1(dest, 0); + a_[1].sv64 = __riscv_vget_v_i64m1x2_i64m1(dest, 1); + simde_int64x1x2_t r = { { + simde_int64x1_from_private(a_[0]), + simde_int64x1_from_private(a_[1]), + } }; + return r; #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && defined(SIMDE_SHUFFLE_VECTOR_) simde_int64x2_private a_ = simde_int64x2_to_private(simde_vld1q_s64(ptr)); a_.values = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.values, a_.values, 0, 1); @@ -208,6 +249,16 @@ simde_vld2_u8(uint8_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { simde_vget_high_u8(q) }; return u; + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint8x8_private a_[2]; + vuint8m1x2_t dest = __riscv_vlseg2e8_v_u8m1x2(&ptr[0], 8); + a_[0].sv64 = __riscv_vget_v_u8m1x2_u8m1(dest, 0); + a_[1].sv64 = __riscv_vget_v_u8m1x2_u8m1(dest, 1); + simde_uint8x8x2_t r = { { + simde_uint8x8_from_private(a_[0]), + simde_uint8x8_from_private(a_[1]), + } }; + return r; #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && defined(SIMDE_SHUFFLE_VECTOR_) simde_uint8x16_private a_ = simde_uint8x16_to_private(simde_vld1q_u8(ptr)); a_.values = SIMDE_SHUFFLE_VECTOR_(8, 16, a_.values, a_.values, 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15); @@ -241,6 +292,16 @@ simde_uint16x4x2_t simde_vld2_u16(uint16_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vld2_u16(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint16x4_private a_[2]; + vuint16m1x2_t dest = __riscv_vlseg2e16_v_u16m1x2(&ptr[0], 4); + a_[0].sv64 = __riscv_vget_v_u16m1x2_u16m1(dest, 0); + a_[1].sv64 = __riscv_vget_v_u16m1x2_u16m1(dest, 1); + simde_uint16x4x2_t r = { { + simde_uint16x4_from_private(a_[0]), + simde_uint16x4_from_private(a_[1]), + } }; + return r; #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && defined(SIMDE_SHUFFLE_VECTOR_) simde_uint16x8_private a_ = simde_uint16x8_to_private(simde_vld1q_u16(ptr)); a_.values = SIMDE_SHUFFLE_VECTOR_(16, 16, a_.values, a_.values, 0, 2, 4, 6, 1, 3, 5, 7); @@ -281,6 +342,16 @@ simde_uint32x2x2_t simde_vld2_u32(uint32_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vld2_u32(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint32x2_private a_[2]; + vuint32m1x2_t dest = __riscv_vlseg2e32_v_u32m1x2(&ptr[0], 2); + a_[0].sv64 = __riscv_vget_v_u32m1x2_u32m1(dest, 0); + a_[1].sv64 = __riscv_vget_v_u32m1x2_u32m1(dest, 1); + simde_uint32x2x2_t r = { { + simde_uint32x2_from_private(a_[0]), + simde_uint32x2_from_private(a_[1]), + } }; + return r; #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && defined(SIMDE_SHUFFLE_VECTOR_) simde_uint32x4_private a_ = simde_uint32x4_to_private(simde_vld1q_u32(ptr)); a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 0, 2, 1, 3); @@ -314,6 +385,16 @@ simde_uint64x1x2_t simde_vld2_u64(uint64_t const ptr[HEDLEY_ARRAY_PARAM(2)]) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vld2_u64(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint64x1_private a_[2]; + vuint64m1x2_t dest = __riscv_vlseg2e64_v_u64m1x2(&ptr[0], 1); + a_[0].sv64 = __riscv_vget_v_u64m1x2_u64m1(dest, 0); + a_[1].sv64 = __riscv_vget_v_u64m1x2_u64m1(dest, 1); + simde_uint64x1x2_t r = { { + simde_uint64x1_from_private(a_[0]), + simde_uint64x1_from_private(a_[1]), + } }; + return r; #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && defined(SIMDE_SHUFFLE_VECTOR_) simde_uint64x2_private a_ = simde_uint64x2_to_private(simde_vld1q_u64(ptr)); a_.values = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.values, a_.values, 0, 1); @@ -347,6 +428,16 @@ simde_float16x4x2_t simde_vld2_f16(simde_float16_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) return vld2_f16(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH && (SIMDE_NATURAL_VECTOR_SIZE >= 128) + simde_float16x4_private r_[2]; + vfloat16m1x2_t dest = __riscv_vlseg2e16_v_f16m1x2((_Float16 *)&ptr[0], 4); + r_[0].sv64 = __riscv_vget_v_f16m1x2_f16m1(dest, 0); + r_[1].sv64 = __riscv_vget_v_f16m1x2_f16m1(dest, 1); + simde_float16x4x2_t r = { { + simde_float16x4_from_private(r_[0]), + simde_float16x4_from_private(r_[1]), + } }; + return r; #else simde_float16x4_private r_[2]; @@ -374,6 +465,16 @@ simde_float32x2x2_t simde_vld2_f32(simde_float32_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vld2_f32(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_float32x2_private r_[2]; + vfloat32m1x2_t dest = __riscv_vlseg2e32_v_f32m1x2(&ptr[0], 2); + r_[0].sv64 = __riscv_vget_v_f32m1x2_f32m1(dest, 0); + r_[1].sv64 = __riscv_vget_v_f32m1x2_f32m1(dest, 1); + simde_float32x2x2_t r = { { + simde_float32x2_from_private(r_[0]), + simde_float32x2_from_private(r_[1]), + } }; + return r; #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && defined(SIMDE_SHUFFLE_VECTOR_) simde_float32x4_private a_ = simde_float32x4_to_private(simde_vld1q_f32(ptr)); a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 0, 2, 1, 3); @@ -407,6 +508,16 @@ simde_float64x1x2_t simde_vld2_f64(simde_float64_t const ptr[HEDLEY_ARRAY_PARAM(2)]) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vld2_f64(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_float64x1_private r_[2]; + vfloat64m1x2_t dest = __riscv_vlseg2e64_v_f64m1x2(&ptr[0], 1); + r_[0].sv64 = __riscv_vget_v_f64m1x2_f64m1(dest, 0); + r_[1].sv64 = __riscv_vget_v_f64m1x2_f64m1(dest, 1); + simde_float64x1x2_t r = { { + simde_float64x1_from_private(r_[0]), + simde_float64x1_from_private(r_[1]), + } }; + return r; #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && defined(SIMDE_SHUFFLE_VECTOR_) simde_float64x2_private a_ = simde_float64x2_to_private(simde_vld1q_f64(ptr)); a_.values = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.values, a_.values, 0, 1); @@ -440,6 +551,16 @@ simde_int8x16x2_t simde_vld2q_s8(int8_t const ptr[HEDLEY_ARRAY_PARAM(32)]) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vld2q_s8(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int8x16_private a_[2]; + vint8m1x2_t dest = __riscv_vlseg2e8_v_i8m1x2(&ptr[0], 16); + a_[0].sv128 = __riscv_vget_v_i8m1x2_i8m1(dest, 0); + a_[1].sv128 = __riscv_vget_v_i8m1x2_i8m1(dest, 1); + simde_int8x16x2_t r = { { + simde_int8x16_from_private(a_[0]), + simde_int8x16_from_private(a_[1]), + } }; + return r; #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) return simde_vuzpq_s8( @@ -480,6 +601,16 @@ simde_int32x4x2_t simde_vld2q_s32(int32_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vld2q_s32(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int32x4_private a_[2]; + vint32m1x2_t dest = __riscv_vlseg2e32_v_i32m1x2(&ptr[0], 4); + a_[0].sv128 = __riscv_vget_v_i32m1x2_i32m1(dest, 0); + a_[1].sv128 = __riscv_vget_v_i32m1x2_i32m1(dest, 1); + simde_int32x4x2_t r = { { + simde_int32x4_from_private(a_[0]), + simde_int32x4_from_private(a_[1]), + } }; + return r; #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) return simde_vuzpq_s32( @@ -520,6 +651,16 @@ simde_int16x8x2_t simde_vld2q_s16(int16_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vld2q_s16(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int16x8_private r_[2]; + vint16m1x2_t dest = __riscv_vlseg2e16_v_i16m1x2(&ptr[0], 8); + r_[0].sv128 = __riscv_vget_v_i16m1x2_i16m1(dest, 0); + r_[1].sv128 = __riscv_vget_v_i16m1x2_i16m1(dest, 1); + simde_int16x8x2_t r = { { + simde_int16x8_from_private(r_[0]), + simde_int16x8_from_private(r_[1]), + } }; + return r; #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) return simde_vuzpq_s16( @@ -560,6 +701,16 @@ simde_int64x2x2_t simde_vld2q_s64(int64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vld2q_s64(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int64x2_private r_[2]; + vint64m1x2_t dest = __riscv_vlseg2e64_v_i64m1x2(&ptr[0], 2); + r_[0].sv128 = __riscv_vget_v_i64m1x2_i64m1(dest, 0); + r_[1].sv128 = __riscv_vget_v_i64m1x2_i64m1(dest, 1); + simde_int64x2x2_t r = { { + simde_int64x2_from_private(r_[0]), + simde_int64x2_from_private(r_[1]), + } }; + return r; #else simde_int64x2_private r_[2]; @@ -587,6 +738,16 @@ simde_uint8x16x2_t simde_vld2q_u8(uint8_t const ptr[HEDLEY_ARRAY_PARAM(32)]) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vld2q_u8(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint8x16_private r_[2]; + vuint8m1x2_t dest = __riscv_vlseg2e8_v_u8m1x2(&ptr[0], 16); + r_[0].sv128 = __riscv_vget_v_u8m1x2_u8m1(dest, 0); + r_[1].sv128 = __riscv_vget_v_u8m1x2_u8m1(dest, 1); + simde_uint8x16x2_t r = { { + simde_uint8x16_from_private(r_[0]), + simde_uint8x16_from_private(r_[1]), + } }; + return r; #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) return simde_vuzpq_u8( @@ -627,6 +788,16 @@ simde_uint16x8x2_t simde_vld2q_u16(uint16_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vld2q_u16(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint16x8_private r_[2]; + vuint16m1x2_t dest = __riscv_vlseg2e16_v_u16m1x2(&ptr[0], 8); + r_[0].sv128 = __riscv_vget_v_u16m1x2_u16m1(dest, 0); + r_[1].sv128 = __riscv_vget_v_u16m1x2_u16m1(dest, 1); + simde_uint16x8x2_t r = { { + simde_uint16x8_from_private(r_[0]), + simde_uint16x8_from_private(r_[1]), + } }; + return r; #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) return simde_vuzpq_u16( @@ -667,6 +838,16 @@ simde_uint32x4x2_t simde_vld2q_u32(uint32_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vld2q_u32(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint32x4_private r_[2]; + vuint32m1x2_t dest = __riscv_vlseg2e32_v_u32m1x2(&ptr[0], 4); + r_[0].sv128 = __riscv_vget_v_u32m1x2_u32m1(dest, 0); + r_[1].sv128 = __riscv_vget_v_u32m1x2_u32m1(dest, 1); + simde_uint32x4x2_t r = { { + simde_uint32x4_from_private(r_[0]), + simde_uint32x4_from_private(r_[1]), + } }; + return r; #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) return simde_vuzpq_u32( @@ -707,6 +888,16 @@ simde_uint64x2x2_t simde_vld2q_u64(uint64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vld2q_u64(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint64x2_private r_[2]; + vuint64m1x2_t dest = __riscv_vlseg2e64_v_u64m1x2(&ptr[0], 2); + r_[0].sv128 = __riscv_vget_v_u64m1x2_u64m1(dest, 0); + r_[1].sv128 = __riscv_vget_v_u64m1x2_u64m1(dest, 1); + simde_uint64x2x2_t r = { { + simde_uint64x2_from_private(r_[0]), + simde_uint64x2_from_private(r_[1]), + } }; + return r; #else simde_uint64x2_private r_[2]; @@ -734,6 +925,16 @@ simde_float16x8x2_t simde_vld2q_f16(simde_float16_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) return vld2q_f16(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH && (SIMDE_NATURAL_VECTOR_SIZE >= 128) + simde_float16x8_private r_[2]; + vfloat16m1x2_t dest = __riscv_vlseg2e16_v_f16m1x2((_Float16 *)&ptr[0], 8); + r_[0].sv128 = __riscv_vget_v_f16m1x2_f16m1(dest, 0); + r_[1].sv128 = __riscv_vget_v_f16m1x2_f16m1(dest, 1); + simde_float16x8x2_t r = { { + simde_float16x8_from_private(r_[0]), + simde_float16x8_from_private(r_[1]), + } }; + return r; #else #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && HEDLEY_GCC_VERSION_CHECK(12,0,0) HEDLEY_DIAGNOSTIC_PUSH @@ -768,6 +969,16 @@ simde_float32x4x2_t simde_vld2q_f32(simde_float32_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vld2q_f32(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_float32x4_private r_[2]; + vfloat32m1x2_t dest = __riscv_vlseg2e32_v_f32m1x2(&ptr[0], 4); + r_[0].sv128 = __riscv_vget_v_f32m1x2_f32m1(dest, 0); + r_[1].sv128 = __riscv_vget_v_f32m1x2_f32m1(dest, 1); + simde_float32x4x2_t r = { { + simde_float32x4_from_private(r_[0]), + simde_float32x4_from_private(r_[1]), + } }; + return r; #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) return simde_vuzpq_f32( @@ -808,6 +1019,16 @@ simde_float64x2x2_t simde_vld2q_f64(simde_float64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vld2q_f64(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_float64x2_private r_[2]; + vfloat64m1x2_t dest = __riscv_vlseg2e64_v_f64m1x2(&ptr[0], 2); + r_[0].sv128 = __riscv_vget_v_f64m1x2_f64m1(dest, 0); + r_[1].sv128 = __riscv_vget_v_f64m1x2_f64m1(dest, 1); + simde_float64x2x2_t r = { { + simde_float64x2_from_private(r_[0]), + simde_float64x2_from_private(r_[1]), + } }; + return r; #else simde_float64x2_private r_[2]; @@ -837,13 +1058,17 @@ simde_vld2_p8(simde_poly8_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { return vld2_p8(ptr); #else simde_poly8x8_private r_[2]; - - for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_[0])) ; i++) { - for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { - r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + #if defined(SIMDE_RISCV_V_NATIVE) + vuint8m1x2_t dest = __riscv_vlseg2e8_v_u8m1x2(&ptr[0], 8); + r_[0].sv64 = __riscv_vget_v_u8m1x2_u8m1(dest, 0); + r_[1].sv64 = __riscv_vget_v_u8m1x2_u8m1(dest, 1); + #else + for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_[0])) ; i++) { + for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { + r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + } } - } - + #endif simde_poly8x8x2_t r = { { simde_poly8x8_from_private(r_[0]), simde_poly8x8_from_private(r_[1]), @@ -868,12 +1093,17 @@ simde_vld2_p16(simde_poly16_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ #endif simde_poly16x4_private r_[2]; - - for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_[0])) ; i++) { - for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { - r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + #if defined(SIMDE_RISCV_V_NATIVE) + vuint16m1x2_t dest = __riscv_vlseg2e16_v_u16m1x2(&ptr[0], 4); + r_[0].sv64 = __riscv_vget_v_u16m1x2_u16m1(dest, 0); + r_[1].sv64 = __riscv_vget_v_u16m1x2_u16m1(dest, 1); + #else + for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_[0])) ; i++) { + for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { + r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + } } - } + #endif #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && HEDLEY_GCC_VERSION_CHECK(12,0,0) HEDLEY_DIAGNOSTIC_POP #endif @@ -899,11 +1129,17 @@ simde_vld2_p64(simde_poly64_t const ptr[HEDLEY_ARRAY_PARAM(2)]) { #else simde_poly64x1_private r_[2]; - for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_[0])) ; i++) { - for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { - r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + #if defined(SIMDE_RISCV_V_NATIVE) + vuint64m1x2_t dest = __riscv_vlseg2e64_v_u64m1x2(&ptr[0], 1); + r_[0].sv64 = __riscv_vget_v_u64m1x2_u64m1(dest, 0); + r_[1].sv64 = __riscv_vget_v_u64m1x2_u64m1(dest, 1); + #else + for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_[0])) ; i++) { + for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { + r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + } } - } + #endif simde_poly64x1x2_t r = { { simde_poly64x1_from_private(r_[0]), @@ -930,11 +1166,17 @@ simde_vld2q_p8(simde_poly8_t const ptr[HEDLEY_ARRAY_PARAM(32)]) { #endif simde_poly8x16_private r_[2]; - for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_[0])) ; i++) { - for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { - r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + #if defined(SIMDE_RISCV_V_NATIVE) + vuint8m1x2_t dest = __riscv_vlseg2e8_v_u8m1x2(&ptr[0], 16); + r_[0].sv128 = __riscv_vget_v_u8m1x2_u8m1(dest, 0); + r_[1].sv128 = __riscv_vget_v_u8m1x2_u8m1(dest, 1); + #else + for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_[0])) ; i++) { + for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { + r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + } } - } + #endif simde_poly8x16x2_t r = { { simde_poly8x16_from_private(r_[0]), @@ -964,11 +1206,17 @@ simde_vld2q_p16(simde_poly16_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { #endif simde_poly16x8_private r_[2]; - for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_[0])) ; i++) { - for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { - r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + #if defined(SIMDE_RISCV_V_NATIVE) + vuint16m1x2_t dest = __riscv_vlseg2e16_v_u16m1x2(&ptr[0], 8); + r_[0].sv128 = __riscv_vget_v_u16m1x2_u16m1(dest, 0); + r_[1].sv128 = __riscv_vget_v_u16m1x2_u16m1(dest, 1); + #else + for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_[0])) ; i++) { + for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { + r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + } } - } + #endif simde_poly16x8x2_t r = { { simde_poly16x8_from_private(r_[0]), @@ -994,11 +1242,17 @@ simde_vld2q_p64(simde_poly64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { #else simde_poly64x2_private r_[2]; - for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_[0])) ; i++) { - for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { - r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + #if defined(SIMDE_RISCV_V_NATIVE) + vuint64m1x2_t dest = __riscv_vlseg2e64_v_u64m1x2(&ptr[0], 2); + r_[0].sv128 = __riscv_vget_v_u64m1x2_u64m1(dest, 0); + r_[1].sv128 = __riscv_vget_v_u64m1x2_u64m1(dest, 1); + #else + for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_[0])) ; i++) { + for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { + r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + } } - } + #endif simde_poly64x2x2_t r = { { simde_poly64x2_from_private(r_[0]), diff --git a/simde/arm/neon/ld3.h b/simde/arm/neon/ld3.h index b53baa836..994052fe3 100644 --- a/simde/arm/neon/ld3.h +++ b/simde/arm/neon/ld3.h @@ -24,6 +24,7 @@ * 2020 Evan Nemerson * 2020 Sean Maher * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_LD3_H) @@ -48,13 +49,18 @@ simde_vld3_f16(simde_float16_t const *ptr) { return vld3_f16(ptr); #else simde_float16x4_private r_[3]; - - for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) { - for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { - r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH && (SIMDE_NATURAL_VECTOR_SIZE >= 128) + vfloat16m1x3_t dest = __riscv_vlseg3e16_v_f16m1x3((_Float16 *)&ptr[0], 4); + r_[0].sv64 = __riscv_vget_v_f16m1x3_f16m1(dest, 0); + r_[1].sv64 = __riscv_vget_v_f16m1x3_f16m1(dest, 1); + r_[2].sv64 = __riscv_vget_v_f16m1x3_f16m1(dest, 2); + #else + for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) { + for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { + r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + } } - } - + #endif simde_float16x4x3_t r = { { simde_float16x4_from_private(r_[0]), simde_float16x4_from_private(r_[1]), @@ -76,13 +82,18 @@ simde_vld3_f32(simde_float32 const *ptr) { return vld3_f32(ptr); #else simde_float32x2_private r_[3]; - - for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) { - for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { - r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + #if defined(SIMDE_RISCV_V_NATIVE) + vfloat32m1x3_t dest = __riscv_vlseg3e32_v_f32m1x3(&ptr[0], 2); + r_[0].sv64 = __riscv_vget_v_f32m1x3_f32m1(dest, 0); + r_[1].sv64 = __riscv_vget_v_f32m1x3_f32m1(dest, 1); + r_[2].sv64 = __riscv_vget_v_f32m1x3_f32m1(dest, 2); + #else + for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) { + for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { + r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + } } - } - + #endif simde_float32x2x3_t r = { { simde_float32x2_from_private(r_[0]), simde_float32x2_from_private(r_[1]), @@ -104,13 +115,18 @@ simde_vld3_f64(simde_float64 const *ptr) { return vld3_f64(ptr); #else simde_float64x1_private r_[3]; - - for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) { - for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { - r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + #if defined(SIMDE_RISCV_V_NATIVE) + vfloat64m1x3_t dest = __riscv_vlseg3e64_v_f64m1x3(&ptr[0], 1); + r_[0].sv64 = __riscv_vget_v_f64m1x3_f64m1(dest, 0); + r_[1].sv64 = __riscv_vget_v_f64m1x3_f64m1(dest, 1); + r_[2].sv64 = __riscv_vget_v_f64m1x3_f64m1(dest, 2); + #else + for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) { + for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { + r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + } } - } - + #endif simde_float64x1x3_t r = { { simde_float64x1_from_private(r_[0]), simde_float64x1_from_private(r_[1]), @@ -132,13 +148,18 @@ simde_vld3_s8(int8_t const *ptr) { return vld3_s8(ptr); #else simde_int8x8_private r_[3]; - - for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) { - for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { - r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + #if defined(SIMDE_RISCV_V_NATIVE) + vint8m1x3_t dest = __riscv_vlseg3e8_v_i8m1x3(&ptr[0], 8); + r_[0].sv64 = __riscv_vget_v_i8m1x3_i8m1(dest, 0); + r_[1].sv64 = __riscv_vget_v_i8m1x3_i8m1(dest, 1); + r_[2].sv64 = __riscv_vget_v_i8m1x3_i8m1(dest, 2); + #else + for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) { + for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { + r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + } } - } - + #endif simde_int8x8x3_t r = { { simde_int8x8_from_private(r_[0]), simde_int8x8_from_private(r_[1]), @@ -160,13 +181,18 @@ simde_vld3_s16(int16_t const *ptr) { return vld3_s16(ptr); #else simde_int16x4_private r_[3]; - - for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) { - for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { - r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + #if defined(SIMDE_RISCV_V_NATIVE) + vint16m1x3_t dest = __riscv_vlseg3e16_v_i16m1x3(&ptr[0], 4); + r_[0].sv64 = __riscv_vget_v_i16m1x3_i16m1(dest, 0); + r_[1].sv64 = __riscv_vget_v_i16m1x3_i16m1(dest, 1); + r_[2].sv64 = __riscv_vget_v_i16m1x3_i16m1(dest, 2); + #else + for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) { + for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { + r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + } } - } - + #endif simde_int16x4x3_t r = { { simde_int16x4_from_private(r_[0]), simde_int16x4_from_private(r_[1]), @@ -188,13 +214,18 @@ simde_vld3_s32(int32_t const *ptr) { return vld3_s32(ptr); #else simde_int32x2_private r_[3]; - - for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) { - for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { - r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + #if defined(SIMDE_RISCV_V_NATIVE) + vint32m1x3_t dest = __riscv_vlseg3e32_v_i32m1x3(&ptr[0], 2); + r_[0].sv64 = __riscv_vget_v_i32m1x3_i32m1(dest, 0); + r_[1].sv64 = __riscv_vget_v_i32m1x3_i32m1(dest, 1); + r_[2].sv64 = __riscv_vget_v_i32m1x3_i32m1(dest, 2); + #else + for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) { + for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { + r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + } } - } - + #endif simde_int32x2x3_t r = { { simde_int32x2_from_private(r_[0]), simde_int32x2_from_private(r_[1]), @@ -216,13 +247,18 @@ simde_vld3_s64(int64_t const *ptr) { return vld3_s64(ptr); #else simde_int64x1_private r_[3]; - - for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) { - for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { - r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + #if defined(SIMDE_RISCV_V_NATIVE) + vint64m1x3_t dest = __riscv_vlseg3e64_v_i64m1x3(&ptr[0], 1); + r_[0].sv64 = __riscv_vget_v_i64m1x3_i64m1(dest, 0); + r_[1].sv64 = __riscv_vget_v_i64m1x3_i64m1(dest, 1); + r_[2].sv64 = __riscv_vget_v_i64m1x3_i64m1(dest, 2); + #else + for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) { + for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { + r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + } } - } - + #endif simde_int64x1x3_t r = { { simde_int64x1_from_private(r_[0]), simde_int64x1_from_private(r_[1]), @@ -244,13 +280,18 @@ simde_vld3_u8(uint8_t const *ptr) { return vld3_u8(ptr); #else simde_uint8x8_private r_[3]; - - for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) { - for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { - r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + #if defined(SIMDE_RISCV_V_NATIVE) + vuint8m1x3_t dest = __riscv_vlseg3e8_v_u8m1x3(&ptr[0], 8); + r_[0].sv64 = __riscv_vget_v_u8m1x3_u8m1(dest, 0); + r_[1].sv64 = __riscv_vget_v_u8m1x3_u8m1(dest, 1); + r_[2].sv64 = __riscv_vget_v_u8m1x3_u8m1(dest, 2); + #else + for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) { + for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { + r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + } } - } - + #endif simde_uint8x8x3_t r = { { simde_uint8x8_from_private(r_[0]), simde_uint8x8_from_private(r_[1]), @@ -272,13 +313,18 @@ simde_vld3_u16(uint16_t const *ptr) { return vld3_u16(ptr); #else simde_uint16x4_private r_[3]; - - for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) { - for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { - r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + #if defined(SIMDE_RISCV_V_NATIVE) + vuint16m1x3_t dest = __riscv_vlseg3e16_v_u16m1x3(&ptr[0], 4); + r_[0].sv64 = __riscv_vget_v_u16m1x3_u16m1(dest, 0); + r_[1].sv64 = __riscv_vget_v_u16m1x3_u16m1(dest, 1); + r_[2].sv64 = __riscv_vget_v_u16m1x3_u16m1(dest, 2); + #else + for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) { + for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { + r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + } } - } - + #endif simde_uint16x4x3_t r = { { simde_uint16x4_from_private(r_[0]), simde_uint16x4_from_private(r_[1]), @@ -300,13 +346,18 @@ simde_vld3_u32(uint32_t const *ptr) { return vld3_u32(ptr); #else simde_uint32x2_private r_[3]; - - for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) { - for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { - r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + #if defined(SIMDE_RISCV_V_NATIVE) + vuint32m1x3_t dest = __riscv_vlseg3e32_v_u32m1x3(&ptr[0], 2); + r_[0].sv64 = __riscv_vget_v_u32m1x3_u32m1(dest, 0); + r_[1].sv64 = __riscv_vget_v_u32m1x3_u32m1(dest, 1); + r_[2].sv64 = __riscv_vget_v_u32m1x3_u32m1(dest, 2); + #else + for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) { + for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { + r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + } } - } - + #endif simde_uint32x2x3_t r = { { simde_uint32x2_from_private(r_[0]), simde_uint32x2_from_private(r_[1]), @@ -328,13 +379,18 @@ simde_vld3_u64(uint64_t const *ptr) { return vld3_u64(ptr); #else simde_uint64x1_private r_[3]; - - for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) { - for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { - r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + #if defined(SIMDE_RISCV_V_NATIVE) + vuint64m1x3_t dest = __riscv_vlseg3e64_v_u64m1x3(&ptr[0], 1); + r_[0].sv64 = __riscv_vget_v_u64m1x3_u64m1(dest, 0); + r_[1].sv64 = __riscv_vget_v_u64m1x3_u64m1(dest, 1); + r_[2].sv64 = __riscv_vget_v_u64m1x3_u64m1(dest, 2); + #else + for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) { + for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { + r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + } } - } - + #endif simde_uint64x1x3_t r = { { simde_uint64x1_from_private(r_[0]), simde_uint64x1_from_private(r_[1]), @@ -356,13 +412,18 @@ simde_vld3q_f16(simde_float16_t const *ptr) { return vld3q_f16(ptr); #else simde_float16x8_private r_[3]; - - for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) { - for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { - r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH && (SIMDE_NATURAL_VECTOR_SIZE >= 128) + vfloat16m1x3_t dest = __riscv_vlseg3e16_v_f16m1x3((_Float16 *)&ptr[0], 8); + r_[0].sv128 = __riscv_vget_v_f16m1x3_f16m1(dest, 0); + r_[1].sv128 = __riscv_vget_v_f16m1x3_f16m1(dest, 1); + r_[2].sv128 = __riscv_vget_v_f16m1x3_f16m1(dest, 2); + #else + for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) { + for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { + r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + } } - } - + #endif simde_float16x8x3_t r = { { simde_float16x8_from_private(r_[0]), simde_float16x8_from_private(r_[1]), @@ -382,6 +443,18 @@ simde_float32x4x3_t simde_vld3q_f32(simde_float32 const *ptr) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vld3q_f32(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_float32x4_private r_[3]; + vfloat32m1x3_t dest = __riscv_vlseg3e32_v_f32m1x3(&ptr[0], 4); + r_[0].sv128 = __riscv_vget_v_f32m1x3_f32m1(dest, 0); + r_[1].sv128 = __riscv_vget_v_f32m1x3_f32m1(dest, 1); + r_[2].sv128 = __riscv_vget_v_f32m1x3_f32m1(dest, 2); + simde_float32x4x3_t r = { { + simde_float32x4_from_private(r_[0]), + simde_float32x4_from_private(r_[1]), + simde_float32x4_from_private(r_[2]) + } }; + return r; #else simde_float32x4_private r_[3]; @@ -410,6 +483,18 @@ simde_float64x2x3_t simde_vld3q_f64(simde_float64 const *ptr) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vld3q_f64(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_float64x2_private r_[3]; + vfloat64m1x3_t dest = __riscv_vlseg3e64_v_f64m1x3(&ptr[0], 2); + r_[0].sv128 = __riscv_vget_v_f64m1x3_f64m1(dest, 0); + r_[1].sv128 = __riscv_vget_v_f64m1x3_f64m1(dest, 1); + r_[2].sv128 = __riscv_vget_v_f64m1x3_f64m1(dest, 2); + simde_float64x2x3_t r = { { + simde_float64x2_from_private(r_[0]), + simde_float64x2_from_private(r_[1]), + simde_float64x2_from_private(r_[2]) + } }; + return r; #else simde_float64x2_private r_[3]; @@ -438,6 +523,18 @@ simde_int8x16x3_t simde_vld3q_s8(int8_t const *ptr) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vld3q_s8(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int8x16_private r_[3]; + vint8m1x3_t dest = __riscv_vlseg3e8_v_i8m1x3(&ptr[0], 16); + r_[0].sv128 = __riscv_vget_v_i8m1x3_i8m1(dest, 0); + r_[1].sv128 = __riscv_vget_v_i8m1x3_i8m1(dest, 1); + r_[2].sv128 = __riscv_vget_v_i8m1x3_i8m1(dest, 2); + simde_int8x16x3_t r = { { + simde_int8x16_from_private(r_[0]), + simde_int8x16_from_private(r_[1]), + simde_int8x16_from_private(r_[2]) + } }; + return r; #else simde_int8x16_private r_[3]; @@ -466,6 +563,18 @@ simde_int16x8x3_t simde_vld3q_s16(int16_t const *ptr) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vld3q_s16(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int16x8_private r_[3]; + vint16m1x3_t dest = __riscv_vlseg3e16_v_i16m1x3(&ptr[0], 8); + r_[0].sv128 = __riscv_vget_v_i16m1x3_i16m1(dest, 0); + r_[1].sv128 = __riscv_vget_v_i16m1x3_i16m1(dest, 1); + r_[2].sv128 = __riscv_vget_v_i16m1x3_i16m1(dest, 2); + simde_int16x8x3_t r = { { + simde_int16x8_from_private(r_[0]), + simde_int16x8_from_private(r_[1]), + simde_int16x8_from_private(r_[2]) + } }; + return r; #else simde_int16x8_private r_[3]; @@ -494,6 +603,18 @@ simde_int32x4x3_t simde_vld3q_s32(int32_t const *ptr) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vld3q_s32(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int32x4_private r_[3]; + vint32m1x3_t dest = __riscv_vlseg3e32_v_i32m1x3(&ptr[0], 4); + r_[0].sv128 = __riscv_vget_v_i32m1x3_i32m1(dest, 0); + r_[1].sv128 = __riscv_vget_v_i32m1x3_i32m1(dest, 1); + r_[2].sv128 = __riscv_vget_v_i32m1x3_i32m1(dest, 2); + simde_int32x4x3_t r = { { + simde_int32x4_from_private(r_[0]), + simde_int32x4_from_private(r_[1]), + simde_int32x4_from_private(r_[2]) + } }; + return r; #else simde_int32x4_private r_[3]; @@ -522,6 +643,18 @@ simde_int64x2x3_t simde_vld3q_s64(int64_t const *ptr) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vld3q_s64(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int64x2_private r_[3]; + vint64m1x3_t dest = __riscv_vlseg3e64_v_i64m1x3(&ptr[0], 2); + r_[0].sv128 = __riscv_vget_v_i64m1x3_i64m1(dest, 0); + r_[1].sv128 = __riscv_vget_v_i64m1x3_i64m1(dest, 1); + r_[2].sv128 = __riscv_vget_v_i64m1x3_i64m1(dest, 2); + simde_int64x2x3_t r = { { + simde_int64x2_from_private(r_[0]), + simde_int64x2_from_private(r_[1]), + simde_int64x2_from_private(r_[2]) + } }; + return r; #else simde_int64x2_private r_[3]; @@ -551,6 +684,18 @@ simde_uint8x16x3_t simde_vld3q_u8(uint8_t const *ptr) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vld3q_u8(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint8x16_private r_[3]; + vuint8m1x3_t dest = __riscv_vlseg3e8_v_u8m1x3(&ptr[0], 16); + r_[0].sv128 = __riscv_vget_v_u8m1x3_u8m1(dest, 0); + r_[1].sv128 = __riscv_vget_v_u8m1x3_u8m1(dest, 1); + r_[2].sv128 = __riscv_vget_v_u8m1x3_u8m1(dest, 2); + simde_uint8x16x3_t r = { { + simde_uint8x16_from_private(r_[0]), + simde_uint8x16_from_private(r_[1]), + simde_uint8x16_from_private(r_[2]) + } }; + return r; #else simde_uint8x16_private r_[3]; @@ -579,6 +724,18 @@ simde_uint16x8x3_t simde_vld3q_u16(uint16_t const *ptr) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vld3q_u16(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint16x8_private r_[3]; + vuint16m1x3_t dest = __riscv_vlseg3e16_v_u16m1x3(&ptr[0], 8); + r_[0].sv128 = __riscv_vget_v_u16m1x3_u16m1(dest, 0); + r_[1].sv128 = __riscv_vget_v_u16m1x3_u16m1(dest, 1); + r_[2].sv128 = __riscv_vget_v_u16m1x3_u16m1(dest, 2); + simde_uint16x8x3_t r = { { + simde_uint16x8_from_private(r_[0]), + simde_uint16x8_from_private(r_[1]), + simde_uint16x8_from_private(r_[2]) + } }; + return r; #else simde_uint16x8_private r_[3]; @@ -607,6 +764,18 @@ simde_uint32x4x3_t simde_vld3q_u32(uint32_t const *ptr) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vld3q_u32(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint32x4_private r_[3]; + vuint32m1x3_t dest = __riscv_vlseg3e32_v_u32m1x3(&ptr[0], 4); + r_[0].sv128 = __riscv_vget_v_u32m1x3_u32m1(dest, 0); + r_[1].sv128 = __riscv_vget_v_u32m1x3_u32m1(dest, 1); + r_[2].sv128 = __riscv_vget_v_u32m1x3_u32m1(dest, 2); + simde_uint32x4x3_t r = { { + simde_uint32x4_from_private(r_[0]), + simde_uint32x4_from_private(r_[1]), + simde_uint32x4_from_private(r_[2]) + } }; + return r; #else simde_uint32x4_private r_[3]; @@ -635,6 +804,18 @@ simde_uint64x2x3_t simde_vld3q_u64(uint64_t const *ptr) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vld3q_u64(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint64x2_private r_[3]; + vuint64m1x3_t dest = __riscv_vlseg3e64_v_u64m1x3(&ptr[0], 2); + r_[0].sv128 = __riscv_vget_v_u64m1x3_u64m1(dest, 0); + r_[1].sv128 = __riscv_vget_v_u64m1x3_u64m1(dest, 1); + r_[2].sv128 = __riscv_vget_v_u64m1x3_u64m1(dest, 2); + simde_uint64x2x3_t r = { { + simde_uint64x2_from_private(r_[0]), + simde_uint64x2_from_private(r_[1]), + simde_uint64x2_from_private(r_[2]) + } }; + return r; #else simde_uint64x2_private r_[3]; @@ -666,11 +847,18 @@ simde_vld3_p8(simde_poly8_t const *ptr) { #else simde_poly8x8_private r_[3]; - for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) { - for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { - r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + #if defined(SIMDE_RISCV_V_NATIVE) + vuint8m1x3_t dest = __riscv_vlseg3e8_v_u8m1x3(&ptr[0], 8); + r_[0].sv64 = __riscv_vget_v_u8m1x3_u8m1(dest, 0); + r_[1].sv64 = __riscv_vget_v_u8m1x3_u8m1(dest, 1); + r_[2].sv64 = __riscv_vget_v_u8m1x3_u8m1(dest, 2); + #else + for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) { + for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { + r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + } } - } + #endif simde_poly8x8x3_t r = { { simde_poly8x8_from_private(r_[0]), @@ -694,11 +882,18 @@ simde_vld3_p16(simde_poly16_t const *ptr) { #else simde_poly16x4_private r_[3]; - for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) { - for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { - r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + #if defined(SIMDE_RISCV_V_NATIVE) + vuint16m1x3_t dest = __riscv_vlseg3e16_v_u16m1x3(&ptr[0], 4); + r_[0].sv64 = __riscv_vget_v_u16m1x3_u16m1(dest, 0); + r_[1].sv64 = __riscv_vget_v_u16m1x3_u16m1(dest, 1); + r_[2].sv64 = __riscv_vget_v_u16m1x3_u16m1(dest, 2); + #else + for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) { + for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { + r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + } } - } + #endif simde_poly16x4x3_t r = { { simde_poly16x4_from_private(r_[0]), @@ -722,11 +917,18 @@ simde_vld3_p64(simde_poly64_t const *ptr) { #else simde_poly64x1_private r_[3]; - for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) { - for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { - r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + #if defined(SIMDE_RISCV_V_NATIVE) + vuint64m1x3_t dest = __riscv_vlseg3e64_v_u64m1x3(&ptr[0], 1); + r_[0].sv64 = __riscv_vget_v_u64m1x3_u64m1(dest, 0); + r_[1].sv64 = __riscv_vget_v_u64m1x3_u64m1(dest, 1); + r_[2].sv64 = __riscv_vget_v_u64m1x3_u64m1(dest, 2); + #else + for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) { + for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { + r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + } } - } + #endif simde_poly64x1x3_t r = { { simde_poly64x1_from_private(r_[0]), @@ -750,11 +952,18 @@ simde_vld3q_p8(simde_poly8_t const *ptr) { #else simde_poly8x16_private r_[3]; - for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) { - for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { - r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + #if defined(SIMDE_RISCV_V_NATIVE) + vuint8m1x3_t dest = __riscv_vlseg3e8_v_u8m1x3(&ptr[0], 16); + r_[0].sv128 = __riscv_vget_v_u8m1x3_u8m1(dest, 0); + r_[1].sv128 = __riscv_vget_v_u8m1x3_u8m1(dest, 1); + r_[2].sv128 = __riscv_vget_v_u8m1x3_u8m1(dest, 2); + #else + for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) { + for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { + r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + } } - } + #endif simde_poly8x16x3_t r = { { simde_poly8x16_from_private(r_[0]), @@ -778,11 +987,18 @@ simde_vld3q_p16(simde_poly16_t const *ptr) { #else simde_poly16x8_private r_[3]; - for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) { - for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { - r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + #if defined(SIMDE_RISCV_V_NATIVE) + vuint16m1x3_t dest = __riscv_vlseg3e16_v_u16m1x3(&ptr[0], 8); + r_[0].sv128 = __riscv_vget_v_u16m1x3_u16m1(dest, 0); + r_[1].sv128 = __riscv_vget_v_u16m1x3_u16m1(dest, 1); + r_[2].sv128 = __riscv_vget_v_u16m1x3_u16m1(dest, 2); + #else + for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) { + for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { + r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + } } - } + #endif simde_poly16x8x3_t r = { { simde_poly16x8_from_private(r_[0]), @@ -806,11 +1022,18 @@ simde_vld3q_p64(simde_poly64_t const *ptr) { #else simde_poly64x2_private r_[3]; - for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) { - for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { - r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + #if defined(SIMDE_RISCV_V_NATIVE) + vuint64m1x3_t dest = __riscv_vlseg3e64_v_u64m1x3(&ptr[0], 2); + r_[0].sv128 = __riscv_vget_v_u64m1x3_u64m1(dest, 0); + r_[1].sv128 = __riscv_vget_v_u64m1x3_u64m1(dest, 1); + r_[2].sv128 = __riscv_vget_v_u64m1x3_u64m1(dest, 2); + #else + for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) { + for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { + r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + } } - } + #endif simde_poly64x2x3_t r = { { simde_poly64x2_from_private(r_[0]), diff --git a/simde/arm/neon/ld4.h b/simde/arm/neon/ld4.h index 7f6db039d..7446b9bd5 100644 --- a/simde/arm/neon/ld4.h +++ b/simde/arm/neon/ld4.h @@ -24,6 +24,7 @@ * 2020 Evan Nemerson * 2020 Sean Maher * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_LD4_H) @@ -47,9 +48,17 @@ simde_vld4_f16(simde_float16_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { return vld4_f16(ptr); #else simde_float16x4_private a_[4]; - for (size_t i = 0; i < (sizeof(simde_float16x4_t) / sizeof(*ptr)) * 4 ; i++) { - a_[i % 4].values[i / 4] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH && (SIMDE_NATURAL_VECTOR_SIZE >= 128) + vfloat16m1x4_t dest = __riscv_vlseg4e16_v_f16m1x4((_Float16 *)&ptr[0], 4); + a_[0].sv64 = __riscv_vget_v_f16m1x4_f16m1(dest, 0); + a_[1].sv64 = __riscv_vget_v_f16m1x4_f16m1(dest, 1); + a_[2].sv64 = __riscv_vget_v_f16m1x4_f16m1(dest, 2); + a_[3].sv64 = __riscv_vget_v_f16m1x4_f16m1(dest, 3); + #else + for (size_t i = 0; i < (sizeof(simde_float16x4_t) / sizeof(*ptr)) * 4 ; i++) { + a_[i % 4].values[i / 4] = ptr[i]; + } + #endif simde_float16x4x4_t s_ = { { simde_float16x4_from_private(a_[0]), simde_float16x4_from_private(a_[1]), simde_float16x4_from_private(a_[2]), simde_float16x4_from_private(a_[3]) } }; return (s_); @@ -67,9 +76,17 @@ simde_vld4_f32(simde_float32 const ptr[HEDLEY_ARRAY_PARAM(8)]) { return vld4_f32(ptr); #else simde_float32x2_private a_[4]; - for (size_t i = 0; i < (sizeof(simde_float32x2_t) / sizeof(*ptr)) * 4 ; i++) { - a_[i % 4].values[i / 4] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vfloat32m1x4_t dest = __riscv_vlseg4e32_v_f32m1x4(&ptr[0], 2); + a_[0].sv64 = __riscv_vget_v_f32m1x4_f32m1(dest, 0); + a_[1].sv64 = __riscv_vget_v_f32m1x4_f32m1(dest, 1); + a_[2].sv64 = __riscv_vget_v_f32m1x4_f32m1(dest, 2); + a_[3].sv64 = __riscv_vget_v_f32m1x4_f32m1(dest, 3); + #else + for (size_t i = 0; i < (sizeof(simde_float32x2_t) / sizeof(*ptr)) * 4 ; i++) { + a_[i % 4].values[i / 4] = ptr[i]; + } + #endif simde_float32x2x4_t s_ = { { simde_float32x2_from_private(a_[0]), simde_float32x2_from_private(a_[1]), simde_float32x2_from_private(a_[2]), simde_float32x2_from_private(a_[3]) } }; return (s_); @@ -87,9 +104,17 @@ simde_vld4_f64(simde_float64 const ptr[HEDLEY_ARRAY_PARAM(4)]) { return vld4_f64(ptr); #else simde_float64x1_private a_[4]; - for (size_t i = 0; i < (sizeof(simde_float64x1_t) / sizeof(*ptr)) * 4 ; i++) { - a_[i % 4].values[i / 4] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vfloat64m1x4_t dest = __riscv_vlseg4e64_v_f64m1x4(&ptr[0], 1); + a_[0].sv64 = __riscv_vget_v_f64m1x4_f64m1(dest, 0); + a_[1].sv64 = __riscv_vget_v_f64m1x4_f64m1(dest, 1); + a_[2].sv64 = __riscv_vget_v_f64m1x4_f64m1(dest, 2); + a_[3].sv64 = __riscv_vget_v_f64m1x4_f64m1(dest, 3); + #else + for (size_t i = 0; i < (sizeof(simde_float64x1_t) / sizeof(*ptr)) * 4 ; i++) { + a_[i % 4].values[i / 4] = ptr[i]; + } + #endif simde_float64x1x4_t s_ = { { simde_float64x1_from_private(a_[0]), simde_float64x1_from_private(a_[1]), simde_float64x1_from_private(a_[2]), simde_float64x1_from_private(a_[3]) } }; return s_; @@ -107,9 +132,17 @@ simde_vld4_s8(int8_t const ptr[HEDLEY_ARRAY_PARAM(32)]) { return vld4_s8(ptr); #else simde_int8x8_private a_[4]; - for (size_t i = 0; i < (sizeof(simde_int8x8_t) / sizeof(*ptr)) * 4 ; i++) { - a_[i % 4].values[i / 4] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vint8m1x4_t dest = __riscv_vlseg4e8_v_i8m1x4(&ptr[0], 8); + a_[0].sv64 = __riscv_vget_v_i8m1x4_i8m1(dest, 0); + a_[1].sv64 = __riscv_vget_v_i8m1x4_i8m1(dest, 1); + a_[2].sv64 = __riscv_vget_v_i8m1x4_i8m1(dest, 2); + a_[3].sv64 = __riscv_vget_v_i8m1x4_i8m1(dest, 3); + #else + for (size_t i = 0; i < (sizeof(simde_int8x8_t) / sizeof(*ptr)) * 4 ; i++) { + a_[i % 4].values[i / 4] = ptr[i]; + } + #endif simde_int8x8x4_t s_ = { { simde_int8x8_from_private(a_[0]), simde_int8x8_from_private(a_[1]), simde_int8x8_from_private(a_[2]), simde_int8x8_from_private(a_[3]) } }; return s_; @@ -127,9 +160,17 @@ simde_vld4_s16(int16_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { return vld4_s16(ptr); #else simde_int16x4_private a_[4]; - for (size_t i = 0; i < (sizeof(simde_int16x4_t) / sizeof(*ptr)) * 4 ; i++) { - a_[i % 4].values[i / 4] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vint16m1x4_t dest = __riscv_vlseg4e16_v_i16m1x4(&ptr[0], 4); + a_[0].sv64 = __riscv_vget_v_i16m1x4_i16m1(dest, 0); + a_[1].sv64 = __riscv_vget_v_i16m1x4_i16m1(dest, 1); + a_[2].sv64 = __riscv_vget_v_i16m1x4_i16m1(dest, 2); + a_[3].sv64 = __riscv_vget_v_i16m1x4_i16m1(dest, 3); + #else + for (size_t i = 0; i < (sizeof(simde_int16x4_t) / sizeof(*ptr)) * 4 ; i++) { + a_[i % 4].values[i / 4] = ptr[i]; + } + #endif simde_int16x4x4_t s_ = { { simde_int16x4_from_private(a_[0]), simde_int16x4_from_private(a_[1]), simde_int16x4_from_private(a_[2]), simde_int16x4_from_private(a_[3]) } }; return s_; @@ -147,9 +188,17 @@ simde_vld4_s32(int32_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { return vld4_s32(ptr); #else simde_int32x2_private a_[4]; - for (size_t i = 0; i < (sizeof(simde_int32x2_t) / sizeof(*ptr)) * 4 ; i++) { - a_[i % 4].values[i / 4] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vint32m1x4_t dest = __riscv_vlseg4e32_v_i32m1x4(&ptr[0], 2); + a_[0].sv64 = __riscv_vget_v_i32m1x4_i32m1(dest, 0); + a_[1].sv64 = __riscv_vget_v_i32m1x4_i32m1(dest, 1); + a_[2].sv64 = __riscv_vget_v_i32m1x4_i32m1(dest, 2); + a_[3].sv64 = __riscv_vget_v_i32m1x4_i32m1(dest, 3); + #else + for (size_t i = 0; i < (sizeof(simde_int32x2_t) / sizeof(*ptr)) * 4 ; i++) { + a_[i % 4].values[i / 4] = ptr[i]; + } + #endif simde_int32x2x4_t s_ = { { simde_int32x2_from_private(a_[0]), simde_int32x2_from_private(a_[1]), simde_int32x2_from_private(a_[2]), simde_int32x2_from_private(a_[3]) } }; return s_; @@ -167,9 +216,17 @@ simde_vld4_s64(int64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { return vld4_s64(ptr); #else simde_int64x1_private a_[4]; - for (size_t i = 0; i < (sizeof(simde_int64x1_t) / sizeof(*ptr)) * 4 ; i++) { - a_[i % 4].values[i / 4] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vint64m1x4_t dest = __riscv_vlseg4e64_v_i64m1x4(&ptr[0], 1); + a_[0].sv64 = __riscv_vget_v_i64m1x4_i64m1(dest, 0); + a_[1].sv64 = __riscv_vget_v_i64m1x4_i64m1(dest, 1); + a_[2].sv64 = __riscv_vget_v_i64m1x4_i64m1(dest, 2); + a_[3].sv64 = __riscv_vget_v_i64m1x4_i64m1(dest, 3); + #else + for (size_t i = 0; i < (sizeof(simde_int64x1_t) / sizeof(*ptr)) * 4 ; i++) { + a_[i % 4].values[i / 4] = ptr[i]; + } + #endif simde_int64x1x4_t s_ = { { simde_int64x1_from_private(a_[0]), simde_int64x1_from_private(a_[1]), simde_int64x1_from_private(a_[2]), simde_int64x1_from_private(a_[3]) } }; return s_; @@ -187,9 +244,17 @@ simde_vld4_u8(uint8_t const ptr[HEDLEY_ARRAY_PARAM(32)]) { return vld4_u8(ptr); #else simde_uint8x8_private a_[4]; - for (size_t i = 0; i < (sizeof(simde_uint8x8_t) / sizeof(*ptr)) * 4 ; i++) { - a_[i % 4].values[i / 4] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vuint8m1x4_t dest = __riscv_vlseg4e8_v_u8m1x4(&ptr[0], 8); + a_[0].sv64 = __riscv_vget_v_u8m1x4_u8m1(dest, 0); + a_[1].sv64 = __riscv_vget_v_u8m1x4_u8m1(dest, 1); + a_[2].sv64 = __riscv_vget_v_u8m1x4_u8m1(dest, 2); + a_[3].sv64 = __riscv_vget_v_u8m1x4_u8m1(dest, 3); + #else + for (size_t i = 0; i < (sizeof(simde_uint8x8_t) / sizeof(*ptr)) * 4 ; i++) { + a_[i % 4].values[i / 4] = ptr[i]; + } + #endif simde_uint8x8x4_t s_ = { { simde_uint8x8_from_private(a_[0]), simde_uint8x8_from_private(a_[1]), simde_uint8x8_from_private(a_[2]), simde_uint8x8_from_private(a_[3]) } }; return s_; @@ -207,9 +272,17 @@ simde_vld4_u16(uint16_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { return vld4_u16(ptr); #else simde_uint16x4_private a_[4]; - for (size_t i = 0; i < (sizeof(simde_uint16x4_t) / sizeof(*ptr)) * 4 ; i++) { - a_[i % 4].values[i / 4] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vuint16m1x4_t dest = __riscv_vlseg4e16_v_u16m1x4(&ptr[0], 4); + a_[0].sv64 = __riscv_vget_v_u16m1x4_u16m1(dest, 0); + a_[1].sv64 = __riscv_vget_v_u16m1x4_u16m1(dest, 1); + a_[2].sv64 = __riscv_vget_v_u16m1x4_u16m1(dest, 2); + a_[3].sv64 = __riscv_vget_v_u16m1x4_u16m1(dest, 3); + #else + for (size_t i = 0; i < (sizeof(simde_uint16x4_t) / sizeof(*ptr)) * 4 ; i++) { + a_[i % 4].values[i / 4] = ptr[i]; + } + #endif simde_uint16x4x4_t s_ = { { simde_uint16x4_from_private(a_[0]), simde_uint16x4_from_private(a_[1]), simde_uint16x4_from_private(a_[2]), simde_uint16x4_from_private(a_[3]) } }; return s_; @@ -227,9 +300,17 @@ simde_vld4_u32(uint32_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { return vld4_u32(ptr); #else simde_uint32x2_private a_[4]; - for (size_t i = 0; i < (sizeof(simde_uint32x2_t) / sizeof(*ptr)) * 4 ; i++) { - a_[i % 4].values[i / 4] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vuint32m1x4_t dest = __riscv_vlseg4e32_v_u32m1x4(&ptr[0], 2); + a_[0].sv64 = __riscv_vget_v_u32m1x4_u32m1(dest, 0); + a_[1].sv64 = __riscv_vget_v_u32m1x4_u32m1(dest, 1); + a_[2].sv64 = __riscv_vget_v_u32m1x4_u32m1(dest, 2); + a_[3].sv64 = __riscv_vget_v_u32m1x4_u32m1(dest, 3); + #else + for (size_t i = 0; i < (sizeof(simde_uint32x2_t) / sizeof(*ptr)) * 4 ; i++) { + a_[i % 4].values[i / 4] = ptr[i]; + } + #endif simde_uint32x2x4_t s_ = { { simde_uint32x2_from_private(a_[0]), simde_uint32x2_from_private(a_[1]), simde_uint32x2_from_private(a_[2]), simde_uint32x2_from_private(a_[3]) } }; return s_; @@ -247,9 +328,17 @@ simde_vld4_u64(uint64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { return vld4_u64(ptr); #else simde_uint64x1_private a_[4]; - for (size_t i = 0; i < (sizeof(simde_uint64x1_t) / sizeof(*ptr)) * 4 ; i++) { - a_[i % 4].values[i / 4] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vuint64m1x4_t dest = __riscv_vlseg4e64_v_u64m1x4(&ptr[0], 1); + a_[0].sv64 = __riscv_vget_v_u64m1x4_u64m1(dest, 0); + a_[1].sv64 = __riscv_vget_v_u64m1x4_u64m1(dest, 1); + a_[2].sv64 = __riscv_vget_v_u64m1x4_u64m1(dest, 2); + a_[3].sv64 = __riscv_vget_v_u64m1x4_u64m1(dest, 3); + #else + for (size_t i = 0; i < (sizeof(simde_uint64x1_t) / sizeof(*ptr)) * 4 ; i++) { + a_[i % 4].values[i / 4] = ptr[i]; + } + #endif simde_uint64x1x4_t s_ = { { simde_uint64x1_from_private(a_[0]), simde_uint64x1_from_private(a_[1]), simde_uint64x1_from_private(a_[2]), simde_uint64x1_from_private(a_[3]) } }; return s_; @@ -267,9 +356,17 @@ simde_vld4q_f16(simde_float16_t const ptr[HEDLEY_ARRAY_PARAM(32)]) { return vld4q_f16(ptr); #else simde_float16x8_private a_[4]; - for (size_t i = 0; i < (sizeof(simde_float16x8_t) / sizeof(*ptr)) * 4 ; i++) { - a_[i % 4].values[i / 4] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH && (SIMDE_NATURAL_VECTOR_SIZE >= 128) + vfloat16m1x4_t dest = __riscv_vlseg4e16_v_f16m1x4((_Float16 *)&ptr[0], 8); + a_[0].sv128 = __riscv_vget_v_f16m1x4_f16m1(dest, 0); + a_[1].sv128 = __riscv_vget_v_f16m1x4_f16m1(dest, 1); + a_[2].sv128 = __riscv_vget_v_f16m1x4_f16m1(dest, 2); + a_[3].sv128 = __riscv_vget_v_f16m1x4_f16m1(dest, 3); + #else + for (size_t i = 0; i < (sizeof(simde_float16x8_t) / sizeof(*ptr)) * 4 ; i++) { + a_[i % 4].values[i / 4] = ptr[i]; + } + #endif simde_float16x8x4_t s_ = { { simde_float16x8_from_private(a_[0]), simde_float16x8_from_private(a_[1]), simde_float16x8_from_private(a_[2]), simde_float16x8_from_private(a_[3]) } }; return s_; @@ -287,9 +384,17 @@ simde_vld4q_f32(simde_float32 const ptr[HEDLEY_ARRAY_PARAM(16)]) { return vld4q_f32(ptr); #else simde_float32x4_private a_[4]; - for (size_t i = 0; i < (sizeof(simde_float32x4_t) / sizeof(*ptr)) * 4 ; i++) { - a_[i % 4].values[i / 4] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vfloat32m1x4_t dest = __riscv_vlseg4e32_v_f32m1x4(&ptr[0], 4); + a_[0].sv128 = __riscv_vget_v_f32m1x4_f32m1(dest, 0); + a_[1].sv128 = __riscv_vget_v_f32m1x4_f32m1(dest, 1); + a_[2].sv128 = __riscv_vget_v_f32m1x4_f32m1(dest, 2); + a_[3].sv128 = __riscv_vget_v_f32m1x4_f32m1(dest, 3); + #else + for (size_t i = 0; i < (sizeof(simde_float32x4_t) / sizeof(*ptr)) * 4 ; i++) { + a_[i % 4].values[i / 4] = ptr[i]; + } + #endif simde_float32x4x4_t s_ = { { simde_float32x4_from_private(a_[0]), simde_float32x4_from_private(a_[1]), simde_float32x4_from_private(a_[2]), simde_float32x4_from_private(a_[3]) } }; return s_; @@ -307,9 +412,17 @@ simde_vld4q_f64(simde_float64 const ptr[HEDLEY_ARRAY_PARAM(8)]) { return vld4q_f64(ptr); #else simde_float64x2_private a_[4]; - for (size_t i = 0; i < (sizeof(simde_float64x2_t) / sizeof(*ptr)) * 4 ; i++) { - a_[i % 4].values[i / 4] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vfloat64m1x4_t dest = __riscv_vlseg4e64_v_f64m1x4(&ptr[0], 2); + a_[0].sv128 = __riscv_vget_v_f64m1x4_f64m1(dest, 0); + a_[1].sv128 = __riscv_vget_v_f64m1x4_f64m1(dest, 1); + a_[2].sv128 = __riscv_vget_v_f64m1x4_f64m1(dest, 2); + a_[3].sv128 = __riscv_vget_v_f64m1x4_f64m1(dest, 3); + #else + for (size_t i = 0; i < (sizeof(simde_float64x2_t) / sizeof(*ptr)) * 4 ; i++) { + a_[i % 4].values[i / 4] = ptr[i]; + } + #endif simde_float64x2x4_t s_ = { { simde_float64x2_from_private(a_[0]), simde_float64x2_from_private(a_[1]), simde_float64x2_from_private(a_[2]), simde_float64x2_from_private(a_[3]) } }; return s_; @@ -327,9 +440,17 @@ simde_vld4q_s8(int8_t const ptr[HEDLEY_ARRAY_PARAM(64)]) { return vld4q_s8(ptr); #else simde_int8x16_private a_[4]; - for (size_t i = 0; i < (sizeof(simde_int8x16_t) / sizeof(*ptr)) * 4 ; i++) { - a_[i % 4].values[i / 4] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vint8m1x4_t dest = __riscv_vlseg4e8_v_i8m1x4(&ptr[0], 16); + a_[0].sv128 = __riscv_vget_v_i8m1x4_i8m1(dest, 0); + a_[1].sv128 = __riscv_vget_v_i8m1x4_i8m1(dest, 1); + a_[2].sv128 = __riscv_vget_v_i8m1x4_i8m1(dest, 2); + a_[3].sv128 = __riscv_vget_v_i8m1x4_i8m1(dest, 3); + #else + for (size_t i = 0; i < (sizeof(simde_int8x16_t) / sizeof(*ptr)) * 4 ; i++) { + a_[i % 4].values[i / 4] = ptr[i]; + } + #endif simde_int8x16x4_t s_ = { { simde_int8x16_from_private(a_[0]), simde_int8x16_from_private(a_[1]), simde_int8x16_from_private(a_[2]), simde_int8x16_from_private(a_[3]) } }; return s_; @@ -347,9 +468,17 @@ simde_vld4q_s16(int16_t const ptr[HEDLEY_ARRAY_PARAM(32)]) { return vld4q_s16(ptr); #else simde_int16x8_private a_[4]; - for (size_t i = 0; i < (sizeof(simde_int16x8_t) / sizeof(*ptr)) * 4 ; i++) { - a_[i % 4].values[i / 4] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vint16m1x4_t dest = __riscv_vlseg4e16_v_i16m1x4(&ptr[0], 8); + a_[0].sv128 = __riscv_vget_v_i16m1x4_i16m1(dest, 0); + a_[1].sv128 = __riscv_vget_v_i16m1x4_i16m1(dest, 1); + a_[2].sv128 = __riscv_vget_v_i16m1x4_i16m1(dest, 2); + a_[3].sv128 = __riscv_vget_v_i16m1x4_i16m1(dest, 3); + #else + for (size_t i = 0; i < (sizeof(simde_int16x8_t) / sizeof(*ptr)) * 4 ; i++) { + a_[i % 4].values[i / 4] = ptr[i]; + } + #endif simde_int16x8x4_t s_ = { { simde_int16x8_from_private(a_[0]), simde_int16x8_from_private(a_[1]), simde_int16x8_from_private(a_[2]), simde_int16x8_from_private(a_[3]) } }; return s_; @@ -367,9 +496,17 @@ simde_vld4q_s32(int32_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { return vld4q_s32(ptr); #else simde_int32x4_private a_[4]; - for (size_t i = 0; i < (sizeof(simde_int32x4_t) / sizeof(*ptr)) * 4 ; i++) { - a_[i % 4].values[i / 4] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vint32m1x4_t dest = __riscv_vlseg4e32_v_i32m1x4(&ptr[0], 4); + a_[0].sv128 = __riscv_vget_v_i32m1x4_i32m1(dest, 0); + a_[1].sv128 = __riscv_vget_v_i32m1x4_i32m1(dest, 1); + a_[2].sv128 = __riscv_vget_v_i32m1x4_i32m1(dest, 2); + a_[3].sv128 = __riscv_vget_v_i32m1x4_i32m1(dest, 3); + #else + for (size_t i = 0; i < (sizeof(simde_int32x4_t) / sizeof(*ptr)) * 4 ; i++) { + a_[i % 4].values[i / 4] = ptr[i]; + } + #endif simde_int32x4x4_t s_ = { { simde_int32x4_from_private(a_[0]), simde_int32x4_from_private(a_[1]), simde_int32x4_from_private(a_[2]), simde_int32x4_from_private(a_[3]) } }; return s_; @@ -387,9 +524,17 @@ simde_vld4q_s64(int64_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { return vld4q_s64(ptr); #else simde_int64x2_private a_[4]; - for (size_t i = 0; i < (sizeof(simde_int64x2_t) / sizeof(*ptr)) * 4 ; i++) { - a_[i % 4].values[i / 4] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vint64m1x4_t dest = __riscv_vlseg4e64_v_i64m1x4(&ptr[0], 2); + a_[0].sv128 = __riscv_vget_v_i64m1x4_i64m1(dest, 0); + a_[1].sv128 = __riscv_vget_v_i64m1x4_i64m1(dest, 1); + a_[2].sv128 = __riscv_vget_v_i64m1x4_i64m1(dest, 2); + a_[3].sv128 = __riscv_vget_v_i64m1x4_i64m1(dest, 3); + #else + for (size_t i = 0; i < (sizeof(simde_int64x2_t) / sizeof(*ptr)) * 4 ; i++) { + a_[i % 4].values[i / 4] = ptr[i]; + } + #endif simde_int64x2x4_t s_ = { { simde_int64x2_from_private(a_[0]), simde_int64x2_from_private(a_[1]), simde_int64x2_from_private(a_[2]), simde_int64x2_from_private(a_[3]) } }; return s_; @@ -443,6 +588,20 @@ simde_vld4q_u8(uint8_t const ptr[HEDLEY_ARRAY_PARAM(64)]) { simde_uint8x16_from_private(r_[2]), simde_uint8x16_from_private(r_[3])}}; return s_; + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint8x16_private r_[4]; + vuint8m1x4_t dest = __riscv_vlseg4e8_v_u8m1x4(&ptr[0], 16); + r_[0].sv128 = __riscv_vget_v_u8m1x4_u8m1(dest, 0); + r_[1].sv128 = __riscv_vget_v_u8m1x4_u8m1(dest, 1); + r_[2].sv128 = __riscv_vget_v_u8m1x4_u8m1(dest, 2); + r_[3].sv128 = __riscv_vget_v_u8m1x4_u8m1(dest, 3); + simde_uint8x16x4_t r = { { + simde_uint8x16_from_private(r_[0]), + simde_uint8x16_from_private(r_[1]), + simde_uint8x16_from_private(r_[2]), + simde_uint8x16_from_private(r_[3]) + } }; + return r; #else simde_uint8x16_private a_[4]; for (size_t i = 0; i < (sizeof(simde_uint8x16_t) / sizeof(*ptr)) * 4 ; i++) { @@ -465,9 +624,17 @@ simde_vld4q_u16(uint16_t const ptr[HEDLEY_ARRAY_PARAM(32)]) { return vld4q_u16(ptr); #else simde_uint16x8_private a_[4]; - for (size_t i = 0; i < (sizeof(simde_uint16x8_t) / sizeof(*ptr)) * 4 ; i++) { - a_[i % 4].values[i / 4] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vuint16m1x4_t dest = __riscv_vlseg4e16_v_u16m1x4(&ptr[0], 8); + a_[0].sv128 = __riscv_vget_v_u16m1x4_u16m1(dest, 0); + a_[1].sv128 = __riscv_vget_v_u16m1x4_u16m1(dest, 1); + a_[2].sv128 = __riscv_vget_v_u16m1x4_u16m1(dest, 2); + a_[3].sv128 = __riscv_vget_v_u16m1x4_u16m1(dest, 3); + #else + for (size_t i = 0; i < (sizeof(simde_uint16x8_t) / sizeof(*ptr)) * 4 ; i++) { + a_[i % 4].values[i / 4] = ptr[i]; + } + #endif simde_uint16x8x4_t s_ = { { simde_uint16x8_from_private(a_[0]), simde_uint16x8_from_private(a_[1]), simde_uint16x8_from_private(a_[2]), simde_uint16x8_from_private(a_[3]) } }; return s_; @@ -485,9 +652,17 @@ simde_vld4q_u32(uint32_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { return vld4q_u32(ptr); #else simde_uint32x4_private a_[4]; - for (size_t i = 0; i < (sizeof(simde_uint32x4_t) / sizeof(*ptr)) * 4 ; i++) { - a_[i % 4].values[i / 4] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vuint32m1x4_t dest = __riscv_vlseg4e32_v_u32m1x4(&ptr[0], 4); + a_[0].sv128 = __riscv_vget_v_u32m1x4_u32m1(dest, 0); + a_[1].sv128 = __riscv_vget_v_u32m1x4_u32m1(dest, 1); + a_[2].sv128 = __riscv_vget_v_u32m1x4_u32m1(dest, 2); + a_[3].sv128 = __riscv_vget_v_u32m1x4_u32m1(dest, 3); + #else + for (size_t i = 0; i < (sizeof(simde_uint32x4_t) / sizeof(*ptr)) * 4 ; i++) { + a_[i % 4].values[i / 4] = ptr[i]; + } + #endif simde_uint32x4x4_t s_ = { { simde_uint32x4_from_private(a_[0]), simde_uint32x4_from_private(a_[1]), simde_uint32x4_from_private(a_[2]), simde_uint32x4_from_private(a_[3]) } }; return s_; @@ -505,9 +680,17 @@ simde_vld4q_u64(uint64_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { return vld4q_u64(ptr); #else simde_uint64x2_private a_[4]; - for (size_t i = 0; i < (sizeof(simde_uint64x2_t) / sizeof(*ptr)) * 4 ; i++) { - a_[i % 4].values[i / 4] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vuint64m1x4_t dest = __riscv_vlseg4e64_v_u64m1x4(&ptr[0], 2); + a_[0].sv128 = __riscv_vget_v_u64m1x4_u64m1(dest, 0); + a_[1].sv128 = __riscv_vget_v_u64m1x4_u64m1(dest, 1); + a_[2].sv128 = __riscv_vget_v_u64m1x4_u64m1(dest, 2); + a_[3].sv128 = __riscv_vget_v_u64m1x4_u64m1(dest, 3); + #else + for (size_t i = 0; i < (sizeof(simde_uint64x2_t) / sizeof(*ptr)) * 4 ; i++) { + a_[i % 4].values[i / 4] = ptr[i]; + } + #endif simde_uint64x2x4_t s_ = { { simde_uint64x2_from_private(a_[0]), simde_uint64x2_from_private(a_[1]), simde_uint64x2_from_private(a_[2]), simde_uint64x2_from_private(a_[3]) } }; return s_; @@ -525,9 +708,17 @@ simde_vld4_p8(simde_poly8_t const ptr[HEDLEY_ARRAY_PARAM(32)]) { return vld4_p8(ptr); #else simde_poly8x8_private a_[4]; - for (size_t i = 0; i < (sizeof(simde_poly8x8_t) / sizeof(*ptr)) * 4 ; i++) { - a_[i % 4].values[i / 4] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vuint8m1x4_t dest = __riscv_vlseg4e8_v_u8m1x4(&ptr[0], 8); + a_[0].sv64 = __riscv_vget_v_u8m1x4_u8m1(dest, 0); + a_[1].sv64 = __riscv_vget_v_u8m1x4_u8m1(dest, 1); + a_[2].sv64 = __riscv_vget_v_u8m1x4_u8m1(dest, 2); + a_[3].sv64 = __riscv_vget_v_u8m1x4_u8m1(dest, 3); + #else + for (size_t i = 0; i < (sizeof(simde_poly8x8_t) / sizeof(*ptr)) * 4 ; i++) { + a_[i % 4].values[i / 4] = ptr[i]; + } + #endif simde_poly8x8x4_t s_ = { { simde_poly8x8_from_private(a_[0]), simde_poly8x8_from_private(a_[1]), simde_poly8x8_from_private(a_[2]), simde_poly8x8_from_private(a_[3]) } }; return s_; @@ -545,9 +736,17 @@ simde_vld4_p16(simde_poly16_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { return vld4_p16(ptr); #else simde_poly16x4_private a_[4]; - for (size_t i = 0; i < (sizeof(simde_poly16x4_t) / sizeof(*ptr)) * 4 ; i++) { - a_[i % 4].values[i / 4] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vuint16m1x4_t dest = __riscv_vlseg4e16_v_u16m1x4(&ptr[0], 4); + a_[0].sv64 = __riscv_vget_v_u16m1x4_u16m1(dest, 0); + a_[1].sv64 = __riscv_vget_v_u16m1x4_u16m1(dest, 1); + a_[2].sv64 = __riscv_vget_v_u16m1x4_u16m1(dest, 2); + a_[3].sv64 = __riscv_vget_v_u16m1x4_u16m1(dest, 3); + #else + for (size_t i = 0; i < (sizeof(simde_poly16x4_t) / sizeof(*ptr)) * 4 ; i++) { + a_[i % 4].values[i / 4] = ptr[i]; + } + #endif simde_poly16x4x4_t s_ = { { simde_poly16x4_from_private(a_[0]), simde_poly16x4_from_private(a_[1]), simde_poly16x4_from_private(a_[2]), simde_poly16x4_from_private(a_[3]) } }; return s_; @@ -565,9 +764,17 @@ simde_vld4_p64(simde_poly64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { return vld4_p64(ptr); #else simde_poly64x1_private a_[4]; - for (size_t i = 0; i < (sizeof(simde_poly64x1_t) / sizeof(*ptr)) * 4 ; i++) { - a_[i % 4].values[i / 4] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vuint64m1x4_t dest = __riscv_vlseg4e64_v_u64m1x4(&ptr[0], 1); + a_[0].sv64 = __riscv_vget_v_u64m1x4_u64m1(dest, 0); + a_[1].sv64 = __riscv_vget_v_u64m1x4_u64m1(dest, 1); + a_[2].sv64 = __riscv_vget_v_u64m1x4_u64m1(dest, 2); + a_[3].sv64 = __riscv_vget_v_u64m1x4_u64m1(dest, 3); + #else + for (size_t i = 0; i < (sizeof(simde_poly64x1_t) / sizeof(*ptr)) * 4 ; i++) { + a_[i % 4].values[i / 4] = ptr[i]; + } + #endif simde_poly64x1x4_t s_ = { { simde_poly64x1_from_private(a_[0]), simde_poly64x1_from_private(a_[1]), simde_poly64x1_from_private(a_[2]), simde_poly64x1_from_private(a_[3]) } }; return s_; @@ -585,9 +792,17 @@ simde_vld4q_p8(simde_poly8_t const ptr[HEDLEY_ARRAY_PARAM(64)]) { return vld4q_p8(ptr); #else simde_poly8x16_private a_[4]; - for (size_t i = 0; i < (sizeof(simde_poly8x16_t) / sizeof(*ptr)) * 4 ; i++) { - a_[i % 4].values[i / 4] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vuint8m1x4_t dest = __riscv_vlseg4e8_v_u8m1x4(&ptr[0], 16); + a_[0].sv128 = __riscv_vget_v_u8m1x4_u8m1(dest, 0); + a_[1].sv128 = __riscv_vget_v_u8m1x4_u8m1(dest, 1); + a_[2].sv128 = __riscv_vget_v_u8m1x4_u8m1(dest, 2); + a_[3].sv128 = __riscv_vget_v_u8m1x4_u8m1(dest, 3); + #else + for (size_t i = 0; i < (sizeof(simde_poly8x16_t) / sizeof(*ptr)) * 4 ; i++) { + a_[i % 4].values[i / 4] = ptr[i]; + } + #endif simde_poly8x16x4_t s_ = { { simde_poly8x16_from_private(a_[0]), simde_poly8x16_from_private(a_[1]), simde_poly8x16_from_private(a_[2]), simde_poly8x16_from_private(a_[3]) } }; return s_; @@ -605,9 +820,17 @@ simde_vld4q_p16(simde_poly16_t const ptr[HEDLEY_ARRAY_PARAM(32)]) { return vld4q_p16(ptr); #else simde_poly16x8_private a_[4]; - for (size_t i = 0; i < (sizeof(simde_poly16x8_t) / sizeof(*ptr)) * 4 ; i++) { - a_[i % 4].values[i / 4] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vuint16m1x4_t dest = __riscv_vlseg4e16_v_u16m1x4(&ptr[0], 8); + a_[0].sv128 = __riscv_vget_v_u16m1x4_u16m1(dest, 0); + a_[1].sv128 = __riscv_vget_v_u16m1x4_u16m1(dest, 1); + a_[2].sv128 = __riscv_vget_v_u16m1x4_u16m1(dest, 2); + a_[3].sv128 = __riscv_vget_v_u16m1x4_u16m1(dest, 3); + #else + for (size_t i = 0; i < (sizeof(simde_poly16x8_t) / sizeof(*ptr)) * 4 ; i++) { + a_[i % 4].values[i / 4] = ptr[i]; + } + #endif simde_poly16x8x4_t s_ = { { simde_poly16x8_from_private(a_[0]), simde_poly16x8_from_private(a_[1]), simde_poly16x8_from_private(a_[2]), simde_poly16x8_from_private(a_[3]) } }; return s_; @@ -625,9 +848,17 @@ simde_vld4q_p64(simde_poly64_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { return vld4q_p64(ptr); #else simde_poly64x2_private a_[4]; - for (size_t i = 0; i < (sizeof(simde_poly64x2_t) / sizeof(*ptr)) * 4 ; i++) { - a_[i % 4].values[i / 4] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vuint64m1x4_t dest = __riscv_vlseg4e64_v_u64m1x4(&ptr[0], 2); + a_[0].sv128 = __riscv_vget_v_u64m1x4_u64m1(dest, 0); + a_[1].sv128 = __riscv_vget_v_u64m1x4_u64m1(dest, 1); + a_[2].sv128 = __riscv_vget_v_u64m1x4_u64m1(dest, 2); + a_[3].sv128 = __riscv_vget_v_u64m1x4_u64m1(dest, 3); + #else + for (size_t i = 0; i < (sizeof(simde_poly64x2_t) / sizeof(*ptr)) * 4 ; i++) { + a_[i % 4].values[i / 4] = ptr[i]; + } + #endif simde_poly64x2x4_t s_ = { { simde_poly64x2_from_private(a_[0]), simde_poly64x2_from_private(a_[1]), simde_poly64x2_from_private(a_[2]), simde_poly64x2_from_private(a_[3]) } }; return s_; diff --git a/simde/arm/neon/mul.h b/simde/arm/neon/mul.h index 26b3a1b10..590b0eae5 100644 --- a/simde/arm/neon/mul.h +++ b/simde/arm/neon/mul.h @@ -24,6 +24,7 @@ * 2020 Evan Nemerson * 2020 Sean Maher (Copyright owned by Google, LLC) * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Yung-Cheng Su (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_MUL_H) @@ -91,7 +92,9 @@ simde_vmul_f32(simde_float32x2_t a, simde_float32x2_t b) { a_ = simde_float32x2_to_private(a), b_ = simde_float32x2_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vfmul_vv_f32m1(a_.sv64, b_.sv64, 2); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values * b_.values; #else SIMDE_VECTORIZE @@ -119,7 +122,9 @@ simde_vmul_f64(simde_float64x1_t a, simde_float64x1_t b) { a_ = simde_float64x1_to_private(a), b_ = simde_float64x1_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vfmul_vv_f64m1(a_.sv64, b_.sv64, 1); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values * b_.values; #else SIMDE_VECTORIZE @@ -147,7 +152,9 @@ simde_vmul_s8(simde_int8x8_t a, simde_int8x8_t b) { a_ = simde_int8x8_to_private(a), b_ = simde_int8x8_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmul_vv_i8m1(a_.sv64, b_.sv64, 8); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) r_.values = a_.values * b_.values; #else SIMDE_VECTORIZE @@ -177,6 +184,8 @@ simde_vmul_s16(simde_int16x4_t a, simde_int16x4_t b) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _m_pmullw(a_.m64, b_.m64); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmul_vv_i16m1(a_.sv64, b_.sv64, 4); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) r_.values = a_.values * b_.values; #else @@ -205,7 +214,9 @@ simde_vmul_s32(simde_int32x2_t a, simde_int32x2_t b) { a_ = simde_int32x2_to_private(a), b_ = simde_int32x2_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmul_vv_i32m1(a_.sv64, b_.sv64, 2); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) r_.values = a_.values * b_.values; #else SIMDE_VECTORIZE @@ -230,7 +241,9 @@ simde_x_vmul_s64(simde_int64x1_t a, simde_int64x1_t b) { a_ = simde_int64x1_to_private(a), b_ = simde_int64x1_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmul_vv_i64m1(a_.sv64, b_.sv64, 1); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values * b_.values; #else SIMDE_VECTORIZE @@ -253,7 +266,9 @@ simde_vmul_u8(simde_uint8x8_t a, simde_uint8x8_t b) { a_ = simde_uint8x8_to_private(a), b_ = simde_uint8x8_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmul_vv_u8m1(a_.sv64, b_.sv64, 8); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) r_.values = a_.values * b_.values; #else SIMDE_VECTORIZE @@ -281,7 +296,9 @@ simde_vmul_u16(simde_uint16x4_t a, simde_uint16x4_t b) { a_ = simde_uint16x4_to_private(a), b_ = simde_uint16x4_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmul_vv_u16m1(a_.sv64, b_.sv64, 4); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) r_.values = a_.values * b_.values; #else SIMDE_VECTORIZE @@ -309,7 +326,9 @@ simde_vmul_u32(simde_uint32x2_t a, simde_uint32x2_t b) { a_ = simde_uint32x2_to_private(a), b_ = simde_uint32x2_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmul_vv_u32m1(a_.sv64, b_.sv64, 2); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) r_.values = a_.values * b_.values; #else SIMDE_VECTORIZE @@ -334,7 +353,9 @@ simde_x_vmul_u64(simde_uint64x1_t a, simde_uint64x1_t b) { a_ = simde_uint64x1_to_private(a), b_ = simde_uint64x1_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmul_vv_u64m1(a_.sv64, b_.sv64, 1); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values * b_.values; #else SIMDE_VECTORIZE @@ -387,6 +408,8 @@ simde_vmulq_f32(simde_float32x4_t a, simde_float32x4_t b) { r_.m128 = _mm_mul_ps(a_.m128, b_.m128); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_f32x4_mul(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vfmul_vv_f32m1(a_.sv128, b_.sv128, 4); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values * b_.values; #else @@ -419,6 +442,8 @@ simde_vmulq_f64(simde_float64x2_t a, simde_float64x2_t b) { r_.m128d = _mm_mul_pd(a_.m128d, b_.m128d); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_f64x2_mul(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vfmul_vv_f64m1(a_.sv128, b_.sv128, 2); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values * b_.values; #else @@ -470,6 +495,8 @@ simde_vmulq_s8(simde_int8x16_t a, simde_int8x16_t b) { ) #endif ); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vmul_vv_i8m1(a_.sv128, b_.sv128, 16); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values * b_.values; #else @@ -500,6 +527,8 @@ simde_vmulq_s16(simde_int16x8_t a, simde_int16x8_t b) { #if defined(SIMDE_X86_SSE2_NATIVE) r_.m128i = _mm_mullo_epi16(a_.m128i, b_.m128i); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vmul_vv_i16m1(a_.sv128, b_.sv128, 8); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values * b_.values; #else @@ -530,6 +559,8 @@ simde_vmulq_s32(simde_int32x4_t a, simde_int32x4_t b) { #if defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i32x4_mul(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vmul_vv_i32m1(a_.sv128, b_.sv128, 4); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values * b_.values; #else @@ -559,6 +590,8 @@ simde_x_vmulq_s64(simde_int64x2_t a, simde_int64x2_t b) { r_.v128 = wasm_i64x2_mul(a_.v128, b_.v128); #elif defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512DQ_NATIVE) r_.m128i = _mm_mullo_epi64(a_.m128i, b_.m128i); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vmul_vv_i64m1(a_.sv128, b_.sv128, 2); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values * b_.values; #else @@ -576,6 +609,13 @@ simde_uint8x16_t simde_vmulq_u8(simde_uint8x16_t a, simde_uint8x16_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmulq_u8(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint8x16_private + r_, + a_ = simde_uint8x16_to_private(a), + b_ = simde_uint8x16_to_private(b); + r_.sv128 = __riscv_vmul_vv_u8m1(a_.sv128, b_.sv128, 16); + return simde_uint8x16_from_private(r_); #else return simde_vreinterpretq_u8_s8( @@ -596,6 +636,13 @@ simde_uint16x8_t simde_vmulq_u16(simde_uint16x8_t a, simde_uint16x8_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmulq_u16(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint16x8_private + r_, + a_ = simde_uint16x8_to_private(a), + b_ = simde_uint16x8_to_private(b); + r_.sv128 = __riscv_vmul_vv_u16m1(a_.sv128, b_.sv128, 8); + return simde_uint16x8_from_private(r_); #else return simde_vreinterpretq_u16_s16( @@ -616,6 +663,13 @@ simde_uint32x4_t simde_vmulq_u32(simde_uint32x4_t a, simde_uint32x4_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmulq_u32(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint32x4_private + r_, + a_ = simde_uint32x4_to_private(a), + b_ = simde_uint32x4_to_private(b); + r_.sv128 = __riscv_vmul_vv_u32m1(a_.sv128, b_.sv128, 4); + return simde_uint32x4_from_private(r_); #else return simde_vreinterpretq_u32_s32( @@ -634,13 +688,22 @@ simde_vmulq_u32(simde_uint32x4_t a, simde_uint32x4_t b) { SIMDE_FUNCTION_ATTRIBUTES simde_uint64x2_t simde_x_vmulq_u64(simde_uint64x2_t a, simde_uint64x2_t b) { - return - simde_vreinterpretq_u64_s64( - simde_x_vmulq_s64( - simde_vreinterpretq_s64_u64(a), - simde_vreinterpretq_s64_u64(b) - ) - ); + #if defined(SIMDE_RISCV_V_NATIVE) + simde_uint64x2_private + r_, + a_ = simde_uint64x2_to_private(a), + b_ = simde_uint64x2_to_private(b); + r_.sv128 = __riscv_vmul_vv_u64m1(a_.sv128, b_.sv128, 2); + return simde_uint64x2_from_private(r_); + #else + return + simde_vreinterpretq_u64_s64( + simde_x_vmulq_s64( + simde_vreinterpretq_s64_u64(a), + simde_vreinterpretq_s64_u64(b) + ) + ); + #endif } SIMDE_FUNCTION_ATTRIBUTES diff --git a/simde/arm/neon/mul_lane.h b/simde/arm/neon/mul_lane.h index 1ac2e9420..72c032eea 100644 --- a/simde/arm/neon/mul_lane.h +++ b/simde/arm/neon/mul_lane.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Yung-Cheng Su */ #if !defined(SIMDE_ARM_NEON_MUL_LANE_H) @@ -182,10 +183,14 @@ simde_vmul_lane_f32(simde_float32x2_t a, simde_float32x2_t b, const int lane) a_ = simde_float32x2_to_private(a), b_ = simde_float32x2_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] * b_.values[lane]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vfmul_vf_f32m1(a_.sv64, b_.values[lane], 2); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[lane]; + } + #endif return simde_float32x2_from_private(r_); } @@ -206,10 +211,14 @@ simde_vmul_lane_f64(simde_float64x1_t a, simde_float64x1_t b, const int lane) a_ = simde_float64x1_to_private(a), b_ = simde_float64x1_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] * b_.values[lane]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vfmul_vf_f64m1(a_.sv64, b_.values[lane], 1); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[lane]; + } + #endif return simde_float64x1_from_private(r_); } @@ -230,10 +239,14 @@ simde_vmul_lane_s16(simde_int16x4_t a, simde_int16x4_t b, const int lane) a_ = simde_int16x4_to_private(a), b_ = simde_int16x4_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] * b_.values[lane]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmul_vx_i16m1(a_.sv64, b_.values[lane], 4); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[lane]; + } + #endif return simde_int16x4_from_private(r_); } @@ -254,10 +267,14 @@ simde_vmul_lane_s32(simde_int32x2_t a, simde_int32x2_t b, const int lane) a_ = simde_int32x2_to_private(a), b_ = simde_int32x2_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] * b_.values[lane]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmul_vx_i32m1(a_.sv64, b_.values[lane], 2); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[lane]; + } + #endif return simde_int32x2_from_private(r_); } @@ -278,10 +295,14 @@ simde_vmul_lane_u16(simde_uint16x4_t a, simde_uint16x4_t b, const int lane) a_ = simde_uint16x4_to_private(a), b_ = simde_uint16x4_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] * b_.values[lane]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmul_vx_u16m1(a_.sv64, b_.values[lane], 4); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[lane]; + } + #endif return simde_uint16x4_from_private(r_); } @@ -302,10 +323,14 @@ simde_vmul_lane_u32(simde_uint32x2_t a, simde_uint32x2_t b, const int lane) a_ = simde_uint32x2_to_private(a), b_ = simde_uint32x2_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] * b_.values[lane]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmul_vx_u32m1(a_.sv64, b_.values[lane], 2); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[lane]; + } + #endif return simde_uint32x2_from_private(r_); } @@ -327,10 +352,14 @@ simde_vmul_laneq_s16(simde_int16x4_t a, simde_int16x8_t b, const int lane) simde_int16x8_private b_ = simde_int16x8_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] * b_.values[lane]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmul_vx_i16m1(a_.sv64, b_.values[lane], 4); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[lane]; + } + #endif return simde_int16x4_from_private(r_); } @@ -352,10 +381,14 @@ simde_vmul_laneq_s32(simde_int32x2_t a, simde_int32x4_t b, const int lane) simde_int32x4_private b_ = simde_int32x4_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] * b_.values[lane]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmul_vx_i32m1(a_.sv64, b_.values[lane], 2); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[lane]; + } + #endif return simde_int32x2_from_private(r_); } @@ -377,10 +410,14 @@ simde_vmul_laneq_u16(simde_uint16x4_t a, simde_uint16x8_t b, const int lane) simde_uint16x8_private b_ = simde_uint16x8_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] * b_.values[lane]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmul_vx_u16m1(a_.sv64, b_.values[lane], 4); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[lane]; + } + #endif return simde_uint16x4_from_private(r_); } @@ -402,10 +439,14 @@ simde_vmul_laneq_u32(simde_uint32x2_t a, simde_uint32x4_t b, const int lane) simde_uint32x4_private b_ = simde_uint32x4_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] * b_.values[lane]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmul_vx_u32m1(a_.sv64, b_.values[lane], 2); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[lane]; + } + #endif return simde_uint32x2_from_private(r_); } @@ -450,10 +491,14 @@ simde_vmulq_lane_f32(simde_float32x4_t a, simde_float32x2_t b, const int lane) a_ = simde_float32x4_to_private(a); simde_float32x2_private b_ = simde_float32x2_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] * b_.values[lane]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vfmul_vf_f32m1(a_.sv128, b_.values[lane], 4); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[lane]; + } + #endif return simde_float32x4_from_private(r_); } @@ -474,10 +519,14 @@ simde_vmulq_lane_f64(simde_float64x2_t a, simde_float64x1_t b, const int lane) a_ = simde_float64x2_to_private(a); simde_float64x1_private b_ = simde_float64x1_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] * b_.values[lane]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vfmul_vf_f64m1(a_.sv128, b_.values[lane], 2); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[lane]; + } + #endif return simde_float64x2_from_private(r_); } @@ -498,10 +547,14 @@ simde_vmulq_lane_s16(simde_int16x8_t a, simde_int16x4_t b, const int lane) a_ = simde_int16x8_to_private(a); simde_int16x4_private b_ = simde_int16x4_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] * b_.values[lane]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vmul_vx_i16m1(a_.sv128, b_.values[lane], 8); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[lane]; + } + #endif return simde_int16x8_from_private(r_); } @@ -522,10 +575,14 @@ simde_vmulq_lane_s32(simde_int32x4_t a, simde_int32x2_t b, const int lane) a_ = simde_int32x4_to_private(a); simde_int32x2_private b_ = simde_int32x2_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] * b_.values[lane]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vmul_vx_i32m1(a_.sv128, b_.values[lane], 4); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[lane]; + } + #endif return simde_int32x4_from_private(r_); } @@ -546,10 +603,14 @@ simde_vmulq_lane_u16(simde_uint16x8_t a, simde_uint16x4_t b, const int lane) a_ = simde_uint16x8_to_private(a); simde_uint16x4_private b_ = simde_uint16x4_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] * b_.values[lane]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vmul_vx_u16m1(a_.sv128, b_.values[lane], 8); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[lane]; + } + #endif return simde_uint16x8_from_private(r_); } @@ -570,10 +631,14 @@ simde_vmulq_lane_u32(simde_uint32x4_t a, simde_uint32x2_t b, const int lane) a_ = simde_uint32x4_to_private(a); simde_uint32x2_private b_ = simde_uint32x2_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] * b_.values[lane]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vmul_vx_u32m1(a_.sv128, b_.values[lane], 4); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[lane]; + } + #endif return simde_uint32x4_from_private(r_); } @@ -618,10 +683,14 @@ simde_vmulq_laneq_f32(simde_float32x4_t a, simde_float32x4_t b, const int lane) a_ = simde_float32x4_to_private(a), b_ = simde_float32x4_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] * b_.values[lane]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vfmul_vf_f32m1(a_.sv128, b_.values[lane], 4); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[lane]; + } + #endif return simde_float32x4_from_private(r_); } @@ -642,10 +711,14 @@ simde_vmulq_laneq_f64(simde_float64x2_t a, simde_float64x2_t b, const int lane) a_ = simde_float64x2_to_private(a), b_ = simde_float64x2_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] * b_.values[lane]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vfmul_vf_f64m1(a_.sv128, b_.values[lane], 2); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[lane]; + } + #endif return simde_float64x2_from_private(r_); } @@ -666,10 +739,14 @@ simde_vmulq_laneq_s16(simde_int16x8_t a, simde_int16x8_t b, const int lane) a_ = simde_int16x8_to_private(a), b_ = simde_int16x8_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] * b_.values[lane]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vmul_vx_i16m1(a_.sv128, b_.values[lane], 8); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[lane]; + } + #endif return simde_int16x8_from_private(r_); } @@ -690,10 +767,14 @@ simde_vmulq_laneq_s32(simde_int32x4_t a, simde_int32x4_t b, const int lane) a_ = simde_int32x4_to_private(a), b_ = simde_int32x4_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] * b_.values[lane]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vmul_vx_i32m1(a_.sv128, b_.values[lane], 4); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[lane]; + } + #endif return simde_int32x4_from_private(r_); } @@ -714,10 +795,14 @@ simde_vmulq_laneq_u16(simde_uint16x8_t a, simde_uint16x8_t b, const int lane) a_ = simde_uint16x8_to_private(a), b_ = simde_uint16x8_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] * b_.values[lane]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vmul_vx_u16m1(a_.sv128, b_.values[lane], 8); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[lane]; + } + #endif return simde_uint16x8_from_private(r_); } @@ -738,10 +823,14 @@ simde_vmulq_laneq_u32(simde_uint32x4_t a, simde_uint32x4_t b, const int lane) a_ = simde_uint32x4_to_private(a), b_ = simde_uint32x4_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] * b_.values[lane]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vmul_vx_u32m1(a_.sv128, b_.values[lane], 4); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[lane]; + } + #endif return simde_uint32x4_from_private(r_); } @@ -786,10 +875,14 @@ simde_vmul_laneq_f32(simde_float32x2_t a, simde_float32x4_t b, const int lane) a_ = simde_float32x2_to_private(a); simde_float32x4_private b_ = simde_float32x4_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] * b_.values[lane]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vfmul_vf_f32m1(a_.sv64, b_.values[lane], 2); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[lane]; + } + #endif return simde_float32x2_from_private(r_); } @@ -810,10 +903,14 @@ simde_vmul_laneq_f64(simde_float64x1_t a, simde_float64x2_t b, const int lane) a_ = simde_float64x1_to_private(a); simde_float64x2_private b_ = simde_float64x2_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] * b_.values[lane]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vfmul_vf_f64m1(a_.sv64, b_.values[lane], 1); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[lane]; + } + #endif return simde_float64x1_from_private(r_); } diff --git a/simde/arm/neon/mulx_lane.h b/simde/arm/neon/mulx_lane.h index 06c02f3cb..eed553651 100644 --- a/simde/arm/neon/mulx_lane.h +++ b/simde/arm/neon/mulx_lane.h @@ -156,10 +156,14 @@ simde_vmulx_lane_f32(simde_float32x2_t a, simde_float32x2_t b, const int lane) a_ = simde_float32x2_to_private(a), b_ = simde_float32x2_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] * b_.values[lane]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vfmul_vf_f32m1(a_.sv64, b_.values[lane], 2); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[lane]; + } + #endif return simde_float32x2_from_private(r_); } @@ -180,10 +184,14 @@ simde_vmulx_lane_f64(simde_float64x1_t a, simde_float64x1_t b, const int lane) a_ = simde_float64x1_to_private(a), b_ = simde_float64x1_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] * b_.values[lane]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vfmul_vf_f64m1(a_.sv64, b_.values[lane], 1); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[lane]; + } + #endif return simde_float64x1_from_private(r_); } @@ -230,10 +238,14 @@ simde_vmulxq_lane_f32(simde_float32x4_t a, simde_float32x2_t b, const int lane) a_ = simde_float32x4_to_private(a); simde_float32x2_private b_ = simde_float32x2_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] * b_.values[lane]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vfmul_vf_f32m1(a_.sv128, b_.values[lane], 4); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[lane]; + } + #endif return simde_float32x4_from_private(r_); } @@ -254,10 +266,14 @@ simde_vmulxq_lane_f64(simde_float64x2_t a, simde_float64x1_t b, const int lane) a_ = simde_float64x2_to_private(a); simde_float64x1_private b_ = simde_float64x1_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] * b_.values[lane]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vfmul_vf_f64m1(a_.sv128, b_.values[lane], 2); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[lane]; + } + #endif return simde_float64x2_from_private(r_); } @@ -304,10 +320,14 @@ simde_vmulxq_laneq_f32(simde_float32x4_t a, simde_float32x4_t b, const int lane) a_ = simde_float32x4_to_private(a), b_ = simde_float32x4_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] * b_.values[lane]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vfmul_vf_f32m1(a_.sv128, b_.values[lane], 4); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[lane]; + } + #endif return simde_float32x4_from_private(r_); } @@ -328,10 +348,14 @@ simde_vmulxq_laneq_f64(simde_float64x2_t a, simde_float64x2_t b, const int lane) a_ = simde_float64x2_to_private(a), b_ = simde_float64x2_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] * b_.values[lane]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vfmul_vf_f64m1(a_.sv128, b_.values[lane], 2); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[lane]; + } + #endif return simde_float64x2_from_private(r_); } @@ -378,10 +402,14 @@ simde_vmulx_laneq_f32(simde_float32x2_t a, simde_float32x4_t b, const int lane) a_ = simde_float32x2_to_private(a); simde_float32x4_private b_ = simde_float32x4_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] * b_.values[lane]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vfmul_vf_f32m1(a_.sv64, b_.values[lane], 2); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[lane]; + } + #endif return simde_float32x2_from_private(r_); } @@ -402,10 +430,14 @@ simde_vmulx_laneq_f64(simde_float64x1_t a, simde_float64x2_t b, const int lane) a_ = simde_float64x1_to_private(a); simde_float64x2_private b_ = simde_float64x2_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] * b_.values[lane]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vfmul_vf_f64m1(a_.sv64, b_.values[lane], 1); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[lane]; + } + #endif return simde_float64x1_from_private(r_); } diff --git a/simde/arm/neon/st1.h b/simde/arm/neon/st1.h index b91658149..2e9b912a7 100644 --- a/simde/arm/neon/st1.h +++ b/simde/arm/neon/st1.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_ST1_H) @@ -41,7 +42,11 @@ simde_vst1_f16(simde_float16_t ptr[HEDLEY_ARRAY_PARAM(4)], simde_float16x4_t val vst1_f16(ptr, val); #else simde_float16x4_private val_ = simde_float16x4_to_private(val); - simde_memcpy(ptr, &val_, sizeof(val_)); + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVFH) + __riscv_vse16_v_f16m1((_Float16 *)ptr , val_.sv64 , 4); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -56,7 +61,11 @@ simde_vst1_f32(simde_float32_t ptr[HEDLEY_ARRAY_PARAM(2)], simde_float32x2_t val vst1_f32(ptr, val); #else simde_float32x2_private val_ = simde_float32x2_to_private(val); - simde_memcpy(ptr, &val_, sizeof(val_)); + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse32_v_f32m1(ptr , val_.sv64 , 2); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -71,7 +80,11 @@ simde_vst1_f64(simde_float64_t ptr[HEDLEY_ARRAY_PARAM(1)], simde_float64x1_t val vst1_f64(ptr, val); #else simde_float64x1_private val_ = simde_float64x1_to_private(val); - simde_memcpy(ptr, &val_, sizeof(val_)); + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse64_v_f64m1(ptr , val_.sv64 , 1); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) @@ -86,7 +99,11 @@ simde_vst1_s8(int8_t ptr[HEDLEY_ARRAY_PARAM(8)], simde_int8x8_t val) { vst1_s8(ptr, val); #else simde_int8x8_private val_ = simde_int8x8_to_private(val); - simde_memcpy(ptr, &val_, sizeof(val_)); + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse8_v_i8m1(ptr , val_.sv64 , 8); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -101,7 +118,11 @@ simde_vst1_s16(int16_t ptr[HEDLEY_ARRAY_PARAM(4)], simde_int16x4_t val) { vst1_s16(ptr, val); #else simde_int16x4_private val_ = simde_int16x4_to_private(val); - simde_memcpy(ptr, &val_, sizeof(val_)); + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse16_v_i16m1(ptr , val_.sv64 , 4); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -116,7 +137,11 @@ simde_vst1_s32(int32_t ptr[HEDLEY_ARRAY_PARAM(2)], simde_int32x2_t val) { vst1_s32(ptr, val); #else simde_int32x2_private val_ = simde_int32x2_to_private(val); - simde_memcpy(ptr, &val_, sizeof(val_)); + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse32_v_i32m1(ptr , val_.sv64 , 2); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -131,7 +156,11 @@ simde_vst1_s64(int64_t ptr[HEDLEY_ARRAY_PARAM(1)], simde_int64x1_t val) { vst1_s64(ptr, val); #else simde_int64x1_private val_ = simde_int64x1_to_private(val); - simde_memcpy(ptr, &val_, sizeof(val_)); + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse64_v_i64m1(ptr , val_.sv64 , 1); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -146,7 +175,11 @@ simde_vst1_u8(uint8_t ptr[HEDLEY_ARRAY_PARAM(8)], simde_uint8x8_t val) { vst1_u8(ptr, val); #else simde_uint8x8_private val_ = simde_uint8x8_to_private(val); - simde_memcpy(ptr, &val_, sizeof(val_)); + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse8_v_u8m1(ptr , val_.sv64 , 8); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -161,7 +194,11 @@ simde_vst1_u16(uint16_t ptr[HEDLEY_ARRAY_PARAM(4)], simde_uint16x4_t val) { vst1_u16(ptr, val); #else simde_uint16x4_private val_ = simde_uint16x4_to_private(val); - simde_memcpy(ptr, &val_, sizeof(val_)); + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse16_v_u16m1(ptr , val_.sv64 , 4); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -176,7 +213,11 @@ simde_vst1_u32(uint32_t ptr[HEDLEY_ARRAY_PARAM(2)], simde_uint32x2_t val) { vst1_u32(ptr, val); #else simde_uint32x2_private val_ = simde_uint32x2_to_private(val); - simde_memcpy(ptr, &val_, sizeof(val_)); + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse32_v_u32m1(ptr , val_.sv64 , 2); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -191,7 +232,11 @@ simde_vst1_u64(uint64_t ptr[HEDLEY_ARRAY_PARAM(1)], simde_uint64x1_t val) { vst1_u64(ptr, val); #else simde_uint64x1_private val_ = simde_uint64x1_to_private(val); - simde_memcpy(ptr, &val_, sizeof(val_)); + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse64_v_u64m1(ptr , val_.sv64 , 1); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -209,6 +254,8 @@ simde_vst1q_f16(simde_float16_t ptr[HEDLEY_ARRAY_PARAM(8)], simde_float16x8_t va #if defined(SIMDE_WASM_SIMD128_NATIVE) wasm_v128_store(ptr, val_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVFH) + __riscv_vse16_v_f16m1((_Float16 *)ptr , val_.sv128 , 8); #else simde_memcpy(ptr, &val_, sizeof(val_)); #endif @@ -229,6 +276,8 @@ simde_vst1q_f32(simde_float32_t ptr[HEDLEY_ARRAY_PARAM(4)], simde_float32x4_t va #if defined(SIMDE_WASM_SIMD128_NATIVE) wasm_v128_store(ptr, val_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse32_v_f32m1(ptr , val_.sv128 , 4); #else simde_memcpy(ptr, &val_, sizeof(val_)); #endif @@ -249,6 +298,8 @@ simde_vst1q_f64(simde_float64_t ptr[HEDLEY_ARRAY_PARAM(2)], simde_float64x2_t va #if defined(SIMDE_WASM_SIMD128_NATIVE) wasm_v128_store(ptr, val_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse64_v_f64m1(ptr , val_.sv128 , 2); #else simde_memcpy(ptr, &val_, sizeof(val_)); #endif @@ -269,6 +320,8 @@ simde_vst1q_s8(int8_t ptr[HEDLEY_ARRAY_PARAM(16)], simde_int8x16_t val) { #if defined(SIMDE_WASM_SIMD128_NATIVE) wasm_v128_store(ptr, val_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse8_v_i8m1(ptr , val_.sv128 , 16); #else simde_memcpy(ptr, &val_, sizeof(val_)); #endif @@ -289,6 +342,8 @@ simde_vst1q_s16(int16_t ptr[HEDLEY_ARRAY_PARAM(8)], simde_int16x8_t val) { #if defined(SIMDE_WASM_SIMD128_NATIVE) wasm_v128_store(ptr, val_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse16_v_i16m1(ptr , val_.sv128 , 8); #else simde_memcpy(ptr, &val_, sizeof(val_)); #endif @@ -309,6 +364,8 @@ simde_vst1q_s32(int32_t ptr[HEDLEY_ARRAY_PARAM(4)], simde_int32x4_t val) { #if defined(SIMDE_WASM_SIMD128_NATIVE) wasm_v128_store(ptr, val_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse32_v_i32m1(ptr , val_.sv128 , 4); #else simde_memcpy(ptr, &val_, sizeof(val_)); #endif @@ -329,6 +386,8 @@ simde_vst1q_s64(int64_t ptr[HEDLEY_ARRAY_PARAM(2)], simde_int64x2_t val) { #if defined(SIMDE_WASM_SIMD128_NATIVE) wasm_v128_store(ptr, val_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse64_v_i64m1(ptr , val_.sv128 , 2); #else simde_memcpy(ptr, &val_, sizeof(val_)); #endif @@ -349,6 +408,8 @@ simde_vst1q_u8(uint8_t ptr[HEDLEY_ARRAY_PARAM(16)], simde_uint8x16_t val) { #if defined(SIMDE_WASM_SIMD128_NATIVE) wasm_v128_store(ptr, val_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse8_v_u8m1(ptr , val_.sv128 , 16); #else simde_memcpy(ptr, &val_, sizeof(val_)); #endif @@ -369,6 +430,8 @@ simde_vst1q_u16(uint16_t ptr[HEDLEY_ARRAY_PARAM(8)], simde_uint16x8_t val) { #if defined(SIMDE_WASM_SIMD128_NATIVE) wasm_v128_store(ptr, val_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse16_v_u16m1(ptr , val_.sv128 , 8); #else simde_memcpy(ptr, &val_, sizeof(val_)); #endif @@ -389,6 +452,8 @@ simde_vst1q_u32(uint32_t ptr[HEDLEY_ARRAY_PARAM(4)], simde_uint32x4_t val) { #if defined(SIMDE_WASM_SIMD128_NATIVE) wasm_v128_store(ptr, val_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse32_v_u32m1(ptr , val_.sv128 , 4); #else simde_memcpy(ptr, &val_, sizeof(val_)); #endif @@ -409,6 +474,8 @@ simde_vst1q_u64(uint64_t ptr[HEDLEY_ARRAY_PARAM(2)], simde_uint64x2_t val) { #if defined(SIMDE_WASM_SIMD128_NATIVE) wasm_v128_store(ptr, val_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse64_v_u64m1(ptr , val_.sv128 , 2); #else simde_memcpy(ptr, &val_, sizeof(val_)); #endif @@ -426,7 +493,11 @@ simde_vst1_p8(simde_poly8_t ptr[HEDLEY_ARRAY_PARAM(8)], simde_poly8x8_t val) { vst1_p8(ptr, val); #else simde_poly8x8_private val_ = simde_poly8x8_to_private(val); - simde_memcpy(ptr, &val_, sizeof(val_)); + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse8_v_u8m1(ptr , val_.sv64 , 8); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -441,7 +512,11 @@ simde_vst1_p16(simde_poly16_t ptr[HEDLEY_ARRAY_PARAM(4)], simde_poly16x4_t val) vst1_p16(ptr, val); #else simde_poly16x4_private val_ = simde_poly16x4_to_private(val); - simde_memcpy(ptr, &val_, sizeof(val_)); + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse16_v_u16m1(ptr , val_.sv64 , 4); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -456,7 +531,11 @@ simde_vst1_p64(simde_poly64_t ptr[HEDLEY_ARRAY_PARAM(1)], simde_poly64x1_t val) vst1_p64(ptr, val); #else simde_poly64x1_private val_ = simde_poly64x1_to_private(val); - simde_memcpy(ptr, &val_, sizeof(val_)); + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse64_v_u64m1(ptr , val_.sv64 , 1); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) @@ -471,7 +550,11 @@ simde_vst1q_p8(simde_poly8_t ptr[HEDLEY_ARRAY_PARAM(16)], simde_poly8x16_t val) vst1q_p8(ptr, val); #else simde_poly8x16_private val_ = simde_poly8x16_to_private(val); - simde_memcpy(ptr, &val_, sizeof(val_)); + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse8_v_u8m1(ptr , val_.sv128 , 16); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -486,7 +569,11 @@ simde_vst1q_p16(simde_poly16_t ptr[HEDLEY_ARRAY_PARAM(8)], simde_poly16x8_t val) vst1q_p16(ptr, val); #else simde_poly16x8_private val_ = simde_poly16x8_to_private(val); - simde_memcpy(ptr, &val_, sizeof(val_)); + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse16_v_u16m1(ptr , val_.sv128 , 8); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -501,7 +588,11 @@ simde_vst1q_p64(simde_poly64_t ptr[HEDLEY_ARRAY_PARAM(2)], simde_poly64x2_t val) vst1q_p64(ptr, val); #else simde_poly64x2_private val_ = simde_poly64x2_to_private(val); - simde_memcpy(ptr, &val_, sizeof(val_)); + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse64_v_u64m1(ptr , val_.sv128 , 2); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) diff --git a/simde/arm/neon/st1_x2.h b/simde/arm/neon/st1_x2.h index 59d3b395d..2b9f94c96 100644 --- a/simde/arm/neon/st1_x2.h +++ b/simde/arm/neon/st1_x2.h @@ -24,6 +24,7 @@ * 2020 Evan Nemerson * 2021 Décio Luiz Gazzoni Filho * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_ST1_X2_H) @@ -43,11 +44,18 @@ simde_vst1_f16_x2(simde_float16_t ptr[HEDLEY_ARRAY_PARAM(8)], simde_float16x4x2_ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && !defined(SIMDE_BUG_GCC_REV_260989) vst1_f16_x2(ptr, val); #else - simde_float16x4_private val_[2]; - for (size_t i = 0; i < 2; i++) { - val_[i] = simde_float16x4_to_private(val.val[i]); - } - simde_memcpy(ptr, &val_, sizeof(val_)); + simde_float16x4_private a_[2] = {simde_float16x4_to_private(val.val[0]), + simde_float16x4_to_private(val.val[1])}; + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + __riscv_vse16_v_f16m1((_Float16 *)ptr , a_[0].sv64 , 4); + __riscv_vse16_v_f16m1((_Float16 *)ptr+4 , a_[1].sv64 , 4); + #else + simde_float16_t buf[8]; + for (size_t i = 0; i < 8; i++) { + buf[i] = a_[i / 4].values[i % 4]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -216,7 +224,12 @@ simde_vst1_p8_x2(simde_poly8_t ptr[HEDLEY_ARRAY_PARAM(16)], simde_poly8x8x2_t va for (size_t i = 0; i < 2; i++) { val_[i] = simde_poly8x8_to_private(val.val[i]); } - simde_memcpy(ptr, &val_, sizeof(val_)); + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse8_v_u8m1(ptr , val_[0].sv64 , 8); + __riscv_vse8_v_u8m1(ptr+8 , val_[1].sv64 , 8); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -235,7 +248,12 @@ simde_vst1_p16_x2(simde_poly16_t ptr[HEDLEY_ARRAY_PARAM(8)], simde_poly16x4x2_t for (size_t i = 0; i < 2; i++) { val_[i] = simde_poly16x4_to_private(val.val[i]); } - simde_memcpy(ptr, &val_, sizeof(val_)); + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse16_v_u16m1(ptr , val_[0].sv64 , 4); + __riscv_vse16_v_u16m1(ptr+4 , val_[1].sv64 , 4); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -254,7 +272,12 @@ simde_vst1_p64_x2(simde_poly64_t ptr[HEDLEY_ARRAY_PARAM(2)], simde_poly64x1x2_t for (size_t i = 0; i < 2; i++) { val_[i] = simde_poly64x1_to_private(val.val[i]); } - simde_memcpy(ptr, &val_, sizeof(val_)); + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse64_v_u64m1(ptr , val_[0].sv64 , 1); + __riscv_vse64_v_u64m1(ptr+1 , val_[1].sv64 , 1); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) diff --git a/simde/arm/neon/st1_x3.h b/simde/arm/neon/st1_x3.h index 35ead1244..510c9d67e 100644 --- a/simde/arm/neon/st1_x3.h +++ b/simde/arm/neon/st1_x3.h @@ -24,6 +24,7 @@ * 2020 Evan Nemerson * 2021 Décio Luiz Gazzoni Filho * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_ST1_X3_H) @@ -43,11 +44,20 @@ simde_vst1_f16_x3(simde_float16_t ptr[HEDLEY_ARRAY_PARAM(12)], simde_float16x4x3 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) vst1_f16_x3(ptr, val); #else - simde_float16x4_private val_[3]; - for (size_t i = 0; i < 3; i++) { - val_[i] = simde_float16x4_to_private(val.val[i]); - } - simde_memcpy(ptr, &val_, sizeof(val_)); + simde_float16x4_private a[3] = { simde_float16x4_to_private(val.val[0]), + simde_float16x4_to_private(val.val[1]), + simde_float16x4_to_private(val.val[2]) }; + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + __riscv_vse16_v_f16m1((_Float16 *)ptr , a[0].sv64 , 4); + __riscv_vse16_v_f16m1((_Float16 *)ptr+4 , a[1].sv64 , 4); + __riscv_vse16_v_f16m1((_Float16 *)ptr+8 , a[2].sv64 , 4); + #else + simde_float16_t buf[12]; + for (size_t i = 0; i < 12 ; i++) { + buf[i] = a[i / 4].values[i % 4]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -226,7 +236,13 @@ simde_vst1_p8_x3(simde_poly8_t ptr[HEDLEY_ARRAY_PARAM(24)], simde_poly8x8x3_t va for (size_t i = 0; i < 3; i++) { val_[i] = simde_poly8x8_to_private(val.val[i]); } - simde_memcpy(ptr, &val_, sizeof(val_)); + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse8_v_u8m1(ptr , val_[0].sv64 , 8); + __riscv_vse8_v_u8m1(ptr+8 , val_[1].sv64 , 8); + __riscv_vse8_v_u8m1(ptr+16 , val_[2].sv64 , 8); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -245,7 +261,13 @@ simde_vst1_p16_x3(simde_poly16_t ptr[HEDLEY_ARRAY_PARAM(12)], simde_poly16x4x3_t for (size_t i = 0; i < 3; i++) { val_[i] = simde_poly16x4_to_private(val.val[i]); } - simde_memcpy(ptr, &val_, sizeof(val_)); + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse16_v_u16m1(ptr , val_[0].sv64 , 4); + __riscv_vse16_v_u16m1(ptr+4 , val_[1].sv64 , 4); + __riscv_vse16_v_u16m1(ptr+8 , val_[2].sv64 , 4); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -264,7 +286,13 @@ simde_vst1_p64_x3(simde_poly64_t ptr[HEDLEY_ARRAY_PARAM(3)], simde_poly64x1x3_t for (size_t i = 0; i < 3; i++) { val_[i] = simde_poly64x1_to_private(val.val[i]); } - simde_memcpy(ptr, &val_, sizeof(val_)); + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse64_v_u64m1(ptr , val_[0].sv64 , 1); + __riscv_vse64_v_u64m1(ptr+1 , val_[1].sv64 , 1); + __riscv_vse64_v_u64m1(ptr+2 , val_[2].sv64 , 1); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) diff --git a/simde/arm/neon/st1_x4.h b/simde/arm/neon/st1_x4.h index 873e1f413..41f6db6e1 100644 --- a/simde/arm/neon/st1_x4.h +++ b/simde/arm/neon/st1_x4.h @@ -24,6 +24,7 @@ * 2020 Evan Nemerson * 2021 Décio Luiz Gazzoni Filho * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_ST1_X4_H) @@ -43,11 +44,20 @@ simde_vst1_f16_x4(simde_float16_t ptr[HEDLEY_ARRAY_PARAM(16)], simde_float16x4x4 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) vst1_f16_x4(ptr, val); #else - simde_float16x4_private val_[4]; - for (size_t i = 0; i < 4; i++) { - val_[i] = simde_float16x4_to_private(val.val[i]); - } - simde_memcpy(ptr, &val_, sizeof(val_)); + simde_float16x4_private a_[4] = { simde_float16x4_to_private(val.val[0]), simde_float16x4_to_private(val.val[1]), + simde_float16x4_to_private(val.val[2]), simde_float16x4_to_private(val.val[3]) }; + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + __riscv_vse16_v_f16m1((_Float16 *)ptr , a_[0].sv64 , 4); + __riscv_vse16_v_f16m1((_Float16 *)ptr+4 , a_[1].sv64 , 4); + __riscv_vse16_v_f16m1((_Float16 *)ptr+8 , a_[2].sv64 , 4); + __riscv_vse16_v_f16m1((_Float16 *)ptr+12 , a_[3].sv64 , 4); + #else + simde_float16_t buf[16]; + for (size_t i = 0; i < 16 ; i++) { + buf[i] = a_[i / 4].values[i % 4]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -236,7 +246,14 @@ simde_vst1_p8_x4(simde_poly8_t ptr[HEDLEY_ARRAY_PARAM(32)], simde_poly8x8x4_t va for (size_t i = 0; i < 4; i++) { val_[i] = simde_poly8x8_to_private(val.val[i]); } - simde_memcpy(ptr, &val_, sizeof(val_)); + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse8_v_u8m1(ptr , val_[0].sv64 , 8); + __riscv_vse8_v_u8m1(ptr+8 , val_[1].sv64 , 8); + __riscv_vse8_v_u8m1(ptr+16 , val_[2].sv64 , 8); + __riscv_vse8_v_u8m1(ptr+24 , val_[3].sv64 , 8); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -255,7 +272,14 @@ simde_vst1_p16_x4(simde_poly16_t ptr[HEDLEY_ARRAY_PARAM(16)], simde_poly16x4x4_t for (size_t i = 0; i < 4; i++) { val_[i] = simde_poly16x4_to_private(val.val[i]); } - simde_memcpy(ptr, &val_, sizeof(val_)); + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse16_v_u16m1(ptr , val_[0].sv64 , 4); + __riscv_vse16_v_u16m1(ptr+4 , val_[1].sv64 , 4); + __riscv_vse16_v_u16m1(ptr+8 , val_[2].sv64 , 4); + __riscv_vse16_v_u16m1(ptr+12 , val_[3].sv64 , 4); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -274,7 +298,14 @@ simde_vst1_p64_x4(simde_poly64_t ptr[HEDLEY_ARRAY_PARAM(4)], simde_poly64x1x4_t for (size_t i = 0; i < 4; i++) { val_[i] = simde_poly64x1_to_private(val.val[i]); } - simde_memcpy(ptr, &val_, sizeof(val_)); + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse64_v_u64m1(ptr , val_[0].sv64 , 1); + __riscv_vse64_v_u64m1(ptr+1 , val_[1].sv64 , 1); + __riscv_vse64_v_u64m1(ptr+2 , val_[2].sv64 , 1); + __riscv_vse64_v_u64m1(ptr+3 , val_[3].sv64 , 1); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) diff --git a/simde/arm/neon/st1q_x2.h b/simde/arm/neon/st1q_x2.h index eeee23975..4e96191af 100644 --- a/simde/arm/neon/st1q_x2.h +++ b/simde/arm/neon/st1q_x2.h @@ -22,6 +22,7 @@ * * Copyright: * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_ST1Q_X2_H) @@ -41,11 +42,18 @@ simde_vst1q_f16_x2(simde_float16_t ptr[HEDLEY_ARRAY_PARAM(16)], simde_float16x8x #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) vst1q_f16_x2(ptr, val); #else - simde_float16x8_private val_[2]; - for (size_t i = 0; i < 2; i++) { - val_[i] = simde_float16x8_to_private(val.val[i]); - } - simde_memcpy(ptr, &val_, sizeof(val_)); + simde_float16x8_private a_[2] = {simde_float16x8_to_private(val.val[0]), + simde_float16x8_to_private(val.val[1])}; + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + __riscv_vse16_v_f16m1((_Float16 *)ptr , a_[0].sv128 , 8); + __riscv_vse16_v_f16m1((_Float16 *)ptr+8 , a_[1].sv128 , 8); + #else + simde_float16_t buf[16]; + for (size_t i = 0; i < 16; i++) { + buf[i] = a_[i / 8].values[i % 8]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -214,7 +222,12 @@ simde_vst1q_p8_x2(simde_poly8_t ptr[HEDLEY_ARRAY_PARAM(32)], simde_poly8x16x2_t for (size_t i = 0; i < 2; i++) { val_[i] = simde_poly8x16_to_private(val.val[i]); } - simde_memcpy(ptr, &val_, sizeof(val_)); + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse8_v_u8m1(ptr , val_[0].sv128 , 16); + __riscv_vse8_v_u8m1(ptr+16 , val_[1].sv128 , 16); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -233,7 +246,12 @@ simde_vst1q_p16_x2(simde_poly16_t ptr[HEDLEY_ARRAY_PARAM(16)], simde_poly16x8x2_ for (size_t i = 0; i < 2; i++) { val_[i] = simde_poly16x8_to_private(val.val[i]); } - simde_memcpy(ptr, &val_, sizeof(val_)); + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse16_v_u16m1(ptr , val_[0].sv128 , 8); + __riscv_vse16_v_u16m1(ptr+8 , val_[1].sv128 , 8); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -252,7 +270,12 @@ simde_vst1q_p64_x2(simde_poly64_t ptr[HEDLEY_ARRAY_PARAM(4)], simde_poly64x2x2_t for (size_t i = 0; i < 2; i++) { val_[i] = simde_poly64x2_to_private(val.val[i]); } - simde_memcpy(ptr, &val_, sizeof(val_)); + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse64_v_u64m1(ptr , val_[0].sv128 , 2); + __riscv_vse64_v_u64m1(ptr+2 , val_[1].sv128 , 2); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) diff --git a/simde/arm/neon/st1q_x3.h b/simde/arm/neon/st1q_x3.h index 604e037c3..04beeb2c8 100644 --- a/simde/arm/neon/st1q_x3.h +++ b/simde/arm/neon/st1q_x3.h @@ -22,6 +22,7 @@ * * Copyright: * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_ST1Q_X3_H) @@ -41,11 +42,20 @@ simde_vst1q_f16_x3(simde_float16_t ptr[HEDLEY_ARRAY_PARAM(24)], simde_float16x8x #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) vst1q_f16_x3(ptr, val); #else - simde_float16x8_private val_[3]; - for (size_t i = 0; i < 3; i++) { - val_[i] = simde_float16x8_to_private(val.val[i]); - } - simde_memcpy(ptr, &val_, sizeof(val_)); + simde_float16x8_private a[3] = { simde_float16x8_to_private(val.val[0]), + simde_float16x8_to_private(val.val[1]), + simde_float16x8_to_private(val.val[2]) }; + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + __riscv_vse16_v_f16m1((_Float16 *)ptr , a[0].sv128 , 8); + __riscv_vse16_v_f16m1((_Float16 *)ptr+8 , a[1].sv128 , 8); + __riscv_vse16_v_f16m1((_Float16 *)ptr+16 , a[2].sv128 , 8); + #else + simde_float16_t buf[24]; + for (size_t i = 0; i < 24 ; i++) { + buf[i] = a[i / 8].values[i % 8]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -224,7 +234,13 @@ simde_vst1q_p8_x3(simde_poly8_t ptr[HEDLEY_ARRAY_PARAM(48)], simde_poly8x16x3_t for (size_t i = 0; i < 3; i++) { val_[i] = simde_poly8x16_to_private(val.val[i]); } - simde_memcpy(ptr, &val_, sizeof(val_)); + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse8_v_u8m1(ptr , val_[0].sv128 , 16); + __riscv_vse8_v_u8m1(ptr+16 , val_[1].sv128 , 16); + __riscv_vse8_v_u8m1(ptr+32 , val_[2].sv128 , 16); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -243,7 +259,13 @@ simde_vst1q_p16_x3(simde_poly16_t ptr[HEDLEY_ARRAY_PARAM(24)], simde_poly16x8x3_ for (size_t i = 0; i < 3; i++) { val_[i] = simde_poly16x8_to_private(val.val[i]); } - simde_memcpy(ptr, &val_, sizeof(val_)); + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse16_v_u16m1(ptr , val_[0].sv128 , 8); + __riscv_vse16_v_u16m1(ptr+8 , val_[1].sv128 , 8); + __riscv_vse16_v_u16m1(ptr+16 , val_[2].sv128 , 8); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -262,7 +284,13 @@ simde_vst1q_p64_x3(simde_poly64_t ptr[HEDLEY_ARRAY_PARAM(6)], simde_poly64x2x3_t for (size_t i = 0; i < 3; i++) { val_[i] = simde_poly64x2_to_private(val.val[i]); } - simde_memcpy(ptr, &val_, sizeof(val_)); + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse64_v_u64m1(ptr , val_[0].sv128 , 2); + __riscv_vse64_v_u64m1(ptr+2 , val_[1].sv128 , 2); + __riscv_vse64_v_u64m1(ptr+4 , val_[2].sv128 , 2); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) diff --git a/simde/arm/neon/st1q_x4.h b/simde/arm/neon/st1q_x4.h index a23651c60..9b91c632e 100644 --- a/simde/arm/neon/st1q_x4.h +++ b/simde/arm/neon/st1q_x4.h @@ -24,6 +24,7 @@ * 2020 Evan Nemerson * 2021 Décio Luiz Gazzoni Filho * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_ST1Q_X4_H) @@ -43,11 +44,20 @@ simde_vst1q_f16_x4(simde_float16_t ptr[HEDLEY_ARRAY_PARAM(32)], simde_float16x8x #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) vst1q_f16_x4(ptr, val); #else - simde_float16x8_private val_[4]; - for (size_t i = 0; i < 4; i++) { - val_[i] = simde_float16x8_to_private(val.val[i]); - } - simde_memcpy(ptr, &val_, sizeof(val_)); + simde_float16x8_private a_[4] = { simde_float16x8_to_private(val.val[0]), simde_float16x8_to_private(val.val[1]), + simde_float16x8_to_private(val.val[2]), simde_float16x8_to_private(val.val[3]) }; + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + __riscv_vse16_v_f16m1((_Float16 *)ptr , a_[0].sv128 , 8); + __riscv_vse16_v_f16m1((_Float16 *)ptr+8 , a_[1].sv128 , 8); + __riscv_vse16_v_f16m1((_Float16 *)ptr+16 , a_[2].sv128 , 8); + __riscv_vse16_v_f16m1((_Float16 *)ptr+24 , a_[3].sv128 , 8); + #else + simde_float16_t buf[32]; + for (size_t i = 0; i < 32 ; i++) { + buf[i] = a_[i / 8].values[i % 8]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -236,7 +246,14 @@ simde_vst1q_p8_x4(simde_poly8_t ptr[HEDLEY_ARRAY_PARAM(64)], simde_poly8x16x4_t for (size_t i = 0; i < 4; i++) { val_[i] = simde_poly8x16_to_private(val.val[i]); } - simde_memcpy(ptr, &val_, sizeof(val_)); + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse8_v_u8m1(ptr , val_[0].sv128 , 16); + __riscv_vse8_v_u8m1(ptr+16 , val_[1].sv128 , 16); + __riscv_vse8_v_u8m1(ptr+32 , val_[2].sv128 , 16); + __riscv_vse8_v_u8m1(ptr+48 , val_[3].sv128 , 16); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -255,7 +272,14 @@ simde_vst1q_p16_x4(simde_poly16_t ptr[HEDLEY_ARRAY_PARAM(32)], simde_poly16x8x4_ for (size_t i = 0; i < 4; i++) { val_[i] = simde_poly16x8_to_private(val.val[i]); } - simde_memcpy(ptr, &val_, sizeof(val_)); + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse16_v_u16m1(ptr , val_[0].sv128 , 8); + __riscv_vse16_v_u16m1(ptr+8 , val_[1].sv128 , 8); + __riscv_vse16_v_u16m1(ptr+16 , val_[2].sv128 , 8); + __riscv_vse16_v_u16m1(ptr+24 , val_[3].sv128 , 8); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -274,7 +298,14 @@ simde_vst1q_p64_x4(simde_poly64_t ptr[HEDLEY_ARRAY_PARAM(8)], simde_poly64x2x4_t for (size_t i = 0; i < 4; i++) { val_[i] = simde_poly64x2_to_private(val.val[i]); } - simde_memcpy(ptr, &val_, sizeof(val_)); + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse64_v_u64m1(ptr , val_[0].sv128 , 2); + __riscv_vse64_v_u64m1(ptr+2 , val_[1].sv128 , 2); + __riscv_vse64_v_u64m1(ptr+4 , val_[2].sv128 , 2); + __riscv_vse64_v_u64m1(ptr+6 , val_[3].sv128 , 2); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) diff --git a/simde/arm/neon/st2.h b/simde/arm/neon/st2.h index b449f8525..157123cf7 100644 --- a/simde/arm/neon/st2.h +++ b/simde/arm/neon/st2.h @@ -24,6 +24,7 @@ * 2020 Evan Nemerson * 2021 Zhi An Ng (Copyright owned by Google, LLC) * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_ST2_H) @@ -44,13 +45,20 @@ simde_vst2_f16(simde_float16_t *ptr, simde_float16x4x2_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) vst2_f16(ptr, val); #else - simde_float16_t buf[8]; simde_float16x4_private a_[2] = {simde_float16x4_to_private(val.val[0]), simde_float16x4_to_private(val.val[1])}; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { - buf[i] = a_[i % 2].values[i / 2]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH && (SIMDE_NATURAL_VECTOR_SIZE >= 128) + vfloat16m1x2_t dest = __riscv_vlseg2e16_v_f16m1x2((_Float16 *)ptr, 4); + dest = __riscv_vset_v_f16m1_f16m1x2 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_f16m1_f16m1x2 (dest, 1, a_[1].sv64); + __riscv_vsseg2e16_v_f16m1x2 ((_Float16 *)ptr, dest, 4); + #else + simde_float16_t buf[8]; + for (size_t i = 0; i < 8 ; i++) { + buf[i] = a_[i % 2].values[i / 2]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -64,13 +72,20 @@ simde_vst2_f32(simde_float32_t *ptr, simde_float32x2x2_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst2_f32(ptr, val); #else - simde_float32_t buf[4]; simde_float32x2_private a_[2] = {simde_float32x2_to_private(val.val[0]), simde_float32x2_to_private(val.val[1])}; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { - buf[i] = a_[i % 2].values[i / 2]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) + vfloat32m1x2_t dest = __riscv_vlseg2e32_v_f32m1x2(ptr, 2); + dest = __riscv_vset_v_f32m1_f32m1x2 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_f32m1_f32m1x2 (dest, 1, a_[1].sv64); + __riscv_vsseg2e32_v_f32m1x2 (ptr, dest, 2); + #else + simde_float32_t buf[4]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { + buf[i] = a_[i % 2].values[i / 2]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -84,13 +99,20 @@ simde_vst2_f64(simde_float64_t *ptr, simde_float64x1x2_t val) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) vst2_f64(ptr, val); #else - simde_float64_t buf[2]; simde_float64x1_private a_[2] = {simde_float64x1_to_private(val.val[0]), simde_float64x1_to_private(val.val[1])}; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { - buf[i] = a_[i % 2].values[i / 2]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) + vfloat64m1x2_t dest = __riscv_vlseg2e64_v_f64m1x2(ptr, 1); + dest = __riscv_vset_v_f64m1_f64m1x2 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_f64m1_f64m1x2 (dest, 1, a_[1].sv64); + __riscv_vsseg2e64_v_f64m1x2 (ptr, dest, 1); + #else + simde_float64_t buf[2]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { + buf[i] = a_[i % 2].values[i / 2]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) @@ -104,13 +126,20 @@ simde_vst2_s8(int8_t *ptr, simde_int8x8x2_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst2_s8(ptr, val); #else - int8_t buf[16]; simde_int8x8_private a_[2] = {simde_int8x8_to_private(val.val[0]), simde_int8x8_to_private(val.val[1])}; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { - buf[i] = a_[i % 2].values[i / 2]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) + vint8m1x2_t dest = __riscv_vlseg2e8_v_i8m1x2(ptr, 8); + dest = __riscv_vset_v_i8m1_i8m1x2 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_i8m1_i8m1x2 (dest, 1, a_[1].sv64); + __riscv_vsseg2e8_v_i8m1x2 (ptr, dest, 8); + #else + int8_t buf[16]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { + buf[i] = a_[i % 2].values[i / 2]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -124,13 +153,20 @@ simde_vst2_s16(int16_t *ptr, simde_int16x4x2_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst2_s16(ptr, val); #else - int16_t buf[8]; simde_int16x4_private a_[2] = {simde_int16x4_to_private(val.val[0]), simde_int16x4_to_private(val.val[1])}; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { - buf[i] = a_[i % 2].values[i / 2]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) + vint16m1x2_t dest = __riscv_vlseg2e16_v_i16m1x2(ptr, 4); + dest = __riscv_vset_v_i16m1_i16m1x2 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_i16m1_i16m1x2 (dest, 1, a_[1].sv64); + __riscv_vsseg2e16_v_i16m1x2 (ptr, dest, 4); + #else + int16_t buf[8]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { + buf[i] = a_[i % 2].values[i / 2]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -144,13 +180,20 @@ simde_vst2_s32(int32_t *ptr, simde_int32x2x2_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst2_s32(ptr, val); #else - int32_t buf[4]; simde_int32x2_private a_[2] = {simde_int32x2_to_private(val.val[0]), simde_int32x2_to_private(val.val[1])}; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { - buf[i] = a_[i % 2].values[i / 2]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) + vint32m1x2_t dest = __riscv_vlseg2e32_v_i32m1x2(ptr, 2); + dest = __riscv_vset_v_i32m1_i32m1x2 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_i32m1_i32m1x2 (dest, 1, a_[1].sv64); + __riscv_vsseg2e32_v_i32m1x2 (ptr, dest, 2); + #else + int32_t buf[4]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { + buf[i] = a_[i % 2].values[i / 2]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -164,13 +207,20 @@ simde_vst2_s64(int64_t *ptr, simde_int64x1x2_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst2_s64(ptr, val); #else - int64_t buf[2]; simde_int64x1_private a_[2] = {simde_int64x1_to_private(val.val[0]), simde_int64x1_to_private(val.val[1])}; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { - buf[i] = a_[i % 2].values[i / 2]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) + vint64m1x2_t dest = __riscv_vlseg2e64_v_i64m1x2(ptr, 1); + dest = __riscv_vset_v_i64m1_i64m1x2 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_i64m1_i64m1x2 (dest, 1, a_[1].sv64); + __riscv_vsseg2e64_v_i64m1x2 (ptr, dest, 1); + #else + int64_t buf[2]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { + buf[i] = a_[i % 2].values[i / 2]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -184,13 +234,20 @@ simde_vst2_u8(uint8_t *ptr, simde_uint8x8x2_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst2_u8(ptr, val); #else - uint8_t buf[16]; simde_uint8x8_private a_[2] = {simde_uint8x8_to_private(val.val[0]), simde_uint8x8_to_private(val.val[1])}; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { - buf[i] = a_[i % 2].values[i / 2]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) + vuint8m1x2_t dest = __riscv_vlseg2e8_v_u8m1x2(ptr, 8); + dest = __riscv_vset_v_u8m1_u8m1x2 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_u8m1_u8m1x2 (dest, 1, a_[1].sv64); + __riscv_vsseg2e8_v_u8m1x2 (ptr, dest, 8); + #else + uint8_t buf[16]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { + buf[i] = a_[i % 2].values[i / 2]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -204,13 +261,20 @@ simde_vst2_u16(uint16_t *ptr, simde_uint16x4x2_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst2_u16(ptr, val); #else - uint16_t buf[8]; simde_uint16x4_private a_[2] = {simde_uint16x4_to_private(val.val[0]), simde_uint16x4_to_private(val.val[1])}; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { - buf[i] = a_[i % 2].values[i / 2]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) + vuint16m1x2_t dest = __riscv_vlseg2e16_v_u16m1x2(ptr, 4); + dest = __riscv_vset_v_u16m1_u16m1x2 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_u16m1_u16m1x2 (dest, 1, a_[1].sv64); + __riscv_vsseg2e16_v_u16m1x2 (ptr, dest, 4); + #else + uint16_t buf[8]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { + buf[i] = a_[i % 2].values[i / 2]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -224,13 +288,20 @@ simde_vst2_u32(uint32_t *ptr, simde_uint32x2x2_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst2_u32(ptr, val); #else - uint32_t buf[4]; simde_uint32x2_private a_[2] = {simde_uint32x2_to_private(val.val[0]), simde_uint32x2_to_private(val.val[1])}; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { - buf[i] = a_[i % 2].values[i / 2]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) + vuint32m1x2_t dest = __riscv_vlseg2e32_v_u32m1x2(ptr, 2); + dest = __riscv_vset_v_u32m1_u32m1x2 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_u32m1_u32m1x2 (dest, 1, a_[1].sv64); + __riscv_vsseg2e32_v_u32m1x2 (ptr, dest, 2); + #else + uint32_t buf[4]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { + buf[i] = a_[i % 2].values[i / 2]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -244,13 +315,20 @@ simde_vst2_u64(uint64_t *ptr, simde_uint64x1x2_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst2_u64(ptr, val); #else - uint64_t buf[2]; simde_uint64x1_private a_[2] = {simde_uint64x1_to_private(val.val[0]), simde_uint64x1_to_private(val.val[1])}; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { - buf[i] = a_[i % 2].values[i / 2]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) + vuint64m1x2_t dest = __riscv_vlseg2e64_v_u64m1x2(ptr, 1); + dest = __riscv_vset_v_u64m1_u64m1x2 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_u64m1_u64m1x2 (dest, 1, a_[1].sv64); + __riscv_vsseg2e64_v_u64m1x2 (ptr, dest, 1); + #else + uint64_t buf[2]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { + buf[i] = a_[i % 2].values[i / 2]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -263,6 +341,13 @@ void simde_vst2q_f16(simde_float16_t *ptr, simde_float16x8x2_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) vst2q_f16(ptr, val); + #elif defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH && (SIMDE_NATURAL_VECTOR_SIZE >= 128) + simde_float16x8_private a_[2] = {simde_float16x8_to_private(val.val[0]), + simde_float16x8_to_private(val.val[1])}; + vfloat16m1x2_t dest = __riscv_vlseg2e16_v_f16m1x2((_Float16 *)ptr, 8); + dest = __riscv_vset_v_f16m1_f16m1x2 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_f16m1_f16m1x2 (dest, 1, a_[1].sv128); + __riscv_vsseg2e16_v_f16m1x2 ((_Float16 *)ptr, dest, 8); #else simde_float16x8x2_t r = simde_vzipq_f16(val.val[0], val.val[1]); simde_vst1q_f16(ptr, r.val[0]); @@ -279,6 +364,13 @@ void simde_vst2q_f32(simde_float32_t *ptr, simde_float32x4x2_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst2q_f32(ptr, val); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_float32x4_private a_[2] = {simde_float32x4_to_private(val.val[0]), + simde_float32x4_to_private(val.val[1])}; + vfloat32m1x2_t dest = __riscv_vlseg2e32_v_f32m1x2(ptr, 4); + dest = __riscv_vset_v_f32m1_f32m1x2 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_f32m1_f32m1x2 (dest, 1, a_[1].sv128); + __riscv_vsseg2e32_v_f32m1x2 (ptr, dest, 4); #else simde_float32x4x2_t r = simde_vzipq_f32(val.val[0], val.val[1]); simde_vst1q_f32(ptr, r.val[0]); @@ -296,13 +388,20 @@ simde_vst2q_f64(simde_float64_t *ptr, simde_float64x2x2_t val) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) vst2q_f64(ptr, val); #else - simde_float64_t buf[4]; simde_float64x2_private a_[2] = {simde_float64x2_to_private(val.val[0]), simde_float64x2_to_private(val.val[1])}; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { - buf[i] = a_[i % 2].values[i / 2]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) + vfloat64m1x2_t dest = __riscv_vlseg2e64_v_f64m1x2(ptr, 2); + dest = __riscv_vset_v_f64m1_f64m1x2 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_f64m1_f64m1x2 (dest, 1, a_[1].sv128); + __riscv_vsseg2e64_v_f64m1x2 (ptr, dest, 2); + #else + simde_float64_t buf[4]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { + buf[i] = a_[i % 2].values[i / 2]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) @@ -315,6 +414,13 @@ void simde_vst2q_s8(int8_t *ptr, simde_int8x16x2_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst2q_s8(ptr, val); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int8x16_private a_[2] = {simde_int8x16_to_private(val.val[0]), + simde_int8x16_to_private(val.val[1])}; + vint8m1x2_t dest = __riscv_vlseg2e8_v_i8m1x2(ptr, 16); + dest = __riscv_vset_v_i8m1_i8m1x2 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_i8m1_i8m1x2 (dest, 1, a_[1].sv128); + __riscv_vsseg2e8_v_i8m1x2 (ptr, dest, 16); #else simde_int8x16x2_t r = simde_vzipq_s8(val.val[0], val.val[1]); simde_vst1q_s8(ptr, r.val[0]); @@ -331,6 +437,13 @@ void simde_vst2q_s16(int16_t *ptr, simde_int16x8x2_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst2q_s16(ptr, val); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int16x8_private a_[2] = {simde_int16x8_to_private(val.val[0]), + simde_int16x8_to_private(val.val[1])}; + vint16m1x2_t dest = __riscv_vlseg2e16_v_i16m1x2(ptr, 8); + dest = __riscv_vset_v_i16m1_i16m1x2 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_i16m1_i16m1x2 (dest, 1, a_[1].sv128); + __riscv_vsseg2e16_v_i16m1x2 (ptr, dest, 8); #else simde_int16x8x2_t r = simde_vzipq_s16(val.val[0], val.val[1]); simde_vst1q_s16(ptr, r.val[0]); @@ -347,6 +460,13 @@ void simde_vst2q_s32(int32_t *ptr, simde_int32x4x2_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst2q_s32(ptr, val); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int32x4_private a_[2] = {simde_int32x4_to_private(val.val[0]), + simde_int32x4_to_private(val.val[1])}; + vint32m1x2_t dest = __riscv_vlseg2e32_v_i32m1x2(ptr, 4); + dest = __riscv_vset_v_i32m1_i32m1x2 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_i32m1_i32m1x2 (dest, 1, a_[1].sv128); + __riscv_vsseg2e32_v_i32m1x2 (ptr, dest, 4); #else simde_int32x4x2_t r = simde_vzipq_s32(val.val[0], val.val[1]); simde_vst1q_s32(ptr, r.val[0]); @@ -363,6 +483,13 @@ void simde_vst2q_s64(int64_t *ptr, simde_int64x2x2_t val) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) vst2q_s64(ptr, val); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int64x2_private a_[2] = {simde_int64x2_to_private(val.val[0]), + simde_int64x2_to_private(val.val[1])}; + vint64m1x2_t dest = __riscv_vlseg2e64_v_i64m1x2(ptr, 2); + dest = __riscv_vset_v_i64m1_i64m1x2 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_i64m1_i64m1x2 (dest, 1, a_[1].sv128); + __riscv_vsseg2e64_v_i64m1x2 (ptr, dest, 2); #else int64_t buf[4]; simde_int64x2_private a_[2] = {simde_int64x2_to_private(val.val[0]), @@ -383,6 +510,13 @@ void simde_vst2q_u8(uint8_t *ptr, simde_uint8x16x2_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst2q_u8(ptr, val); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint8x16_private a_[2] = {simde_uint8x16_to_private(val.val[0]), + simde_uint8x16_to_private(val.val[1])}; + vuint8m1x2_t dest = __riscv_vlseg2e8_v_u8m1x2(ptr, 16); + dest = __riscv_vset_v_u8m1_u8m1x2 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_u8m1_u8m1x2 (dest, 1, a_[1].sv128); + __riscv_vsseg2e8_v_u8m1x2 (ptr, dest, 16); #else simde_uint8x16x2_t r = simde_vzipq_u8(val.val[0], val.val[1]); simde_vst1q_u8(ptr, r.val[0]); @@ -399,6 +533,13 @@ void simde_vst2q_u16(uint16_t *ptr, simde_uint16x8x2_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst2q_u16(ptr, val); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint16x8_private a_[2] = {simde_uint16x8_to_private(val.val[0]), + simde_uint16x8_to_private(val.val[1])}; + vuint16m1x2_t dest = __riscv_vlseg2e16_v_u16m1x2(ptr, 8); + dest = __riscv_vset_v_u16m1_u16m1x2 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_u16m1_u16m1x2 (dest, 1, a_[1].sv128); + __riscv_vsseg2e16_v_u16m1x2 (ptr, dest, 8); #else simde_uint16x8x2_t r = simde_vzipq_u16(val.val[0], val.val[1]); simde_vst1q_u16(ptr, r.val[0]); @@ -415,6 +556,13 @@ void simde_vst2q_u32(uint32_t *ptr, simde_uint32x4x2_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst2q_u32(ptr, val); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint32x4_private a_[2] = {simde_uint32x4_to_private(val.val[0]), + simde_uint32x4_to_private(val.val[1])}; + vuint32m1x2_t dest = __riscv_vlseg2e32_v_u32m1x2(ptr, 4); + dest = __riscv_vset_v_u32m1_u32m1x2 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_u32m1_u32m1x2 (dest, 1, a_[1].sv128); + __riscv_vsseg2e32_v_u32m1x2 (ptr, dest, 4); #else simde_uint32x4x2_t r = simde_vzipq_u32(val.val[0], val.val[1]); simde_vst1q_u32(ptr, r.val[0]); @@ -432,13 +580,20 @@ simde_vst2q_u64(uint64_t *ptr, simde_uint64x2x2_t val) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) vst2q_u64(ptr, val); #else - uint64_t buf[4]; simde_uint64x2_private a_[2] = {simde_uint64x2_to_private(val.val[0]), simde_uint64x2_to_private(val.val[1])}; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { - buf[i] = a_[i % 2].values[i / 2]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) + vuint64m1x2_t dest = __riscv_vlseg2e64_v_u64m1x2(ptr, 2); + dest = __riscv_vset_v_u64m1_u64m1x2 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_u64m1_u64m1x2 (dest, 1, a_[1].sv128); + __riscv_vsseg2e64_v_u64m1x2 (ptr, dest, 2); + #else + uint64_t buf[4]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { + buf[i] = a_[i % 2].values[i / 2]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) @@ -452,13 +607,20 @@ simde_vst2_p8(simde_poly8_t *ptr, simde_poly8x8x2_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst2_p8(ptr, val); #else - simde_poly8_t buf[16]; simde_poly8x8_private a_[2] = {simde_poly8x8_to_private(val.val[0]), simde_poly8x8_to_private(val.val[1])}; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { - buf[i] = a_[i % 2].values[i / 2]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) + vuint8m1x2_t dest = __riscv_vlseg2e8_v_u8m1x2(ptr, 8); + dest = __riscv_vset_v_u8m1_u8m1x2 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_u8m1_u8m1x2 (dest, 1, a_[1].sv64); + __riscv_vsseg2e8_v_u8m1x2 (ptr, dest, 8); + #else + simde_poly8_t buf[16]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { + buf[i] = a_[i % 2].values[i / 2]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -472,13 +634,20 @@ simde_vst2_p16(simde_poly16_t *ptr, simde_poly16x4x2_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst2_p16(ptr, val); #else - simde_poly16_t buf[8]; simde_poly16x4_private a_[2] = {simde_poly16x4_to_private(val.val[0]), simde_poly16x4_to_private(val.val[1])}; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { - buf[i] = a_[i % 2].values[i / 2]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) + vuint16m1x2_t dest = __riscv_vlseg2e16_v_u16m1x2(ptr, 4); + dest = __riscv_vset_v_u16m1_u16m1x2 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_u16m1_u16m1x2 (dest, 1, a_[1].sv64); + __riscv_vsseg2e16_v_u16m1x2 (ptr, dest, 4); + #else + simde_poly16_t buf[8]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { + buf[i] = a_[i % 2].values[i / 2]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -492,13 +661,20 @@ simde_vst2_p64(simde_poly64_t *ptr, simde_poly64x1x2_t val) { #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) vst2_p64(ptr, val); #else - simde_poly64_t buf[2]; simde_poly64x1_private a_[2] = {simde_poly64x1_to_private(val.val[0]), simde_poly64x1_to_private(val.val[1])}; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { - buf[i] = a_[i % 2].values[i / 2]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) + vuint64m1x2_t dest = __riscv_vlseg2e64_v_u64m1x2(ptr, 1); + dest = __riscv_vset_v_u64m1_u64m1x2 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_u64m1_u64m1x2 (dest, 1, a_[1].sv64); + __riscv_vsseg2e64_v_u64m1x2 (ptr, dest, 1); + #else + simde_poly64_t buf[2]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { + buf[i] = a_[i % 2].values[i / 2]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) @@ -512,13 +688,20 @@ simde_vst2q_p8(simde_poly8_t *ptr, simde_poly8x16x2_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst2q_p8(ptr, val); #else - simde_poly8_t buf[32]; simde_poly8x16_private a_[2] = {simde_poly8x16_to_private(val.val[0]), simde_poly8x16_to_private(val.val[1])}; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { - buf[i] = a_[i % 2].values[i / 2]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) + vuint8m1x2_t dest = __riscv_vlseg2e8_v_u8m1x2(ptr, 16); + dest = __riscv_vset_v_u8m1_u8m1x2 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_u8m1_u8m1x2 (dest, 1, a_[1].sv128); + __riscv_vsseg2e8_v_u8m1x2 (ptr, dest, 16); + #else + simde_poly8_t buf[32]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { + buf[i] = a_[i % 2].values[i / 2]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -532,13 +715,20 @@ simde_vst2q_p16(simde_poly16_t *ptr, simde_poly16x8x2_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst2q_p16(ptr, val); #else - simde_poly16_t buf[16]; simde_poly16x8_private a_[2] = {simde_poly16x8_to_private(val.val[0]), simde_poly16x8_to_private(val.val[1])}; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { - buf[i] = a_[i % 2].values[i / 2]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) + vuint16m1x2_t dest = __riscv_vlseg2e16_v_u16m1x2(ptr, 8); + dest = __riscv_vset_v_u16m1_u16m1x2 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_u16m1_u16m1x2 (dest, 1, a_[1].sv128); + __riscv_vsseg2e16_v_u16m1x2 (ptr, dest, 8); + #else + simde_poly16_t buf[16]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { + buf[i] = a_[i % 2].values[i / 2]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -552,13 +742,20 @@ simde_vst2q_p64(simde_poly64_t *ptr, simde_poly64x2x2_t val) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) vst2q_p64(ptr, val); #else - simde_poly64_t buf[4]; simde_poly64x2_private a_[2] = {simde_poly64x2_to_private(val.val[0]), simde_poly64x2_to_private(val.val[1])}; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { - buf[i] = a_[i % 2].values[i / 2]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) + vuint64m1x2_t dest = __riscv_vlseg2e64_v_u64m1x2(ptr, 2); + dest = __riscv_vset_v_u64m1_u64m1x2 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_u64m1_u64m1x2 (dest, 1, a_[1].sv128); + __riscv_vsseg2e64_v_u64m1x2 (ptr, dest, 2); + #else + simde_poly64_t buf[4]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { + buf[i] = a_[i % 2].values[i / 2]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) @@ -572,9 +769,9 @@ simde_vst2_bf16(simde_bfloat16_t *ptr, simde_bfloat16x4x2_t val) { #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) vst2_bf16(ptr, val); #else - simde_bfloat16_t buf[8]; simde_bfloat16x4_private a_[2] = {simde_bfloat16x4_to_private(val.val[0]), simde_bfloat16x4_to_private(val.val[1])}; + simde_bfloat16_t buf[8]; for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { buf[i] = a_[i % 2].values[i / 2]; } @@ -592,9 +789,9 @@ simde_vst2q_bf16(simde_bfloat16_t *ptr, simde_bfloat16x8x2_t val) { #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) vst2q_bf16(ptr, val); #else - simde_bfloat16_t buf[16]; simde_bfloat16x8_private a_[2] = {simde_bfloat16x8_to_private(val.val[0]), simde_bfloat16x8_to_private(val.val[1])}; + simde_bfloat16_t buf[16]; for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { buf[i] = a_[i % 2].values[i / 2]; } diff --git a/simde/arm/neon/st3.h b/simde/arm/neon/st3.h index d14b1f62e..940016b33 100644 --- a/simde/arm/neon/st3.h +++ b/simde/arm/neon/st3.h @@ -24,6 +24,7 @@ * 2020 Evan Nemerson * 2020 Sean Maher * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_ST3_H) @@ -47,11 +48,19 @@ simde_vst3_f16(simde_float16_t ptr[HEDLEY_ARRAY_PARAM(12)], simde_float16x4x3_t simde_float16x4_private a[3] = { simde_float16x4_to_private(val.val[0]), simde_float16x4_to_private(val.val[1]), simde_float16x4_to_private(val.val[2]) }; - simde_float16_t buf[12]; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 3 ; i++) { - buf[i] = a[i % 3].values[i / 3]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH && (SIMDE_NATURAL_VECTOR_SIZE >= 128) + vfloat16m1x3_t dest = __riscv_vlseg3e16_v_f16m1x3((_Float16 *)ptr, 4); + dest = __riscv_vset_v_f16m1_f16m1x3 (dest, 0, a[0].sv64); + dest = __riscv_vset_v_f16m1_f16m1x3 (dest, 1, a[1].sv64); + dest = __riscv_vset_v_f16m1_f16m1x3 (dest, 2, a[2].sv64); + __riscv_vsseg3e16_v_f16m1x3 ((_Float16 *)ptr, dest, 4); + #else + simde_float16_t buf[12]; + for (size_t i = 0; i < 12 ; i++) { + buf[i] = a[i % 3].values[i / 3]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -68,7 +77,13 @@ simde_vst3_f32(simde_float32_t ptr[HEDLEY_ARRAY_PARAM(6)], simde_float32x2x3_t v simde_float32x2_private a[3] = { simde_float32x2_to_private(val.val[0]), simde_float32x2_to_private(val.val[1]), simde_float32x2_to_private(val.val[2]) }; - #if defined(SIMDE_SHUFFLE_VECTOR_) + #if defined(SIMDE_RISCV_V_NATIVE) + vfloat32m1x3_t dest = __riscv_vlseg3e32_v_f32m1x3(ptr, 2); + dest = __riscv_vset_v_f32m1_f32m1x3 (dest, 0, a[0].sv64); + dest = __riscv_vset_v_f32m1_f32m1x3 (dest, 1, a[1].sv64); + dest = __riscv_vset_v_f32m1_f32m1x3 (dest, 2, a[2].sv64); + __riscv_vsseg3e32_v_f32m1x3 (ptr, dest, 2); + #elif defined(SIMDE_SHUFFLE_VECTOR_) __typeof__(a[0].values) r1 = SIMDE_SHUFFLE_VECTOR_(32, 8, a[0].values, a[1].values, 0, 2); __typeof__(a[0].values) r2 = SIMDE_SHUFFLE_VECTOR_(32, 8, a[2].values, a[0].values, 0, 3); __typeof__(a[0].values) r3 = SIMDE_SHUFFLE_VECTOR_(32, 8, a[1].values, a[2].values, 1, 3); @@ -77,7 +92,7 @@ simde_vst3_f32(simde_float32_t ptr[HEDLEY_ARRAY_PARAM(6)], simde_float32x2x3_t v simde_memcpy(&ptr[4], &r3, sizeof(r3)); #else simde_float32_t buf[6]; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 3 ; i++) { + for (size_t i = 0; i < 6 ; i++) { buf[i] = a[i % 3].values[i / 3]; } simde_memcpy(ptr, buf, sizeof(buf)); @@ -98,9 +113,17 @@ simde_vst3_f64(simde_float64_t ptr[HEDLEY_ARRAY_PARAM(3)], simde_float64x1x3_t v simde_float64x1_private a_[3] = { simde_float64x1_to_private(val.val[0]), simde_float64x1_to_private(val.val[1]), simde_float64x1_to_private(val.val[2]) }; - simde_memcpy(ptr, &a_[0].values, sizeof(a_[0].values)); - simde_memcpy(&ptr[1], &a_[1].values, sizeof(a_[1].values)); - simde_memcpy(&ptr[2], &a_[2].values, sizeof(a_[2].values)); + #if defined(SIMDE_RISCV_V_NATIVE) + vfloat64m1x3_t dest = __riscv_vlseg3e64_v_f64m1x3(ptr, 1); + dest = __riscv_vset_v_f64m1_f64m1x3 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_f64m1_f64m1x3 (dest, 1, a_[1].sv64); + dest = __riscv_vset_v_f64m1_f64m1x3 (dest, 2, a_[2].sv64); + __riscv_vsseg3e64_v_f64m1x3(ptr, dest, 1); + #else + simde_memcpy(ptr, &a_[0].values, sizeof(a_[0].values)); + simde_memcpy(&ptr[1], &a_[1].values, sizeof(a_[1].values)); + simde_memcpy(&ptr[2], &a_[2].values, sizeof(a_[2].values)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) @@ -117,7 +140,13 @@ simde_vst3_s8(int8_t ptr[HEDLEY_ARRAY_PARAM(24)], simde_int8x8x3_t val) { simde_int8x8_private a_[3] = { simde_int8x8_to_private(val.val[0]), simde_int8x8_to_private(val.val[1]), simde_int8x8_to_private(val.val[2]) }; - #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100762) + #if defined(SIMDE_RISCV_V_NATIVE) + vint8m1x3_t dest = __riscv_vlseg3e8_v_i8m1x3(ptr, 8); + dest = __riscv_vset_v_i8m1_i8m1x3 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_i8m1_i8m1x3 (dest, 1, a_[1].sv64); + dest = __riscv_vset_v_i8m1_i8m1x3 (dest, 2, a_[2].sv64); + __riscv_vsseg3e8_v_i8m1x3(ptr, dest, 8); + #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100762) __typeof__(a_[0].values) r0 = SIMDE_SHUFFLE_VECTOR_(8, 8, a_[0].values, a_[1].values, 0, 8, 3, 1, 9, 4, 2, 10); __typeof__(a_[0].values) m0 = SIMDE_SHUFFLE_VECTOR_(8, 8, r0, a_[2].values, @@ -137,7 +166,7 @@ simde_vst3_s8(int8_t ptr[HEDLEY_ARRAY_PARAM(24)], simde_int8x8x3_t val) { simde_memcpy(&ptr[16], &m2, sizeof(m2)); #else int8_t buf[24]; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 3 ; i++) { + for (size_t i = 0; i < 24 ; i++) { buf[i] = a_[i % 3].values[i / 3]; } simde_memcpy(ptr, buf, sizeof(buf)); @@ -158,7 +187,13 @@ simde_vst3_s16(int16_t ptr[HEDLEY_ARRAY_PARAM(12)], simde_int16x4x3_t val) { simde_int16x4_private a_[3] = { simde_int16x4_to_private(val.val[0]), simde_int16x4_to_private(val.val[1]), simde_int16x4_to_private(val.val[2]) }; - #if defined(SIMDE_SHUFFLE_VECTOR_) + #if defined(SIMDE_RISCV_V_NATIVE) + vint16m1x3_t dest = __riscv_vlseg3e16_v_i16m1x3(ptr, 4); + dest = __riscv_vset_v_i16m1_i16m1x3 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_i16m1_i16m1x3 (dest, 1, a_[1].sv64); + dest = __riscv_vset_v_i16m1_i16m1x3 (dest, 2, a_[2].sv64); + __riscv_vsseg3e16_v_i16m1x3 (ptr, dest, 4); + #elif defined(SIMDE_SHUFFLE_VECTOR_) __typeof__(a_[0].values) r0 = SIMDE_SHUFFLE_VECTOR_(16, 8, a_[0].values, a_[1].values, 0, 4, 1, 0); __typeof__(a_[0].values) m0 = SIMDE_SHUFFLE_VECTOR_(16, 8, r0, a_[2].values, @@ -178,7 +213,7 @@ simde_vst3_s16(int16_t ptr[HEDLEY_ARRAY_PARAM(12)], simde_int16x4x3_t val) { simde_memcpy(&ptr[8], &m2, sizeof(m2)); #else int16_t buf[12]; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 3 ; i++) { + for (size_t i = 0; i < 12 ; i++) { buf[i] = a_[i % 3].values[i / 3]; } simde_memcpy(ptr, buf, sizeof(buf)); @@ -199,7 +234,13 @@ simde_vst3_s32(int32_t ptr[HEDLEY_ARRAY_PARAM(6)], simde_int32x2x3_t val) { simde_int32x2_private a[3] = { simde_int32x2_to_private(val.val[0]), simde_int32x2_to_private(val.val[1]), simde_int32x2_to_private(val.val[2]) }; - #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100762) + #if defined(SIMDE_RISCV_V_NATIVE) + vint32m1x3_t dest = __riscv_vlseg3e32_v_i32m1x3(ptr, 2); + dest = __riscv_vset_v_i32m1_i32m1x3 (dest, 0, a[0].sv64); + dest = __riscv_vset_v_i32m1_i32m1x3 (dest, 1, a[1].sv64); + dest = __riscv_vset_v_i32m1_i32m1x3 (dest, 2, a[2].sv64); + __riscv_vsseg3e32_v_i32m1x3 (ptr, dest, 2); + #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100762) __typeof__(a[0].values) r1 = SIMDE_SHUFFLE_VECTOR_(32, 8, a[0].values, a[1].values, 0, 2); __typeof__(a[0].values) r2 = SIMDE_SHUFFLE_VECTOR_(32, 8, a[2].values, a[0].values, 0, 3); __typeof__(a[0].values) r3 = SIMDE_SHUFFLE_VECTOR_(32, 8, a[1].values, a[2].values, 1, 3); @@ -208,7 +249,7 @@ simde_vst3_s32(int32_t ptr[HEDLEY_ARRAY_PARAM(6)], simde_int32x2x3_t val) { simde_memcpy(&ptr[4], &r3, sizeof(r3)); #else int32_t buf[6]; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 3 ; i++) { + for (size_t i = 0; i < 6 ; i++) { buf[i] = a[i % 3].values[i / 3]; } simde_memcpy(ptr, buf, sizeof(buf)); @@ -229,9 +270,17 @@ simde_vst3_s64(int64_t ptr[HEDLEY_ARRAY_PARAM(3)], simde_int64x1x3_t val) { simde_int64x1_private a_[3] = { simde_int64x1_to_private(val.val[0]), simde_int64x1_to_private(val.val[1]), simde_int64x1_to_private(val.val[2]) }; - simde_memcpy(ptr, &a_[0].values, sizeof(a_[0].values)); - simde_memcpy(&ptr[1], &a_[1].values, sizeof(a_[1].values)); - simde_memcpy(&ptr[2], &a_[2].values, sizeof(a_[2].values)); + #if defined(SIMDE_RISCV_V_NATIVE) + vint64m1x3_t dest = __riscv_vlseg3e64_v_i64m1x3(ptr, 1); + dest = __riscv_vset_v_i64m1_i64m1x3 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_i64m1_i64m1x3 (dest, 1, a_[1].sv64); + dest = __riscv_vset_v_i64m1_i64m1x3 (dest, 2, a_[2].sv64); + __riscv_vsseg3e64_v_i64m1x3 (ptr, dest, 1); + #else + simde_memcpy(ptr, &a_[0].values, sizeof(a_[0].values)); + simde_memcpy(&ptr[1], &a_[1].values, sizeof(a_[1].values)); + simde_memcpy(&ptr[2], &a_[2].values, sizeof(a_[2].values)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) @@ -248,7 +297,13 @@ simde_vst3_u8(uint8_t ptr[HEDLEY_ARRAY_PARAM(24)], simde_uint8x8x3_t val) { simde_uint8x8_private a_[3] = { simde_uint8x8_to_private(val.val[0]), simde_uint8x8_to_private(val.val[1]), simde_uint8x8_to_private(val.val[2]) }; - #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100762) + #if defined(SIMDE_RISCV_V_NATIVE) + vuint8m1x3_t dest = __riscv_vlseg3e8_v_u8m1x3(ptr, 8); + dest = __riscv_vset_v_u8m1_u8m1x3 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_u8m1_u8m1x3 (dest, 1, a_[1].sv64); + dest = __riscv_vset_v_u8m1_u8m1x3 (dest, 2, a_[2].sv64); + __riscv_vsseg3e8_v_u8m1x3 (ptr, dest, 8); + #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100762) __typeof__(a_[0].values) r0 = SIMDE_SHUFFLE_VECTOR_(8, 8, a_[0].values, a_[1].values, 0, 8, 3, 1, 9, 4, 2, 10); __typeof__(a_[0].values) m0 = SIMDE_SHUFFLE_VECTOR_(8, 8, r0, a_[2].values, @@ -268,7 +323,7 @@ simde_vst3_u8(uint8_t ptr[HEDLEY_ARRAY_PARAM(24)], simde_uint8x8x3_t val) { simde_memcpy(&ptr[16], &m2, sizeof(m2)); #else uint8_t buf[24]; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 3 ; i++) { + for (size_t i = 0; i < 24 ; i++) { buf[i] = a_[i % 3].values[i / 3]; } simde_memcpy(ptr, buf, sizeof(buf)); @@ -289,7 +344,13 @@ simde_vst3_u16(uint16_t ptr[HEDLEY_ARRAY_PARAM(12)], simde_uint16x4x3_t val) { simde_uint16x4_private a_[3] = { simde_uint16x4_to_private(val.val[0]), simde_uint16x4_to_private(val.val[1]), simde_uint16x4_to_private(val.val[2]) }; - #if defined(SIMDE_SHUFFLE_VECTOR_) + #if defined(SIMDE_RISCV_V_NATIVE) + vuint16m1x3_t dest = __riscv_vlseg3e16_v_u16m1x3(ptr, 4); + dest = __riscv_vset_v_u16m1_u16m1x3 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_u16m1_u16m1x3 (dest, 1, a_[1].sv64); + dest = __riscv_vset_v_u16m1_u16m1x3 (dest, 2, a_[2].sv64); + __riscv_vsseg3e16_v_u16m1x3 (ptr, dest, 4); + #elif defined(SIMDE_SHUFFLE_VECTOR_) __typeof__(a_[0].values) r0 = SIMDE_SHUFFLE_VECTOR_(16, 8, a_[0].values, a_[1].values, 0, 4, 1, 0); __typeof__(a_[0].values) m0 = SIMDE_SHUFFLE_VECTOR_(16, 8, r0, a_[2].values, @@ -309,7 +370,7 @@ simde_vst3_u16(uint16_t ptr[HEDLEY_ARRAY_PARAM(12)], simde_uint16x4x3_t val) { simde_memcpy(&ptr[8], &m2, sizeof(m2)); #else uint16_t buf[12]; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 3 ; i++) { + for (size_t i = 0; i < 12 ; i++) { buf[i] = a_[i % 3].values[i / 3]; } simde_memcpy(ptr, buf, sizeof(buf)); @@ -330,7 +391,13 @@ simde_vst3_u32(uint32_t ptr[HEDLEY_ARRAY_PARAM(6)], simde_uint32x2x3_t val) { simde_uint32x2_private a[3] = { simde_uint32x2_to_private(val.val[0]), simde_uint32x2_to_private(val.val[1]), simde_uint32x2_to_private(val.val[2]) }; - #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100762) + #if defined(SIMDE_RISCV_V_NATIVE) + vuint32m1x3_t dest = __riscv_vlseg3e32_v_u32m1x3(ptr, 2); + dest = __riscv_vset_v_u32m1_u32m1x3 (dest, 0, a[0].sv64); + dest = __riscv_vset_v_u32m1_u32m1x3 (dest, 1, a[1].sv64); + dest = __riscv_vset_v_u32m1_u32m1x3 (dest, 2, a[2].sv64); + __riscv_vsseg3e32_v_u32m1x3 (ptr, dest, 2); + #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100762) __typeof__(a[0].values) r1 = SIMDE_SHUFFLE_VECTOR_(32, 8, a[0].values, a[1].values, 0, 2); __typeof__(a[0].values) r2 = SIMDE_SHUFFLE_VECTOR_(32, 8, a[2].values, a[0].values, 0, 3); __typeof__(a[0].values) r3 = SIMDE_SHUFFLE_VECTOR_(32, 8, a[1].values, a[2].values, 1, 3); @@ -339,7 +406,7 @@ simde_vst3_u32(uint32_t ptr[HEDLEY_ARRAY_PARAM(6)], simde_uint32x2x3_t val) { simde_memcpy(&ptr[4], &r3, sizeof(r3)); #else uint32_t buf[6]; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 3 ; i++) { + for (size_t i = 0; i < 6 ; i++) { buf[i] = a[i % 3].values[i / 3]; } simde_memcpy(ptr, buf, sizeof(buf)); @@ -360,9 +427,17 @@ simde_vst3_u64(uint64_t ptr[HEDLEY_ARRAY_PARAM(3)], simde_uint64x1x3_t val) { simde_uint64x1_private a_[3] = { simde_uint64x1_to_private(val.val[0]), simde_uint64x1_to_private(val.val[1]), simde_uint64x1_to_private(val.val[2]) }; - simde_memcpy(ptr, &a_[0].values, sizeof(a_[0].values)); - simde_memcpy(&ptr[1], &a_[1].values, sizeof(a_[1].values)); - simde_memcpy(&ptr[2], &a_[2].values, sizeof(a_[2].values)); + #if defined(SIMDE_RISCV_V_NATIVE) + vuint64m1x3_t dest = __riscv_vlseg3e64_v_u64m1x3(ptr, 1); + dest = __riscv_vset_v_u64m1_u64m1x3 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_u64m1_u64m1x3 (dest, 1, a_[1].sv64); + dest = __riscv_vset_v_u64m1_u64m1x3 (dest, 2, a_[2].sv64); + __riscv_vsseg3e64_v_u64m1x3 (ptr, dest, 1); + #else + simde_memcpy(ptr, &a_[0].values, sizeof(a_[0].values)); + simde_memcpy(&ptr[1], &a_[1].values, sizeof(a_[1].values)); + simde_memcpy(&ptr[2], &a_[2].values, sizeof(a_[2].values)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) @@ -379,11 +454,19 @@ simde_vst3q_f16(simde_float16_t ptr[HEDLEY_ARRAY_PARAM(24)], simde_float16x8x3_t simde_float16x8_private a_[3] = { simde_float16x8_to_private(val.val[0]), simde_float16x8_to_private(val.val[1]), simde_float16x8_to_private(val.val[2]) }; - simde_float16_t buf[24]; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 3 ; i++) { - buf[i] = a_[i % 3].values[i / 3]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH && (SIMDE_NATURAL_VECTOR_SIZE >= 128) + vfloat16m1x3_t dest = __riscv_vlseg3e16_v_f16m1x3((_Float16 *)ptr, 8); + dest = __riscv_vset_v_f16m1_f16m1x3 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_f16m1_f16m1x3 (dest, 1, a_[1].sv128); + dest = __riscv_vset_v_f16m1_f16m1x3 (dest, 2, a_[2].sv128); + __riscv_vsseg3e16_v_f16m1x3 ((_Float16 *)ptr, dest, 8); + #else + simde_float16_t buf[24]; + for (size_t i = 0; i < 24 ; i++) { + buf[i] = a_[i % 3].values[i / 3]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -400,7 +483,13 @@ simde_vst3q_f32(simde_float32_t ptr[HEDLEY_ARRAY_PARAM(12)], simde_float32x4x3_t simde_float32x4_private a_[3] = { simde_float32x4_to_private(val.val[0]), simde_float32x4_to_private(val.val[1]), simde_float32x4_to_private(val.val[2]) }; - #if defined(SIMDE_SHUFFLE_VECTOR_) + #if defined(SIMDE_RISCV_V_NATIVE) + vfloat32m1x3_t dest = __riscv_vlseg3e32_v_f32m1x3(ptr, 4); + dest = __riscv_vset_v_f32m1_f32m1x3 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_f32m1_f32m1x3 (dest, 1, a_[1].sv128); + dest = __riscv_vset_v_f32m1_f32m1x3 (dest, 2, a_[2].sv128); + __riscv_vsseg3e32_v_f32m1x3 (ptr, dest, 4); + #elif defined(SIMDE_SHUFFLE_VECTOR_) __typeof__(a_[0].values) r0 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_[0].values, a_[1].values, 0, 4, 1, 0); __typeof__(a_[0].values) m0 = SIMDE_SHUFFLE_VECTOR_(32, 16, r0, a_[2].values, @@ -420,7 +509,7 @@ simde_vst3q_f32(simde_float32_t ptr[HEDLEY_ARRAY_PARAM(12)], simde_float32x4x3_t simde_memcpy(&ptr[8], &m2, sizeof(m2)); #else simde_float32_t buf[12]; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 3 ; i++) { + for (size_t i = 0; i < 12 ; i++) { buf[i] = a_[i % 3].values[i / 3]; } simde_memcpy(ptr, buf, sizeof(buf)); @@ -441,7 +530,13 @@ simde_vst3q_f64(simde_float64_t ptr[HEDLEY_ARRAY_PARAM(6)], simde_float64x2x3_t simde_float64x2_private a[3] = { simde_float64x2_to_private(val.val[0]), simde_float64x2_to_private(val.val[1]), simde_float64x2_to_private(val.val[2]) }; - #if defined(SIMDE_SHUFFLE_VECTOR_) + #if defined(SIMDE_RISCV_V_NATIVE) + vfloat64m1x3_t dest = __riscv_vlseg3e64_v_f64m1x3(ptr, 2); + dest = __riscv_vset_v_f64m1_f64m1x3 (dest, 0, a[0].sv128); + dest = __riscv_vset_v_f64m1_f64m1x3 (dest, 1, a[1].sv128); + dest = __riscv_vset_v_f64m1_f64m1x3 (dest, 2, a[2].sv128); + __riscv_vsseg3e64_v_f64m1x3 (ptr, dest, 2); + #elif defined(SIMDE_SHUFFLE_VECTOR_) __typeof__(a[0].values) r1 = SIMDE_SHUFFLE_VECTOR_(64, 16, a[0].values, a[1].values, 0, 2); __typeof__(a[0].values) r2 = SIMDE_SHUFFLE_VECTOR_(64, 16, a[2].values, a[0].values, 0, 3); __typeof__(a[0].values) r3 = SIMDE_SHUFFLE_VECTOR_(64, 16, a[1].values, a[2].values, 1, 3); @@ -450,7 +545,7 @@ simde_vst3q_f64(simde_float64_t ptr[HEDLEY_ARRAY_PARAM(6)], simde_float64x2x3_t simde_memcpy(&ptr[4], &r3, sizeof(r3)); #else simde_float64_t buf[6]; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 3 ; i++) { + for (size_t i = 0; i < 6 ; i++) { buf[i] = a[i % 3].values[i / 3]; } simde_memcpy(ptr, buf, sizeof(buf)); @@ -471,7 +566,13 @@ simde_vst3q_s8(int8_t ptr[HEDLEY_ARRAY_PARAM(48)], simde_int8x16x3_t val) { simde_int8x16_private a_[3] = { simde_int8x16_to_private(val.val[0]), simde_int8x16_to_private(val.val[1]), simde_int8x16_to_private(val.val[2]) }; - #if defined(SIMDE_SHUFFLE_VECTOR_) + #if defined(SIMDE_RISCV_V_NATIVE) + vint8m1x3_t dest = __riscv_vlseg3e8_v_i8m1x3(ptr, 16); + dest = __riscv_vset_v_i8m1_i8m1x3 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_i8m1_i8m1x3 (dest, 1, a_[1].sv128); + dest = __riscv_vset_v_i8m1_i8m1x3 (dest, 2, a_[2].sv128); + __riscv_vsseg3e8_v_i8m1x3 (ptr, dest, 16); + #elif defined(SIMDE_SHUFFLE_VECTOR_) __typeof__(a_[0].values) r0 = SIMDE_SHUFFLE_VECTOR_(8, 16, a_[0].values, a_[1].values, 0, 16, 6, 1, 17, 7, 2, 18, 8, 3, 19, 9, 4, 20, 10, 5); @@ -496,7 +597,7 @@ simde_vst3q_s8(int8_t ptr[HEDLEY_ARRAY_PARAM(48)], simde_int8x16x3_t val) { simde_memcpy(&ptr[32], &m2, sizeof(m2)); #else int8_t buf[48]; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 3 ; i++) { + for (size_t i = 0; i < 48 ; i++) { buf[i] = a_[i % 3].values[i / 3]; } simde_memcpy(ptr, buf, sizeof(buf)); @@ -517,7 +618,13 @@ simde_vst3q_s16(int16_t ptr[HEDLEY_ARRAY_PARAM(24)], simde_int16x8x3_t val) { simde_int16x8_private a_[3] = { simde_int16x8_to_private(val.val[0]), simde_int16x8_to_private(val.val[1]), simde_int16x8_to_private(val.val[2]) }; - #if defined(SIMDE_SHUFFLE_VECTOR_) + #if defined(SIMDE_RISCV_V_NATIVE) + vint16m1x3_t dest = __riscv_vlseg3e16_v_i16m1x3(ptr, 8); + dest = __riscv_vset_v_i16m1_i16m1x3 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_i16m1_i16m1x3 (dest, 1, a_[1].sv128); + dest = __riscv_vset_v_i16m1_i16m1x3 (dest, 2, a_[2].sv128); + __riscv_vsseg3e16_v_i16m1x3 (ptr, dest, 8); + #elif defined(SIMDE_SHUFFLE_VECTOR_) __typeof__(a_[0].values) r0 = SIMDE_SHUFFLE_VECTOR_(16, 16, a_[0].values, a_[1].values, 0, 8, 3, 1, 9, 4, 2, 10); __typeof__(a_[0].values) m0 = SIMDE_SHUFFLE_VECTOR_(16, 16, r0, a_[2].values, @@ -537,7 +644,7 @@ simde_vst3q_s16(int16_t ptr[HEDLEY_ARRAY_PARAM(24)], simde_int16x8x3_t val) { simde_memcpy(&ptr[16], &m2, sizeof(m2)); #else int16_t buf[24]; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 3 ; i++) { + for (size_t i = 0; i < 24 ; i++) { buf[i] = a_[i % 3].values[i / 3]; } simde_memcpy(ptr, buf, sizeof(buf)); @@ -558,7 +665,13 @@ simde_vst3q_s32(int32_t ptr[HEDLEY_ARRAY_PARAM(12)], simde_int32x4x3_t val) { simde_int32x4_private a_[3] = { simde_int32x4_to_private(val.val[0]), simde_int32x4_to_private(val.val[1]), simde_int32x4_to_private(val.val[2]) }; - #if defined(SIMDE_SHUFFLE_VECTOR_) + #if defined(SIMDE_RISCV_V_NATIVE) + vint32m1x3_t dest = __riscv_vlseg3e32_v_i32m1x3(ptr, 4); + dest = __riscv_vset_v_i32m1_i32m1x3 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_i32m1_i32m1x3 (dest, 1, a_[1].sv128); + dest = __riscv_vset_v_i32m1_i32m1x3 (dest, 2, a_[2].sv128); + __riscv_vsseg3e32_v_i32m1x3 (ptr, dest, 4); + #elif defined(SIMDE_SHUFFLE_VECTOR_) __typeof__(a_[0].values) r0 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_[0].values, a_[1].values, 0, 4, 1, 0); __typeof__(a_[0].values) m0 = SIMDE_SHUFFLE_VECTOR_(32, 16, r0, a_[2].values, @@ -578,7 +691,7 @@ simde_vst3q_s32(int32_t ptr[HEDLEY_ARRAY_PARAM(12)], simde_int32x4x3_t val) { simde_memcpy(&ptr[8], &m2, sizeof(m2)); #else int32_t buf[12]; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 3 ; i++) { + for (size_t i = 0; i < 12 ; i++) { buf[i] = a_[i % 3].values[i / 3]; } simde_memcpy(ptr, buf, sizeof(buf)); @@ -599,7 +712,13 @@ simde_vst3q_s64(int64_t ptr[HEDLEY_ARRAY_PARAM(6)], simde_int64x2x3_t val) { simde_int64x2_private a[3] = { simde_int64x2_to_private(val.val[0]), simde_int64x2_to_private(val.val[1]), simde_int64x2_to_private(val.val[2]) }; - #if defined(SIMDE_SHUFFLE_VECTOR_) + #if defined(SIMDE_RISCV_V_NATIVE) + vint64m1x3_t dest = __riscv_vlseg3e64_v_i64m1x3(ptr, 2); + dest = __riscv_vset_v_i64m1_i64m1x3 (dest, 0, a[0].sv128); + dest = __riscv_vset_v_i64m1_i64m1x3 (dest, 1, a[1].sv128); + dest = __riscv_vset_v_i64m1_i64m1x3 (dest, 2, a[2].sv128); + __riscv_vsseg3e64_v_i64m1x3 (ptr, dest, 2); + #elif defined(SIMDE_SHUFFLE_VECTOR_) __typeof__(a[0].values) r1 = SIMDE_SHUFFLE_VECTOR_(64, 16, a[0].values, a[1].values, 0, 2); __typeof__(a[0].values) r2 = SIMDE_SHUFFLE_VECTOR_(64, 16, a[2].values, a[0].values, 0, 3); __typeof__(a[0].values) r3 = SIMDE_SHUFFLE_VECTOR_(64, 16, a[1].values, a[2].values, 1, 3); @@ -608,7 +727,7 @@ simde_vst3q_s64(int64_t ptr[HEDLEY_ARRAY_PARAM(6)], simde_int64x2x3_t val) { simde_memcpy(&ptr[4], &r3, sizeof(r3)); #else int64_t buf[6]; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 3 ; i++) { + for (size_t i = 0; i < 6 ; i++) { buf[i] = a[i % 3].values[i / 3]; } simde_memcpy(ptr, buf, sizeof(buf)); @@ -661,6 +780,12 @@ simde_vst3q_u8(uint8_t ptr[HEDLEY_ARRAY_PARAM(48)], simde_uint8x16x3_t val) { v128_t m2 = wasm_i8x16_shuffle(r2, r1, 0, 1, 18, 3, 4, 21, 6, 7, 24, 9, 10, 27, 12, 13, 30, 15); wasm_v128_store(ptr + 32, m2); + #elif defined(SIMDE_RISCV_V_NATIVE) + vuint8m1x3_t dest = __riscv_vlseg3e8_v_u8m1x3(ptr, 16); + dest = __riscv_vset_v_u8m1_u8m1x3 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_u8m1_u8m1x3 (dest, 1, a_[1].sv128); + dest = __riscv_vset_v_u8m1_u8m1x3 (dest, 2, a_[2].sv128); + __riscv_vsseg3e8_v_u8m1x3 (ptr, dest, 16); #elif defined(SIMDE_SHUFFLE_VECTOR_) __typeof__(a_[0].values) r0 = SIMDE_SHUFFLE_VECTOR_(8, 16, a_[0].values, a_[1].values, 0, 16, 6, 1, 17, 7, 2, 18, 8, 3, 19, 9, @@ -686,7 +811,7 @@ simde_vst3q_u8(uint8_t ptr[HEDLEY_ARRAY_PARAM(48)], simde_uint8x16x3_t val) { simde_memcpy(&ptr[32], &m2, sizeof(m2)); #else uint8_t buf[48]; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 3 ; i++) { + for (size_t i = 0; i < 48 ; i++) { buf[i] = a_[i % 3].values[i / 3]; } simde_memcpy(ptr, buf, sizeof(buf)); @@ -708,7 +833,13 @@ simde_vst3q_u16(uint16_t ptr[HEDLEY_ARRAY_PARAM(24)], simde_uint16x8x3_t val) { simde_uint16x8_to_private(val.val[1]), simde_uint16x8_to_private(val.val[2]) }; - #if defined(SIMDE_SHUFFLE_VECTOR_) + #if defined(SIMDE_RISCV_V_NATIVE) + vuint16m1x3_t dest = __riscv_vlseg3e16_v_u16m1x3(ptr, 8); + dest = __riscv_vset_v_u16m1_u16m1x3 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_u16m1_u16m1x3 (dest, 1, a_[1].sv128); + dest = __riscv_vset_v_u16m1_u16m1x3 (dest, 2, a_[2].sv128); + __riscv_vsseg3e16_v_u16m1x3 (ptr, dest, 8); + #elif defined(SIMDE_SHUFFLE_VECTOR_) __typeof__(a_[0].values) r0 = SIMDE_SHUFFLE_VECTOR_(16, 16, a_[0].values, a_[1].values, 0, 8, 3, 1, 9, 4, 2, 10); __typeof__(a_[0].values) m0 = SIMDE_SHUFFLE_VECTOR_(16, 16, r0, a_[2].values, @@ -728,7 +859,7 @@ simde_vst3q_u16(uint16_t ptr[HEDLEY_ARRAY_PARAM(24)], simde_uint16x8x3_t val) { simde_memcpy(&ptr[16], &m2, sizeof(m2)); #else uint16_t buf[24]; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 3 ; i++) { + for (size_t i = 0; i < 24 ; i++) { buf[i] = a_[i % 3].values[i / 3]; } simde_memcpy(ptr, buf, sizeof(buf)); @@ -750,7 +881,13 @@ simde_vst3q_u32(uint32_t ptr[HEDLEY_ARRAY_PARAM(12)], simde_uint32x4x3_t val) { simde_uint32x4_to_private(val.val[1]), simde_uint32x4_to_private(val.val[2]) }; - #if defined(SIMDE_SHUFFLE_VECTOR_) + #if defined(SIMDE_RISCV_V_NATIVE) + vuint32m1x3_t dest = __riscv_vlseg3e32_v_u32m1x3(ptr, 4); + dest = __riscv_vset_v_u32m1_u32m1x3 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_u32m1_u32m1x3 (dest, 1, a_[1].sv128); + dest = __riscv_vset_v_u32m1_u32m1x3 (dest, 2, a_[2].sv128); + __riscv_vsseg3e32_v_u32m1x3 (ptr, dest, 4); + #elif defined(SIMDE_SHUFFLE_VECTOR_) __typeof__(a_[0].values) r0 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_[0].values, a_[1].values, 0, 4, 1, 0); __typeof__(a_[0].values) m0 = SIMDE_SHUFFLE_VECTOR_(32, 16, r0, a_[2].values, @@ -770,7 +907,7 @@ simde_vst3q_u32(uint32_t ptr[HEDLEY_ARRAY_PARAM(12)], simde_uint32x4x3_t val) { simde_memcpy(&ptr[8], &m2, sizeof(m2)); #else uint32_t buf[12]; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 3 ; i++) { + for (size_t i = 0; i < 12 ; i++) { buf[i] = a_[i % 3].values[i / 3]; } simde_memcpy(ptr, buf, sizeof(buf)); @@ -791,7 +928,13 @@ simde_vst3q_u64(uint64_t ptr[HEDLEY_ARRAY_PARAM(6)], simde_uint64x2x3_t val) { simde_uint64x2_private a[3] = { simde_uint64x2_to_private(val.val[0]), simde_uint64x2_to_private(val.val[1]), simde_uint64x2_to_private(val.val[2]) }; - #if defined(SIMDE_SHUFFLE_VECTOR_) + #if defined(SIMDE_RISCV_V_NATIVE) + vuint64m1x3_t dest = __riscv_vlseg3e64_v_u64m1x3(ptr, 2); + dest = __riscv_vset_v_u64m1_u64m1x3 (dest, 0, a[0].sv128); + dest = __riscv_vset_v_u64m1_u64m1x3 (dest, 1, a[1].sv128); + dest = __riscv_vset_v_u64m1_u64m1x3 (dest, 2, a[2].sv128); + __riscv_vsseg3e64_v_u64m1x3 (ptr, dest, 2); + #elif defined(SIMDE_SHUFFLE_VECTOR_) __typeof__(a[0].values) r1 = SIMDE_SHUFFLE_VECTOR_(64, 16, a[0].values, a[1].values, 0, 2); __typeof__(a[0].values) r2 = SIMDE_SHUFFLE_VECTOR_(64, 16, a[2].values, a[0].values, 0, 3); __typeof__(a[0].values) r3 = SIMDE_SHUFFLE_VECTOR_(64, 16, a[1].values, a[2].values, 1, 3); @@ -800,7 +943,7 @@ simde_vst3q_u64(uint64_t ptr[HEDLEY_ARRAY_PARAM(6)], simde_uint64x2x3_t val) { simde_memcpy(&ptr[4], &r3, sizeof(r3)); #else uint64_t buf[6]; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 3 ; i++) { + for (size_t i = 0; i < 6 ; i++) { buf[i] = a[i % 3].values[i / 3]; } simde_memcpy(ptr, buf, sizeof(buf)); @@ -821,11 +964,19 @@ simde_vst3_p8(simde_poly8_t ptr[HEDLEY_ARRAY_PARAM(24)], simde_poly8x8x3_t val) simde_poly8x8_private a_[3] = { simde_poly8x8_to_private(val.val[0]), simde_poly8x8_to_private(val.val[1]), simde_poly8x8_to_private(val.val[2]) }; - simde_poly8_t buf[24]; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 3 ; i++) { - buf[i] = a_[i % 3].values[i / 3]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) + vuint8m1x3_t dest = __riscv_vlseg3e8_v_u8m1x3(ptr, 8); + dest = __riscv_vset_v_u8m1_u8m1x3 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_u8m1_u8m1x3 (dest, 1, a_[1].sv64); + dest = __riscv_vset_v_u8m1_u8m1x3 (dest, 2, a_[2].sv64); + __riscv_vsseg3e8_v_u8m1x3 (ptr, dest, 8); + #else + simde_poly8_t buf[24]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 3 ; i++) { + buf[i] = a_[i % 3].values[i / 3]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -842,11 +993,19 @@ simde_vst3_p16(simde_poly16_t ptr[HEDLEY_ARRAY_PARAM(12)], simde_poly16x4x3_t va simde_poly16x4_private a_[3] = { simde_poly16x4_to_private(val.val[0]), simde_poly16x4_to_private(val.val[1]), simde_poly16x4_to_private(val.val[2]) }; - simde_poly16_t buf[12]; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 3 ; i++) { - buf[i] = a_[i % 3].values[i / 3]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) + vuint16m1x3_t dest = __riscv_vlseg3e16_v_u16m1x3(ptr, 4); + dest = __riscv_vset_v_u16m1_u16m1x3 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_u16m1_u16m1x3 (dest, 1, a_[1].sv64); + dest = __riscv_vset_v_u16m1_u16m1x3 (dest, 2, a_[2].sv64); + __riscv_vsseg3e16_v_u16m1x3 (ptr, dest, 4); + #else + simde_poly16_t buf[12]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 3 ; i++) { + buf[i] = a_[i % 3].values[i / 3]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -863,9 +1022,17 @@ simde_vst3_p64(simde_poly64_t ptr[HEDLEY_ARRAY_PARAM(3)], simde_poly64x1x3_t val simde_poly64x1_private a_[3] = { simde_poly64x1_to_private(val.val[0]), simde_poly64x1_to_private(val.val[1]), simde_poly64x1_to_private(val.val[2]) }; - simde_memcpy(ptr, &a_[0].values, sizeof(a_[0].values)); - simde_memcpy(&ptr[1], &a_[1].values, sizeof(a_[1].values)); - simde_memcpy(&ptr[2], &a_[2].values, sizeof(a_[2].values)); + #if defined(SIMDE_RISCV_V_NATIVE) + vuint64m1x3_t dest = __riscv_vlseg3e64_v_u64m1x3(ptr, 1); + dest = __riscv_vset_v_u64m1_u64m1x3 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_u64m1_u64m1x3 (dest, 1, a_[1].sv64); + dest = __riscv_vset_v_u64m1_u64m1x3 (dest, 2, a_[2].sv64); + __riscv_vsseg3e64_v_u64m1x3 (ptr, dest, 1); + #else + simde_memcpy(ptr, &a_[0].values, sizeof(a_[0].values)); + simde_memcpy(&ptr[1], &a_[1].values, sizeof(a_[1].values)); + simde_memcpy(&ptr[2], &a_[2].values, sizeof(a_[2].values)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) @@ -882,11 +1049,19 @@ simde_vst3q_p8(simde_poly8_t ptr[HEDLEY_ARRAY_PARAM(48)], simde_poly8x16x3_t val simde_poly8x16_private a_[3] = {simde_poly8x16_to_private(val.val[0]), simde_poly8x16_to_private(val.val[1]), simde_poly8x16_to_private(val.val[2])}; - simde_poly8_t buf[48]; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 3 ; i++) { - buf[i] = a_[i % 3].values[i / 3]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) + vuint8m1x3_t dest = __riscv_vlseg3e8_v_u8m1x3(ptr, 16); + dest = __riscv_vset_v_u8m1_u8m1x3 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_u8m1_u8m1x3 (dest, 1, a_[1].sv128); + dest = __riscv_vset_v_u8m1_u8m1x3 (dest, 2, a_[2].sv128); + __riscv_vsseg3e8_v_u8m1x3 (ptr, dest, 16); + #else + simde_poly8_t buf[48]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 3 ; i++) { + buf[i] = a_[i % 3].values[i / 3]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -904,11 +1079,19 @@ simde_vst3q_p16(simde_poly16_t ptr[HEDLEY_ARRAY_PARAM(24)], simde_poly16x8x3_t v simde_poly16x8_to_private(val.val[1]), simde_poly16x8_to_private(val.val[2]) }; - simde_poly16_t buf[24]; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 3 ; i++) { - buf[i] = a_[i % 3].values[i / 3]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) + vuint16m1x3_t dest = __riscv_vlseg3e16_v_u16m1x3(ptr, 8); + dest = __riscv_vset_v_u16m1_u16m1x3 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_u16m1_u16m1x3 (dest, 1, a_[1].sv128); + dest = __riscv_vset_v_u16m1_u16m1x3 (dest, 2, a_[2].sv128); + __riscv_vsseg3e16_v_u16m1x3 (ptr, dest, 8); + #else + simde_poly16_t buf[24]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 3 ; i++) { + buf[i] = a_[i % 3].values[i / 3]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -922,14 +1105,22 @@ simde_vst3q_p64(simde_poly64_t ptr[HEDLEY_ARRAY_PARAM(6)], simde_poly64x2x3_t va #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) vst3q_p64(ptr, val); #else - simde_poly64x2_private a[3] = { simde_poly64x2_to_private(val.val[0]), + simde_poly64x2_private a_[3] = { simde_poly64x2_to_private(val.val[0]), simde_poly64x2_to_private(val.val[1]), simde_poly64x2_to_private(val.val[2]) }; - simde_poly64_t buf[6]; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 3 ; i++) { - buf[i] = a[i % 3].values[i / 3]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) + vuint64m1x3_t dest = __riscv_vlseg3e64_v_u64m1x3(ptr, 2); + dest = __riscv_vset_v_u64m1_u64m1x3 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_u64m1_u64m1x3 (dest, 1, a_[1].sv128); + dest = __riscv_vset_v_u64m1_u64m1x3 (dest, 2, a_[2].sv128); + __riscv_vsseg3e64_v_u64m1x3 (ptr, dest, 2); + #else + simde_poly64_t buf[6]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 3 ; i++) { + buf[i] = a_[i % 3].values[i / 3]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) diff --git a/simde/arm/neon/st4.h b/simde/arm/neon/st4.h index 7ed0ec480..1cb3eab23 100644 --- a/simde/arm/neon/st4.h +++ b/simde/arm/neon/st4.h @@ -24,6 +24,7 @@ * 2020 Evan Nemerson * 2020 Sean Maher * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_ST4_H) @@ -43,13 +44,22 @@ simde_vst4_f16(simde_float16_t *ptr, simde_float16x4x4_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) vst4_f16(ptr, val); #else - simde_float16_t buf[16]; simde_float16x4_private a_[4] = { simde_float16x4_to_private(val.val[0]), simde_float16x4_to_private(val.val[1]), simde_float16x4_to_private(val.val[2]), simde_float16x4_to_private(val.val[3]) }; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { - buf[i] = a_[i % 4].values[i / 4]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH && (SIMDE_NATURAL_VECTOR_SIZE >= 128) + vfloat16m1x4_t dest = __riscv_vlseg4e16_v_f16m1x4((_Float16 *)ptr, 4); + dest = __riscv_vset_v_f16m1_f16m1x4 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_f16m1_f16m1x4 (dest, 1, a_[1].sv64); + dest = __riscv_vset_v_f16m1_f16m1x4 (dest, 2, a_[2].sv64); + dest = __riscv_vset_v_f16m1_f16m1x4 (dest, 3, a_[3].sv64); + __riscv_vsseg4e16_v_f16m1x4 ((_Float16 *)ptr, dest, 4); + #else + simde_float16_t buf[16]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { + buf[i] = a_[i % 4].values[i / 4]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -63,13 +73,22 @@ simde_vst4_f32(simde_float32_t *ptr, simde_float32x2x4_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst4_f32(ptr, val); #else - simde_float32_t buf[8]; simde_float32x2_private a_[4] = { simde_float32x2_to_private(val.val[0]), simde_float32x2_to_private(val.val[1]), simde_float32x2_to_private(val.val[2]), simde_float32x2_to_private(val.val[3]) }; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { - buf[i] = a_[i % 4].values[i / 4]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) + vfloat32m1x4_t dest = __riscv_vlseg4e32_v_f32m1x4(ptr, 2); + dest = __riscv_vset_v_f32m1_f32m1x4 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_f32m1_f32m1x4 (dest, 1, a_[1].sv64); + dest = __riscv_vset_v_f32m1_f32m1x4 (dest, 2, a_[2].sv64); + dest = __riscv_vset_v_f32m1_f32m1x4 (dest, 3, a_[3].sv64); + __riscv_vsseg4e32_v_f32m1x4 (ptr, dest, 2); + #else + simde_float32_t buf[8]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { + buf[i] = a_[i % 4].values[i / 4]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -83,13 +102,22 @@ simde_vst4_f64(simde_float64_t *ptr, simde_float64x1x4_t val) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) vst4_f64(ptr, val); #else - simde_float64_t buf[4]; simde_float64x1_private a_[4] = { simde_float64x1_to_private(val.val[0]), simde_float64x1_to_private(val.val[1]), simde_float64x1_to_private(val.val[2]), simde_float64x1_to_private(val.val[3]) }; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { - buf[i] = a_[i % 4].values[i / 4]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) + vfloat64m1x4_t dest = __riscv_vlseg4e64_v_f64m1x4(ptr, 1); + dest = __riscv_vset_v_f64m1_f64m1x4 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_f64m1_f64m1x4 (dest, 1, a_[1].sv64); + dest = __riscv_vset_v_f64m1_f64m1x4 (dest, 2, a_[2].sv64); + dest = __riscv_vset_v_f64m1_f64m1x4 (dest, 3, a_[3].sv64); + __riscv_vsseg4e64_v_f64m1x4(ptr, dest, 1); + #else + simde_float64_t buf[4]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { + buf[i] = a_[i % 4].values[i / 4]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) @@ -103,13 +131,22 @@ simde_vst4_s8(int8_t *ptr, simde_int8x8x4_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst4_s8(ptr, val); #else - int8_t buf[32]; simde_int8x8_private a_[4] = { simde_int8x8_to_private(val.val[0]), simde_int8x8_to_private(val.val[1]), simde_int8x8_to_private(val.val[2]), simde_int8x8_to_private(val.val[3]) }; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { - buf[i] = a_[i % 4].values[i / 4]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) + vint8m1x4_t dest = __riscv_vlseg4e8_v_i8m1x4(ptr, 8); + dest = __riscv_vset_v_i8m1_i8m1x4 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_i8m1_i8m1x4 (dest, 1, a_[1].sv64); + dest = __riscv_vset_v_i8m1_i8m1x4 (dest, 2, a_[2].sv64); + dest = __riscv_vset_v_i8m1_i8m1x4 (dest, 3, a_[3].sv64); + __riscv_vsseg4e8_v_i8m1x4(ptr, dest, 8); + #else + int8_t buf[32]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { + buf[i] = a_[i % 4].values[i / 4]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -123,13 +160,22 @@ simde_vst4_s16(int16_t *ptr, simde_int16x4x4_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst4_s16(ptr, val); #else - int16_t buf[16]; simde_int16x4_private a_[4] = { simde_int16x4_to_private(val.val[0]), simde_int16x4_to_private(val.val[1]), simde_int16x4_to_private(val.val[2]), simde_int16x4_to_private(val.val[3]) }; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { - buf[i] = a_[i % 4].values[i / 4]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) + vint16m1x4_t dest = __riscv_vlseg4e16_v_i16m1x4(ptr, 4); + dest = __riscv_vset_v_i16m1_i16m1x4 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_i16m1_i16m1x4 (dest, 1, a_[1].sv64); + dest = __riscv_vset_v_i16m1_i16m1x4 (dest, 2, a_[2].sv64); + dest = __riscv_vset_v_i16m1_i16m1x4 (dest, 3, a_[3].sv64); + __riscv_vsseg4e16_v_i16m1x4 (ptr, dest, 4); + #else + int16_t buf[16]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { + buf[i] = a_[i % 4].values[i / 4]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -143,13 +189,22 @@ simde_vst4_s32(int32_t *ptr, simde_int32x2x4_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst4_s32(ptr, val); #else - int32_t buf[8]; simde_int32x2_private a_[4] = { simde_int32x2_to_private(val.val[0]), simde_int32x2_to_private(val.val[1]), simde_int32x2_to_private(val.val[2]), simde_int32x2_to_private(val.val[3]) }; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { - buf[i] = a_[i % 4].values[i / 4]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) + vint32m1x4_t dest = __riscv_vlseg4e32_v_i32m1x4(ptr, 2); + dest = __riscv_vset_v_i32m1_i32m1x4 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_i32m1_i32m1x4 (dest, 1, a_[1].sv64); + dest = __riscv_vset_v_i32m1_i32m1x4 (dest, 2, a_[2].sv64); + dest = __riscv_vset_v_i32m1_i32m1x4 (dest, 3, a_[3].sv64); + __riscv_vsseg4e32_v_i32m1x4 (ptr, dest, 2); + #else + int32_t buf[8]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { + buf[i] = a_[i % 4].values[i / 4]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -163,13 +218,22 @@ simde_vst4_s64(int64_t *ptr, simde_int64x1x4_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst4_s64(ptr, val); #else - int64_t buf[4]; simde_int64x1_private a_[4] = { simde_int64x1_to_private(val.val[0]), simde_int64x1_to_private(val.val[1]), simde_int64x1_to_private(val.val[2]), simde_int64x1_to_private(val.val[3]) }; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { - buf[i] = a_[i % 4].values[i / 4]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) + vint64m1x4_t dest = __riscv_vlseg4e64_v_i64m1x4(ptr, 1); + dest = __riscv_vset_v_i64m1_i64m1x4 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_i64m1_i64m1x4 (dest, 1, a_[1].sv64); + dest = __riscv_vset_v_i64m1_i64m1x4 (dest, 2, a_[2].sv64); + dest = __riscv_vset_v_i64m1_i64m1x4 (dest, 3, a_[3].sv64); + __riscv_vsseg4e64_v_i64m1x4 (ptr, dest, 1); + #else + int64_t buf[4]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { + buf[i] = a_[i % 4].values[i / 4]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) @@ -183,13 +247,22 @@ simde_vst4_u8(uint8_t *ptr, simde_uint8x8x4_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst4_u8(ptr, val); #else - uint8_t buf[32]; simde_uint8x8_private a_[4] = { simde_uint8x8_to_private(val.val[0]), simde_uint8x8_to_private(val.val[1]), simde_uint8x8_to_private(val.val[2]), simde_uint8x8_to_private(val.val[3]) }; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { - buf[i] = a_[i % 4].values[i / 4]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) + vuint8m1x4_t dest = __riscv_vlseg4e8_v_u8m1x4(ptr, 8); + dest = __riscv_vset_v_u8m1_u8m1x4 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_u8m1_u8m1x4 (dest, 1, a_[1].sv64); + dest = __riscv_vset_v_u8m1_u8m1x4 (dest, 2, a_[2].sv64); + dest = __riscv_vset_v_u8m1_u8m1x4 (dest, 3, a_[3].sv64); + __riscv_vsseg4e8_v_u8m1x4 (ptr, dest, 8); + #else + uint8_t buf[32]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { + buf[i] = a_[i % 4].values[i / 4]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -203,13 +276,22 @@ simde_vst4_u16(uint16_t *ptr, simde_uint16x4x4_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst4_u16(ptr, val); #else - uint16_t buf[16]; simde_uint16x4_private a_[4] = { simde_uint16x4_to_private(val.val[0]), simde_uint16x4_to_private(val.val[1]), simde_uint16x4_to_private(val.val[2]), simde_uint16x4_to_private(val.val[3]) }; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { - buf[i] = a_[i % 4].values[i / 4]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) + vuint16m1x4_t dest = __riscv_vlseg4e16_v_u16m1x4(ptr, 4); + dest = __riscv_vset_v_u16m1_u16m1x4 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_u16m1_u16m1x4 (dest, 1, a_[1].sv64); + dest = __riscv_vset_v_u16m1_u16m1x4 (dest, 2, a_[2].sv64); + dest = __riscv_vset_v_u16m1_u16m1x4 (dest, 3, a_[3].sv64); + __riscv_vsseg4e16_v_u16m1x4 (ptr, dest, 4); + #else + uint16_t buf[16]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { + buf[i] = a_[i % 4].values[i / 4]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -223,13 +305,22 @@ simde_vst4_u32(uint32_t *ptr, simde_uint32x2x4_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst4_u32(ptr, val); #else - uint32_t buf[8]; simde_uint32x2_private a_[4] = { simde_uint32x2_to_private(val.val[0]), simde_uint32x2_to_private(val.val[1]), simde_uint32x2_to_private(val.val[2]), simde_uint32x2_to_private(val.val[3]) }; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { - buf[i] = a_[i % 4].values[i / 4]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) + vuint32m1x4_t dest = __riscv_vlseg4e32_v_u32m1x4(ptr, 2); + dest = __riscv_vset_v_u32m1_u32m1x4 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_u32m1_u32m1x4 (dest, 1, a_[1].sv64); + dest = __riscv_vset_v_u32m1_u32m1x4 (dest, 2, a_[2].sv64); + dest = __riscv_vset_v_u32m1_u32m1x4 (dest, 3, a_[3].sv64); + __riscv_vsseg4e32_v_u32m1x4 (ptr, dest, 2); + #else + uint32_t buf[8]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { + buf[i] = a_[i % 4].values[i / 4]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -243,13 +334,22 @@ simde_vst4_u64(uint64_t *ptr, simde_uint64x1x4_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst4_u64(ptr, val); #else - uint64_t buf[4]; simde_uint64x1_private a_[4] = { simde_uint64x1_to_private(val.val[0]), simde_uint64x1_to_private(val.val[1]), simde_uint64x1_to_private(val.val[2]), simde_uint64x1_to_private(val.val[3]) }; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { - buf[i] = a_[i % 4].values[i / 4]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) + vuint64m1x4_t dest = __riscv_vlseg4e64_v_u64m1x4(ptr, 1); + dest = __riscv_vset_v_u64m1_u64m1x4 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_u64m1_u64m1x4 (dest, 1, a_[1].sv64); + dest = __riscv_vset_v_u64m1_u64m1x4 (dest, 2, a_[2].sv64); + dest = __riscv_vset_v_u64m1_u64m1x4 (dest, 3, a_[3].sv64); + __riscv_vsseg4e64_v_u64m1x4 (ptr, dest, 1); + #else + uint64_t buf[4]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { + buf[i] = a_[i % 4].values[i / 4]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) @@ -263,13 +363,22 @@ simde_vst4q_f16(simde_float16_t *ptr, simde_float16x8x4_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) vst4q_f16(ptr, val); #else - simde_float16_t buf[32]; simde_float16x8_private a_[4] = { simde_float16x8_to_private(val.val[0]), simde_float16x8_to_private(val.val[1]), simde_float16x8_to_private(val.val[2]), simde_float16x8_to_private(val.val[3]) }; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { - buf[i] = a_[i % 4].values[i / 4]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH && (SIMDE_NATURAL_VECTOR_SIZE >= 128) + vfloat16m1x4_t dest = __riscv_vlseg4e16_v_f16m1x4((_Float16 *)ptr, 8); + dest = __riscv_vset_v_f16m1_f16m1x4 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_f16m1_f16m1x4 (dest, 1, a_[1].sv128); + dest = __riscv_vset_v_f16m1_f16m1x4 (dest, 2, a_[2].sv128); + dest = __riscv_vset_v_f16m1_f16m1x4 (dest, 3, a_[3].sv128); + __riscv_vsseg4e16_v_f16m1x4 ((_Float16 *)ptr, dest, 8); + #else + simde_float16_t buf[32]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { + buf[i] = a_[i % 4].values[i / 4]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -283,13 +392,22 @@ simde_vst4q_f32(simde_float32_t *ptr, simde_float32x4x4_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst4q_f32(ptr, val); #else - simde_float32_t buf[16]; simde_float32x4_private a_[4] = { simde_float32x4_to_private(val.val[0]), simde_float32x4_to_private(val.val[1]), simde_float32x4_to_private(val.val[2]), simde_float32x4_to_private(val.val[3]) }; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { - buf[i] = a_[i % 4].values[i / 4]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) + vfloat32m1x4_t dest = __riscv_vlseg4e32_v_f32m1x4(ptr, 4); + dest = __riscv_vset_v_f32m1_f32m1x4 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_f32m1_f32m1x4 (dest, 1, a_[1].sv128); + dest = __riscv_vset_v_f32m1_f32m1x4 (dest, 2, a_[2].sv128); + dest = __riscv_vset_v_f32m1_f32m1x4 (dest, 3, a_[3].sv128); + __riscv_vsseg4e32_v_f32m1x4 (ptr, dest, 4); + #else + simde_float32_t buf[16]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { + buf[i] = a_[i % 4].values[i / 4]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -303,13 +421,22 @@ simde_vst4q_f64(simde_float64_t *ptr, simde_float64x2x4_t val) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) vst4q_f64(ptr, val); #else - simde_float64_t buf[8]; simde_float64x2_private a_[4] = { simde_float64x2_to_private(val.val[0]), simde_float64x2_to_private(val.val[1]), simde_float64x2_to_private(val.val[2]), simde_float64x2_to_private(val.val[3]) }; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { - buf[i] = a_[i % 4].values[i / 4]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) + vfloat64m1x4_t dest = __riscv_vlseg4e64_v_f64m1x4(ptr, 2); + dest = __riscv_vset_v_f64m1_f64m1x4 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_f64m1_f64m1x4 (dest, 1, a_[1].sv128); + dest = __riscv_vset_v_f64m1_f64m1x4 (dest, 2, a_[2].sv128); + dest = __riscv_vset_v_f64m1_f64m1x4 (dest, 3, a_[3].sv128); + __riscv_vsseg4e64_v_f64m1x4 (ptr, dest, 2); + #else + simde_float64_t buf[8]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { + buf[i] = a_[i % 4].values[i / 4]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) @@ -323,13 +450,22 @@ simde_vst4q_s8(int8_t *ptr, simde_int8x16x4_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst4q_s8(ptr, val); #else - int8_t buf[64]; simde_int8x16_private a_[4] = { simde_int8x16_to_private(val.val[0]), simde_int8x16_to_private(val.val[1]), simde_int8x16_to_private(val.val[2]), simde_int8x16_to_private(val.val[3]) }; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { - buf[i] = a_[i % 4].values[i / 4]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) + vint8m1x4_t dest = __riscv_vlseg4e8_v_i8m1x4(ptr, 16); + dest = __riscv_vset_v_i8m1_i8m1x4 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_i8m1_i8m1x4 (dest, 1, a_[1].sv128); + dest = __riscv_vset_v_i8m1_i8m1x4 (dest, 2, a_[2].sv128); + dest = __riscv_vset_v_i8m1_i8m1x4 (dest, 3, a_[3].sv128); + __riscv_vsseg4e8_v_i8m1x4 (ptr, dest, 16); + #else + int8_t buf[64]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { + buf[i] = a_[i % 4].values[i / 4]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -343,13 +479,22 @@ simde_vst4q_s16(int16_t *ptr, simde_int16x8x4_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst4q_s16(ptr, val); #else - int16_t buf[32]; simde_int16x8_private a_[4] = { simde_int16x8_to_private(val.val[0]), simde_int16x8_to_private(val.val[1]), simde_int16x8_to_private(val.val[2]), simde_int16x8_to_private(val.val[3]) }; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { - buf[i] = a_[i % 4].values[i / 4]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) + vint16m1x4_t dest = __riscv_vlseg4e16_v_i16m1x4(ptr, 8); + dest = __riscv_vset_v_i16m1_i16m1x4 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_i16m1_i16m1x4 (dest, 1, a_[1].sv128); + dest = __riscv_vset_v_i16m1_i16m1x4 (dest, 2, a_[2].sv128); + dest = __riscv_vset_v_i16m1_i16m1x4 (dest, 3, a_[3].sv128); + __riscv_vsseg4e16_v_i16m1x4 (ptr, dest, 8); + #else + int16_t buf[32]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { + buf[i] = a_[i % 4].values[i / 4]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -363,13 +508,22 @@ simde_vst4q_s32(int32_t *ptr, simde_int32x4x4_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst4q_s32(ptr, val); #else - int32_t buf[16]; simde_int32x4_private a_[4] = { simde_int32x4_to_private(val.val[0]), simde_int32x4_to_private(val.val[1]), simde_int32x4_to_private(val.val[2]), simde_int32x4_to_private(val.val[3]) }; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { - buf[i] = a_[i % 4].values[i / 4]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) + vint32m1x4_t dest = __riscv_vlseg4e32_v_i32m1x4(ptr, 4); + dest = __riscv_vset_v_i32m1_i32m1x4 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_i32m1_i32m1x4 (dest, 1, a_[1].sv128); + dest = __riscv_vset_v_i32m1_i32m1x4 (dest, 2, a_[2].sv128); + dest = __riscv_vset_v_i32m1_i32m1x4 (dest, 3, a_[3].sv128); + __riscv_vsseg4e32_v_i32m1x4 (ptr, dest, 4); + #else + int32_t buf[16]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { + buf[i] = a_[i % 4].values[i / 4]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -383,13 +537,22 @@ simde_vst4q_s64(int64_t *ptr, simde_int64x2x4_t val) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) vst4q_s64(ptr, val); #else - int64_t buf[8]; simde_int64x2_private a_[4] = { simde_int64x2_to_private(val.val[0]), simde_int64x2_to_private(val.val[1]), simde_int64x2_to_private(val.val[2]), simde_int64x2_to_private(val.val[3]) }; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { - buf[i] = a_[i % 4].values[i / 4]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) + vint64m1x4_t dest = __riscv_vlseg4e64_v_i64m1x4(ptr, 2); + dest = __riscv_vset_v_i64m1_i64m1x4 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_i64m1_i64m1x4 (dest, 1, a_[1].sv128); + dest = __riscv_vset_v_i64m1_i64m1x4 (dest, 2, a_[2].sv128); + dest = __riscv_vset_v_i64m1_i64m1x4 (dest, 3, a_[3].sv128); + __riscv_vsseg4e64_v_i64m1x4 (ptr, dest, 2); + #else + int64_t buf[8]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { + buf[i] = a_[i % 4].values[i / 4]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) @@ -404,13 +567,22 @@ simde_vst4q_u8(uint8_t *ptr, simde_uint8x16x4_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst4q_u8(ptr, val); #else - uint8_t buf[64]; simde_uint8x16_private a_[4] = { simde_uint8x16_to_private(val.val[0]), simde_uint8x16_to_private(val.val[1]), simde_uint8x16_to_private(val.val[2]), simde_uint8x16_to_private(val.val[3]) }; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { - buf[i] = a_[i % 4].values[i / 4]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) + vuint8m1x4_t dest = __riscv_vlseg4e8_v_u8m1x4(ptr, 16); + dest = __riscv_vset_v_u8m1_u8m1x4 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_u8m1_u8m1x4 (dest, 1, a_[1].sv128); + dest = __riscv_vset_v_u8m1_u8m1x4 (dest, 2, a_[2].sv128); + dest = __riscv_vset_v_u8m1_u8m1x4 (dest, 3, a_[3].sv128); + __riscv_vsseg4e8_v_u8m1x4 (ptr, dest, 16); + #else + uint8_t buf[64]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { + buf[i] = a_[i % 4].values[i / 4]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -424,13 +596,22 @@ simde_vst4q_u16(uint16_t *ptr, simde_uint16x8x4_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst4q_u16(ptr, val); #else - uint16_t buf[32]; simde_uint16x8_private a_[4] = { simde_uint16x8_to_private(val.val[0]), simde_uint16x8_to_private(val.val[1]), simde_uint16x8_to_private(val.val[2]), simde_uint16x8_to_private(val.val[3]) }; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { - buf[i] = a_[i % 4].values[i / 4]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) + vuint16m1x4_t dest = __riscv_vlseg4e16_v_u16m1x4(ptr, 8); + dest = __riscv_vset_v_u16m1_u16m1x4 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_u16m1_u16m1x4 (dest, 1, a_[1].sv128); + dest = __riscv_vset_v_u16m1_u16m1x4 (dest, 2, a_[2].sv128); + dest = __riscv_vset_v_u16m1_u16m1x4 (dest, 3, a_[3].sv128); + __riscv_vsseg4e16_v_u16m1x4 (ptr, dest, 8); + #else + uint16_t buf[32]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { + buf[i] = a_[i % 4].values[i / 4]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -444,13 +625,22 @@ simde_vst4q_u32(uint32_t *ptr, simde_uint32x4x4_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst4q_u32(ptr, val); #else - uint32_t buf[16]; simde_uint32x4_private a_[4] = { simde_uint32x4_to_private(val.val[0]), simde_uint32x4_to_private(val.val[1]), simde_uint32x4_to_private(val.val[2]), simde_uint32x4_to_private(val.val[3]) }; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { - buf[i] = a_[i % 4].values[i / 4]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) + vuint32m1x4_t dest = __riscv_vlseg4e32_v_u32m1x4(ptr, 4); + dest = __riscv_vset_v_u32m1_u32m1x4 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_u32m1_u32m1x4 (dest, 1, a_[1].sv128); + dest = __riscv_vset_v_u32m1_u32m1x4 (dest, 2, a_[2].sv128); + dest = __riscv_vset_v_u32m1_u32m1x4 (dest, 3, a_[3].sv128); + __riscv_vsseg4e32_v_u32m1x4 (ptr, dest, 4); + #else + uint32_t buf[16]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { + buf[i] = a_[i % 4].values[i / 4]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -464,13 +654,22 @@ simde_vst4q_u64(uint64_t *ptr, simde_uint64x2x4_t val) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) vst4q_u64(ptr, val); #else - uint64_t buf[8]; simde_uint64x2_private a_[4] = { simde_uint64x2_to_private(val.val[0]), simde_uint64x2_to_private(val.val[1]), simde_uint64x2_to_private(val.val[2]), simde_uint64x2_to_private(val.val[3]) }; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { - buf[i] = a_[i % 4].values[i / 4]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) + vuint64m1x4_t dest = __riscv_vlseg4e64_v_u64m1x4(ptr, 2); + dest = __riscv_vset_v_u64m1_u64m1x4 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_u64m1_u64m1x4 (dest, 1, a_[1].sv128); + dest = __riscv_vset_v_u64m1_u64m1x4 (dest, 2, a_[2].sv128); + dest = __riscv_vset_v_u64m1_u64m1x4 (dest, 3, a_[3].sv128); + __riscv_vsseg4e64_v_u64m1x4 (ptr, dest, 2); + #else + uint64_t buf[8]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { + buf[i] = a_[i % 4].values[i / 4]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) @@ -484,13 +683,22 @@ simde_vst4_p8(simde_poly8_t *ptr, simde_poly8x8x4_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst4_p8(ptr, val); #else - simde_poly8_t buf[32]; simde_poly8x8_private a_[4] = { simde_poly8x8_to_private(val.val[0]), simde_poly8x8_to_private(val.val[1]), simde_poly8x8_to_private(val.val[2]), simde_poly8x8_to_private(val.val[3]) }; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { - buf[i] = a_[i % 4].values[i / 4]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) + vuint8m1x4_t dest = __riscv_vlseg4e8_v_u8m1x4(ptr, 8); + dest = __riscv_vset_v_u8m1_u8m1x4 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_u8m1_u8m1x4 (dest, 1, a_[1].sv64); + dest = __riscv_vset_v_u8m1_u8m1x4 (dest, 2, a_[2].sv64); + dest = __riscv_vset_v_u8m1_u8m1x4 (dest, 3, a_[3].sv64); + __riscv_vsseg4e8_v_u8m1x4 (ptr, dest, 8); + #else + simde_poly8_t buf[32]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { + buf[i] = a_[i % 4].values[i / 4]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -504,13 +712,22 @@ simde_vst4_p16(simde_poly16_t *ptr, simde_poly16x4x4_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst4_p16(ptr, val); #else - simde_poly16_t buf[16]; simde_poly16x4_private a_[4] = { simde_poly16x4_to_private(val.val[0]), simde_poly16x4_to_private(val.val[1]), simde_poly16x4_to_private(val.val[2]), simde_poly16x4_to_private(val.val[3]) }; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { - buf[i] = a_[i % 4].values[i / 4]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) + vuint16m1x4_t dest = __riscv_vlseg4e16_v_u16m1x4(ptr, 4); + dest = __riscv_vset_v_u16m1_u16m1x4 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_u16m1_u16m1x4 (dest, 1, a_[1].sv64); + dest = __riscv_vset_v_u16m1_u16m1x4 (dest, 2, a_[2].sv64); + dest = __riscv_vset_v_u16m1_u16m1x4 (dest, 3, a_[3].sv64); + __riscv_vsseg4e16_v_u16m1x4 (ptr, dest, 4); + #else + simde_poly16_t buf[16]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { + buf[i] = a_[i % 4].values[i / 4]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -524,13 +741,22 @@ simde_vst4_p64(simde_poly64_t *ptr, simde_poly64x1x4_t val) { #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) vst4_p64(ptr, val); #else - simde_poly64_t buf[4]; simde_poly64x1_private a_[4] = { simde_poly64x1_to_private(val.val[0]), simde_poly64x1_to_private(val.val[1]), simde_poly64x1_to_private(val.val[2]), simde_poly64x1_to_private(val.val[3]) }; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { - buf[i] = a_[i % 4].values[i / 4]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) + vuint64m1x4_t dest = __riscv_vlseg4e64_v_u64m1x4(ptr, 1); + dest = __riscv_vset_v_u64m1_u64m1x4 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_u64m1_u64m1x4 (dest, 1, a_[1].sv64); + dest = __riscv_vset_v_u64m1_u64m1x4 (dest, 2, a_[2].sv64); + dest = __riscv_vset_v_u64m1_u64m1x4 (dest, 3, a_[3].sv64); + __riscv_vsseg4e64_v_u64m1x4 (ptr, dest, 1); + #else + simde_poly64_t buf[4]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { + buf[i] = a_[i % 4].values[i / 4]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) @@ -544,13 +770,22 @@ simde_vst4q_p8(simde_poly8_t *ptr, simde_poly8x16x4_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst4q_p8(ptr, val); #else - simde_poly8_t buf[64]; simde_poly8x16_private a_[4] = { simde_poly8x16_to_private(val.val[0]), simde_poly8x16_to_private(val.val[1]), simde_poly8x16_to_private(val.val[2]), simde_poly8x16_to_private(val.val[3]) }; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { - buf[i] = a_[i % 4].values[i / 4]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) + vuint8m1x4_t dest = __riscv_vlseg4e8_v_u8m1x4(ptr, 16); + dest = __riscv_vset_v_u8m1_u8m1x4 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_u8m1_u8m1x4 (dest, 1, a_[1].sv128); + dest = __riscv_vset_v_u8m1_u8m1x4 (dest, 2, a_[2].sv128); + dest = __riscv_vset_v_u8m1_u8m1x4 (dest, 3, a_[3].sv128); + __riscv_vsseg4e8_v_u8m1x4 (ptr, dest, 16); + #else + simde_poly8_t buf[64]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { + buf[i] = a_[i % 4].values[i / 4]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -564,13 +799,22 @@ simde_vst4q_p16(simde_poly16_t *ptr, simde_poly16x8x4_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst4q_p16(ptr, val); #else - simde_poly16_t buf[32]; simde_poly16x8_private a_[4] = { simde_poly16x8_to_private(val.val[0]), simde_poly16x8_to_private(val.val[1]), simde_poly16x8_to_private(val.val[2]), simde_poly16x8_to_private(val.val[3]) }; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { - buf[i] = a_[i % 4].values[i / 4]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) + vuint16m1x4_t dest = __riscv_vlseg4e16_v_u16m1x4(ptr, 8); + dest = __riscv_vset_v_u16m1_u16m1x4 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_u16m1_u16m1x4 (dest, 1, a_[1].sv128); + dest = __riscv_vset_v_u16m1_u16m1x4 (dest, 2, a_[2].sv128); + dest = __riscv_vset_v_u16m1_u16m1x4 (dest, 3, a_[3].sv128); + __riscv_vsseg4e16_v_u16m1x4 (ptr, dest, 8); + #else + simde_poly16_t buf[32]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { + buf[i] = a_[i % 4].values[i / 4]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -584,13 +828,22 @@ simde_vst4q_p64(simde_poly64_t *ptr, simde_poly64x2x4_t val) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) vst4q_p64(ptr, val); #else - simde_poly64_t buf[8]; simde_poly64x2_private a_[4] = { simde_poly64x2_to_private(val.val[0]), simde_poly64x2_to_private(val.val[1]), simde_poly64x2_to_private(val.val[2]), simde_poly64x2_to_private(val.val[3]) }; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { - buf[i] = a_[i % 4].values[i / 4]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) + vuint64m1x4_t dest = __riscv_vlseg4e64_v_u64m1x4(ptr, 2); + dest = __riscv_vset_v_u64m1_u64m1x4 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_u64m1_u64m1x4 (dest, 1, a_[1].sv128); + dest = __riscv_vset_v_u64m1_u64m1x4 (dest, 2, a_[2].sv128); + dest = __riscv_vset_v_u64m1_u64m1x4 (dest, 3, a_[3].sv128); + __riscv_vsseg4e64_v_u64m1x4 (ptr, dest, 2); + #else + simde_poly64_t buf[8]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { + buf[i] = a_[i % 4].values[i / 4]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) @@ -604,9 +857,9 @@ simde_vst4_bf16(simde_bfloat16_t *ptr, simde_bfloat16x4x4_t val) { #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) vst4_bf16(ptr, val); #else - simde_bfloat16_t buf[16]; simde_bfloat16x4_private a_[4] = { simde_bfloat16x4_to_private(val.val[0]), simde_bfloat16x4_to_private(val.val[1]), simde_bfloat16x4_to_private(val.val[2]), simde_bfloat16x4_to_private(val.val[3]) }; + simde_bfloat16_t buf[16]; for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { buf[i] = a_[i % 4].values[i / 4]; } @@ -624,9 +877,9 @@ simde_vst4q_bf16(simde_bfloat16_t *ptr, simde_bfloat16x8x4_t val) { #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) vst4q_bf16(ptr, val); #else - simde_bfloat16_t buf[32]; simde_bfloat16x8_private a_[4] = { simde_bfloat16x8_to_private(val.val[0]), simde_bfloat16x8_to_private(val.val[1]), simde_bfloat16x8_to_private(val.val[2]), simde_bfloat16x8_to_private(val.val[3]) }; + simde_bfloat16_t buf[32]; for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { buf[i] = a_[i % 4].values[i / 4]; } diff --git a/simde/arm/neon/types.h b/simde/arm/neon/types.h index 50ea467c0..51ad0f825 100644 --- a/simde/arm/neon/types.h +++ b/simde/arm/neon/types.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_TYPES_H) @@ -48,6 +49,11 @@ typedef union { #if defined(SIMDE_X86_MMX_NATIVE) __m64 m64; #endif + + #if defined(SIMDE_RISCV_V_NATIVE) + fixed_vint8m1_t sv64; + #endif + } simde_int8x8_private; typedef union { @@ -56,6 +62,11 @@ typedef union { #if defined(SIMDE_X86_MMX_NATIVE) __m64 m64; #endif + + #if defined(SIMDE_RISCV_V_NATIVE) + fixed_vint16m1_t sv64; + #endif + } simde_int16x4_private; typedef union { @@ -64,6 +75,11 @@ typedef union { #if defined(SIMDE_X86_MMX_NATIVE) __m64 m64; #endif + + #if defined(SIMDE_RISCV_V_NATIVE) + fixed_vint32m1_t sv64; + #endif + } simde_int32x2_private; typedef union { @@ -72,6 +88,11 @@ typedef union { #if defined(SIMDE_X86_MMX_NATIVE) __m64 m64; #endif + + #if defined(SIMDE_RISCV_V_NATIVE) + fixed_vint64m1_t sv64; + #endif + } simde_int64x1_private; typedef union { @@ -80,6 +101,11 @@ typedef union { #if defined(SIMDE_X86_MMX_NATIVE) __m64 m64; #endif + + #if defined(SIMDE_RISCV_V_NATIVE) + fixed_vuint8m1_t sv64; + #endif + } simde_uint8x8_private; typedef union { @@ -88,6 +114,11 @@ typedef union { #if defined(SIMDE_X86_MMX_NATIVE) __m64 m64; #endif + + #if defined(SIMDE_RISCV_V_NATIVE) + fixed_vuint16m1_t sv64; + #endif + } simde_uint16x4_private; typedef union { @@ -96,6 +127,11 @@ typedef union { #if defined(SIMDE_X86_MMX_NATIVE) __m64 m64; #endif + + #if defined(SIMDE_RISCV_V_NATIVE) + fixed_vuint32m1_t sv64; + #endif + } simde_uint32x2_private; typedef union { @@ -104,6 +140,11 @@ typedef union { #if defined(SIMDE_X86_MMX_NATIVE) __m64 m64; #endif + + #if defined(SIMDE_RISCV_V_NATIVE) + fixed_vuint64m1_t sv64; + #endif + } simde_uint64x1_private; typedef union { @@ -116,6 +157,11 @@ typedef union { #if defined(SIMDE_X86_MMX_NATIVE) __m64 m64; #endif + + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + fixed_vfloat16m1_t sv64; + #endif + } simde_float16x4_private; typedef union { @@ -124,6 +170,11 @@ typedef union { #if defined(SIMDE_X86_MMX_NATIVE) __m64 m64; #endif + + #if defined(SIMDE_RISCV_V_NATIVE) + fixed_vfloat32m1_t sv64; + #endif + } simde_float32x2_private; typedef union { @@ -132,18 +183,32 @@ typedef union { #if defined(SIMDE_X86_MMX_NATIVE) __m64 m64; #endif + + #if defined(SIMDE_RISCV_V_NATIVE) + fixed_vfloat64m1_t sv64; + #endif + } simde_float64x1_private; typedef union { SIMDE_ARM_NEON_DECLARE_VECTOR(simde_poly8, values, 8); + #if defined(SIMDE_RISCV_V_NATIVE) + fixed_vuint8m1_t sv64; + #endif } simde_poly8x8_private; typedef union { SIMDE_ARM_NEON_DECLARE_VECTOR(simde_poly16, values, 8); + #if defined(SIMDE_RISCV_V_NATIVE) + fixed_vuint16m1_t sv64; + #endif } simde_poly16x4_private; typedef union { SIMDE_ARM_NEON_DECLARE_VECTOR(simde_poly64, values, 8); + #if defined(SIMDE_RISCV_V_NATIVE) + fixed_vuint64m1_t sv64; + #endif } simde_poly64x1_private; typedef union { @@ -160,6 +225,11 @@ typedef union { #if defined(SIMDE_WASM_SIMD128_NATIVE) v128_t v128; #endif + + #if defined(SIMDE_RISCV_V_NATIVE) + fixed_vint8m1_t sv128; + #endif + } simde_int8x16_private; typedef union { @@ -176,6 +246,11 @@ typedef union { #if defined(SIMDE_WASM_SIMD128_NATIVE) v128_t v128; #endif + + #if defined(SIMDE_RISCV_V_NATIVE) + fixed_vint16m1_t sv128; + #endif + } simde_int16x8_private; typedef union { @@ -196,6 +271,11 @@ typedef union { #if defined(SIMDE_WASM_SIMD128_NATIVE) v128_t v128; #endif + + #if defined(SIMDE_RISCV_V_NATIVE) + fixed_vint32m1_t sv128; + #endif + } simde_int32x4_private; typedef union { @@ -212,6 +292,11 @@ typedef union { #if defined(SIMDE_WASM_SIMD128_NATIVE) v128_t v128; #endif + + #if defined(SIMDE_RISCV_V_NATIVE) + fixed_vint64m1_t sv128; + #endif + } simde_int64x2_private; typedef union { @@ -228,6 +313,11 @@ typedef union { #if defined(SIMDE_WASM_SIMD128_NATIVE) v128_t v128; #endif + + #if defined(SIMDE_RISCV_V_NATIVE) + fixed_vuint8m1_t sv128; + #endif + } simde_uint8x16_private; typedef union { @@ -244,6 +334,11 @@ typedef union { #if defined(SIMDE_WASM_SIMD128_NATIVE) v128_t v128; #endif + + #if defined(SIMDE_RISCV_V_NATIVE) + fixed_vuint16m1_t sv128; + #endif + } simde_uint16x8_private; typedef union { @@ -260,6 +355,11 @@ typedef union { #if defined(SIMDE_WASM_SIMD128_NATIVE) v128_t v128; #endif + + #if defined(SIMDE_RISCV_V_NATIVE) + fixed_vuint32m1_t sv128; + #endif + } simde_uint32x4_private; typedef union { @@ -276,6 +376,11 @@ typedef union { #if defined(SIMDE_WASM_SIMD128_NATIVE) v128_t v128; #endif + + #if defined(SIMDE_RISCV_V_NATIVE) + fixed_vuint64m1_t sv128; + #endif + } simde_uint64x2_private; typedef union { @@ -296,6 +401,11 @@ typedef union { #if defined(SIMDE_WASM_SIMD128_NATIVE) v128_t v128; #endif + + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + fixed_vfloat16m1_t sv128; + #endif + } simde_float16x8_private; typedef union { @@ -312,6 +422,11 @@ typedef union { #if defined(SIMDE_WASM_SIMD128_NATIVE) v128_t v128; #endif + + #if defined(SIMDE_RISCV_V_NATIVE) + fixed_vfloat32m1_t sv128; + #endif + } simde_float32x4_private; typedef union { @@ -328,18 +443,32 @@ typedef union { #if defined(SIMDE_WASM_SIMD128_NATIVE) v128_t v128; #endif + + #if defined(SIMDE_RISCV_V_NATIVE) + fixed_vfloat64m1_t sv128; + #endif + } simde_float64x2_private; typedef union { SIMDE_ARM_NEON_DECLARE_VECTOR(simde_poly8, values, 16); + #if defined(SIMDE_RISCV_V_NATIVE) + fixed_vuint8m1_t sv128; + #endif } simde_poly8x16_private; typedef union { SIMDE_ARM_NEON_DECLARE_VECTOR(simde_poly16, values, 16); + #if defined(SIMDE_RISCV_V_NATIVE) + fixed_vuint16m1_t sv128; + #endif } simde_poly16x8_private; typedef union { SIMDE_ARM_NEON_DECLARE_VECTOR(simde_poly64, values, 16); + #if defined(SIMDE_RISCV_V_NATIVE) + fixed_vuint64m1_t sv128; + #endif } simde_poly64x2_private; typedef union { @@ -647,6 +776,42 @@ typedef union { #define SIMDE_ARM_NEON_NEED_PORTABLE_U64X2 #define SIMDE_ARM_NEON_NEED_PORTABLE_F64X2 #endif +#elif defined(SIMDE_RISCV_V_NATIVE) + + typedef fixed_vint8m1_t simde_int8x8_t; + typedef fixed_vint16m1_t simde_int16x4_t; + typedef fixed_vint32m1_t simde_int32x2_t; + typedef fixed_vint64m1_t simde_int64x1_t; + typedef fixed_vuint8m1_t simde_uint8x8_t; + typedef fixed_vuint16m1_t simde_uint16x4_t; + typedef fixed_vuint32m1_t simde_uint32x2_t; + typedef fixed_vuint64m1_t simde_uint64x1_t; + typedef fixed_vfloat32m1_t simde_float32x2_t; + typedef fixed_vfloat64m1_t simde_float64x1_t; + + typedef fixed_vint8m1_t simde_int8x16_t; + typedef fixed_vint16m1_t simde_int16x8_t; + typedef fixed_vint32m1_t simde_int32x4_t; + typedef fixed_vint64m1_t simde_int64x2_t; + typedef fixed_vuint8m1_t simde_uint8x16_t; + typedef fixed_vuint16m1_t simde_uint16x8_t; + typedef fixed_vuint32m1_t simde_uint32x4_t; + typedef fixed_vuint64m1_t simde_uint64x2_t; + typedef fixed_vfloat32m1_t simde_float32x4_t; + typedef fixed_vfloat64m1_t simde_float64x2_t; + + #define SIMDE_ARM_NEON_NEED_PORTABLE_F16 + #define SIMDE_ARM_NEON_NEED_PORTABLE_F32 + #define SIMDE_ARM_NEON_NEED_PORTABLE_F64 + #define SIMDE_ARM_NEON_NEED_PORTABLE_POLY_128_BIT + #define SIMDE_ARM_NEON_NEED_PORTABLE_POLY_64_BIT + #define SIMDE_ARM_NEON_NEED_PORTABLE_POLY + #define SIMDE_ARM_NEON_NEED_PORTABLE_F64X1XN + #define SIMDE_ARM_NEON_NEED_PORTABLE_F64X2XN + #define SIMDE_ARM_NEON_NEED_PORTABLE_POLY_VXN + #define SIMDE_ARM_NEON_NEED_PORTABLE_VXN + #define SIMDE_ARM_NEON_NEED_PORTABLE_BF16 + #elif defined(SIMDE_VECTOR) typedef simde_float32 simde_float32_t; typedef simde_float64 simde_float64_t; diff --git a/simde/simde-arch.h b/simde/simde-arch.h index a492d7edc..959226b8e 100644 --- a/simde/simde-arch.h +++ b/simde/simde-arch.h @@ -479,8 +479,40 @@ #define SIMDE_ARCH_POWER_ALTIVEC_CHECK(version) (0) #endif -#if defined(__riscv) && __riscv_xlen==64 -# define SIMDE_ARCH_RISCV64 +/* RISC-V + */ +#if defined(__riscv) || defined(__riscv__) +# if __riscv_xlen == 64 +# define SIMDE_ARCH_RISCV64 +# elif __riscv_xlen == 32 +# define SIMDE_ARCH_RISCV32 +# endif +#endif + +/* RISC-V SIMD ISA extensions */ +#if defined(__riscv_zve32x) +# define SIMDE_ARCH_RISCV_ZVE32X 1 +#endif +#if defined(__riscv_zve32f) +# define SIMDE_ARCH_RISCV_ZVE32F 1 +#endif +#if defined(__riscv_zve64x) +# define SIMDE_ARCH_RISCV_ZVE64X 1 +#endif +#if defined(__riscv_zve64f) +# define SIMDE_ARCH_RISCV_ZVE64F 1 +#endif +#if defined(__riscv_zve64d) +# define SIMDE_ARCH_RISCV_ZVE64D 1 +#endif +#if defined(__riscv_v) +# define SIMDE_ARCH_RISCV_V 1 +#endif +#if defined(__riscv_zvfh) +# define SIMDE_ARCH_RISCV_ZVFH 1 +#endif +#if defined(__riscv_zvfhmin) +# define SIMDE_ARCH_RISCV_ZVFHMIN 1 #endif /* SPARC diff --git a/simde/simde-common.h b/simde/simde-common.h index 2476dd77f..9786649d3 100644 --- a/simde/simde-common.h +++ b/simde/simde-common.h @@ -23,6 +23,7 @@ * Copyright: * 2017-2020 Evan Nemerson * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_COMMON_H) @@ -1189,6 +1190,34 @@ HEDLEY_DIAGNOSTIC_POP #define SIMDE_CAST_VECTOR_SHIFT_COUNT(width, value) HEDLEY_STATIC_CAST(int##width##_t, (value)) #endif +/* Initial support for RISCV V extensions based on ZVE64D. */ +#if defined(SIMDE_ARCH_RISCV_ZVE64D) && SIMDE_NATURAL_VECTOR_SIZE >= 64 + #define RVV_FIXED_TYPE_DEF(name, lmul) \ + typedef vint8##name##_t fixed_vint8##name##_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen * lmul))); \ + typedef vint16##name##_t fixed_vint16##name##_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen * lmul))); \ + typedef vint32##name##_t fixed_vint32##name##_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen * lmul))); \ + typedef vuint8##name##_t fixed_vuint8##name##_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen * lmul))); \ + typedef vuint16##name##_t fixed_vuint16##name##_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen * lmul))); \ + typedef vuint32##name##_t fixed_vuint32##name##_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen * lmul))); \ + typedef vfloat32##name##_t fixed_vfloat32##name##_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen * lmul))); + RVV_FIXED_TYPE_DEF(mf2, 1/2); + RVV_FIXED_TYPE_DEF(m1, 1); + RVV_FIXED_TYPE_DEF(m2, 2); + #define RVV_FIXED_TYPE_DEF_64B(name, lmul) \ + typedef vint64##name##_t fixed_vint64##name##_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen * lmul))); \ + typedef vuint64##name##_t fixed_vuint64##name##_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen * lmul))); \ + typedef vfloat64##name##_t fixed_vfloat64##name##_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen * lmul))); + RVV_FIXED_TYPE_DEF_64B(m1, 1); + RVV_FIXED_TYPE_DEF_64B(m2, 2); + #if defined(SIMDE_ARCH_RISCV_ZVFH) + #define RVV_FIXED_TYPE_DEF_16F(name, lmul) \ + typedef vfloat16##name##_t fixed_vfloat16##name##_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen * lmul))); + RVV_FIXED_TYPE_DEF_16F(mf2, 1/2); + RVV_FIXED_TYPE_DEF_16F(m1, 1); + RVV_FIXED_TYPE_DEF_16F(m2, 2); + #endif +#endif + /* SIMDE_DIAGNOSTIC_DISABLE_USED_BUT_MARKED_UNUSED_ */ HEDLEY_DIAGNOSTIC_POP diff --git a/simde/simde-f16.h b/simde/simde-f16.h index 632ef626a..5d10ce75e 100644 --- a/simde/simde-f16.h +++ b/simde/simde-f16.h @@ -22,6 +22,7 @@ * * Copyright: * 2021 Evan Nemerson + * 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ #include "hedley.h" @@ -75,7 +76,8 @@ SIMDE_BEGIN_DECLS_ (defined(SIMDE_ARCH_X86_SSE2) && HEDLEY_GCC_VERSION_CHECK(12,0,0)) || \ (defined(SIMDE_ARCH_AARCH64) && HEDLEY_GCC_VERSION_CHECK(7,0,0) && !defined(__cplusplus)) || \ ((defined(SIMDE_ARCH_X86) || defined(SIMDE_ARCH_AMD64)) && SIMDE_DETECT_CLANG_VERSION_CHECK(15,0,0)) || \ - (!(defined(SIMDE_ARCH_X86) || defined(SIMDE_ARCH_AMD64)) && SIMDE_DETECT_CLANG_VERSION_CHECK(6,0,0))) + (!(defined(SIMDE_ARCH_X86) || defined(SIMDE_ARCH_AMD64)) && SIMDE_DETECT_CLANG_VERSION_CHECK(6,0,0))) || \ + defined(SIMDE_ARCH_RISCV_ZVFH) /* We haven't found a better way to detect this. It seems like defining * __STDC_WANT_IEC_60559_TYPES_EXT__, then including float.h, then * checking for defined(FLT16_MAX) should work, but both gcc and diff --git a/simde/simde-features.h b/simde/simde-features.h index 622d12908..7b622ead3 100644 --- a/simde/simde-features.h +++ b/simde/simde-features.h @@ -22,6 +22,7 @@ * * Copyright: * 2020 Evan Nemerson + * 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ /* simde-arch.h is used to determine which features are available according @@ -378,6 +379,15 @@ #endif #endif +#if !defined(SIMDE_RISCV_V_NATIVE) && !defined(SIMDE_RISCV_V_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) + #if defined(SIMDE_ARCH_RISCV_V) + #define SIMDE_RISCV_V_NATIVE + #endif +#endif +#if defined(SIMDE_RISCV_V_NATIVE) + #include +#endif + #if !defined(SIMDE_WASM_SIMD128_NATIVE) && !defined(SIMDE_WASM_SIMD128_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) #if defined(SIMDE_ARCH_WASM_SIMD128) #define SIMDE_WASM_SIMD128_NATIVE @@ -549,6 +559,9 @@ #define SIMDE_NATURAL_FLOAT_VECTOR_SIZE (128) #define SIMDE_NATURAL_INT_VECTOR_SIZE (64) #define SIMDE_NATURAL_DOUBLE_VECTOR_SIZE (0) + #elif defined(SIMDE_RISCV_V_NATIVE) && defined(__riscv_v_fixed_vlen) + //FIXME : SIMDE_NATURAL_VECTOR_SIZE == __riscv_v_fixed_vlen + #define SIMDE_NATURAL_VECTOR_SIZE (128) #endif #if !defined(SIMDE_NATURAL_VECTOR_SIZE) @@ -690,6 +703,10 @@ #define SIMDE_ARM_SVE_ENABLE_NATIVE_ALIASES #endif + #if !defined(SIMDE_RISCV_V_NATIVE) + #define SIMDE_RISCV_V_ENABLE_NATIVE_ALIASES + #endif + #if !defined(SIMDE_MIPS_MSA_NATIVE) #define SIMDE_MIPS_MSA_ENABLE_NATIVE_ALIASES #endif diff --git a/test/arm/neon/reinterpret.c b/test/arm/neon/reinterpret.c index 1c97de9e4..f8893737f 100644 --- a/test/arm/neon/reinterpret.c +++ b/test/arm/neon/reinterpret.c @@ -34,7 +34,7 @@ test_simde_vreinterpret_f32_s8 (SIMDE_MUNIT_TEST_ARGS) { simde_float32x2_t r = simde_vreinterpret_f32_s8(a); simde_float32x2_private r_ = simde_float32x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -82,7 +82,7 @@ test_simde_vreinterpret_f64_s8 (SIMDE_MUNIT_TEST_ARGS) { simde_float64x1_t r = simde_vreinterpret_f64_s8(a); simde_float64x1_private r_ = simde_float64x1_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -130,7 +130,7 @@ test_simde_vreinterpret_s16_s8 (SIMDE_MUNIT_TEST_ARGS) { simde_int16x4_t r = simde_vreinterpret_s16_s8(a); simde_int16x4_private r_ = simde_int16x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -178,7 +178,7 @@ test_simde_vreinterpret_s32_s8 (SIMDE_MUNIT_TEST_ARGS) { simde_int32x2_t r = simde_vreinterpret_s32_s8(a); simde_int32x2_private r_ = simde_int32x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -226,7 +226,7 @@ test_simde_vreinterpret_s64_s8 (SIMDE_MUNIT_TEST_ARGS) { simde_int64x1_t r = simde_vreinterpret_s64_s8(a); simde_int64x1_private r_ = simde_int64x1_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -274,7 +274,7 @@ test_simde_vreinterpret_u8_s8 (SIMDE_MUNIT_TEST_ARGS) { simde_uint8x8_t r = simde_vreinterpret_u8_s8(a); simde_uint8x8_private r_ = simde_uint8x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -321,7 +321,7 @@ test_simde_vreinterpret_u16_s8 (SIMDE_MUNIT_TEST_ARGS) { simde_int8x8_private a_ = simde_int8x8_to_private(a); simde_uint16x4_t r = simde_vreinterpret_u16_s8(a); simde_uint16x4_private r_ = simde_uint16x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -368,7 +368,7 @@ test_simde_vreinterpret_u32_s8 (SIMDE_MUNIT_TEST_ARGS) { simde_int8x8_private a_ = simde_int8x8_to_private(a); simde_uint32x2_t r = simde_vreinterpret_u32_s8(a); simde_uint32x2_private r_ = simde_uint32x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -415,7 +415,7 @@ test_simde_vreinterpret_u64_s8 (SIMDE_MUNIT_TEST_ARGS) { simde_int8x8_private a_ = simde_int8x8_to_private(a); simde_uint64x1_t r = simde_vreinterpret_u64_s8(a); simde_uint64x1_private r_ = simde_uint64x1_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -470,7 +470,7 @@ test_simde_vreinterpretq_f32_s8 (SIMDE_MUNIT_TEST_ARGS) { simde_int8x16_private a_ = simde_int8x16_to_private(a); simde_float32x4_t r = simde_vreinterpretq_f32_s8(a); simde_float32x4_private r_ = simde_float32x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -526,7 +526,7 @@ test_simde_vreinterpretq_f64_s8 (SIMDE_MUNIT_TEST_ARGS) { simde_float64x2_t r = simde_vreinterpretq_f64_s8(a); simde_float64x2_private r_ = simde_float64x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -582,7 +582,7 @@ test_simde_vreinterpretq_s16_s8 (SIMDE_MUNIT_TEST_ARGS) { simde_int16x8_t r = simde_vreinterpretq_s16_s8(a); simde_int16x8_private r_ = simde_int16x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -637,7 +637,7 @@ test_simde_vreinterpretq_s32_s8 (SIMDE_MUNIT_TEST_ARGS) { simde_int8x16_private a_ = simde_int8x16_to_private(a); simde_int32x4_t r = simde_vreinterpretq_s32_s8(a); simde_int32x4_private r_ = simde_int32x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -692,7 +692,7 @@ test_simde_vreinterpretq_s64_s8 (SIMDE_MUNIT_TEST_ARGS) { simde_int8x16_private a_ = simde_int8x16_to_private(a); simde_int64x2_t r = simde_vreinterpretq_s64_s8(a); simde_int64x2_private r_ = simde_int64x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -748,7 +748,7 @@ test_simde_vreinterpretq_u8_s8 (SIMDE_MUNIT_TEST_ARGS) { simde_uint8x16_t r = simde_vreinterpretq_u8_s8(a); simde_uint8x16_private r_ = simde_uint8x16_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -803,7 +803,7 @@ test_simde_vreinterpretq_u16_s8 (SIMDE_MUNIT_TEST_ARGS) { simde_int8x16_private a_ = simde_int8x16_to_private(a); simde_uint16x8_t r = simde_vreinterpretq_u16_s8(a); simde_uint16x8_private r_ = simde_uint16x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -859,7 +859,7 @@ test_simde_vreinterpretq_u32_s8 (SIMDE_MUNIT_TEST_ARGS) { simde_uint32x4_t r = simde_vreinterpretq_u32_s8(a); simde_uint32x4_private r_ = simde_uint32x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -914,7 +914,7 @@ test_simde_vreinterpretq_u64_s8 (SIMDE_MUNIT_TEST_ARGS) { simde_int8x16_private a_ = simde_int8x16_to_private(a); simde_uint64x2_t r = simde_vreinterpretq_u64_s8(a); simde_uint64x2_private r_ = simde_uint64x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -962,7 +962,7 @@ test_simde_vreinterpret_f32_s16 (SIMDE_MUNIT_TEST_ARGS) { simde_float32x2_t r = simde_vreinterpret_f32_s16(a); simde_float32x2_private r_ = simde_float32x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -1010,7 +1010,7 @@ test_simde_vreinterpret_f64_s16 (SIMDE_MUNIT_TEST_ARGS) { simde_float64x1_t r = simde_vreinterpret_f64_s16(a); simde_float64x1_private r_ = simde_float64x1_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -1058,7 +1058,7 @@ test_simde_vreinterpret_s8_s16 (SIMDE_MUNIT_TEST_ARGS) { simde_int8x8_t r = simde_vreinterpret_s8_s16(a); simde_int8x8_private r_ = simde_int8x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -1106,7 +1106,7 @@ test_simde_vreinterpret_s32_s16 (SIMDE_MUNIT_TEST_ARGS) { simde_int32x2_t r = simde_vreinterpret_s32_s16(a); simde_int32x2_private r_ = simde_int32x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -1154,7 +1154,7 @@ test_simde_vreinterpret_s64_s16 (SIMDE_MUNIT_TEST_ARGS) { simde_int64x1_t r = simde_vreinterpret_s64_s16(a); simde_int64x1_private r_ = simde_int64x1_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -1202,7 +1202,7 @@ test_simde_vreinterpret_u8_s16 (SIMDE_MUNIT_TEST_ARGS) { simde_uint8x8_t r = simde_vreinterpret_u8_s16(a); simde_uint8x8_private r_ = simde_uint8x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -1249,7 +1249,7 @@ test_simde_vreinterpret_u16_s16 (SIMDE_MUNIT_TEST_ARGS) { simde_int16x4_private a_ = simde_int16x4_to_private(a); simde_uint16x4_t r = simde_vreinterpret_u16_s16(a); simde_uint16x4_private r_ = simde_uint16x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -1296,7 +1296,7 @@ test_simde_vreinterpret_u32_s16 (SIMDE_MUNIT_TEST_ARGS) { simde_int16x4_private a_ = simde_int16x4_to_private(a); simde_uint32x2_t r = simde_vreinterpret_u32_s16(a); simde_uint32x2_private r_ = simde_uint32x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -1343,7 +1343,7 @@ test_simde_vreinterpret_u64_s16 (SIMDE_MUNIT_TEST_ARGS) { simde_int16x4_private a_ = simde_int16x4_to_private(a); simde_uint64x1_t r = simde_vreinterpret_u64_s16(a); simde_uint64x1_private r_ = simde_uint64x1_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -1390,7 +1390,7 @@ test_simde_vreinterpretq_f32_s16 (SIMDE_MUNIT_TEST_ARGS) { simde_int16x8_private a_ = simde_int16x8_to_private(a); simde_float32x4_t r = simde_vreinterpretq_f32_s16(a); simde_float32x4_private r_ = simde_float32x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -1438,7 +1438,7 @@ test_simde_vreinterpretq_f64_s16 (SIMDE_MUNIT_TEST_ARGS) { simde_float64x2_t r = simde_vreinterpretq_f64_s16(a); simde_float64x2_private r_ = simde_float64x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -1486,7 +1486,7 @@ test_simde_vreinterpretq_s8_s16 (SIMDE_MUNIT_TEST_ARGS) { simde_int8x16_t r = simde_vreinterpretq_s8_s16(a); simde_int8x16_private r_ = simde_int8x16_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -1533,7 +1533,7 @@ test_simde_vreinterpretq_s32_s16 (SIMDE_MUNIT_TEST_ARGS) { simde_int16x8_private a_ = simde_int16x8_to_private(a); simde_int32x4_t r = simde_vreinterpretq_s32_s16(a); simde_int32x4_private r_ = simde_int32x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -1580,7 +1580,7 @@ test_simde_vreinterpretq_s64_s16 (SIMDE_MUNIT_TEST_ARGS) { simde_int16x8_private a_ = simde_int16x8_to_private(a); simde_int64x2_t r = simde_vreinterpretq_s64_s16(a); simde_int64x2_private r_ = simde_int64x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -1628,7 +1628,7 @@ test_simde_vreinterpretq_u8_s16 (SIMDE_MUNIT_TEST_ARGS) { simde_uint8x16_t r = simde_vreinterpretq_u8_s16(a); simde_uint8x16_private r_ = simde_uint8x16_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -1675,7 +1675,7 @@ test_simde_vreinterpretq_u16_s16 (SIMDE_MUNIT_TEST_ARGS) { simde_int16x8_private a_ = simde_int16x8_to_private(a); simde_uint16x8_t r = simde_vreinterpretq_u16_s16(a); simde_uint16x8_private r_ = simde_uint16x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -1723,7 +1723,7 @@ test_simde_vreinterpretq_u32_s16 (SIMDE_MUNIT_TEST_ARGS) { simde_uint32x4_t r = simde_vreinterpretq_u32_s16(a); simde_uint32x4_private r_ = simde_uint32x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -1770,7 +1770,7 @@ test_simde_vreinterpretq_u64_s16 (SIMDE_MUNIT_TEST_ARGS) { simde_int16x8_private a_ = simde_int16x8_to_private(a); simde_uint64x2_t r = simde_vreinterpretq_u64_s16(a); simde_uint64x2_private r_ = simde_uint64x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -1818,7 +1818,7 @@ test_simde_vreinterpret_f32_s32 (SIMDE_MUNIT_TEST_ARGS) { simde_float32x2_t r = simde_vreinterpret_f32_s32(a); simde_float32x2_private r_ = simde_float32x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -1866,7 +1866,7 @@ test_simde_vreinterpret_f64_s32 (SIMDE_MUNIT_TEST_ARGS) { simde_float64x1_t r = simde_vreinterpret_f64_s32(a); simde_float64x1_private r_ = simde_float64x1_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -1914,7 +1914,7 @@ test_simde_vreinterpret_s8_s32 (SIMDE_MUNIT_TEST_ARGS) { simde_int8x8_t r = simde_vreinterpret_s8_s32(a); simde_int8x8_private r_ = simde_int8x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -1962,7 +1962,7 @@ test_simde_vreinterpret_s16_s32 (SIMDE_MUNIT_TEST_ARGS) { simde_int16x4_t r = simde_vreinterpret_s16_s32(a); simde_int16x4_private r_ = simde_int16x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -2010,7 +2010,7 @@ test_simde_vreinterpret_s64_s32 (SIMDE_MUNIT_TEST_ARGS) { simde_int64x1_t r = simde_vreinterpret_s64_s32(a); simde_int64x1_private r_ = simde_int64x1_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -2058,7 +2058,7 @@ test_simde_vreinterpret_u8_s32 (SIMDE_MUNIT_TEST_ARGS) { simde_uint8x8_t r = simde_vreinterpret_u8_s32(a); simde_uint8x8_private r_ = simde_uint8x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -2105,7 +2105,7 @@ test_simde_vreinterpret_u16_s32 (SIMDE_MUNIT_TEST_ARGS) { simde_int32x2_private a_ = simde_int32x2_to_private(a); simde_uint16x4_t r = simde_vreinterpret_u16_s32(a); simde_uint16x4_private r_ = simde_uint16x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -2152,7 +2152,7 @@ test_simde_vreinterpret_u32_s32 (SIMDE_MUNIT_TEST_ARGS) { simde_int32x2_private a_ = simde_int32x2_to_private(a); simde_uint32x2_t r = simde_vreinterpret_u32_s32(a); simde_uint32x2_private r_ = simde_uint32x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -2199,7 +2199,7 @@ test_simde_vreinterpret_u64_s32 (SIMDE_MUNIT_TEST_ARGS) { simde_int32x2_private a_ = simde_int32x2_to_private(a); simde_uint64x1_t r = simde_vreinterpret_u64_s32(a); simde_uint64x1_private r_ = simde_uint64x1_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -2246,7 +2246,7 @@ test_simde_vreinterpretq_f32_s32 (SIMDE_MUNIT_TEST_ARGS) { simde_int32x4_private a_ = simde_int32x4_to_private(a); simde_float32x4_t r = simde_vreinterpretq_f32_s32(a); simde_float32x4_private r_ = simde_float32x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -2294,7 +2294,7 @@ test_simde_vreinterpretq_f64_s32 (SIMDE_MUNIT_TEST_ARGS) { simde_float64x2_t r = simde_vreinterpretq_f64_s32(a); simde_float64x2_private r_ = simde_float64x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -2342,7 +2342,7 @@ test_simde_vreinterpretq_s8_s32 (SIMDE_MUNIT_TEST_ARGS) { simde_int8x16_t r = simde_vreinterpretq_s8_s32(a); simde_int8x16_private r_ = simde_int8x16_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -2390,7 +2390,7 @@ test_simde_vreinterpretq_s16_s32 (SIMDE_MUNIT_TEST_ARGS) { simde_int16x8_t r = simde_vreinterpretq_s16_s32(a); simde_int16x8_private r_ = simde_int16x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -2437,7 +2437,7 @@ test_simde_vreinterpretq_s64_s32 (SIMDE_MUNIT_TEST_ARGS) { simde_int32x4_private a_ = simde_int32x4_to_private(a); simde_int64x2_t r = simde_vreinterpretq_s64_s32(a); simde_int64x2_private r_ = simde_int64x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -2485,7 +2485,7 @@ test_simde_vreinterpretq_u8_s32 (SIMDE_MUNIT_TEST_ARGS) { simde_uint8x16_t r = simde_vreinterpretq_u8_s32(a); simde_uint8x16_private r_ = simde_uint8x16_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -2532,7 +2532,7 @@ test_simde_vreinterpretq_u16_s32 (SIMDE_MUNIT_TEST_ARGS) { simde_int32x4_private a_ = simde_int32x4_to_private(a); simde_uint16x8_t r = simde_vreinterpretq_u16_s32(a); simde_uint16x8_private r_ = simde_uint16x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -2580,7 +2580,7 @@ test_simde_vreinterpretq_u32_s32 (SIMDE_MUNIT_TEST_ARGS) { simde_uint32x4_t r = simde_vreinterpretq_u32_s32(a); simde_uint32x4_private r_ = simde_uint32x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -2627,7 +2627,7 @@ test_simde_vreinterpretq_u64_s32 (SIMDE_MUNIT_TEST_ARGS) { simde_int32x4_private a_ = simde_int32x4_to_private(a); simde_uint64x2_t r = simde_vreinterpretq_u64_s32(a); simde_uint64x2_private r_ = simde_uint64x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -2675,7 +2675,7 @@ test_simde_vreinterpret_f32_s64 (SIMDE_MUNIT_TEST_ARGS) { simde_float32x2_t r = simde_vreinterpret_f32_s64(a); simde_float32x2_private r_ = simde_float32x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -2723,7 +2723,7 @@ test_simde_vreinterpret_f64_s64 (SIMDE_MUNIT_TEST_ARGS) { simde_float64x1_t r = simde_vreinterpret_f64_s64(a); simde_float64x1_private r_ = simde_float64x1_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -2771,7 +2771,7 @@ test_simde_vreinterpret_s8_s64 (SIMDE_MUNIT_TEST_ARGS) { simde_int8x8_t r = simde_vreinterpret_s8_s64(a); simde_int8x8_private r_ = simde_int8x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -2819,7 +2819,7 @@ test_simde_vreinterpret_s16_s64 (SIMDE_MUNIT_TEST_ARGS) { simde_int16x4_t r = simde_vreinterpret_s16_s64(a); simde_int16x4_private r_ = simde_int16x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -2867,7 +2867,7 @@ test_simde_vreinterpret_s32_s64 (SIMDE_MUNIT_TEST_ARGS) { simde_int32x2_t r = simde_vreinterpret_s32_s64(a); simde_int32x2_private r_ = simde_int32x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -2915,7 +2915,7 @@ test_simde_vreinterpret_u8_s64 (SIMDE_MUNIT_TEST_ARGS) { simde_uint8x8_t r = simde_vreinterpret_u8_s64(a); simde_uint8x8_private r_ = simde_uint8x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -2962,7 +2962,7 @@ test_simde_vreinterpret_u16_s64 (SIMDE_MUNIT_TEST_ARGS) { simde_int64x1_private a_ = simde_int64x1_to_private(a); simde_uint16x4_t r = simde_vreinterpret_u16_s64(a); simde_uint16x4_private r_ = simde_uint16x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -3009,7 +3009,7 @@ test_simde_vreinterpret_u32_s64 (SIMDE_MUNIT_TEST_ARGS) { simde_int64x1_private a_ = simde_int64x1_to_private(a); simde_uint32x2_t r = simde_vreinterpret_u32_s64(a); simde_uint32x2_private r_ = simde_uint32x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -3056,7 +3056,7 @@ test_simde_vreinterpret_u64_s64 (SIMDE_MUNIT_TEST_ARGS) { simde_int64x1_private a_ = simde_int64x1_to_private(a); simde_uint64x1_t r = simde_vreinterpret_u64_s64(a); simde_uint64x1_private r_ = simde_uint64x1_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -3103,7 +3103,7 @@ test_simde_vreinterpretq_f32_s64 (SIMDE_MUNIT_TEST_ARGS) { simde_int64x2_private a_ = simde_int64x2_to_private(a); simde_float32x4_t r = simde_vreinterpretq_f32_s64(a); simde_float32x4_private r_ = simde_float32x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -3151,7 +3151,7 @@ test_simde_vreinterpretq_f64_s64 (SIMDE_MUNIT_TEST_ARGS) { simde_float64x2_t r = simde_vreinterpretq_f64_s64(a); simde_float64x2_private r_ = simde_float64x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -3199,7 +3199,7 @@ test_simde_vreinterpretq_s8_s64 (SIMDE_MUNIT_TEST_ARGS) { simde_int8x16_t r = simde_vreinterpretq_s8_s64(a); simde_int8x16_private r_ = simde_int8x16_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -3247,7 +3247,7 @@ test_simde_vreinterpretq_s16_s64 (SIMDE_MUNIT_TEST_ARGS) { simde_int16x8_t r = simde_vreinterpretq_s16_s64(a); simde_int16x8_private r_ = simde_int16x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -3294,7 +3294,7 @@ test_simde_vreinterpretq_s32_s64 (SIMDE_MUNIT_TEST_ARGS) { simde_int64x2_private a_ = simde_int64x2_to_private(a); simde_int32x4_t r = simde_vreinterpretq_s32_s64(a); simde_int32x4_private r_ = simde_int32x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -3342,7 +3342,7 @@ test_simde_vreinterpretq_u8_s64 (SIMDE_MUNIT_TEST_ARGS) { simde_uint8x16_t r = simde_vreinterpretq_u8_s64(a); simde_uint8x16_private r_ = simde_uint8x16_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -3389,7 +3389,7 @@ test_simde_vreinterpretq_u16_s64 (SIMDE_MUNIT_TEST_ARGS) { simde_int64x2_private a_ = simde_int64x2_to_private(a); simde_uint16x8_t r = simde_vreinterpretq_u16_s64(a); simde_uint16x8_private r_ = simde_uint16x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -3437,7 +3437,7 @@ test_simde_vreinterpretq_u32_s64 (SIMDE_MUNIT_TEST_ARGS) { simde_uint32x4_t r = simde_vreinterpretq_u32_s64(a); simde_uint32x4_private r_ = simde_uint32x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -3484,7 +3484,7 @@ test_simde_vreinterpretq_u64_s64 (SIMDE_MUNIT_TEST_ARGS) { simde_int64x2_private a_ = simde_int64x2_to_private(a); simde_uint64x2_t r = simde_vreinterpretq_u64_s64(a); simde_uint64x2_private r_ = simde_uint64x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -3532,7 +3532,7 @@ test_simde_vreinterpret_f32_u8 (SIMDE_MUNIT_TEST_ARGS) { simde_float32x2_t r = simde_vreinterpret_f32_u8(a); simde_float32x2_private r_ = simde_float32x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -3580,7 +3580,7 @@ test_simde_vreinterpret_f64_u8 (SIMDE_MUNIT_TEST_ARGS) { simde_float64x1_t r = simde_vreinterpret_f64_u8(a); simde_float64x1_private r_ = simde_float64x1_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -3628,7 +3628,7 @@ test_simde_vreinterpret_s8_u8 (SIMDE_MUNIT_TEST_ARGS) { simde_int8x8_t r = simde_vreinterpret_s8_u8(a); simde_int8x8_private r_ = simde_int8x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -3676,7 +3676,7 @@ test_simde_vreinterpret_s16_u8 (SIMDE_MUNIT_TEST_ARGS) { simde_int16x4_t r = simde_vreinterpret_s16_u8(a); simde_int16x4_private r_ = simde_int16x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -3724,7 +3724,7 @@ test_simde_vreinterpret_s32_u8 (SIMDE_MUNIT_TEST_ARGS) { simde_int32x2_t r = simde_vreinterpret_s32_u8(a); simde_int32x2_private r_ = simde_int32x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -3772,7 +3772,7 @@ test_simde_vreinterpret_s64_u8 (SIMDE_MUNIT_TEST_ARGS) { simde_int64x1_t r = simde_vreinterpret_s64_u8(a); simde_int64x1_private r_ = simde_int64x1_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -3819,7 +3819,7 @@ test_simde_vreinterpret_u16_u8 (SIMDE_MUNIT_TEST_ARGS) { simde_uint8x8_private a_ = simde_uint8x8_to_private(a); simde_uint16x4_t r = simde_vreinterpret_u16_u8(a); simde_uint16x4_private r_ = simde_uint16x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -3866,7 +3866,7 @@ test_simde_vreinterpret_u32_u8 (SIMDE_MUNIT_TEST_ARGS) { simde_uint8x8_private a_ = simde_uint8x8_to_private(a); simde_uint32x2_t r = simde_vreinterpret_u32_u8(a); simde_uint32x2_private r_ = simde_uint32x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -3913,7 +3913,7 @@ test_simde_vreinterpret_u64_u8 (SIMDE_MUNIT_TEST_ARGS) { simde_uint8x8_private a_ = simde_uint8x8_to_private(a); simde_uint64x1_t r = simde_vreinterpret_u64_u8(a); simde_uint64x1_private r_ = simde_uint64x1_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -3968,7 +3968,7 @@ test_simde_vreinterpretq_f32_u8 (SIMDE_MUNIT_TEST_ARGS) { simde_uint8x16_private a_ = simde_uint8x16_to_private(a); simde_float32x4_t r = simde_vreinterpretq_f32_u8(a); simde_float32x4_private r_ = simde_float32x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -4024,7 +4024,7 @@ test_simde_vreinterpretq_f64_u8 (SIMDE_MUNIT_TEST_ARGS) { simde_float64x2_t r = simde_vreinterpretq_f64_u8(a); simde_float64x2_private r_ = simde_float64x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -4080,7 +4080,7 @@ test_simde_vreinterpretq_s8_u8 (SIMDE_MUNIT_TEST_ARGS) { simde_int8x16_t r = simde_vreinterpretq_s8_u8(a); simde_int8x16_private r_ = simde_int8x16_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -4136,7 +4136,7 @@ test_simde_vreinterpretq_s16_u8 (SIMDE_MUNIT_TEST_ARGS) { simde_int16x8_t r = simde_vreinterpretq_s16_u8(a); simde_int16x8_private r_ = simde_int16x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -4191,7 +4191,7 @@ test_simde_vreinterpretq_s32_u8 (SIMDE_MUNIT_TEST_ARGS) { simde_uint8x16_private a_ = simde_uint8x16_to_private(a); simde_int32x4_t r = simde_vreinterpretq_s32_u8(a); simde_int32x4_private r_ = simde_int32x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -4246,7 +4246,7 @@ test_simde_vreinterpretq_s64_u8 (SIMDE_MUNIT_TEST_ARGS) { simde_uint8x16_private a_ = simde_uint8x16_to_private(a); simde_int64x2_t r = simde_vreinterpretq_s64_u8(a); simde_int64x2_private r_ = simde_int64x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -4301,7 +4301,7 @@ test_simde_vreinterpretq_u16_u8 (SIMDE_MUNIT_TEST_ARGS) { simde_uint8x16_private a_ = simde_uint8x16_to_private(a); simde_uint16x8_t r = simde_vreinterpretq_u16_u8(a); simde_uint16x8_private r_ = simde_uint16x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -4357,7 +4357,7 @@ test_simde_vreinterpretq_u32_u8 (SIMDE_MUNIT_TEST_ARGS) { simde_uint32x4_t r = simde_vreinterpretq_u32_u8(a); simde_uint32x4_private r_ = simde_uint32x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -4412,7 +4412,7 @@ test_simde_vreinterpretq_u64_u8 (SIMDE_MUNIT_TEST_ARGS) { simde_uint8x16_private a_ = simde_uint8x16_to_private(a); simde_uint64x2_t r = simde_vreinterpretq_u64_u8(a); simde_uint64x2_private r_ = simde_uint64x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -4460,7 +4460,7 @@ test_simde_vreinterpret_f32_u16 (SIMDE_MUNIT_TEST_ARGS) { simde_float32x2_t r = simde_vreinterpret_f32_u16(a); simde_float32x2_private r_ = simde_float32x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -4508,7 +4508,7 @@ test_simde_vreinterpret_f64_u16 (SIMDE_MUNIT_TEST_ARGS) { simde_float64x1_t r = simde_vreinterpret_f64_u16(a); simde_float64x1_private r_ = simde_float64x1_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -4556,7 +4556,7 @@ test_simde_vreinterpret_s8_u16 (SIMDE_MUNIT_TEST_ARGS) { simde_int8x8_t r = simde_vreinterpret_s8_u16(a); simde_int8x8_private r_ = simde_int8x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -4604,7 +4604,7 @@ test_simde_vreinterpret_s16_u16 (SIMDE_MUNIT_TEST_ARGS) { simde_int16x4_t r = simde_vreinterpret_s16_u16(a); simde_int16x4_private r_ = simde_int16x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -4652,7 +4652,7 @@ test_simde_vreinterpret_s32_u16 (SIMDE_MUNIT_TEST_ARGS) { simde_int32x2_t r = simde_vreinterpret_s32_u16(a); simde_int32x2_private r_ = simde_int32x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -4700,7 +4700,7 @@ test_simde_vreinterpret_s64_u16 (SIMDE_MUNIT_TEST_ARGS) { simde_int64x1_t r = simde_vreinterpret_s64_u16(a); simde_int64x1_private r_ = simde_int64x1_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -4748,7 +4748,7 @@ test_simde_vreinterpret_u8_u16 (SIMDE_MUNIT_TEST_ARGS) { simde_uint8x8_t r = simde_vreinterpret_u8_u16(a); simde_uint8x8_private r_ = simde_uint8x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -4795,7 +4795,7 @@ test_simde_vreinterpret_u32_u16 (SIMDE_MUNIT_TEST_ARGS) { simde_uint16x4_private a_ = simde_uint16x4_to_private(a); simde_uint32x2_t r = simde_vreinterpret_u32_u16(a); simde_uint32x2_private r_ = simde_uint32x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -4842,7 +4842,7 @@ test_simde_vreinterpret_u64_u16 (SIMDE_MUNIT_TEST_ARGS) { simde_uint16x4_private a_ = simde_uint16x4_to_private(a); simde_uint64x1_t r = simde_vreinterpret_u64_u16(a); simde_uint64x1_private r_ = simde_uint64x1_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -4889,7 +4889,7 @@ test_simde_vreinterpretq_f32_u16 (SIMDE_MUNIT_TEST_ARGS) { simde_uint16x8_private a_ = simde_uint16x8_to_private(a); simde_float32x4_t r = simde_vreinterpretq_f32_u16(a); simde_float32x4_private r_ = simde_float32x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -4937,7 +4937,7 @@ test_simde_vreinterpretq_f64_u16 (SIMDE_MUNIT_TEST_ARGS) { simde_float64x2_t r = simde_vreinterpretq_f64_u16(a); simde_float64x2_private r_ = simde_float64x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -4985,7 +4985,7 @@ test_simde_vreinterpretq_s8_u16 (SIMDE_MUNIT_TEST_ARGS) { simde_int8x16_t r = simde_vreinterpretq_s8_u16(a); simde_int8x16_private r_ = simde_int8x16_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -5033,7 +5033,7 @@ test_simde_vreinterpretq_s16_u16 (SIMDE_MUNIT_TEST_ARGS) { simde_int16x8_t r = simde_vreinterpretq_s16_u16(a); simde_int16x8_private r_ = simde_int16x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -5080,7 +5080,7 @@ test_simde_vreinterpretq_s32_u16 (SIMDE_MUNIT_TEST_ARGS) { simde_uint16x8_private a_ = simde_uint16x8_to_private(a); simde_int32x4_t r = simde_vreinterpretq_s32_u16(a); simde_int32x4_private r_ = simde_int32x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -5127,7 +5127,7 @@ test_simde_vreinterpretq_s64_u16 (SIMDE_MUNIT_TEST_ARGS) { simde_uint16x8_private a_ = simde_uint16x8_to_private(a); simde_int64x2_t r = simde_vreinterpretq_s64_u16(a); simde_int64x2_private r_ = simde_int64x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -5175,7 +5175,7 @@ test_simde_vreinterpretq_u8_u16 (SIMDE_MUNIT_TEST_ARGS) { simde_uint8x16_t r = simde_vreinterpretq_u8_u16(a); simde_uint8x16_private r_ = simde_uint8x16_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -5223,7 +5223,7 @@ test_simde_vreinterpretq_u32_u16 (SIMDE_MUNIT_TEST_ARGS) { simde_uint32x4_t r = simde_vreinterpretq_u32_u16(a); simde_uint32x4_private r_ = simde_uint32x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -5270,7 +5270,7 @@ test_simde_vreinterpretq_u64_u16 (SIMDE_MUNIT_TEST_ARGS) { simde_uint16x8_private a_ = simde_uint16x8_to_private(a); simde_uint64x2_t r = simde_vreinterpretq_u64_u16(a); simde_uint64x2_private r_ = simde_uint64x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -5309,7 +5309,7 @@ test_simde_vreinterpret_f16_u16 (SIMDE_MUNIT_TEST_ARGS) { simde_float16x4_t r = simde_vreinterpret_f16_u16(a); simde_float16x4_private r_ = simde_float16x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -5357,7 +5357,7 @@ test_simde_vreinterpret_f32_u32 (SIMDE_MUNIT_TEST_ARGS) { simde_float32x2_t r = simde_vreinterpret_f32_u32(a); simde_float32x2_private r_ = simde_float32x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -5405,7 +5405,7 @@ test_simde_vreinterpret_f64_u32 (SIMDE_MUNIT_TEST_ARGS) { simde_float64x1_t r = simde_vreinterpret_f64_u32(a); simde_float64x1_private r_ = simde_float64x1_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -5453,7 +5453,7 @@ test_simde_vreinterpret_s8_u32 (SIMDE_MUNIT_TEST_ARGS) { simde_int8x8_t r = simde_vreinterpret_s8_u32(a); simde_int8x8_private r_ = simde_int8x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -5501,7 +5501,7 @@ test_simde_vreinterpret_s16_u32 (SIMDE_MUNIT_TEST_ARGS) { simde_int16x4_t r = simde_vreinterpret_s16_u32(a); simde_int16x4_private r_ = simde_int16x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -5549,7 +5549,7 @@ test_simde_vreinterpret_s32_u32 (SIMDE_MUNIT_TEST_ARGS) { simde_int32x2_t r = simde_vreinterpret_s32_u32(a); simde_int32x2_private r_ = simde_int32x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -5597,7 +5597,7 @@ test_simde_vreinterpret_s64_u32 (SIMDE_MUNIT_TEST_ARGS) { simde_int64x1_t r = simde_vreinterpret_s64_u32(a); simde_int64x1_private r_ = simde_int64x1_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -5645,7 +5645,7 @@ test_simde_vreinterpret_u8_u32 (SIMDE_MUNIT_TEST_ARGS) { simde_uint8x8_t r = simde_vreinterpret_u8_u32(a); simde_uint8x8_private r_ = simde_uint8x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -5692,7 +5692,7 @@ test_simde_vreinterpret_u16_u32 (SIMDE_MUNIT_TEST_ARGS) { simde_uint32x2_private a_ = simde_uint32x2_to_private(a); simde_uint16x4_t r = simde_vreinterpret_u16_u32(a); simde_uint16x4_private r_ = simde_uint16x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -5739,7 +5739,7 @@ test_simde_vreinterpret_u64_u32 (SIMDE_MUNIT_TEST_ARGS) { simde_uint32x2_private a_ = simde_uint32x2_to_private(a); simde_uint64x1_t r = simde_vreinterpret_u64_u32(a); simde_uint64x1_private r_ = simde_uint64x1_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -5778,7 +5778,7 @@ test_simde_vreinterpretq_f16_u16 (SIMDE_MUNIT_TEST_ARGS) { simde_float16x8_t r = simde_vreinterpretq_f16_u16(a); simde_float16x8_private r_ = simde_float16x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -5825,7 +5825,7 @@ test_simde_vreinterpretq_f32_u32 (SIMDE_MUNIT_TEST_ARGS) { simde_uint32x4_private a_ = simde_uint32x4_to_private(a); simde_float32x4_t r = simde_vreinterpretq_f32_u32(a); simde_float32x4_private r_ = simde_float32x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -5873,7 +5873,7 @@ test_simde_vreinterpretq_f64_u32 (SIMDE_MUNIT_TEST_ARGS) { simde_float64x2_t r = simde_vreinterpretq_f64_u32(a); simde_float64x2_private r_ = simde_float64x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -5921,7 +5921,7 @@ test_simde_vreinterpretq_s8_u32 (SIMDE_MUNIT_TEST_ARGS) { simde_int8x16_t r = simde_vreinterpretq_s8_u32(a); simde_int8x16_private r_ = simde_int8x16_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -5969,7 +5969,7 @@ test_simde_vreinterpretq_s16_u32 (SIMDE_MUNIT_TEST_ARGS) { simde_int16x8_t r = simde_vreinterpretq_s16_u32(a); simde_int16x8_private r_ = simde_int16x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -6016,7 +6016,7 @@ test_simde_vreinterpretq_s32_u32 (SIMDE_MUNIT_TEST_ARGS) { simde_uint32x4_private a_ = simde_uint32x4_to_private(a); simde_int32x4_t r = simde_vreinterpretq_s32_u32(a); simde_int32x4_private r_ = simde_int32x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -6063,7 +6063,7 @@ test_simde_vreinterpretq_s64_u32 (SIMDE_MUNIT_TEST_ARGS) { simde_uint32x4_private a_ = simde_uint32x4_to_private(a); simde_int64x2_t r = simde_vreinterpretq_s64_u32(a); simde_int64x2_private r_ = simde_int64x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -6111,7 +6111,7 @@ test_simde_vreinterpretq_u8_u32 (SIMDE_MUNIT_TEST_ARGS) { simde_uint8x16_t r = simde_vreinterpretq_u8_u32(a); simde_uint8x16_private r_ = simde_uint8x16_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -6158,7 +6158,7 @@ test_simde_vreinterpretq_u16_u32 (SIMDE_MUNIT_TEST_ARGS) { simde_uint32x4_private a_ = simde_uint32x4_to_private(a); simde_uint16x8_t r = simde_vreinterpretq_u16_u32(a); simde_uint16x8_private r_ = simde_uint16x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -6205,7 +6205,7 @@ test_simde_vreinterpretq_u64_u32 (SIMDE_MUNIT_TEST_ARGS) { simde_uint32x4_private a_ = simde_uint32x4_to_private(a); simde_uint64x2_t r = simde_vreinterpretq_u64_u32(a); simde_uint64x2_private r_ = simde_uint64x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -6253,7 +6253,7 @@ test_simde_vreinterpret_f32_u64 (SIMDE_MUNIT_TEST_ARGS) { simde_float32x2_t r = simde_vreinterpret_f32_u64(a); simde_float32x2_private r_ = simde_float32x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -6301,7 +6301,7 @@ test_simde_vreinterpret_f64_u64 (SIMDE_MUNIT_TEST_ARGS) { simde_float64x1_t r = simde_vreinterpret_f64_u64(a); simde_float64x1_private r_ = simde_float64x1_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -6349,7 +6349,7 @@ test_simde_vreinterpret_s8_u64 (SIMDE_MUNIT_TEST_ARGS) { simde_int8x8_t r = simde_vreinterpret_s8_u64(a); simde_int8x8_private r_ = simde_int8x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -6397,7 +6397,7 @@ test_simde_vreinterpret_s16_u64 (SIMDE_MUNIT_TEST_ARGS) { simde_int16x4_t r = simde_vreinterpret_s16_u64(a); simde_int16x4_private r_ = simde_int16x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -6445,7 +6445,7 @@ test_simde_vreinterpret_s32_u64 (SIMDE_MUNIT_TEST_ARGS) { simde_int32x2_t r = simde_vreinterpret_s32_u64(a); simde_int32x2_private r_ = simde_int32x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -6493,7 +6493,7 @@ test_simde_vreinterpret_s64_u64 (SIMDE_MUNIT_TEST_ARGS) { simde_int64x1_t r = simde_vreinterpret_s64_u64(a); simde_int64x1_private r_ = simde_int64x1_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -6541,7 +6541,7 @@ test_simde_vreinterpret_u8_u64 (SIMDE_MUNIT_TEST_ARGS) { simde_uint8x8_t r = simde_vreinterpret_u8_u64(a); simde_uint8x8_private r_ = simde_uint8x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -6588,7 +6588,7 @@ test_simde_vreinterpret_u16_u64 (SIMDE_MUNIT_TEST_ARGS) { simde_uint64x1_private a_ = simde_uint64x1_to_private(a); simde_uint16x4_t r = simde_vreinterpret_u16_u64(a); simde_uint16x4_private r_ = simde_uint16x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -6635,7 +6635,7 @@ test_simde_vreinterpret_u32_u64 (SIMDE_MUNIT_TEST_ARGS) { simde_uint64x1_private a_ = simde_uint64x1_to_private(a); simde_uint32x2_t r = simde_vreinterpret_u32_u64(a); simde_uint32x2_private r_ = simde_uint32x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -6682,7 +6682,7 @@ test_simde_vreinterpretq_f32_u64 (SIMDE_MUNIT_TEST_ARGS) { simde_uint64x2_private a_ = simde_uint64x2_to_private(a); simde_float32x4_t r = simde_vreinterpretq_f32_u64(a); simde_float32x4_private r_ = simde_float32x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -6730,7 +6730,7 @@ test_simde_vreinterpretq_f64_u64 (SIMDE_MUNIT_TEST_ARGS) { simde_float64x2_t r = simde_vreinterpretq_f64_u64(a); simde_float64x2_private r_ = simde_float64x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -6778,7 +6778,7 @@ test_simde_vreinterpretq_s8_u64 (SIMDE_MUNIT_TEST_ARGS) { simde_int8x16_t r = simde_vreinterpretq_s8_u64(a); simde_int8x16_private r_ = simde_int8x16_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -6826,7 +6826,7 @@ test_simde_vreinterpretq_s16_u64 (SIMDE_MUNIT_TEST_ARGS) { simde_int16x8_t r = simde_vreinterpretq_s16_u64(a); simde_int16x8_private r_ = simde_int16x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -6873,7 +6873,7 @@ test_simde_vreinterpretq_s32_u64 (SIMDE_MUNIT_TEST_ARGS) { simde_uint64x2_private a_ = simde_uint64x2_to_private(a); simde_int32x4_t r = simde_vreinterpretq_s32_u64(a); simde_int32x4_private r_ = simde_int32x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -6920,7 +6920,7 @@ test_simde_vreinterpretq_s64_u64 (SIMDE_MUNIT_TEST_ARGS) { simde_uint64x2_private a_ = simde_uint64x2_to_private(a); simde_int64x2_t r = simde_vreinterpretq_s64_u64(a); simde_int64x2_private r_ = simde_int64x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -6968,7 +6968,7 @@ test_simde_vreinterpretq_u8_u64 (SIMDE_MUNIT_TEST_ARGS) { simde_uint8x16_t r = simde_vreinterpretq_u8_u64(a); simde_uint8x16_private r_ = simde_uint8x16_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -7015,7 +7015,7 @@ test_simde_vreinterpretq_u16_u64 (SIMDE_MUNIT_TEST_ARGS) { simde_uint64x2_private a_ = simde_uint64x2_to_private(a); simde_uint16x8_t r = simde_vreinterpretq_u16_u64(a); simde_uint16x8_private r_ = simde_uint16x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -7063,7 +7063,7 @@ test_simde_vreinterpretq_u32_u64 (SIMDE_MUNIT_TEST_ARGS) { simde_uint32x4_t r = simde_vreinterpretq_u32_u64(a); simde_uint32x4_private r_ = simde_uint32x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -7111,7 +7111,7 @@ test_simde_vreinterpret_f64_f32 (SIMDE_MUNIT_TEST_ARGS) { simde_float64x1_t r = simde_vreinterpret_f64_f32(a); simde_float64x1_private r_ = simde_float64x1_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -7159,7 +7159,7 @@ test_simde_vreinterpret_s8_f32 (SIMDE_MUNIT_TEST_ARGS) { simde_int8x8_t r = simde_vreinterpret_s8_f32(a); simde_int8x8_private r_ = simde_int8x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -7207,7 +7207,7 @@ test_simde_vreinterpret_s16_f32 (SIMDE_MUNIT_TEST_ARGS) { simde_int16x4_t r = simde_vreinterpret_s16_f32(a); simde_int16x4_private r_ = simde_int16x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -7255,7 +7255,7 @@ test_simde_vreinterpret_s32_f32 (SIMDE_MUNIT_TEST_ARGS) { simde_int32x2_t r = simde_vreinterpret_s32_f32(a); simde_int32x2_private r_ = simde_int32x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -7303,7 +7303,7 @@ test_simde_vreinterpret_s64_f32 (SIMDE_MUNIT_TEST_ARGS) { simde_int64x1_t r = simde_vreinterpret_s64_f32(a); simde_int64x1_private r_ = simde_int64x1_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -7351,7 +7351,7 @@ test_simde_vreinterpret_u8_f32 (SIMDE_MUNIT_TEST_ARGS) { simde_uint8x8_t r = simde_vreinterpret_u8_f32(a); simde_uint8x8_private r_ = simde_uint8x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -7398,7 +7398,7 @@ test_simde_vreinterpret_u16_f32 (SIMDE_MUNIT_TEST_ARGS) { simde_float32x2_private a_ = simde_float32x2_to_private(a); simde_uint16x4_t r = simde_vreinterpret_u16_f32(a); simde_uint16x4_private r_ = simde_uint16x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -7436,7 +7436,7 @@ test_simde_vreinterpret_u16_f16 (SIMDE_MUNIT_TEST_ARGS) { simde_float16x4_private a_ = simde_float16x4_to_private(a); simde_uint16x4_t r = simde_vreinterpret_u16_f16(a); simde_uint16x4_private r_ = simde_uint16x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -7483,7 +7483,7 @@ test_simde_vreinterpret_u32_f32 (SIMDE_MUNIT_TEST_ARGS) { simde_float32x2_private a_ = simde_float32x2_to_private(a); simde_uint32x2_t r = simde_vreinterpret_u32_f32(a); simde_uint32x2_private r_ = simde_uint32x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -7530,7 +7530,7 @@ test_simde_vreinterpret_u64_f32 (SIMDE_MUNIT_TEST_ARGS) { simde_float32x2_private a_ = simde_float32x2_to_private(a); simde_uint64x1_t r = simde_vreinterpret_u64_f32(a); simde_uint64x1_private r_ = simde_uint64x1_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -7578,7 +7578,7 @@ test_simde_vreinterpretq_f64_f32 (SIMDE_MUNIT_TEST_ARGS) { simde_float64x2_t r = simde_vreinterpretq_f64_f32(a); simde_float64x2_private r_ = simde_float64x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -7626,7 +7626,7 @@ test_simde_vreinterpretq_s8_f32 (SIMDE_MUNIT_TEST_ARGS) { simde_int8x16_t r = simde_vreinterpretq_s8_f32(a); simde_int8x16_private r_ = simde_int8x16_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -7674,7 +7674,7 @@ test_simde_vreinterpretq_s16_f32 (SIMDE_MUNIT_TEST_ARGS) { simde_int16x8_t r = simde_vreinterpretq_s16_f32(a); simde_int16x8_private r_ = simde_int16x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -7721,7 +7721,7 @@ test_simde_vreinterpretq_s32_f32 (SIMDE_MUNIT_TEST_ARGS) { simde_float32x4_private a_ = simde_float32x4_to_private(a); simde_int32x4_t r = simde_vreinterpretq_s32_f32(a); simde_int32x4_private r_ = simde_int32x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -7768,7 +7768,7 @@ test_simde_vreinterpretq_s64_f32 (SIMDE_MUNIT_TEST_ARGS) { simde_float32x4_private a_ = simde_float32x4_to_private(a); simde_int64x2_t r = simde_vreinterpretq_s64_f32(a); simde_int64x2_private r_ = simde_int64x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -7816,7 +7816,7 @@ test_simde_vreinterpretq_u8_f32 (SIMDE_MUNIT_TEST_ARGS) { simde_uint8x16_t r = simde_vreinterpretq_u8_f32(a); simde_uint8x16_private r_ = simde_uint8x16_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -7863,7 +7863,7 @@ test_simde_vreinterpretq_u16_f32 (SIMDE_MUNIT_TEST_ARGS) { simde_float32x4_private a_ = simde_float32x4_to_private(a); simde_uint16x8_t r = simde_vreinterpretq_u16_f32(a); simde_uint16x8_private r_ = simde_uint16x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -7909,7 +7909,7 @@ test_simde_vreinterpretq_u16_f16 (SIMDE_MUNIT_TEST_ARGS) { simde_float16x8_private a_ = simde_float16x8_to_private(a); simde_uint16x8_t r = simde_vreinterpretq_u16_f16(a); simde_uint16x8_private r_ = simde_uint16x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -7957,7 +7957,7 @@ test_simde_vreinterpretq_u32_f32 (SIMDE_MUNIT_TEST_ARGS) { simde_uint32x4_t r = simde_vreinterpretq_u32_f32(a); simde_uint32x4_private r_ = simde_uint32x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -8004,7 +8004,7 @@ test_simde_vreinterpretq_u64_f32 (SIMDE_MUNIT_TEST_ARGS) { simde_float32x4_private a_ = simde_float32x4_to_private(a); simde_uint64x2_t r = simde_vreinterpretq_u64_f32(a); simde_uint64x2_private r_ = simde_uint64x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -8052,7 +8052,7 @@ test_simde_vreinterpret_f32_f64 (SIMDE_MUNIT_TEST_ARGS) { simde_float32x2_t r = simde_vreinterpret_f32_f64(a); simde_float32x2_private r_ = simde_float32x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -8100,7 +8100,7 @@ test_simde_vreinterpret_s8_f64 (SIMDE_MUNIT_TEST_ARGS) { simde_int8x8_t r = simde_vreinterpret_s8_f64(a); simde_int8x8_private r_ = simde_int8x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -8148,7 +8148,7 @@ test_simde_vreinterpret_s16_f64 (SIMDE_MUNIT_TEST_ARGS) { simde_int16x4_t r = simde_vreinterpret_s16_f64(a); simde_int16x4_private r_ = simde_int16x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -8196,7 +8196,7 @@ test_simde_vreinterpret_s32_f64 (SIMDE_MUNIT_TEST_ARGS) { simde_int32x2_t r = simde_vreinterpret_s32_f64(a); simde_int32x2_private r_ = simde_int32x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -8244,7 +8244,7 @@ test_simde_vreinterpret_s64_f64 (SIMDE_MUNIT_TEST_ARGS) { simde_int64x1_t r = simde_vreinterpret_s64_f64(a); simde_int64x1_private r_ = simde_int64x1_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -8292,7 +8292,7 @@ test_simde_vreinterpret_u8_f64 (SIMDE_MUNIT_TEST_ARGS) { simde_uint8x8_t r = simde_vreinterpret_u8_f64(a); simde_uint8x8_private r_ = simde_uint8x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -8339,7 +8339,7 @@ test_simde_vreinterpret_u16_f64 (SIMDE_MUNIT_TEST_ARGS) { simde_float64x1_private a_ = simde_float64x1_to_private(a); simde_uint16x4_t r = simde_vreinterpret_u16_f64(a); simde_uint16x4_private r_ = simde_uint16x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -8386,7 +8386,7 @@ test_simde_vreinterpret_u32_f64 (SIMDE_MUNIT_TEST_ARGS) { simde_float64x1_private a_ = simde_float64x1_to_private(a); simde_uint32x2_t r = simde_vreinterpret_u32_f64(a); simde_uint32x2_private r_ = simde_uint32x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -8433,7 +8433,7 @@ test_simde_vreinterpret_u64_f64 (SIMDE_MUNIT_TEST_ARGS) { simde_float64x1_private a_ = simde_float64x1_to_private(a); simde_uint64x1_t r = simde_vreinterpret_u64_f64(a); simde_uint64x1_private r_ = simde_uint64x1_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -8480,7 +8480,7 @@ test_simde_vreinterpretq_f32_f64 (SIMDE_MUNIT_TEST_ARGS) { simde_float64x2_private a_ = simde_float64x2_to_private(a); simde_float32x4_t r = simde_vreinterpretq_f32_f64(a); simde_float32x4_private r_ = simde_float32x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -8528,7 +8528,7 @@ test_simde_vreinterpretq_s8_f64 (SIMDE_MUNIT_TEST_ARGS) { simde_int8x16_t r = simde_vreinterpretq_s8_f64(a); simde_int8x16_private r_ = simde_int8x16_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -8576,7 +8576,7 @@ test_simde_vreinterpretq_s16_f64 (SIMDE_MUNIT_TEST_ARGS) { simde_int16x8_t r = simde_vreinterpretq_s16_f64(a); simde_int16x8_private r_ = simde_int16x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -8623,7 +8623,7 @@ test_simde_vreinterpretq_s32_f64 (SIMDE_MUNIT_TEST_ARGS) { simde_float64x2_private a_ = simde_float64x2_to_private(a); simde_int32x4_t r = simde_vreinterpretq_s32_f64(a); simde_int32x4_private r_ = simde_int32x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -8670,7 +8670,7 @@ test_simde_vreinterpretq_s64_f64 (SIMDE_MUNIT_TEST_ARGS) { simde_float64x2_private a_ = simde_float64x2_to_private(a); simde_int64x2_t r = simde_vreinterpretq_s64_f64(a); simde_int64x2_private r_ = simde_int64x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -8718,7 +8718,7 @@ test_simde_vreinterpretq_u8_f64 (SIMDE_MUNIT_TEST_ARGS) { simde_uint8x16_t r = simde_vreinterpretq_u8_f64(a); simde_uint8x16_private r_ = simde_uint8x16_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -8765,7 +8765,7 @@ test_simde_vreinterpretq_u16_f64 (SIMDE_MUNIT_TEST_ARGS) { simde_float64x2_private a_ = simde_float64x2_to_private(a); simde_uint16x8_t r = simde_vreinterpretq_u16_f64(a); simde_uint16x8_private r_ = simde_uint16x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -8813,7 +8813,7 @@ test_simde_vreinterpretq_u32_f64 (SIMDE_MUNIT_TEST_ARGS) { simde_uint32x4_t r = simde_vreinterpretq_u32_f64(a); simde_uint32x4_private r_ = simde_uint32x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -8860,7 +8860,7 @@ test_simde_vreinterpretq_u64_f64 (SIMDE_MUNIT_TEST_ARGS) { simde_float64x2_private a_ = simde_float64x2_to_private(a); simde_uint64x2_t r = simde_vreinterpretq_u64_f64(a); simde_uint64x2_private r_ = simde_uint64x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -8908,7 +8908,7 @@ test_simde_vreinterpret_f16_f32 (SIMDE_MUNIT_TEST_ARGS) { simde_float16x4_t r = simde_vreinterpret_f16_f32(a); simde_float16x4_private r_ = simde_float16x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -8956,7 +8956,7 @@ test_simde_vreinterpret_f16_s16 (SIMDE_MUNIT_TEST_ARGS) { simde_float16x4_t r = simde_vreinterpret_f16_s16(a); simde_float16x4_private r_ = simde_float16x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -9004,7 +9004,7 @@ test_simde_vreinterpret_f16_s32 (SIMDE_MUNIT_TEST_ARGS) { simde_float16x4_t r = simde_vreinterpret_f16_s32(a); simde_float16x4_private r_ = simde_float16x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -9052,7 +9052,7 @@ test_simde_vreinterpret_f16_s64 (SIMDE_MUNIT_TEST_ARGS) { simde_float16x4_t r = simde_vreinterpret_f16_s64(a); simde_float16x4_private r_ = simde_float16x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -9100,7 +9100,7 @@ test_simde_vreinterpret_f16_s8 (SIMDE_MUNIT_TEST_ARGS) { simde_float16x4_t r = simde_vreinterpret_f16_s8(a); simde_float16x4_private r_ = simde_float16x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -9148,7 +9148,7 @@ test_simde_vreinterpret_f16_u32 (SIMDE_MUNIT_TEST_ARGS) { simde_float16x4_t r = simde_vreinterpret_f16_u32(a); simde_float16x4_private r_ = simde_float16x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -9196,7 +9196,7 @@ test_simde_vreinterpret_f16_u64 (SIMDE_MUNIT_TEST_ARGS) { simde_float16x4_t r = simde_vreinterpret_f16_u64(a); simde_float16x4_private r_ = simde_float16x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -9244,7 +9244,7 @@ test_simde_vreinterpret_f16_u8 (SIMDE_MUNIT_TEST_ARGS) { simde_float16x4_t r = simde_vreinterpret_f16_u8(a); simde_float16x4_private r_ = simde_float16x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -9292,7 +9292,7 @@ test_simde_vreinterpretq_f16_f32 (SIMDE_MUNIT_TEST_ARGS) { simde_float16x8_t r = simde_vreinterpretq_f16_f32(a); simde_float16x8_private r_ = simde_float16x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -9339,7 +9339,7 @@ test_simde_vreinterpretq_f16_s16 (SIMDE_MUNIT_TEST_ARGS) { simde_int16x8_private a_ = simde_int16x8_to_private(a); simde_float16x8_t r = simde_vreinterpretq_f16_s16(a); simde_float16x8_private r_ = simde_float16x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -9387,7 +9387,7 @@ test_simde_vreinterpretq_f16_s32 (SIMDE_MUNIT_TEST_ARGS) { simde_float16x8_t r = simde_vreinterpretq_f16_s32(a); simde_float16x8_private r_ = simde_float16x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -9434,7 +9434,7 @@ test_simde_vreinterpretq_f16_s64 (SIMDE_MUNIT_TEST_ARGS) { simde_int64x2_private a_ = simde_int64x2_to_private(a); simde_float16x8_t r = simde_vreinterpretq_f16_s64(a); simde_float16x8_private r_ = simde_float16x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -9489,7 +9489,7 @@ test_simde_vreinterpretq_f16_s8 (SIMDE_MUNIT_TEST_ARGS) { simde_int8x16_private a_ = simde_int8x16_to_private(a); simde_float16x8_t r = simde_vreinterpretq_f16_s8(a); simde_float16x8_private r_ = simde_float16x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -9537,7 +9537,7 @@ test_simde_vreinterpretq_f16_u32 (SIMDE_MUNIT_TEST_ARGS) { simde_float16x8_t r = simde_vreinterpretq_f16_u32(a); simde_float16x8_private r_ = simde_float16x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -9584,7 +9584,7 @@ test_simde_vreinterpretq_f16_u64 (SIMDE_MUNIT_TEST_ARGS) { simde_uint64x2_private a_ = simde_uint64x2_to_private(a); simde_float16x8_t r = simde_vreinterpretq_f16_u64(a); simde_float16x8_private r_ = simde_float16x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -9639,7 +9639,7 @@ test_simde_vreinterpretq_f16_u8 (SIMDE_MUNIT_TEST_ARGS) { simde_uint8x16_private a_ = simde_uint8x16_to_private(a); simde_float16x8_t r = simde_vreinterpretq_f16_u8(a); simde_float16x8_private r_ = simde_float16x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -9687,7 +9687,7 @@ test_simde_vreinterpret_f16_f64 (SIMDE_MUNIT_TEST_ARGS) { simde_float16x4_t r = simde_vreinterpret_f16_f64(a); simde_float16x4_private r_ = simde_float16x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -9734,7 +9734,7 @@ test_simde_vreinterpretq_f16_f64 (SIMDE_MUNIT_TEST_ARGS) { simde_float64x2_private a_ = simde_float64x2_to_private(a); simde_float16x8_t r = simde_vreinterpretq_f16_f64(a); simde_float16x8_private r_ = simde_float16x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -9773,7 +9773,7 @@ test_simde_vreinterpret_f32_f16 (SIMDE_MUNIT_TEST_ARGS) { simde_float16x4_private a_ = simde_float16x4_to_private(a); simde_float32x2_t r = simde_vreinterpret_f32_f16(a); simde_float32x2_private r_ = simde_float32x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -9819,7 +9819,7 @@ test_simde_vreinterpretq_f32_f16 (SIMDE_MUNIT_TEST_ARGS) { simde_float16x8_private a_ = simde_float16x8_to_private(a); simde_float32x4_t r = simde_vreinterpretq_f32_f16(a); simde_float32x4_private r_ = simde_float32x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -9858,7 +9858,7 @@ test_simde_vreinterpret_f64_f16 (SIMDE_MUNIT_TEST_ARGS) { simde_float16x4_private a_ = simde_float16x4_to_private(a); simde_float64x1_t r = simde_vreinterpret_f64_f16(a); simde_float64x1_private r_ = simde_float64x1_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -9904,7 +9904,7 @@ test_simde_vreinterpretq_f64_f16 (SIMDE_MUNIT_TEST_ARGS) { simde_float16x8_private a_ = simde_float16x8_to_private(a); simde_float64x2_t r = simde_vreinterpretq_f64_f16(a); simde_float64x2_private r_ = simde_float64x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -9943,7 +9943,7 @@ test_simde_vreinterpret_s8_f16 (SIMDE_MUNIT_TEST_ARGS) { simde_float16x4_private a_ = simde_float16x4_to_private(a); simde_int8x8_t r = simde_vreinterpret_s8_f16(a); simde_int8x8_private r_ = simde_int8x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -9989,7 +9989,7 @@ test_simde_vreinterpretq_s8_f16 (SIMDE_MUNIT_TEST_ARGS) { simde_float16x8_private a_ = simde_float16x8_to_private(a); simde_int8x16_t r = simde_vreinterpretq_s8_f16(a); simde_int8x16_private r_ = simde_int8x16_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -10028,7 +10028,7 @@ test_simde_vreinterpret_s16_f16 (SIMDE_MUNIT_TEST_ARGS) { simde_float16x4_private a_ = simde_float16x4_to_private(a); simde_int16x4_t r = simde_vreinterpret_s16_f16(a); simde_int16x4_private r_ = simde_int16x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -10074,7 +10074,7 @@ test_simde_vreinterpretq_s16_f16 (SIMDE_MUNIT_TEST_ARGS) { simde_float16x8_private a_ = simde_float16x8_to_private(a); simde_int16x8_t r = simde_vreinterpretq_s16_f16(a); simde_int16x8_private r_ = simde_int16x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -10113,7 +10113,7 @@ test_simde_vreinterpret_s32_f16 (SIMDE_MUNIT_TEST_ARGS) { simde_float16x4_private a_ = simde_float16x4_to_private(a); simde_int32x2_t r = simde_vreinterpret_s32_f16(a); simde_int32x2_private r_ = simde_int32x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -10159,7 +10159,7 @@ test_simde_vreinterpretq_s32_f16 (SIMDE_MUNIT_TEST_ARGS) { simde_float16x8_private a_ = simde_float16x8_to_private(a); simde_int32x4_t r = simde_vreinterpretq_s32_f16(a); simde_int32x4_private r_ = simde_int32x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -10198,7 +10198,7 @@ test_simde_vreinterpret_s64_f16 (SIMDE_MUNIT_TEST_ARGS) { simde_float16x4_private a_ = simde_float16x4_to_private(a); simde_int64x1_t r = simde_vreinterpret_s64_f16(a); simde_int64x1_private r_ = simde_int64x1_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -10244,7 +10244,7 @@ test_simde_vreinterpretq_s64_f16 (SIMDE_MUNIT_TEST_ARGS) { simde_float16x8_private a_ = simde_float16x8_to_private(a); simde_int64x2_t r = simde_vreinterpretq_s64_f16(a); simde_int64x2_private r_ = simde_int64x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -10283,7 +10283,7 @@ test_simde_vreinterpret_u8_f16 (SIMDE_MUNIT_TEST_ARGS) { simde_float16x4_private a_ = simde_float16x4_to_private(a); simde_uint8x8_t r = simde_vreinterpret_u8_f16(a); simde_uint8x8_private r_ = simde_uint8x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -10329,7 +10329,7 @@ test_simde_vreinterpretq_u8_f16 (SIMDE_MUNIT_TEST_ARGS) { simde_float16x8_private a_ = simde_float16x8_to_private(a); simde_uint8x16_t r = simde_vreinterpretq_u8_f16(a); simde_uint8x16_private r_ = simde_uint8x16_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -10367,7 +10367,7 @@ test_simde_vreinterpret_u32_f16 (SIMDE_MUNIT_TEST_ARGS) { simde_float16x4_private a_ = simde_float16x4_to_private(a); simde_uint32x2_t r = simde_vreinterpret_u32_f16(a); simde_uint32x2_private r_ = simde_uint32x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -10412,7 +10412,7 @@ test_simde_vreinterpretq_u32_f16 (SIMDE_MUNIT_TEST_ARGS) { simde_float16x8_private a_ = simde_float16x8_to_private(a); simde_uint32x4_t r = simde_vreinterpretq_u32_f16(a); simde_uint32x4_private r_ = simde_uint32x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -10450,7 +10450,7 @@ test_simde_vreinterpret_u64_f16 (SIMDE_MUNIT_TEST_ARGS) { simde_float16x4_private a_ = simde_float16x4_to_private(a); simde_uint64x1_t r = simde_vreinterpret_u64_f16(a); simde_uint64x1_private r_ = simde_uint64x1_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -10495,7 +10495,7 @@ test_simde_vreinterpretq_u64_f16 (SIMDE_MUNIT_TEST_ARGS) { simde_float16x8_private a_ = simde_float16x8_to_private(a); simde_uint64x2_t r = simde_vreinterpretq_u64_f16(a); simde_uint64x2_private r_ = simde_uint64x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -10542,7 +10542,7 @@ test_simde_vreinterpret_p8_s8 (SIMDE_MUNIT_TEST_ARGS) { simde_poly8x8_t r = simde_vreinterpret_p8_s8(a); simde_poly8x8_private r_ = simde_poly8x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -10588,7 +10588,7 @@ test_simde_vreinterpret_p16_s8 (SIMDE_MUNIT_TEST_ARGS) { simde_int8x8_private a_ = simde_int8x8_to_private(a); simde_poly16x4_t r = simde_vreinterpret_p16_s8(a); simde_poly16x4_private r_ = simde_poly16x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -10634,7 +10634,7 @@ test_simde_vreinterpret_p64_s8 (SIMDE_MUNIT_TEST_ARGS) { simde_int8x8_private a_ = simde_int8x8_to_private(a); simde_poly64x1_t r = simde_vreinterpret_p64_s8(a); simde_poly64x1_private r_ = simde_poly64x1_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -10681,7 +10681,7 @@ test_simde_vreinterpret_p8_s16 (SIMDE_MUNIT_TEST_ARGS) { simde_poly8x8_t r = simde_vreinterpret_p8_s16(a); simde_poly8x8_private r_ = simde_poly8x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -10727,7 +10727,7 @@ test_simde_vreinterpret_p16_s16 (SIMDE_MUNIT_TEST_ARGS) { simde_int16x4_private a_ = simde_int16x4_to_private(a); simde_poly16x4_t r = simde_vreinterpret_p16_s16(a); simde_poly16x4_private r_ = simde_poly16x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -10773,7 +10773,7 @@ test_simde_vreinterpret_p64_s16 (SIMDE_MUNIT_TEST_ARGS) { simde_int16x4_private a_ = simde_int16x4_to_private(a); simde_poly64x1_t r = simde_vreinterpret_p64_s16(a); simde_poly64x1_private r_ = simde_poly64x1_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -10820,7 +10820,7 @@ test_simde_vreinterpret_p8_s32 (SIMDE_MUNIT_TEST_ARGS) { simde_poly8x8_t r = simde_vreinterpret_p8_s32(a); simde_poly8x8_private r_ = simde_poly8x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -10866,7 +10866,7 @@ test_simde_vreinterpret_p16_s32 (SIMDE_MUNIT_TEST_ARGS) { simde_int32x2_private a_ = simde_int32x2_to_private(a); simde_poly16x4_t r = simde_vreinterpret_p16_s32(a); simde_poly16x4_private r_ = simde_poly16x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -10912,7 +10912,7 @@ test_simde_vreinterpret_p64_s32 (SIMDE_MUNIT_TEST_ARGS) { simde_int32x2_private a_ = simde_int32x2_to_private(a); simde_poly64x1_t r = simde_vreinterpret_p64_s32(a); simde_poly64x1_private r_ = simde_poly64x1_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -10959,7 +10959,7 @@ test_simde_vreinterpret_p8_s64 (SIMDE_MUNIT_TEST_ARGS) { simde_poly8x8_t r = simde_vreinterpret_p8_s64(a); simde_poly8x8_private r_ = simde_poly8x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -11005,7 +11005,7 @@ test_simde_vreinterpret_p16_s64 (SIMDE_MUNIT_TEST_ARGS) { simde_int64x1_private a_ = simde_int64x1_to_private(a); simde_poly16x4_t r = simde_vreinterpret_p16_s64(a); simde_poly16x4_private r_ = simde_poly16x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -11051,7 +11051,7 @@ test_simde_vreinterpret_p16_p8 (SIMDE_MUNIT_TEST_ARGS) { simde_poly8x8_private a_ = simde_poly8x8_to_private(a); simde_poly16x4_t r = simde_vreinterpret_p16_p8(a); simde_poly16x4_private r_ = simde_poly16x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -11097,7 +11097,7 @@ test_simde_vreinterpret_p64_p8 (SIMDE_MUNIT_TEST_ARGS) { simde_poly8x8_private a_ = simde_poly8x8_to_private(a); simde_poly64x1_t r = simde_vreinterpret_p64_p8(a); simde_poly64x1_private r_ = simde_poly64x1_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -11144,7 +11144,7 @@ test_simde_vreinterpret_p8_p16 (SIMDE_MUNIT_TEST_ARGS) { simde_poly8x8_t r = simde_vreinterpret_p8_p16(a); simde_poly8x8_private r_ = simde_poly8x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -11190,7 +11190,7 @@ test_simde_vreinterpret_p64_p16 (SIMDE_MUNIT_TEST_ARGS) { simde_poly16x4_private a_ = simde_poly16x4_to_private(a); simde_poly64x1_t r = simde_vreinterpret_p64_p16(a); simde_poly64x1_private r_ = simde_poly64x1_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -11237,7 +11237,7 @@ test_simde_vreinterpret_p8_p64 (SIMDE_MUNIT_TEST_ARGS) { simde_poly8x8_t r = simde_vreinterpret_p8_p64(a); simde_poly8x8_private r_ = simde_poly8x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -11283,7 +11283,7 @@ test_simde_vreinterpret_p16_p64 (SIMDE_MUNIT_TEST_ARGS) { simde_poly64x1_private a_ = simde_poly64x1_to_private(a); simde_poly16x4_t r = simde_vreinterpret_p16_p64(a); simde_poly16x4_private r_ = simde_poly16x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -11330,7 +11330,7 @@ test_simde_vreinterpret_p8_f32 (SIMDE_MUNIT_TEST_ARGS) { simde_poly8x8_t r = simde_vreinterpret_p8_f32(a); simde_poly8x8_private r_ = simde_poly8x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -11376,7 +11376,7 @@ test_simde_vreinterpret_p16_f32 (SIMDE_MUNIT_TEST_ARGS) { simde_float32x2_private a_ = simde_float32x2_to_private(a); simde_poly16x4_t r = simde_vreinterpret_p16_f32(a); simde_poly16x4_private r_ = simde_poly16x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -11413,7 +11413,7 @@ test_simde_vreinterpret_p16_f16 (SIMDE_MUNIT_TEST_ARGS) { simde_float16x4_private a_ = simde_float16x4_to_private(a); simde_poly16x4_t r = simde_vreinterpret_p16_f16(a); simde_poly16x4_private r_ = simde_poly16x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -11459,7 +11459,7 @@ test_simde_vreinterpret_p64_f32 (SIMDE_MUNIT_TEST_ARGS) { simde_float32x2_private a_ = simde_float32x2_to_private(a); simde_poly64x1_t r = simde_vreinterpret_p64_f32(a); simde_poly64x1_private r_ = simde_poly64x1_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -11506,7 +11506,7 @@ test_simde_vreinterpret_p8_f64 (SIMDE_MUNIT_TEST_ARGS) { simde_poly8x8_t r = simde_vreinterpret_p8_f64(a); simde_poly8x8_private r_ = simde_poly8x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -11552,7 +11552,7 @@ test_simde_vreinterpret_p16_f64 (SIMDE_MUNIT_TEST_ARGS) { simde_float64x1_private a_ = simde_float64x1_to_private(a); simde_poly16x4_t r = simde_vreinterpret_p16_f64(a); simde_poly16x4_private r_ = simde_poly16x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -11598,7 +11598,7 @@ test_simde_vreinterpret_p64_f64 (SIMDE_MUNIT_TEST_ARGS) { simde_float64x1_private a_ = simde_float64x1_to_private(a); simde_poly64x1_t r = simde_vreinterpret_p64_f64(a); simde_poly64x1_private r_ = simde_poly64x1_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -11636,7 +11636,7 @@ test_simde_vreinterpret_p8_f16 (SIMDE_MUNIT_TEST_ARGS) { simde_float16x4_private a_ = simde_float16x4_to_private(a); simde_poly8x8_t r = simde_vreinterpret_p8_f16(a); simde_poly8x8_private r_ = simde_poly8x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -11681,7 +11681,7 @@ test_simde_vreinterpretq_p8_f16 (SIMDE_MUNIT_TEST_ARGS) { simde_float16x8_private a_ = simde_float16x8_to_private(a); simde_poly8x16_t r = simde_vreinterpretq_p8_f16(a); simde_poly8x16_private r_ = simde_poly8x16_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -11719,7 +11719,7 @@ test_simde_vreinterpret_p64_f16 (SIMDE_MUNIT_TEST_ARGS) { simde_float16x4_private a_ = simde_float16x4_to_private(a); simde_poly64x1_t r = simde_vreinterpret_p64_f16(a); simde_poly64x1_private r_ = simde_poly64x1_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -11764,7 +11764,7 @@ test_simde_vreinterpretq_p64_f16 (SIMDE_MUNIT_TEST_ARGS) { simde_float16x8_private a_ = simde_float16x8_to_private(a); simde_poly64x2_t r = simde_vreinterpretq_p64_f16(a); simde_poly64x2_private r_ = simde_poly64x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -11819,7 +11819,7 @@ test_simde_vreinterpretq_p8_s8 (SIMDE_MUNIT_TEST_ARGS) { simde_poly8x16_t r = simde_vreinterpretq_p8_s8(a); simde_poly8x16_private r_ = simde_poly8x16_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -11873,7 +11873,7 @@ test_simde_vreinterpretq_p16_s8 (SIMDE_MUNIT_TEST_ARGS) { simde_int8x16_private a_ = simde_int8x16_to_private(a); simde_poly16x8_t r = simde_vreinterpretq_p16_s8(a); simde_poly16x8_private r_ = simde_poly16x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -11927,7 +11927,7 @@ test_simde_vreinterpretq_p64_s8 (SIMDE_MUNIT_TEST_ARGS) { simde_int8x16_private a_ = simde_int8x16_to_private(a); simde_poly64x2_t r = simde_vreinterpretq_p64_s8(a); simde_poly64x2_private r_ = simde_poly64x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -11974,7 +11974,7 @@ test_simde_vreinterpretq_p8_s16 (SIMDE_MUNIT_TEST_ARGS) { simde_poly8x16_t r = simde_vreinterpretq_p8_s16(a); simde_poly8x16_private r_ = simde_poly8x16_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -12020,7 +12020,7 @@ test_simde_vreinterpretq_p16_s16 (SIMDE_MUNIT_TEST_ARGS) { simde_int16x8_private a_ = simde_int16x8_to_private(a); simde_poly16x8_t r = simde_vreinterpretq_p16_s16(a); simde_poly16x8_private r_ = simde_poly16x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -12066,7 +12066,7 @@ test_simde_vreinterpretq_p64_s16 (SIMDE_MUNIT_TEST_ARGS) { simde_int16x8_private a_ = simde_int16x8_to_private(a); simde_poly64x2_t r = simde_vreinterpretq_p64_s16(a); simde_poly64x2_private r_ = simde_poly64x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -12113,7 +12113,7 @@ test_simde_vreinterpretq_p8_s32 (SIMDE_MUNIT_TEST_ARGS) { simde_poly8x16_t r = simde_vreinterpretq_p8_s32(a); simde_poly8x16_private r_ = simde_poly8x16_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -12159,7 +12159,7 @@ test_simde_vreinterpretq_p16_s32 (SIMDE_MUNIT_TEST_ARGS) { simde_int32x4_private a_ = simde_int32x4_to_private(a); simde_poly16x8_t r = simde_vreinterpretq_p16_s32(a); simde_poly16x8_private r_ = simde_poly16x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -12205,7 +12205,7 @@ test_simde_vreinterpretq_p64_s32 (SIMDE_MUNIT_TEST_ARGS) { simde_int32x4_private a_ = simde_int32x4_to_private(a); simde_poly64x2_t r = simde_vreinterpretq_p64_s32(a); simde_poly64x2_private r_ = simde_poly64x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -12252,7 +12252,7 @@ test_simde_vreinterpretq_p8_s64 (SIMDE_MUNIT_TEST_ARGS) { simde_poly8x16_t r = simde_vreinterpretq_p8_s64(a); simde_poly8x16_private r_ = simde_poly8x16_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -12298,7 +12298,7 @@ test_simde_vreinterpretq_p16_s64 (SIMDE_MUNIT_TEST_ARGS) { simde_int64x2_private a_ = simde_int64x2_to_private(a); simde_poly16x8_t r = simde_vreinterpretq_p16_s64(a); simde_poly16x8_private r_ = simde_poly16x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -12344,7 +12344,7 @@ test_simde_vreinterpretq_p64_s64 (SIMDE_MUNIT_TEST_ARGS) { simde_int64x2_private a_ = simde_int64x2_to_private(a); simde_poly64x2_t r = simde_vreinterpretq_p64_s64(a); simde_poly64x2_private r_ = simde_poly64x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -12398,7 +12398,7 @@ test_simde_vreinterpretq_p16_p8 (SIMDE_MUNIT_TEST_ARGS) { simde_poly8x16_private a_ = simde_poly8x16_to_private(a); simde_poly16x8_t r = simde_vreinterpretq_p16_p8(a); simde_poly16x8_private r_ = simde_poly16x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -12452,7 +12452,7 @@ test_simde_vreinterpretq_p64_p8 (SIMDE_MUNIT_TEST_ARGS) { simde_poly8x16_private a_ = simde_poly8x16_to_private(a); simde_poly64x2_t r = simde_vreinterpretq_p64_p8(a); simde_poly64x2_private r_ = simde_poly64x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -12499,7 +12499,7 @@ test_simde_vreinterpretq_p8_p16 (SIMDE_MUNIT_TEST_ARGS) { simde_poly8x16_t r = simde_vreinterpretq_p8_p16(a); simde_poly8x16_private r_ = simde_poly8x16_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -12545,7 +12545,7 @@ test_simde_vreinterpretq_p64_p16 (SIMDE_MUNIT_TEST_ARGS) { simde_poly16x8_private a_ = simde_poly16x8_to_private(a); simde_poly64x2_t r = simde_vreinterpretq_p64_p16(a); simde_poly64x2_private r_ = simde_poly64x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -12592,7 +12592,7 @@ test_simde_vreinterpretq_p8_p64 (SIMDE_MUNIT_TEST_ARGS) { simde_poly8x16_t r = simde_vreinterpretq_p8_p64(a); simde_poly8x16_private r_ = simde_poly8x16_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -12638,7 +12638,7 @@ test_simde_vreinterpretq_p16_p64 (SIMDE_MUNIT_TEST_ARGS) { simde_poly64x2_private a_ = simde_poly64x2_to_private(a); simde_poly16x8_t r = simde_vreinterpretq_p16_p64(a); simde_poly16x8_private r_ = simde_poly16x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -12685,7 +12685,7 @@ test_simde_vreinterpretq_p8_f32 (SIMDE_MUNIT_TEST_ARGS) { simde_poly8x16_t r = simde_vreinterpretq_p8_f32(a); simde_poly8x16_private r_ = simde_poly8x16_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -12731,7 +12731,7 @@ test_simde_vreinterpretq_p16_f32 (SIMDE_MUNIT_TEST_ARGS) { simde_float32x4_private a_ = simde_float32x4_to_private(a); simde_poly16x8_t r = simde_vreinterpretq_p16_f32(a); simde_poly16x8_private r_ = simde_poly16x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -12776,7 +12776,7 @@ test_simde_vreinterpretq_p16_f16 (SIMDE_MUNIT_TEST_ARGS) { simde_float16x8_private a_ = simde_float16x8_to_private(a); simde_poly16x8_t r = simde_vreinterpretq_p16_f16(a); simde_poly16x8_private r_ = simde_poly16x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -12822,7 +12822,7 @@ test_simde_vreinterpretq_p64_f32 (SIMDE_MUNIT_TEST_ARGS) { simde_float32x4_private a_ = simde_float32x4_to_private(a); simde_poly64x2_t r = simde_vreinterpretq_p64_f32(a); simde_poly64x2_private r_ = simde_poly64x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -12869,7 +12869,7 @@ test_simde_vreinterpretq_p8_f64 (SIMDE_MUNIT_TEST_ARGS) { simde_poly8x16_t r = simde_vreinterpretq_p8_f64(a); simde_poly8x16_private r_ = simde_poly8x16_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -12915,7 +12915,7 @@ test_simde_vreinterpretq_p16_f64 (SIMDE_MUNIT_TEST_ARGS) { simde_float64x2_private a_ = simde_float64x2_to_private(a); simde_poly16x8_t r = simde_vreinterpretq_p16_f64(a); simde_poly16x8_private r_ = simde_poly16x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -12961,7 +12961,7 @@ test_simde_vreinterpretq_p64_f64 (SIMDE_MUNIT_TEST_ARGS) { simde_float64x2_private a_ = simde_float64x2_to_private(a); simde_poly64x2_t r = simde_vreinterpretq_p64_f64(a); simde_poly64x2_private r_ = simde_poly64x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -13008,7 +13008,7 @@ test_simde_vreinterpret_f32_p8 (SIMDE_MUNIT_TEST_ARGS) { simde_float32x2_t r = simde_vreinterpret_f32_p8(a); simde_float32x2_private r_ = simde_float32x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -13055,7 +13055,7 @@ test_simde_vreinterpret_f64_p8 (SIMDE_MUNIT_TEST_ARGS) { simde_float64x1_t r = simde_vreinterpret_f64_p8(a); simde_float64x1_private r_ = simde_float64x1_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -13102,7 +13102,7 @@ test_simde_vreinterpret_s8_p8 (SIMDE_MUNIT_TEST_ARGS) { simde_int8x8_t r = simde_vreinterpret_s8_p8(a); simde_int8x8_private r_ = simde_int8x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -13149,7 +13149,7 @@ test_simde_vreinterpret_s16_p8 (SIMDE_MUNIT_TEST_ARGS) { simde_int16x4_t r = simde_vreinterpret_s16_p8(a); simde_int16x4_private r_ = simde_int16x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -13196,7 +13196,7 @@ test_simde_vreinterpret_s32_p8 (SIMDE_MUNIT_TEST_ARGS) { simde_int32x2_t r = simde_vreinterpret_s32_p8(a); simde_int32x2_private r_ = simde_int32x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -13243,7 +13243,7 @@ test_simde_vreinterpret_s64_p8 (SIMDE_MUNIT_TEST_ARGS) { simde_int64x1_t r = simde_vreinterpret_s64_p8(a); simde_int64x1_private r_ = simde_int64x1_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -13297,7 +13297,7 @@ test_simde_vreinterpretq_f32_p8 (SIMDE_MUNIT_TEST_ARGS) { simde_poly8x16_private a_ = simde_poly8x16_to_private(a); simde_float32x4_t r = simde_vreinterpretq_f32_p8(a); simde_float32x4_private r_ = simde_float32x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -13352,7 +13352,7 @@ test_simde_vreinterpretq_f64_p8 (SIMDE_MUNIT_TEST_ARGS) { simde_float64x2_t r = simde_vreinterpretq_f64_p8(a); simde_float64x2_private r_ = simde_float64x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -13407,7 +13407,7 @@ test_simde_vreinterpretq_s8_p8 (SIMDE_MUNIT_TEST_ARGS) { simde_int8x16_t r = simde_vreinterpretq_s8_p8(a); simde_int8x16_private r_ = simde_int8x16_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -13462,7 +13462,7 @@ test_simde_vreinterpretq_s16_p8 (SIMDE_MUNIT_TEST_ARGS) { simde_int16x8_t r = simde_vreinterpretq_s16_p8(a); simde_int16x8_private r_ = simde_int16x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -13516,7 +13516,7 @@ test_simde_vreinterpretq_s32_p8 (SIMDE_MUNIT_TEST_ARGS) { simde_poly8x16_private a_ = simde_poly8x16_to_private(a); simde_int32x4_t r = simde_vreinterpretq_s32_p8(a); simde_int32x4_private r_ = simde_int32x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -13570,7 +13570,7 @@ test_simde_vreinterpretq_s64_p8 (SIMDE_MUNIT_TEST_ARGS) { simde_poly8x16_private a_ = simde_poly8x16_to_private(a); simde_int64x2_t r = simde_vreinterpretq_s64_p8(a); simde_int64x2_private r_ = simde_int64x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -13617,7 +13617,7 @@ test_simde_vreinterpret_f32_p16 (SIMDE_MUNIT_TEST_ARGS) { simde_float32x2_t r = simde_vreinterpret_f32_p16(a); simde_float32x2_private r_ = simde_float32x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -13664,7 +13664,7 @@ test_simde_vreinterpret_f64_p16 (SIMDE_MUNIT_TEST_ARGS) { simde_float64x1_t r = simde_vreinterpret_f64_p16(a); simde_float64x1_private r_ = simde_float64x1_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -13711,7 +13711,7 @@ test_simde_vreinterpret_s8_p16 (SIMDE_MUNIT_TEST_ARGS) { simde_int8x8_t r = simde_vreinterpret_s8_p16(a); simde_int8x8_private r_ = simde_int8x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -13758,7 +13758,7 @@ test_simde_vreinterpret_s16_p16 (SIMDE_MUNIT_TEST_ARGS) { simde_int16x4_t r = simde_vreinterpret_s16_p16(a); simde_int16x4_private r_ = simde_int16x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -13805,7 +13805,7 @@ test_simde_vreinterpret_s32_p16 (SIMDE_MUNIT_TEST_ARGS) { simde_int32x2_t r = simde_vreinterpret_s32_p16(a); simde_int32x2_private r_ = simde_int32x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -13852,7 +13852,7 @@ test_simde_vreinterpret_s64_p16 (SIMDE_MUNIT_TEST_ARGS) { simde_int64x1_t r = simde_vreinterpret_s64_p16(a); simde_int64x1_private r_ = simde_int64x1_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -13898,7 +13898,7 @@ test_simde_vreinterpretq_f32_p16 (SIMDE_MUNIT_TEST_ARGS) { simde_poly16x8_private a_ = simde_poly16x8_to_private(a); simde_float32x4_t r = simde_vreinterpretq_f32_p16(a); simde_float32x4_private r_ = simde_float32x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -13945,7 +13945,7 @@ test_simde_vreinterpretq_f64_p16 (SIMDE_MUNIT_TEST_ARGS) { simde_float64x2_t r = simde_vreinterpretq_f64_p16(a); simde_float64x2_private r_ = simde_float64x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -13992,7 +13992,7 @@ test_simde_vreinterpretq_s8_p16 (SIMDE_MUNIT_TEST_ARGS) { simde_int8x16_t r = simde_vreinterpretq_s8_p16(a); simde_int8x16_private r_ = simde_int8x16_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -14039,7 +14039,7 @@ test_simde_vreinterpretq_s16_p16 (SIMDE_MUNIT_TEST_ARGS) { simde_int16x8_t r = simde_vreinterpretq_s16_p16(a); simde_int16x8_private r_ = simde_int16x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -14085,7 +14085,7 @@ test_simde_vreinterpretq_s32_p16 (SIMDE_MUNIT_TEST_ARGS) { simde_poly16x8_private a_ = simde_poly16x8_to_private(a); simde_int32x4_t r = simde_vreinterpretq_s32_p16(a); simde_int32x4_private r_ = simde_int32x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -14131,7 +14131,7 @@ test_simde_vreinterpretq_s64_p16 (SIMDE_MUNIT_TEST_ARGS) { simde_poly16x8_private a_ = simde_poly16x8_to_private(a); simde_int64x2_t r = simde_vreinterpretq_s64_p16(a); simde_int64x2_private r_ = simde_int64x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -14169,7 +14169,7 @@ test_simde_vreinterpret_f16_p16 (SIMDE_MUNIT_TEST_ARGS) { simde_float16x4_t r = simde_vreinterpret_f16_p16(a); simde_float16x4_private r_ = simde_float16x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -14207,7 +14207,7 @@ test_simde_vreinterpretq_f16_p16 (SIMDE_MUNIT_TEST_ARGS) { simde_float16x8_t r = simde_vreinterpretq_f16_p16(a); simde_float16x8_private r_ = simde_float16x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -14254,7 +14254,7 @@ test_simde_vreinterpret_f64_p64 (SIMDE_MUNIT_TEST_ARGS) { simde_float64x1_t r = simde_vreinterpret_f64_p64(a); simde_float64x1_private r_ = simde_float64x1_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -14301,7 +14301,7 @@ test_simde_vreinterpret_s8_p64 (SIMDE_MUNIT_TEST_ARGS) { simde_int8x8_t r = simde_vreinterpret_s8_p64(a); simde_int8x8_private r_ = simde_int8x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -14348,7 +14348,7 @@ test_simde_vreinterpret_s16_p64 (SIMDE_MUNIT_TEST_ARGS) { simde_int16x4_t r = simde_vreinterpret_s16_p64(a); simde_int16x4_private r_ = simde_int16x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -14395,7 +14395,7 @@ test_simde_vreinterpret_s32_p64 (SIMDE_MUNIT_TEST_ARGS) { simde_int32x2_t r = simde_vreinterpret_s32_p64(a); simde_int32x2_private r_ = simde_int32x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -14442,7 +14442,7 @@ test_simde_vreinterpret_s64_p64 (SIMDE_MUNIT_TEST_ARGS) { simde_int64x1_t r = simde_vreinterpret_s64_p64(a); simde_int64x1_private r_ = simde_int64x1_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -14489,7 +14489,7 @@ test_simde_vreinterpretq_f64_p64 (SIMDE_MUNIT_TEST_ARGS) { simde_float64x2_t r = simde_vreinterpretq_f64_p64(a); simde_float64x2_private r_ = simde_float64x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -14536,7 +14536,7 @@ test_simde_vreinterpretq_s8_p64 (SIMDE_MUNIT_TEST_ARGS) { simde_int8x16_t r = simde_vreinterpretq_s8_p64(a); simde_int8x16_private r_ = simde_int8x16_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -14583,7 +14583,7 @@ test_simde_vreinterpretq_s16_p64 (SIMDE_MUNIT_TEST_ARGS) { simde_int16x8_t r = simde_vreinterpretq_s16_p64(a); simde_int16x8_private r_ = simde_int16x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -14629,7 +14629,7 @@ test_simde_vreinterpretq_s32_p64 (SIMDE_MUNIT_TEST_ARGS) { simde_poly64x2_private a_ = simde_poly64x2_to_private(a); simde_int32x4_t r = simde_vreinterpretq_s32_p64(a); simde_int32x4_private r_ = simde_int32x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -14675,7 +14675,7 @@ test_simde_vreinterpretq_s64_p64 (SIMDE_MUNIT_TEST_ARGS) { simde_poly64x2_private a_ = simde_poly64x2_to_private(a); simde_int64x2_t r = simde_vreinterpretq_s64_p64(a); simde_int64x2_private r_ = simde_int64x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -14722,7 +14722,7 @@ test_simde_vreinterpret_f16_p64 (SIMDE_MUNIT_TEST_ARGS) { simde_float16x4_t r = simde_vreinterpret_f16_p64(a); simde_float16x4_private r_ = simde_float16x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -14769,7 +14769,7 @@ test_simde_vreinterpret_f16_p8 (SIMDE_MUNIT_TEST_ARGS) { simde_float16x4_t r = simde_vreinterpret_f16_p8(a); simde_float16x4_private r_ = simde_float16x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -14815,7 +14815,7 @@ test_simde_vreinterpretq_f16_p64 (SIMDE_MUNIT_TEST_ARGS) { simde_poly64x2_private a_ = simde_poly64x2_to_private(a); simde_float16x8_t r = simde_vreinterpretq_f16_p64(a); simde_float16x8_private r_ = simde_float16x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -14869,7 +14869,7 @@ test_simde_vreinterpretq_f16_p8 (SIMDE_MUNIT_TEST_ARGS) { simde_poly8x16_private a_ = simde_poly8x16_to_private(a); simde_float16x8_t r = simde_vreinterpretq_f16_p8(a); simde_float16x8_private r_ = simde_float16x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -14915,7 +14915,7 @@ test_simde_vreinterpret_u16_p8 (SIMDE_MUNIT_TEST_ARGS) { simde_poly8x8_private a_ = simde_poly8x8_to_private(a); simde_uint16x4_t r = simde_vreinterpret_u16_p8(a); simde_uint16x4_private r_ = simde_uint16x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -14961,7 +14961,7 @@ test_simde_vreinterpret_u32_p8 (SIMDE_MUNIT_TEST_ARGS) { simde_poly8x8_private a_ = simde_poly8x8_to_private(a); simde_uint32x2_t r = simde_vreinterpret_u32_p8(a); simde_uint32x2_private r_ = simde_uint32x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -15007,7 +15007,7 @@ test_simde_vreinterpret_u64_p8 (SIMDE_MUNIT_TEST_ARGS) { simde_poly8x8_private a_ = simde_poly8x8_to_private(a); simde_uint64x1_t r = simde_vreinterpret_u64_p8(a); simde_uint64x1_private r_ = simde_uint64x1_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -15061,7 +15061,7 @@ test_simde_vreinterpretq_u16_p8 (SIMDE_MUNIT_TEST_ARGS) { simde_poly8x16_private a_ = simde_poly8x16_to_private(a); simde_uint16x8_t r = simde_vreinterpretq_u16_p8(a); simde_uint16x8_private r_ = simde_uint16x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -15116,7 +15116,7 @@ test_simde_vreinterpretq_u32_p8 (SIMDE_MUNIT_TEST_ARGS) { simde_uint32x4_t r = simde_vreinterpretq_u32_p8(a); simde_uint32x4_private r_ = simde_uint32x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -15170,7 +15170,7 @@ test_simde_vreinterpretq_u64_p8 (SIMDE_MUNIT_TEST_ARGS) { simde_poly8x16_private a_ = simde_poly8x16_to_private(a); simde_uint64x2_t r = simde_vreinterpretq_u64_p8(a); simde_uint64x2_private r_ = simde_uint64x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -15217,7 +15217,7 @@ test_simde_vreinterpret_u8_p16 (SIMDE_MUNIT_TEST_ARGS) { simde_uint8x8_t r = simde_vreinterpret_u8_p16(a); simde_uint8x8_private r_ = simde_uint8x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -15263,7 +15263,7 @@ test_simde_vreinterpret_u32_p16 (SIMDE_MUNIT_TEST_ARGS) { simde_poly16x4_private a_ = simde_poly16x4_to_private(a); simde_uint32x2_t r = simde_vreinterpret_u32_p16(a); simde_uint32x2_private r_ = simde_uint32x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -15309,7 +15309,7 @@ test_simde_vreinterpret_u64_p16 (SIMDE_MUNIT_TEST_ARGS) { simde_poly16x4_private a_ = simde_poly16x4_to_private(a); simde_uint64x1_t r = simde_vreinterpret_u64_p16(a); simde_uint64x1_private r_ = simde_uint64x1_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -15356,7 +15356,7 @@ test_simde_vreinterpretq_u8_p16 (SIMDE_MUNIT_TEST_ARGS) { simde_uint8x16_t r = simde_vreinterpretq_u8_p16(a); simde_uint8x16_private r_ = simde_uint8x16_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -15403,7 +15403,7 @@ test_simde_vreinterpretq_u32_p16 (SIMDE_MUNIT_TEST_ARGS) { simde_uint32x4_t r = simde_vreinterpretq_u32_p16(a); simde_uint32x4_private r_ = simde_uint32x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -15449,7 +15449,7 @@ test_simde_vreinterpretq_u64_p16 (SIMDE_MUNIT_TEST_ARGS) { simde_poly16x8_private a_ = simde_poly16x8_to_private(a); simde_uint64x2_t r = simde_vreinterpretq_u64_p16(a); simde_uint64x2_private r_ = simde_uint64x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -15496,7 +15496,7 @@ test_simde_vreinterpret_u8_p64 (SIMDE_MUNIT_TEST_ARGS) { simde_uint8x8_t r = simde_vreinterpret_u8_p64(a); simde_uint8x8_private r_ = simde_uint8x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -15542,7 +15542,7 @@ test_simde_vreinterpret_u16_p64 (SIMDE_MUNIT_TEST_ARGS) { simde_poly64x1_private a_ = simde_poly64x1_to_private(a); simde_uint16x4_t r = simde_vreinterpret_u16_p64(a); simde_uint16x4_private r_ = simde_uint16x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -15588,7 +15588,7 @@ test_simde_vreinterpret_u32_p64 (SIMDE_MUNIT_TEST_ARGS) { simde_poly64x1_private a_ = simde_poly64x1_to_private(a); simde_uint32x2_t r = simde_vreinterpret_u32_p64(a); simde_uint32x2_private r_ = simde_uint32x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -15635,7 +15635,7 @@ test_simde_vreinterpretq_u8_p64 (SIMDE_MUNIT_TEST_ARGS) { simde_uint8x16_t r = simde_vreinterpretq_u8_p64(a); simde_uint8x16_private r_ = simde_uint8x16_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -15681,7 +15681,7 @@ test_simde_vreinterpretq_u16_p64 (SIMDE_MUNIT_TEST_ARGS) { simde_poly64x2_private a_ = simde_poly64x2_to_private(a); simde_uint16x8_t r = simde_vreinterpretq_u16_p64(a); simde_uint16x8_private r_ = simde_uint16x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -15728,7 +15728,7 @@ test_simde_vreinterpretq_u32_p64 (SIMDE_MUNIT_TEST_ARGS) { simde_uint32x4_t r = simde_vreinterpretq_u32_p64(a); simde_uint32x4_private r_ = simde_uint32x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -15774,7 +15774,7 @@ test_simde_vreinterpret_p16_u8 (SIMDE_MUNIT_TEST_ARGS) { simde_uint8x8_private a_ = simde_uint8x8_to_private(a); simde_poly16x4_t r = simde_vreinterpret_p16_u8(a); simde_poly16x4_private r_ = simde_poly16x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -15828,7 +15828,7 @@ test_simde_vreinterpretq_p16_u8 (SIMDE_MUNIT_TEST_ARGS) { simde_uint8x16_private a_ = simde_uint8x16_to_private(a); simde_poly16x8_t r = simde_vreinterpretq_p16_u8(a); simde_poly16x8_private r_ = simde_poly16x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -15874,7 +15874,7 @@ test_simde_vreinterpret_p16_u32 (SIMDE_MUNIT_TEST_ARGS) { simde_uint32x2_private a_ = simde_uint32x2_to_private(a); simde_poly16x4_t r = simde_vreinterpret_p16_u32(a); simde_poly16x4_private r_ = simde_poly16x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -15921,7 +15921,7 @@ test_simde_vreinterpretq_p16_u32 (SIMDE_MUNIT_TEST_ARGS) { simde_uint32x4_private a_ = simde_uint32x4_to_private(a); simde_poly16x8_t r = simde_vreinterpretq_p16_u32(a); simde_poly16x8_private r_ = simde_poly16x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -15967,7 +15967,7 @@ test_simde_vreinterpret_p16_u64 (SIMDE_MUNIT_TEST_ARGS) { simde_uint64x1_private a_ = simde_uint64x1_to_private(a); simde_poly16x4_t r = simde_vreinterpret_p16_u64(a); simde_poly16x4_private r_ = simde_poly16x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -16013,7 +16013,7 @@ test_simde_vreinterpretq_p16_u64 (SIMDE_MUNIT_TEST_ARGS) { simde_uint64x2_private a_ = simde_uint64x2_to_private(a); simde_poly16x8_t r = simde_vreinterpretq_p16_u64(a); simde_poly16x8_private r_ = simde_poly16x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -16059,7 +16059,7 @@ test_simde_vreinterpret_p64_u8 (SIMDE_MUNIT_TEST_ARGS) { simde_uint8x8_private a_ = simde_uint8x8_to_private(a); simde_poly64x1_t r = simde_vreinterpret_p64_u8(a); simde_poly64x1_private r_ = simde_poly64x1_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -16113,7 +16113,7 @@ test_simde_vreinterpretq_p64_u8 (SIMDE_MUNIT_TEST_ARGS) { simde_uint8x16_private a_ = simde_uint8x16_to_private(a); simde_poly64x2_t r = simde_vreinterpretq_p64_u8(a); simde_poly64x2_private r_ = simde_poly64x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -16159,7 +16159,7 @@ test_simde_vreinterpret_p64_u16 (SIMDE_MUNIT_TEST_ARGS) { simde_uint16x4_private a_ = simde_uint16x4_to_private(a); simde_poly64x1_t r = simde_vreinterpret_p64_u16(a); simde_poly64x1_private r_ = simde_poly64x1_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -16205,7 +16205,7 @@ test_simde_vreinterpretq_p64_u16 (SIMDE_MUNIT_TEST_ARGS) { simde_uint16x8_private a_ = simde_uint16x8_to_private(a); simde_poly64x2_t r = simde_vreinterpretq_p64_u16(a); simde_poly64x2_private r_ = simde_poly64x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -16251,7 +16251,7 @@ test_simde_vreinterpret_p64_u32 (SIMDE_MUNIT_TEST_ARGS) { simde_uint32x2_private a_ = simde_uint32x2_to_private(a); simde_poly64x1_t r = simde_vreinterpret_p64_u32(a); simde_poly64x1_private r_ = simde_poly64x1_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -16297,7 +16297,7 @@ test_simde_vreinterpretq_p64_u32 (SIMDE_MUNIT_TEST_ARGS) { simde_uint32x4_private a_ = simde_uint32x4_to_private(a); simde_poly64x2_t r = simde_vreinterpretq_p64_u32(a); simde_poly64x2_private r_ = simde_poly64x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -16344,7 +16344,7 @@ test_simde_vreinterpret_p8_u16 (SIMDE_MUNIT_TEST_ARGS) { simde_poly8x8_t r = simde_vreinterpret_p8_u16(a); simde_poly8x8_private r_ = simde_poly8x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -16391,7 +16391,7 @@ test_simde_vreinterpretq_p8_u16 (SIMDE_MUNIT_TEST_ARGS) { simde_poly8x16_t r = simde_vreinterpretq_p8_u16(a); simde_poly8x16_private r_ = simde_poly8x16_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -16438,7 +16438,7 @@ test_simde_vreinterpret_p8_u32 (SIMDE_MUNIT_TEST_ARGS) { simde_poly8x8_t r = simde_vreinterpret_p8_u32(a); simde_poly8x8_private r_ = simde_poly8x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -16485,7 +16485,7 @@ test_simde_vreinterpretq_p8_u32 (SIMDE_MUNIT_TEST_ARGS) { simde_poly8x16_t r = simde_vreinterpretq_p8_u32(a); simde_poly8x16_private r_ = simde_poly8x16_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -16532,7 +16532,7 @@ test_simde_vreinterpret_p8_u64 (SIMDE_MUNIT_TEST_ARGS) { simde_poly8x8_t r = simde_vreinterpret_p8_u64(a); simde_poly8x8_private r_ = simde_poly8x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -16579,7 +16579,7 @@ test_simde_vreinterpretq_p8_u64 (SIMDE_MUNIT_TEST_ARGS) { simde_poly8x16_t r = simde_vreinterpretq_p8_u64(a); simde_poly8x16_private r_ = simde_poly8x16_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -16626,7 +16626,7 @@ test_simde_vreinterpret_p8_u8 (SIMDE_MUNIT_TEST_ARGS) { simde_poly8x8_t r = simde_vreinterpret_p8_u8(a); simde_poly8x8_private r_ = simde_poly8x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -16681,7 +16681,7 @@ test_simde_vreinterpretq_p8_u8 (SIMDE_MUNIT_TEST_ARGS) { simde_poly8x16_t r = simde_vreinterpretq_p8_u8(a); simde_poly8x16_private r_ = simde_poly8x16_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -16728,7 +16728,7 @@ test_simde_vreinterpret_p16_u16 (SIMDE_MUNIT_TEST_ARGS) { simde_poly16x4_t r = simde_vreinterpret_p16_u16(a); simde_poly16x4_private r_ = simde_poly16x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -16775,7 +16775,7 @@ test_simde_vreinterpretq_p16_u16 (SIMDE_MUNIT_TEST_ARGS) { simde_poly16x8_t r = simde_vreinterpretq_p16_u16(a); simde_poly16x8_private r_ = simde_poly16x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -16822,7 +16822,7 @@ test_simde_vreinterpret_p64_u64 (SIMDE_MUNIT_TEST_ARGS) { simde_poly64x1_t r = simde_vreinterpret_p64_u64(a); simde_poly64x1_private r_ = simde_poly64x1_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -16868,7 +16868,7 @@ test_simde_vreinterpretq_p64_u64 (SIMDE_MUNIT_TEST_ARGS) { simde_uint64x2_private a_ = simde_uint64x2_to_private(a); simde_poly64x2_t r = simde_vreinterpretq_p64_u64(a); simde_poly64x2_private r_ = simde_poly64x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -16915,7 +16915,7 @@ test_simde_vreinterpret_u8_p8 (SIMDE_MUNIT_TEST_ARGS) { simde_uint8x8_t r = simde_vreinterpret_u8_p8(a); simde_uint8x8_private r_ = simde_uint8x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -16970,7 +16970,7 @@ test_simde_vreinterpretq_u8_p8 (SIMDE_MUNIT_TEST_ARGS) { simde_uint8x16_t r = simde_vreinterpretq_u8_p8(a); simde_uint8x16_private r_ = simde_uint8x16_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -17017,7 +17017,7 @@ test_simde_vreinterpret_u16_p16 (SIMDE_MUNIT_TEST_ARGS) { simde_uint16x4_t r = simde_vreinterpret_u16_p16(a); simde_uint16x4_private r_ = simde_uint16x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -17064,7 +17064,7 @@ test_simde_vreinterpretq_u16_p16 (SIMDE_MUNIT_TEST_ARGS) { simde_uint16x8_t r = simde_vreinterpretq_u16_p16(a); simde_uint16x8_private r_ = simde_uint16x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -17111,7 +17111,7 @@ test_simde_vreinterpret_u64_p64 (SIMDE_MUNIT_TEST_ARGS) { simde_uint64x1_t r = simde_vreinterpret_u64_p64(a); simde_uint64x1_private r_ = simde_uint64x1_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -17157,7 +17157,7 @@ test_simde_vreinterpretq_u64_p64 (SIMDE_MUNIT_TEST_ARGS) { simde_poly64x2_private a_ = simde_poly64x2_to_private(a); simde_uint64x2_t r = simde_vreinterpretq_u64_p64(a); simde_uint64x2_private r_ = simde_uint64x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -17812,7 +17812,7 @@ test_simde_vreinterpretq_p8_p128 (SIMDE_MUNIT_TEST_ARGS) { r = simde_vreinterpretq_p8_p128(a); r_ = simde_poly8x16_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a, 16)); } return 0; @@ -17861,7 +17861,7 @@ test_simde_vreinterpretq_p16_p128 (SIMDE_MUNIT_TEST_ARGS) { r = simde_vreinterpretq_p16_p128(a); r_ = simde_poly16x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a, 16)); } return 0; @@ -17910,7 +17910,7 @@ test_simde_vreinterpretq_s8_p128 (SIMDE_MUNIT_TEST_ARGS) { r = simde_vreinterpretq_s8_p128(a); r_ = simde_int8x16_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a, 16)); } return 0; @@ -17959,7 +17959,7 @@ test_simde_vreinterpretq_s16_p128 (SIMDE_MUNIT_TEST_ARGS) { r = simde_vreinterpretq_s16_p128(a); r_ = simde_int16x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a, 16)); } return 0; @@ -18008,7 +18008,7 @@ test_simde_vreinterpretq_s32_p128 (SIMDE_MUNIT_TEST_ARGS) { r = simde_vreinterpretq_s32_p128(a); r_ = simde_int32x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a, 16)); } return 0; @@ -18057,7 +18057,7 @@ test_simde_vreinterpretq_s64_p128 (SIMDE_MUNIT_TEST_ARGS) { r = simde_vreinterpretq_s64_p128(a); r_ = simde_int64x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a, 16)); } return 0; @@ -18106,7 +18106,7 @@ test_simde_vreinterpretq_u8_p128 (SIMDE_MUNIT_TEST_ARGS) { r = simde_vreinterpretq_u8_p128(a); r_ = simde_uint8x16_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a, 16)); } return 0; @@ -18155,7 +18155,7 @@ test_simde_vreinterpretq_u16_p128 (SIMDE_MUNIT_TEST_ARGS) { r = simde_vreinterpretq_u16_p128(a); r_ = simde_uint16x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a, 16)); } return 0; @@ -18204,7 +18204,7 @@ test_simde_vreinterpretq_u32_p128 (SIMDE_MUNIT_TEST_ARGS) { r = simde_vreinterpretq_u32_p128(a); r_ = simde_uint32x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a, 16)); } return 0; @@ -18253,7 +18253,7 @@ test_simde_vreinterpretq_u64_p128 (SIMDE_MUNIT_TEST_ARGS) { r = simde_vreinterpretq_u64_p128(a); r_ = simde_uint64x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a, 16)); } return 0; @@ -18302,7 +18302,7 @@ test_simde_vreinterpretq_f16_p128 (SIMDE_MUNIT_TEST_ARGS) { r = simde_vreinterpretq_f16_p128(a); r_ = simde_float16x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a, 16)); } return 0; @@ -18351,7 +18351,7 @@ test_simde_vreinterpretq_f64_p128 (SIMDE_MUNIT_TEST_ARGS) { r = simde_vreinterpretq_f64_p128(a); r_ = simde_float64x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a, 16)); } return 0; @@ -18387,7 +18387,7 @@ test_simde_vreinterpretq_p128_bf16 (SIMDE_MUNIT_TEST_ARGS) { simde_bfloat16x8_private a_ = simde_bfloat16x8_to_private(a); simde_poly128_t r = simde_vreinterpretq_p128_bf16(a); - simde_assert_equal_i(0, simde_memcmp(&r, &a_, sizeof(r))); + simde_assert_equal_i(0, simde_memcmp(&r, &a_, 16)); } return 0; @@ -18435,7 +18435,7 @@ test_simde_vreinterpretq_bf16_p128 (SIMDE_MUNIT_TEST_ARGS) { r = simde_vreinterpretq_bf16_p128(a); r_ = simde_bfloat16x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a, 16)); } return 0; @@ -18465,7 +18465,7 @@ test_simde_vreinterpret_bf16_s8 (SIMDE_MUNIT_TEST_ARGS) { simde_bfloat16x4_t r = simde_vreinterpret_bf16_s8(a); simde_bfloat16x4_private r_ = simde_bfloat16x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -18502,7 +18502,7 @@ test_simde_vreinterpret_bf16_s16 (SIMDE_MUNIT_TEST_ARGS) { simde_bfloat16x4_t r = simde_vreinterpret_bf16_s16(a); simde_bfloat16x4_private r_ = simde_bfloat16x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -18539,7 +18539,7 @@ test_simde_vreinterpret_bf16_s32 (SIMDE_MUNIT_TEST_ARGS) { simde_bfloat16x4_t r = simde_vreinterpret_bf16_s32(a); simde_bfloat16x4_private r_ = simde_bfloat16x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -18576,7 +18576,7 @@ test_simde_vreinterpret_bf16_s64 (SIMDE_MUNIT_TEST_ARGS) { simde_bfloat16x4_t r = simde_vreinterpret_bf16_s64(a); simde_bfloat16x4_private r_ = simde_bfloat16x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -18613,7 +18613,7 @@ test_simde_vreinterpret_bf16_u8 (SIMDE_MUNIT_TEST_ARGS) { simde_bfloat16x4_t r = simde_vreinterpret_bf16_u8(a); simde_bfloat16x4_private r_ = simde_bfloat16x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -18650,7 +18650,7 @@ test_simde_vreinterpret_bf16_u16 (SIMDE_MUNIT_TEST_ARGS) { simde_bfloat16x4_t r = simde_vreinterpret_bf16_u16(a); simde_bfloat16x4_private r_ = simde_bfloat16x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -18687,7 +18687,7 @@ test_simde_vreinterpret_bf16_u32 (SIMDE_MUNIT_TEST_ARGS) { simde_bfloat16x4_t r = simde_vreinterpret_bf16_u32(a); simde_bfloat16x4_private r_ = simde_bfloat16x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -18724,7 +18724,7 @@ test_simde_vreinterpret_bf16_u64 (SIMDE_MUNIT_TEST_ARGS) { simde_bfloat16x4_t r = simde_vreinterpret_bf16_u64(a); simde_bfloat16x4_private r_ = simde_bfloat16x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -18757,7 +18757,7 @@ test_simde_vreinterpret_bf16_f32 (SIMDE_MUNIT_TEST_ARGS) { simde_bfloat16x4_t r = simde_vreinterpret_bf16_f32(a); simde_bfloat16x4_private r_ = simde_bfloat16x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -18790,7 +18790,7 @@ test_simde_vreinterpret_bf16_f64 (SIMDE_MUNIT_TEST_ARGS) { simde_bfloat16x4_t r = simde_vreinterpret_bf16_f64(a); simde_bfloat16x4_private r_ = simde_bfloat16x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -18827,7 +18827,7 @@ test_simde_vreinterpretq_bf16_s8 (SIMDE_MUNIT_TEST_ARGS) { simde_bfloat16x8_t r = simde_vreinterpretq_bf16_s8(a); simde_bfloat16x8_private r_ = simde_bfloat16x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -18860,7 +18860,7 @@ test_simde_vreinterpretq_bf16_s16 (SIMDE_MUNIT_TEST_ARGS) { simde_bfloat16x8_t r = simde_vreinterpretq_bf16_s16(a); simde_bfloat16x8_private r_ = simde_bfloat16x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -18897,7 +18897,7 @@ test_simde_vreinterpretq_bf16_s32 (SIMDE_MUNIT_TEST_ARGS) { simde_bfloat16x8_t r = simde_vreinterpretq_bf16_s32(a); simde_bfloat16x8_private r_ = simde_bfloat16x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -18930,7 +18930,7 @@ test_simde_vreinterpretq_bf16_s64 (SIMDE_MUNIT_TEST_ARGS) { simde_bfloat16x8_t r = simde_vreinterpretq_bf16_s64(a); simde_bfloat16x8_private r_ = simde_bfloat16x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -18967,7 +18967,7 @@ test_simde_vreinterpretq_bf16_u8 (SIMDE_MUNIT_TEST_ARGS) { simde_bfloat16x8_t r = simde_vreinterpretq_bf16_u8(a); simde_bfloat16x8_private r_ = simde_bfloat16x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -19000,7 +19000,7 @@ test_simde_vreinterpretq_bf16_u16 (SIMDE_MUNIT_TEST_ARGS) { simde_bfloat16x8_t r = simde_vreinterpretq_bf16_u16(a); simde_bfloat16x8_private r_ = simde_bfloat16x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -19037,7 +19037,7 @@ test_simde_vreinterpretq_bf16_u32 (SIMDE_MUNIT_TEST_ARGS) { simde_bfloat16x8_t r = simde_vreinterpretq_bf16_u32(a); simde_bfloat16x8_private r_ = simde_bfloat16x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -19070,7 +19070,7 @@ test_simde_vreinterpretq_bf16_u64 (SIMDE_MUNIT_TEST_ARGS) { simde_bfloat16x8_t r = simde_vreinterpretq_bf16_u64(a); simde_bfloat16x8_private r_ = simde_bfloat16x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -19107,7 +19107,7 @@ test_simde_vreinterpretq_bf16_f32 (SIMDE_MUNIT_TEST_ARGS) { simde_bfloat16x8_t r = simde_vreinterpretq_bf16_f32(a); simde_bfloat16x8_private r_ = simde_bfloat16x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -19140,7 +19140,7 @@ test_simde_vreinterpretq_bf16_f64 (SIMDE_MUNIT_TEST_ARGS) { simde_bfloat16x8_t r = simde_vreinterpretq_bf16_f64(a); simde_bfloat16x8_private r_ = simde_bfloat16x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -19173,7 +19173,7 @@ test_simde_vreinterpret_s8_bf16 (SIMDE_MUNIT_TEST_ARGS) { simde_int8x8_t r = simde_vreinterpret_s8_bf16(a); simde_int8x8_private r_ = simde_int8x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -19206,7 +19206,7 @@ test_simde_vreinterpret_s16_bf16 (SIMDE_MUNIT_TEST_ARGS) { simde_int16x4_t r = simde_vreinterpret_s16_bf16(a); simde_int16x4_private r_ = simde_int16x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -19239,7 +19239,7 @@ test_simde_vreinterpret_s32_bf16 (SIMDE_MUNIT_TEST_ARGS) { simde_int32x2_t r = simde_vreinterpret_s32_bf16(a); simde_int32x2_private r_ = simde_int32x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -19272,7 +19272,7 @@ test_simde_vreinterpret_s64_bf16 (SIMDE_MUNIT_TEST_ARGS) { simde_int64x1_t r = simde_vreinterpret_s64_bf16(a); simde_int64x1_private r_ = simde_int64x1_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -19305,7 +19305,7 @@ test_simde_vreinterpret_u8_bf16 (SIMDE_MUNIT_TEST_ARGS) { simde_uint8x8_t r = simde_vreinterpret_u8_bf16(a); simde_uint8x8_private r_ = simde_uint8x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -19338,7 +19338,7 @@ test_simde_vreinterpret_u16_bf16 (SIMDE_MUNIT_TEST_ARGS) { simde_uint16x4_t r = simde_vreinterpret_u16_bf16(a); simde_uint16x4_private r_ = simde_uint16x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -19371,7 +19371,7 @@ test_simde_vreinterpret_u32_bf16 (SIMDE_MUNIT_TEST_ARGS) { simde_uint32x2_t r = simde_vreinterpret_u32_bf16(a); simde_uint32x2_private r_ = simde_uint32x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -19404,7 +19404,7 @@ test_simde_vreinterpret_u64_bf16 (SIMDE_MUNIT_TEST_ARGS) { simde_uint64x1_t r = simde_vreinterpret_u64_bf16(a); simde_uint64x1_private r_ = simde_uint64x1_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -19437,7 +19437,7 @@ test_simde_vreinterpret_f32_bf16 (SIMDE_MUNIT_TEST_ARGS) { simde_float32x2_t r = simde_vreinterpret_f32_bf16(a); simde_float32x2_private r_ = simde_float32x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -19470,7 +19470,7 @@ test_simde_vreinterpret_f64_bf16 (SIMDE_MUNIT_TEST_ARGS) { simde_float64x1_t r = simde_vreinterpret_f64_bf16(a); simde_float64x1_private r_ = simde_float64x1_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -19507,7 +19507,7 @@ test_simde_vreinterpretq_s8_bf16 (SIMDE_MUNIT_TEST_ARGS) { simde_int8x16_t r = simde_vreinterpretq_s8_bf16(a); simde_int8x16_private r_ = simde_int8x16_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -19544,7 +19544,7 @@ test_simde_vreinterpretq_s16_bf16 (SIMDE_MUNIT_TEST_ARGS) { simde_int16x8_t r = simde_vreinterpretq_s16_bf16(a); simde_int16x8_private r_ = simde_int16x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -19581,7 +19581,7 @@ test_simde_vreinterpretq_s32_bf16 (SIMDE_MUNIT_TEST_ARGS) { simde_int32x4_t r = simde_vreinterpretq_s32_bf16(a); simde_int32x4_private r_ = simde_int32x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -19618,7 +19618,7 @@ test_simde_vreinterpretq_s64_bf16 (SIMDE_MUNIT_TEST_ARGS) { simde_int64x2_t r = simde_vreinterpretq_s64_bf16(a); simde_int64x2_private r_ = simde_int64x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -19655,7 +19655,7 @@ test_simde_vreinterpretq_u8_bf16 (SIMDE_MUNIT_TEST_ARGS) { simde_uint8x16_t r = simde_vreinterpretq_u8_bf16(a); simde_uint8x16_private r_ = simde_uint8x16_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -19692,7 +19692,7 @@ test_simde_vreinterpretq_u16_bf16 (SIMDE_MUNIT_TEST_ARGS) { simde_uint16x8_t r = simde_vreinterpretq_u16_bf16(a); simde_uint16x8_private r_ = simde_uint16x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -19729,7 +19729,7 @@ test_simde_vreinterpretq_u32_bf16 (SIMDE_MUNIT_TEST_ARGS) { simde_uint32x4_t r = simde_vreinterpretq_u32_bf16(a); simde_uint32x4_private r_ = simde_uint32x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -19766,7 +19766,7 @@ test_simde_vreinterpretq_u64_bf16 (SIMDE_MUNIT_TEST_ARGS) { simde_uint64x2_t r = simde_vreinterpretq_u64_bf16(a); simde_uint64x2_private r_ = simde_uint64x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -19803,7 +19803,7 @@ test_simde_vreinterpretq_f32_bf16 (SIMDE_MUNIT_TEST_ARGS) { simde_float32x4_t r = simde_vreinterpretq_f32_bf16(a); simde_float32x4_private r_ = simde_float32x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -19840,7 +19840,7 @@ test_simde_vreinterpretq_f64_bf16 (SIMDE_MUNIT_TEST_ARGS) { simde_float64x2_t r = simde_vreinterpretq_f64_bf16(a); simde_float64x2_private r_ = simde_float64x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -19873,7 +19873,7 @@ test_simde_vreinterpret_bf16_p8 (SIMDE_MUNIT_TEST_ARGS) { simde_bfloat16x4_t r = simde_vreinterpret_bf16_p8(a); simde_bfloat16x4_private r_ = simde_bfloat16x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -19906,7 +19906,7 @@ test_simde_vreinterpret_bf16_p16 (SIMDE_MUNIT_TEST_ARGS) { simde_bfloat16x4_t r = simde_vreinterpret_bf16_p16(a); simde_bfloat16x4_private r_ = simde_bfloat16x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -19939,7 +19939,7 @@ test_simde_vreinterpret_bf16_p64 (SIMDE_MUNIT_TEST_ARGS) { simde_bfloat16x4_t r = simde_vreinterpret_bf16_p64(a); simde_bfloat16x4_private r_ = simde_bfloat16x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -19976,7 +19976,7 @@ test_simde_vreinterpretq_bf16_p8 (SIMDE_MUNIT_TEST_ARGS) { simde_bfloat16x8_t r = simde_vreinterpretq_bf16_p8(a); simde_bfloat16x8_private r_ = simde_bfloat16x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -20009,7 +20009,7 @@ test_simde_vreinterpretq_bf16_p16 (SIMDE_MUNIT_TEST_ARGS) { simde_bfloat16x8_t r = simde_vreinterpretq_bf16_p16(a); simde_bfloat16x8_private r_ = simde_bfloat16x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -20042,7 +20042,7 @@ test_simde_vreinterpretq_bf16_p64 (SIMDE_MUNIT_TEST_ARGS) { simde_bfloat16x8_t r = simde_vreinterpretq_bf16_p64(a); simde_bfloat16x8_private r_ = simde_bfloat16x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -20075,7 +20075,7 @@ test_simde_vreinterpret_p8_bf16 (SIMDE_MUNIT_TEST_ARGS) { simde_poly8x8_t r = simde_vreinterpret_p8_bf16(a); simde_poly8x8_private r_ = simde_poly8x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -20108,7 +20108,7 @@ test_simde_vreinterpret_p16_bf16 (SIMDE_MUNIT_TEST_ARGS) { simde_poly16x4_t r = simde_vreinterpret_p16_bf16(a); simde_poly16x4_private r_ = simde_poly16x4_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -20141,7 +20141,7 @@ test_simde_vreinterpret_p64_bf16 (SIMDE_MUNIT_TEST_ARGS) { simde_poly64x1_t r = simde_vreinterpret_p64_bf16(a); simde_poly64x1_private r_ = simde_poly64x1_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 8)); } return 0; @@ -20178,7 +20178,7 @@ test_simde_vreinterpretq_p8_bf16 (SIMDE_MUNIT_TEST_ARGS) { simde_poly8x16_t r = simde_vreinterpretq_p8_bf16(a); simde_poly8x16_private r_ = simde_poly8x16_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -20215,7 +20215,7 @@ test_simde_vreinterpretq_p16_bf16 (SIMDE_MUNIT_TEST_ARGS) { simde_poly16x8_t r = simde_vreinterpretq_p16_bf16(a); simde_poly16x8_private r_ = simde_poly16x8_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; @@ -20252,7 +20252,7 @@ test_simde_vreinterpretq_p64_bf16 (SIMDE_MUNIT_TEST_ARGS) { simde_poly64x2_t r = simde_vreinterpretq_p64_bf16(a); simde_poly64x2_private r_ = simde_poly64x2_to_private(r); - simde_assert_equal_i(0, simde_memcmp(&r_, &a_, sizeof(r_))); + simde_assert_equal_i(0, simde_memcmp(&r_, &a_, 16)); } return 0; diff --git a/test/arm/neon/test-neon.h b/test/arm/neon/test-neon.h index 72c20390f..668ecdd1a 100644 --- a/test/arm/neon/test-neon.h +++ b/test/arm/neon/test-neon.h @@ -35,7 +35,7 @@ HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION simde_vst1##modifier##_##neon_identifier(a_, a); \ simde_vst1##modifier##_##neon_identifier(b_, b); \ \ - return simde_assert_equal_v##symbol_identifier##_(sizeof(a_) / sizeof(a_[0]), a_, b_, filename, line, astr, bstr); \ + return simde_assert_equal_v##symbol_identifier##_(element_count, a_, b_, filename, line, astr, bstr); \ } #define SIMDE_TEST_ARM_NEON_GENERATE_FLOAT_TYPE_EQUAL_FUNC_(NT, ET, SET, element_count, modifier, symbol_identifier) \ @@ -47,7 +47,7 @@ HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION simde_vst1##modifier##_##symbol_identifier(a_, a); \ simde_vst1##modifier##_##symbol_identifier(b_, b); \ \ - return simde_assert_equal_v##symbol_identifier##_(sizeof(a_) / sizeof(ET), HEDLEY_REINTERPRET_CAST(SET*, a_), HEDLEY_REINTERPRET_CAST(SET*, b_), slop, filename, line, astr, bstr); \ + return simde_assert_equal_v##symbol_identifier##_(element_count, HEDLEY_REINTERPRET_CAST(SET*, a_), HEDLEY_REINTERPRET_CAST(SET*, b_), slop, filename, line, astr, bstr); \ } \ #define SIMDE_TEST_ARM_NEON_GENERATE_FLOAT_TYPE_FUNCS_(NT, ET, SET, element_count, modifier, symbol_identifier) \ @@ -171,8 +171,8 @@ HEDLEY_DIAGNOSTIC_POP simde_vst1##modifier##_##neon_identifier(a1_, a.val[1]); \ simde_vst1##modifier##_##neon_identifier(b1_, b.val[1]); \ \ - return simde_assert_equal_v##symbol_identifier##_(sizeof(a0_) / sizeof(a0_[0]), a0_, b0_, filename, line, astr, bstr) \ - && simde_assert_equal_v##symbol_identifier##_(sizeof(a1_) / sizeof(a1_[0]), a1_, b1_, filename, line, astr, bstr); \ + return simde_assert_equal_v##symbol_identifier##_(element_count, a0_, b0_, filename, line, astr, bstr) \ + && simde_assert_equal_v##symbol_identifier##_(element_count, a1_, b1_, filename, line, astr, bstr); \ } #define SIMDE_TEST_ARM_NEON_GENERATE_X2_VECTOR_FLOAT_TYPE_EQUAL_FUNC_(NT, ET, SET, element_count, modifier, symbol_identifier) \ @@ -187,8 +187,8 @@ HEDLEY_DIAGNOSTIC_POP simde_vst1##modifier##_##symbol_identifier(a1_, a.val[1]); \ simde_vst1##modifier##_##symbol_identifier(b1_, b.val[1]); \ \ - return simde_assert_equal_v##symbol_identifier##_(sizeof(a0_) / sizeof(ET), HEDLEY_REINTERPRET_CAST(SET*, a0_), HEDLEY_REINTERPRET_CAST(SET*, b0_), slop, filename, line, astr, bstr) && \ - simde_assert_equal_v##symbol_identifier##_(sizeof(a1_) / sizeof(ET), HEDLEY_REINTERPRET_CAST(SET*, a1_), HEDLEY_REINTERPRET_CAST(SET*, b1_), slop, filename, line, astr, bstr); \ + return simde_assert_equal_v##symbol_identifier##_(element_count, HEDLEY_REINTERPRET_CAST(SET*, a0_), HEDLEY_REINTERPRET_CAST(SET*, b0_), slop, filename, line, astr, bstr) && \ + simde_assert_equal_v##symbol_identifier##_(element_count, HEDLEY_REINTERPRET_CAST(SET*, a1_), HEDLEY_REINTERPRET_CAST(SET*, b1_), slop, filename, line, astr, bstr); \ } \ #define SIMDE_TEST_ARM_NEON_GENERATE_X2_VECTOR_FLOAT_TYPE_FUNCS_(NT, ET, SET, element_count, modifier, symbol_identifier) \ @@ -322,9 +322,9 @@ HEDLEY_DIAGNOSTIC_POP simde_vst1##modifier##_##neon_identifier(a2_, a.val[2]); \ simde_vst1##modifier##_##neon_identifier(b2_, b.val[2]); \ \ - return simde_assert_equal_v##symbol_identifier##_(sizeof(a0_) / sizeof(a0_[0]), a0_, b0_, filename, line, astr, bstr) \ - && simde_assert_equal_v##symbol_identifier##_(sizeof(a1_) / sizeof(a1_[0]), a1_, b1_, filename, line, astr, bstr) \ - && simde_assert_equal_v##symbol_identifier##_(sizeof(a2_) / sizeof(a2_[0]), a2_, b2_, filename, line, astr, bstr); \ + return simde_assert_equal_v##symbol_identifier##_(element_count, a0_, b0_, filename, line, astr, bstr) \ + && simde_assert_equal_v##symbol_identifier##_(element_count, a1_, b1_, filename, line, astr, bstr) \ + && simde_assert_equal_v##symbol_identifier##_(element_count, a2_, b2_, filename, line, astr, bstr); \ } #define SIMDE_TEST_ARM_NEON_GENERATE_X3_VECTOR_FLOAT_TYPE_EQUAL_FUNC_(NT, ET, SET, element_count, modifier, symbol_identifier) \ @@ -342,9 +342,9 @@ HEDLEY_DIAGNOSTIC_POP simde_vst1##modifier##_##symbol_identifier(a2_, a.val[2]); \ simde_vst1##modifier##_##symbol_identifier(b2_, b.val[2]); \ \ - return simde_assert_equal_v##symbol_identifier##_(sizeof(a0_) / sizeof(ET), HEDLEY_REINTERPRET_CAST(SET*, a0_), HEDLEY_REINTERPRET_CAST(SET*, b0_), slop, filename, line, astr, bstr) && \ - simde_assert_equal_v##symbol_identifier##_(sizeof(a1_) / sizeof(ET), HEDLEY_REINTERPRET_CAST(SET*, a1_), HEDLEY_REINTERPRET_CAST(SET*, b1_), slop, filename, line, astr, bstr) && \ - simde_assert_equal_v##symbol_identifier##_(sizeof(a2_) / sizeof(ET), HEDLEY_REINTERPRET_CAST(SET*, a2_), HEDLEY_REINTERPRET_CAST(SET*, b2_), slop, filename, line, astr, bstr); \ + return simde_assert_equal_v##symbol_identifier##_(element_count, HEDLEY_REINTERPRET_CAST(SET*, a0_), HEDLEY_REINTERPRET_CAST(SET*, b0_), slop, filename, line, astr, bstr) && \ + simde_assert_equal_v##symbol_identifier##_(element_count, HEDLEY_REINTERPRET_CAST(SET*, a1_), HEDLEY_REINTERPRET_CAST(SET*, b1_), slop, filename, line, astr, bstr) && \ + simde_assert_equal_v##symbol_identifier##_(element_count, HEDLEY_REINTERPRET_CAST(SET*, a2_), HEDLEY_REINTERPRET_CAST(SET*, b2_), slop, filename, line, astr, bstr); \ } \ #define SIMDE_TEST_ARM_NEON_GENERATE_X3_VECTOR_FLOAT_TYPE_FUNCS_(NT, ET, SET, element_count, modifier, symbol_identifier) \ @@ -490,10 +490,10 @@ HEDLEY_DIAGNOSTIC_POP simde_vst1##modifier##_##neon_identifier(a3_, a.val[3]); \ simde_vst1##modifier##_##neon_identifier(b3_, b.val[3]); \ \ - return simde_assert_equal_v##symbol_identifier##_(sizeof(a0_) / sizeof(a0_[0]), a0_, b0_, filename, line, astr, bstr) \ - && simde_assert_equal_v##symbol_identifier##_(sizeof(a1_) / sizeof(a1_[0]), a1_, b1_, filename, line, astr, bstr) \ - && simde_assert_equal_v##symbol_identifier##_(sizeof(a2_) / sizeof(a2_[0]), a2_, b2_, filename, line, astr, bstr) \ - && simde_assert_equal_v##symbol_identifier##_(sizeof(a3_) / sizeof(a3_[0]), a3_, b3_, filename, line, astr, bstr); \ + return simde_assert_equal_v##symbol_identifier##_(element_count, a0_, b0_, filename, line, astr, bstr) \ + && simde_assert_equal_v##symbol_identifier##_(element_count, a1_, b1_, filename, line, astr, bstr) \ + && simde_assert_equal_v##symbol_identifier##_(element_count, a2_, b2_, filename, line, astr, bstr) \ + && simde_assert_equal_v##symbol_identifier##_(element_count, a3_, b3_, filename, line, astr, bstr); \ } #define SIMDE_TEST_ARM_NEON_GENERATE_X4_VECTOR_FLOAT_TYPE_EQUAL_FUNC_(NT, ET, SET, element_count, modifier, symbol_identifier) \ @@ -514,10 +514,10 @@ HEDLEY_DIAGNOSTIC_POP simde_vst1##modifier##_##symbol_identifier(a3_, a.val[3]); \ simde_vst1##modifier##_##symbol_identifier(b3_, b.val[3]); \ \ - return simde_assert_equal_v##symbol_identifier##_(sizeof(a0_) / sizeof(ET), HEDLEY_REINTERPRET_CAST(SET*, a0_), HEDLEY_REINTERPRET_CAST(SET*, b0_), slop, filename, line, astr, bstr) && \ - simde_assert_equal_v##symbol_identifier##_(sizeof(a1_) / sizeof(ET), HEDLEY_REINTERPRET_CAST(SET*, a1_), HEDLEY_REINTERPRET_CAST(SET*, b1_), slop, filename, line, astr, bstr) && \ - simde_assert_equal_v##symbol_identifier##_(sizeof(a2_) / sizeof(ET), HEDLEY_REINTERPRET_CAST(SET*, a2_), HEDLEY_REINTERPRET_CAST(SET*, b2_), slop, filename, line, astr, bstr) && \ - simde_assert_equal_v##symbol_identifier##_(sizeof(a3_) / sizeof(ET), HEDLEY_REINTERPRET_CAST(SET*, a3_), HEDLEY_REINTERPRET_CAST(SET*, b3_), slop, filename, line, astr, bstr); \ + return simde_assert_equal_v##symbol_identifier##_(element_count, HEDLEY_REINTERPRET_CAST(SET*, a0_), HEDLEY_REINTERPRET_CAST(SET*, b0_), slop, filename, line, astr, bstr) && \ + simde_assert_equal_v##symbol_identifier##_(element_count, HEDLEY_REINTERPRET_CAST(SET*, a1_), HEDLEY_REINTERPRET_CAST(SET*, b1_), slop, filename, line, astr, bstr) && \ + simde_assert_equal_v##symbol_identifier##_(element_count, HEDLEY_REINTERPRET_CAST(SET*, a2_), HEDLEY_REINTERPRET_CAST(SET*, b2_), slop, filename, line, astr, bstr) && \ + simde_assert_equal_v##symbol_identifier##_(element_count, HEDLEY_REINTERPRET_CAST(SET*, a3_), HEDLEY_REINTERPRET_CAST(SET*, b3_), slop, filename, line, astr, bstr); \ } \ #define SIMDE_TEST_ARM_NEON_GENERATE_X4_VECTOR_FLOAT_TYPE_FUNCS_(NT, ET, SET, element_count, modifier, symbol_identifier) \