From 98bf2df2ca0aabfb1aa5cc74b5b5e4aea69ddea6 Mon Sep 17 00:00:00 2001 From: serge-sans-paille Date: Sun, 6 Oct 2024 17:45:13 +0200 Subject: [PATCH] [u]int32 support --- include/xsimd/arch/xsimd_avx.hpp | 25 +++++++++++++++++++++++ include/xsimd/arch/xsimd_neon.hpp | 26 ++++++++++++++++++++++++ include/xsimd/arch/xsimd_wasm.hpp | 33 ++++++++++++++++++++----------- 3 files changed, 72 insertions(+), 12 deletions(-) diff --git a/include/xsimd/arch/xsimd_avx.hpp b/include/xsimd/arch/xsimd_avx.hpp index 32f609271..998799fdf 100644 --- a/include/xsimd/arch/xsimd_avx.hpp +++ b/include/xsimd/arch/xsimd_avx.hpp @@ -1635,6 +1635,31 @@ namespace xsimd matrix_begin[7] = _mm256_permute2f128_ps(r3, r7, 0x31); } + namespace detail + { + template + XSIMD_INLINE std::array, batch::size> make_converted_sequence(batch < Tp, A >> *matrix_begin, batch* matrix_end) + { + std::array, batch::size> converted; + for (size_t i = 0; i < batch::size; ++i) + converted[i] = bitwise_cast(matrix_begin[i]); + return converted; + } + } + + template + XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept + { + auto converted = detail::make_converted_sequence(matrix_begin, matrix_end); + return transpose(&converted[0], &converted[batch::size], A {}); + } + template + XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept + { + auto converted = detail::make_converted_sequence(matrix_begin, matrix_end); + return transpose(&converted[0], &converted[batch::size], A {}); + } + // trunc template XSIMD_INLINE batch trunc(batch const& self, requires_arch) noexcept diff --git a/include/xsimd/arch/xsimd_neon.hpp b/include/xsimd/arch/xsimd_neon.hpp index f6c791dca..9d67d7890 100644 --- a/include/xsimd/arch/xsimd_neon.hpp +++ b/include/xsimd/arch/xsimd_neon.hpp @@ -1764,6 +1764,32 @@ namespace xsimd matrix_begin[2] = vcombine_f32(vget_high_f32(t01.val[0]), vget_high_f32(t23.val[0])); matrix_begin[3] = vcombine_f32(vget_high_f32(t01.val[1]), vget_high_f32(t23.val[1])); } + template + XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept + { + assert((matrix_end - matrix_begin == batch::size) && "correctly sized matrix"); + (void)matrix_end; + auto r0 = matrix_begin[0], r1 = matrix_begin[1], r2 = matrix_begin[2], r3 = matrix_begin[3]; + auto t01 = vtrnq_u32(r0, r1); + auto t23 = vtrnq_u32(r2, r3); + matrix_begin[0] = vcombine_u32(vget_low_u32(t01.val[0]), vget_low_u32(t23.val[0])); + matrix_begin[1] = vcombine_u32(vget_low_u32(t01.val[1]), vget_low_u32(t23.val[1])); + matrix_begin[2] = vcombine_u32(vget_high_u32(t01.val[0]), vget_high_u32(t23.val[0])); + matrix_begin[3] = vcombine_u32(vget_high_u32(t01.val[1]), vget_high_u32(t23.val[1])); + } + template + XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept + { + assert((matrix_end - matrix_begin == batch::size) && "correctly sized matrix"); + (void)matrix_end; + auto r0 = matrix_begin[0], r1 = matrix_begin[1], r2 = matrix_begin[2], r3 = matrix_begin[3]; + auto t01 = vtrnq_s32(r0, r1); + auto t23 = vtrnq_s32(r2, r3); + matrix_begin[0] = vcombine_s32(vget_low_s32(t01.val[0]), vget_low_s32(t23.val[0])); + matrix_begin[1] = vcombine_s32(vget_low_s32(t01.val[1]), vget_low_s32(t23.val[1])); + matrix_begin[2] = vcombine_s32(vget_high_s32(t01.val[0]), vget_high_s32(t23.val[0])); + matrix_begin[3] = vcombine_s32(vget_high_s32(t01.val[1]), vget_high_s32(t23.val[1])); + } /********** * zip_lo * diff --git a/include/xsimd/arch/xsimd_wasm.hpp b/include/xsimd/arch/xsimd_wasm.hpp index ff4f80357..301450853 100644 --- a/include/xsimd/arch/xsimd_wasm.hpp +++ b/include/xsimd/arch/xsimd_wasm.hpp @@ -39,6 +39,8 @@ namespace xsimd XSIMD_INLINE batch shuffle(batch const& x, batch const& y, batch_constant, requires_arch) noexcept; template XSIMD_INLINE batch avg(batch const&, batch const&, requires_arch) noexcept; + template + XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept; // abs template ::value && std::is_signed::value, void>::type> @@ -1577,23 +1579,30 @@ namespace xsimd } // transpose - template - XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept + template + XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept { - assert((matrix_end - matrix_begin == batch::size) && "correctly sized matrix"); + assert((matrix_end - matrix_begin == batch::size) && "correctly sized matrix"); (void)matrix_end; - auto r0 = matrix_begin[0], r1 = matrix_begin[1], r2 = matrix_begin[2], r3 = matrix_begin[3]; + XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + auto r0 = matrix_begin[0], r1 = matrix_begin[1], r2 = matrix_begin[2], r3 = matrix_begin[3]; - auto t0 = wasm_i32x4_shuffle(r0, r1, 0, 4, 1, 5); // r0[0] r1[0] r0[1] r1[1] - auto t1 = wasm_i32x4_shuffle(r0, r1, 2, 6, 3, 7); // r0[2] r1[2] r0[3] r1[3] + auto t0 = wasm_i32x4_shuffle(r0, r1, 0, 4, 1, 5); // r0[0] r1[0] r0[1] r1[1] + auto t1 = wasm_i32x4_shuffle(r0, r1, 2, 6, 3, 7); // r0[2] r1[2] r0[3] r1[3] - auto t2 = wasm_i32x4_shuffle(r2, r3, 0, 4, 1, 5); // r2[0] r3[0] r2[1] r3[1] - auto t3 = wasm_i32x4_shuffle(r2, r3, 2, 6, 3, 7); // r2[2] r3[2] r2[3] r3[3] + auto t2 = wasm_i32x4_shuffle(r2, r3, 0, 4, 1, 5); // r2[0] r3[0] r2[1] r3[1] + auto t3 = wasm_i32x4_shuffle(r2, r3, 2, 6, 3, 7); // r2[2] r3[2] r2[3] r3[3] - matrix_begin[0] = wasm_i32x4_shuffle(t0, t2, 0, 1, 4, 5); // r0[0] r1[0] r2[0] r3[0] - matrix_begin[1] = wasm_i32x4_shuffle(t0, t2, 2, 3, 6, 7); // r0[1] r1[1] r2[1] r3[1] - matrix_begin[2] = wasm_i32x4_shuffle(t1, t3, 0, 1, 4, 5); // r0[2] r1[2] r2[2] r3[2] - matrix_begin[3] = wasm_i32x4_shuffle(t1, t3, 2, 3, 6, 7); // r0[3] r1[3] r2[3] r3[3] + matrix_begin[0] = wasm_i32x4_shuffle(t0, t2, 0, 1, 4, 5); // r0[0] r1[0] r2[0] r3[0] + matrix_begin[1] = wasm_i32x4_shuffle(t0, t2, 2, 3, 6, 7); // r0[1] r1[1] r2[1] r3[1] + matrix_begin[2] = wasm_i32x4_shuffle(t1, t3, 0, 1, 4, 5); // r0[2] r1[2] r2[2] r3[2] + matrix_begin[3] = wasm_i32x4_shuffle(t1, t3, 2, 3, 6, 7); // r0[3] r1[3] r2[3] r3[3] + } + else + { + transpose(matrix_begin, matrix_end, generic {}); + } } // trunc