diff --git a/include/xsimd/arch/xsimd_avx.hpp b/include/xsimd/arch/xsimd_avx.hpp index c2380b80f..988be34af 100644 --- a/include/xsimd/arch/xsimd_avx.hpp +++ b/include/xsimd/arch/xsimd_avx.hpp @@ -1646,6 +1646,36 @@ namespace xsimd return transpose(reinterpret_cast*>(matrix_begin), reinterpret_cast*>(matrix_end), A {}); } + template + XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept + { + assert((matrix_end - matrix_begin == batch::size) && "correctly sized matrix"); + (void)matrix_end; + auto r0 = matrix_begin[0], r1 = matrix_begin[1], + r2 = matrix_begin[2], r3 = matrix_begin[3]; + + auto t0 = _mm256_unpacklo_pd(r0, r1); // r00 r10 r01 r11 + auto t1 = _mm256_unpackhi_pd(r0, r1); // r02 r12 r03 r13 + auto t2 = _mm256_unpacklo_pd(r2, r3); // r20 r30 r21 r31 + auto t3 = _mm256_unpackhi_pd(r2, r3); // r22 r32 r23 r33 + + matrix_begin[0] = _mm256_permute2f128_pd(t0, t2, 0x20); + matrix_begin[1] = _mm256_permute2f128_pd(t0, t2, 0x31); + matrix_begin[2] = _mm256_permute2f128_pd(t1, t3, 0x20); + matrix_begin[3] = _mm256_permute2f128_pd(t1, t3, 0x31); + } + + template + XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept + { + return transpose(reinterpret_cast*>(matrix_begin), reinterpret_cast*>(matrix_end), A {}); + } + template + XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept + { + return transpose(reinterpret_cast*>(matrix_begin), reinterpret_cast*>(matrix_end), A {}); + } + // trunc template XSIMD_INLINE batch trunc(batch const& self, requires_arch) noexcept diff --git a/include/xsimd/arch/xsimd_neon.hpp b/include/xsimd/arch/xsimd_neon.hpp index 9d67d7890..87f5d34b5 100644 --- a/include/xsimd/arch/xsimd_neon.hpp +++ b/include/xsimd/arch/xsimd_neon.hpp @@ -1791,6 +1791,26 @@ namespace xsimd matrix_begin[3] = vcombine_s32(vget_high_s32(t01.val[1]), vget_high_s32(t23.val[1])); } + template = 0> + XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept + { + assert((matrix_end - matrix_begin == batch::size) && "correctly sized matrix"); + (void)matrix_end; + auto r0 = matrix_begin[0], r1 = matrix_begin[1]; + matrix_begin[0] = vcombine_u64(vget_low_u64(r0), vget_low_u64(r1)); + matrix_begin[1] = vcombine_u64(vget_high_u64(r0), vget_high_u64(r1)); + } + + template = 0> + XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept + { + assert((matrix_end - matrix_begin == batch::size) && "correctly sized matrix"); + (void)matrix_end; + auto r0 = matrix_begin[0], r1 = matrix_begin[1]; + matrix_begin[0] = vcombine_s64(vget_low_s64(r0), vget_low_s64(r1)); + matrix_begin[1] = vcombine_s64(vget_high_s64(r0), vget_high_s64(r1)); + } + /********** * zip_lo * **********/ diff --git a/include/xsimd/arch/xsimd_neon64.hpp b/include/xsimd/arch/xsimd_neon64.hpp index 063622a9e..be7c534cf 100644 --- a/include/xsimd/arch/xsimd_neon64.hpp +++ b/include/xsimd/arch/xsimd_neon64.hpp @@ -950,6 +950,36 @@ namespace xsimd return select(batch_bool { b... }, true_br, false_br, neon64 {}); } + template + XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept + { + assert((matrix_end - matrix_begin == batch::size) && "correctly sized matrix"); + (void)matrix_end; + auto r0 = matrix_begin[0], r1 = matrix_begin[1]; + matrix_begin[0] = vzip1q_f64(r0, r1); + matrix_begin[1] = vzip2q_f64(r0, r1); + } + + template = 0> + XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept + { + assert((matrix_end - matrix_begin == batch::size) && "correctly sized matrix"); + (void)matrix_end; + auto r0 = matrix_begin[0], r1 = matrix_begin[1]; + matrix_begin[0] = vzip1q_u64(r0, r1); + matrix_begin[1] = vzip2q_u64(r0, r1); + } + + template = 0> + XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept + { + assert((matrix_end - matrix_begin == batch::size) && "correctly sized matrix"); + (void)matrix_end; + auto r0 = matrix_begin[0], r1 = matrix_begin[1]; + matrix_begin[0] = vzip1q_s64(r0, r1); + matrix_begin[1] = vzip2q_s64(r0, r1); + } + /********** * zip_lo * **********/ diff --git a/include/xsimd/arch/xsimd_sse2.hpp b/include/xsimd/arch/xsimd_sse2.hpp index db82ff912..62ebcb5d0 100644 --- a/include/xsimd/arch/xsimd_sse2.hpp +++ b/include/xsimd/arch/xsimd_sse2.hpp @@ -1656,12 +1656,32 @@ namespace xsimd template XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept { - transpose(reinterpret_cast*>(matrix_begin), reinterpret_cast*>(matrix_end), A{}); + transpose(reinterpret_cast*>(matrix_begin), reinterpret_cast*>(matrix_end), A {}); } template XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept { - transpose(reinterpret_cast*>(matrix_begin), reinterpret_cast*>(matrix_end), A{}); + transpose(reinterpret_cast*>(matrix_begin), reinterpret_cast*>(matrix_end), A {}); + } + + template + XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept + { + assert((matrix_end - matrix_begin == batch::size) && "correctly sized matrix"); + (void)matrix_end; + auto r0 = *matrix_begin[0], r1 = matrix_begin[1]; + matrix_begin[0] = _mm_unpacklo_pd(r0, r1); + matrix_begin[1] = _mm_unpackhi_pd(r0, r1); + } + template + XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept + { + transpose(reinterpret_cast*>(matrix_begin), reinterpret_cast*>(matrix_end), A {}); + } + template + XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept + { + transpose(reinterpret_cast*>(matrix_begin), reinterpret_cast*>(matrix_end), A {}); } // zip_hi diff --git a/include/xsimd/arch/xsimd_wasm.hpp b/include/xsimd/arch/xsimd_wasm.hpp index 301450853..9d2983992 100644 --- a/include/xsimd/arch/xsimd_wasm.hpp +++ b/include/xsimd/arch/xsimd_wasm.hpp @@ -1599,6 +1599,13 @@ namespace xsimd matrix_begin[2] = wasm_i32x4_shuffle(t1, t3, 0, 1, 4, 5); // r0[2] r1[2] r2[2] r3[2] matrix_begin[3] = wasm_i32x4_shuffle(t1, t3, 2, 3, 6, 7); // r0[3] r1[3] r2[3] r3[3] } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + auto r0 = matrix_begin[0], r1 = matrix_begin[1]; + + matrix_begin[0] = wasm_i64x2_shuffle(r0, r1, 0, 2); + matrix_begin[1] = wasm_i64x2_shuffle(r0, r1, 1, 3); + } else { transpose(matrix_begin, matrix_end, generic {});