Skip to content

Commit

Permalink
Add support for xsimd::transpose
Browse files Browse the repository at this point in the history
Support 64 bit registers for avx, neon, sse and wasm
  • Loading branch information
serge-sans-paille committed Oct 11, 2024
1 parent 74b2dff commit 58ba881
Show file tree
Hide file tree
Showing 5 changed files with 109 additions and 2 deletions.
30 changes: 30 additions & 0 deletions include/xsimd/arch/xsimd_avx.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1646,6 +1646,36 @@ namespace xsimd
return transpose(reinterpret_cast<batch<float, A>*>(matrix_begin), reinterpret_cast<batch<float, A>*>(matrix_end), A {});
}

template <class A>
XSIMD_INLINE void transpose(batch<double, A>* matrix_begin, batch<double, A>* matrix_end, requires_arch<avx>) noexcept
{
assert((matrix_end - matrix_begin == batch<double, A>::size) && "correctly sized matrix");
(void)matrix_end;
auto r0 = matrix_begin[0], r1 = matrix_begin[1],
r2 = matrix_begin[2], r3 = matrix_begin[3];

auto t0 = _mm256_unpacklo_pd(r0, r1); // r00 r10 r01 r11
auto t1 = _mm256_unpackhi_pd(r0, r1); // r02 r12 r03 r13
auto t2 = _mm256_unpacklo_pd(r2, r3); // r20 r30 r21 r31
auto t3 = _mm256_unpackhi_pd(r2, r3); // r22 r32 r23 r33

matrix_begin[0] = _mm256_permute2f128_pd(t0, t2, 0x20);
matrix_begin[1] = _mm256_permute2f128_pd(t0, t2, 0x31);
matrix_begin[2] = _mm256_permute2f128_pd(t1, t3, 0x20);
matrix_begin[3] = _mm256_permute2f128_pd(t1, t3, 0x31);
}

template <class A>
XSIMD_INLINE void transpose(batch<uint64_t, A>* matrix_begin, batch<uint64_t, A>* matrix_end, requires_arch<avx>) noexcept
{
return transpose(reinterpret_cast<batch<double, A>*>(matrix_begin), reinterpret_cast<batch<double, A>*>(matrix_end), A {});
}
template <class A>
XSIMD_INLINE void transpose(batch<int64_t, A>* matrix_begin, batch<int64_t, A>* matrix_end, requires_arch<avx>) noexcept
{
return transpose(reinterpret_cast<batch<double, A>*>(matrix_begin), reinterpret_cast<batch<double, A>*>(matrix_end), A {});
}

// trunc
template <class A>
XSIMD_INLINE batch<float, A> trunc(batch<float, A> const& self, requires_arch<avx>) noexcept
Expand Down
20 changes: 20 additions & 0 deletions include/xsimd/arch/xsimd_neon.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1791,6 +1791,26 @@ namespace xsimd
matrix_begin[3] = vcombine_s32(vget_high_s32(t01.val[1]), vget_high_s32(t23.val[1]));
}

template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
XSIMD_INLINE void transpose(batch<T, A>* matrix_begin, batch<T, A>* matrix_end, requires_arch<neon>) noexcept
{
assert((matrix_end - matrix_begin == batch<T, A>::size) && "correctly sized matrix");
(void)matrix_end;
auto r0 = matrix_begin[0], r1 = matrix_begin[1];
matrix_begin[0] = vcombine_u64(vget_low_u64(r0), vget_low_u64(r1));
matrix_begin[1] = vcombine_u64(vget_high_u64(r0), vget_high_u64(r1));
}

template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
XSIMD_INLINE void transpose(batch<T, A>* matrix_begin, batch<T, A>* matrix_end, requires_arch<neon>) noexcept
{
assert((matrix_end - matrix_begin == batch<T, A>::size) && "correctly sized matrix");
(void)matrix_end;
auto r0 = matrix_begin[0], r1 = matrix_begin[1];
matrix_begin[0] = vcombine_s64(vget_low_s64(r0), vget_low_s64(r1));
matrix_begin[1] = vcombine_s64(vget_high_s64(r0), vget_high_s64(r1));
}

/**********
* zip_lo *
**********/
Expand Down
30 changes: 30 additions & 0 deletions include/xsimd/arch/xsimd_neon64.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -950,6 +950,36 @@ namespace xsimd
return select(batch_bool<double, A> { b... }, true_br, false_br, neon64 {});
}

template <class A>
XSIMD_INLINE void transpose(batch<double, A>* matrix_begin, batch<double, A>* matrix_end, requires_arch<neon64>) noexcept
{
assert((matrix_end - matrix_begin == batch<double, A>::size) && "correctly sized matrix");
(void)matrix_end;
auto r0 = matrix_begin[0], r1 = matrix_begin[1];
matrix_begin[0] = vzip1q_f64(r0, r1);
matrix_begin[1] = vzip2q_f64(r0, r1);
}

template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
XSIMD_INLINE void transpose(batch<T, A>* matrix_begin, batch<T, A>* matrix_end, requires_arch<neon64>) noexcept
{
assert((matrix_end - matrix_begin == batch<T, A>::size) && "correctly sized matrix");
(void)matrix_end;
auto r0 = matrix_begin[0], r1 = matrix_begin[1];
matrix_begin[0] = vzip1q_u64(r0, r1);
matrix_begin[1] = vzip2q_u64(r0, r1);
}

template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
XSIMD_INLINE void transpose(batch<T, A>* matrix_begin, batch<T, A>* matrix_end, requires_arch<neon64>) noexcept
{
assert((matrix_end - matrix_begin == batch<T, A>::size) && "correctly sized matrix");
(void)matrix_end;
auto r0 = matrix_begin[0], r1 = matrix_begin[1];
matrix_begin[0] = vzip1q_s64(r0, r1);
matrix_begin[1] = vzip2q_s64(r0, r1);
}

/**********
* zip_lo *
**********/
Expand Down
24 changes: 22 additions & 2 deletions include/xsimd/arch/xsimd_sse2.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1656,12 +1656,32 @@ namespace xsimd
template <class A>
XSIMD_INLINE void transpose(batch<uint32_t, A>* matrix_begin, batch<uint32_t, A>* matrix_end, requires_arch<sse2>) noexcept
{
transpose(reinterpret_cast<batch<float, A>*>(matrix_begin), reinterpret_cast<batch<float, A>*>(matrix_end), A{});
transpose(reinterpret_cast<batch<float, A>*>(matrix_begin), reinterpret_cast<batch<float, A>*>(matrix_end), A {});
}
template <class A>
XSIMD_INLINE void transpose(batch<int32_t, A>* matrix_begin, batch<int32_t, A>* matrix_end, requires_arch<sse2>) noexcept
{
transpose(reinterpret_cast<batch<float, A>*>(matrix_begin), reinterpret_cast<batch<float, A>*>(matrix_end), A{});
transpose(reinterpret_cast<batch<float, A>*>(matrix_begin), reinterpret_cast<batch<float, A>*>(matrix_end), A {});
}

template <class A>
XSIMD_INLINE void transpose(batch<double, A>* matrix_begin, batch<double, A>* matrix_end, requires_arch<sse2>) noexcept
{
assert((matrix_end - matrix_begin == batch<double, A>::size) && "correctly sized matrix");
(void)matrix_end;
auto r0 = matrix_begin[0], r1 = matrix_begin[1];
matrix_begin[0] = _mm_unpacklo_pd(r0, r1);
matrix_begin[1] = _mm_unpackhi_pd(r0, r1);
}
template <class A>
XSIMD_INLINE void transpose(batch<uint64_t, A>* matrix_begin, batch<uint64_t, A>* matrix_end, requires_arch<sse2>) noexcept
{
transpose(reinterpret_cast<batch<double, A>*>(matrix_begin), reinterpret_cast<batch<double, A>*>(matrix_end), A {});
}
template <class A>
XSIMD_INLINE void transpose(batch<int64_t, A>* matrix_begin, batch<int64_t, A>* matrix_end, requires_arch<sse2>) noexcept
{
transpose(reinterpret_cast<batch<double, A>*>(matrix_begin), reinterpret_cast<batch<double, A>*>(matrix_end), A {});
}

// zip_hi
Expand Down
7 changes: 7 additions & 0 deletions include/xsimd/arch/xsimd_wasm.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1599,6 +1599,13 @@ namespace xsimd
matrix_begin[2] = wasm_i32x4_shuffle(t1, t3, 0, 1, 4, 5); // r0[2] r1[2] r2[2] r3[2]
matrix_begin[3] = wasm_i32x4_shuffle(t1, t3, 2, 3, 6, 7); // r0[3] r1[3] r2[3] r3[3]
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
{
auto r0 = matrix_begin[0], r1 = matrix_begin[1];

matrix_begin[0] = wasm_i64x2_shuffle(r0, r1, 0, 2);
matrix_begin[1] = wasm_i64x2_shuffle(r0, r1, 1, 3);
}
else
{
transpose(matrix_begin, matrix_end, generic {});
Expand Down

0 comments on commit 58ba881

Please sign in to comment.