Skip to content

Commit

Permalink
Add support for xsimd::transpose
Browse files Browse the repository at this point in the history
Currently only specialized 32bits types on sse, avx, neon and wasm. The
other implementations fallback to sequential generic implementation.

Fix #107 (6 years old!)
  • Loading branch information
serge-sans-paille committed Oct 9, 2024
1 parent 3747986 commit c6740bc
Show file tree
Hide file tree
Showing 9 changed files with 233 additions and 0 deletions.
2 changes: 2 additions & 0 deletions docs/source/api/data_transfer.rst
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@ In place:

Between batches:

+---------------------------------------+----------------------------------------------------+
| :cpp:func:`transpose` | tranpose a matrix as an array of batches |
+---------------------------------------+----------------------------------------------------+
| :cpp:func:`zip_lo` | interleave low halves of two batches |
+---------------------------------------+----------------------------------------------------+
Expand Down
26 changes: 26 additions & 0 deletions include/xsimd/arch/generic/xsimd_generic_memory.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -639,6 +639,32 @@ namespace xsimd
hi.store_unaligned(buffer + real_batch::size);
}

// transpose
template <class A, class T>
XSIMD_INLINE void transpose(batch<T, A>* matrix_begin, batch<T, A>* matrix_end, requires_arch<generic>) noexcept
{
assert((matrix_end - matrix_begin == batch<T, A>::size) && "correctly sized matrix");
(void)matrix_end;
alignas(A::alignment()) T scratch_buffer[batch<T, A>::size * batch<T, A>::size];
for (size_t i = 0; i < batch<T, A>::size; ++i)
{
matrix_begin[i].store_aligned(&scratch_buffer[i * batch<T, A>::size]);
}
// FIXME: this is super naive we can probably do better.
for (size_t i = 0; i < batch<T, A>::size; ++i)
{
for (size_t j = 0; j < i; ++j)
{
std::swap(scratch_buffer[i * batch<T, A>::size + j],
scratch_buffer[j * batch<T, A>::size + i]);
}
}
for (size_t i = 0; i < batch<T, A>::size; ++i)
{
matrix_begin[i] = batch<T, A>::load_aligned(&scratch_buffer[i * batch<T, A>::size]);
}
}

}

}
Expand Down
51 changes: 51 additions & 0 deletions include/xsimd/arch/xsimd_avx.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1594,6 +1594,57 @@ namespace xsimd
return bitwise_cast<T>(
swizzle(bitwise_cast<double>(self), mask));
}
// transpose
template <class A>
XSIMD_INLINE void transpose(batch<float, A>* matrix_begin, batch<float, A>* matrix_end, requires_arch<avx>) noexcept
{
assert((matrix_end - matrix_begin == batch<float, A>::size) && "correctly sized matrix");
(void)matrix_end;
// See
// https://stackoverflow.com/questions/25622745/transpose-an-8x8-float-using-avx-avx2
auto r0 = matrix_begin[0], r1 = matrix_begin[1],
r2 = matrix_begin[2], r3 = matrix_begin[3],
r4 = matrix_begin[4], r5 = matrix_begin[5],
r6 = matrix_begin[6], r7 = matrix_begin[7];

auto t0 = _mm256_unpacklo_ps(r0, r1);
auto t1 = _mm256_unpackhi_ps(r0, r1);
auto t2 = _mm256_unpacklo_ps(r2, r3);
auto t3 = _mm256_unpackhi_ps(r2, r3);
auto t4 = _mm256_unpacklo_ps(r4, r5);
auto t5 = _mm256_unpackhi_ps(r4, r5);
auto t6 = _mm256_unpacklo_ps(r6, r7);
auto t7 = _mm256_unpackhi_ps(r6, r7);

r0 = _mm256_shuffle_ps(t0, t2, _MM_SHUFFLE(1, 0, 1, 0));
r1 = _mm256_shuffle_ps(t0, t2, _MM_SHUFFLE(3, 2, 3, 2));
r2 = _mm256_shuffle_ps(t1, t3, _MM_SHUFFLE(1, 0, 1, 0));
r3 = _mm256_shuffle_ps(t1, t3, _MM_SHUFFLE(3, 2, 3, 2));
r4 = _mm256_shuffle_ps(t4, t6, _MM_SHUFFLE(1, 0, 1, 0));
r5 = _mm256_shuffle_ps(t4, t6, _MM_SHUFFLE(3, 2, 3, 2));
r6 = _mm256_shuffle_ps(t5, t7, _MM_SHUFFLE(1, 0, 1, 0));
r7 = _mm256_shuffle_ps(t5, t7, _MM_SHUFFLE(3, 2, 3, 2));

matrix_begin[0] = _mm256_permute2f128_ps(r0, r4, 0x20);
matrix_begin[1] = _mm256_permute2f128_ps(r1, r5, 0x20);
matrix_begin[2] = _mm256_permute2f128_ps(r2, r6, 0x20);
matrix_begin[3] = _mm256_permute2f128_ps(r3, r7, 0x20);
matrix_begin[4] = _mm256_permute2f128_ps(r0, r4, 0x31);
matrix_begin[5] = _mm256_permute2f128_ps(r1, r5, 0x31);
matrix_begin[6] = _mm256_permute2f128_ps(r2, r6, 0x31);
matrix_begin[7] = _mm256_permute2f128_ps(r3, r7, 0x31);
}

template <class A>
XSIMD_INLINE void transpose(batch<uint32_t, A>* matrix_begin, batch<uint32_t, A>* matrix_end, requires_arch<avx>) noexcept
{
return transpose(reinterpret_cast<batch<float, A>*>(matrix_begin), reinterpret_cast<batch<float, A>*>(matrix_end), A {});
}
template <class A>
XSIMD_INLINE void transpose(batch<int32_t, A>* matrix_begin, batch<int32_t, A>* matrix_end, requires_arch<avx>) noexcept
{
return transpose(reinterpret_cast<batch<float, A>*>(matrix_begin), reinterpret_cast<batch<float, A>*>(matrix_end), A {});
}

// trunc
template <class A>
Expand Down
44 changes: 44 additions & 0 deletions include/xsimd/arch/xsimd_neon.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1748,6 +1748,49 @@ namespace xsimd
return select(batch_bool<T, A> { b... }, true_br, false_br, neon {});
}

/*************
* transpose *
*************/
template <class A>
XSIMD_INLINE void transpose(batch<float, A>* matrix_begin, batch<float, A>* matrix_end, requires_arch<neon>) noexcept
{
assert((matrix_end - matrix_begin == batch<float, A>::size) && "correctly sized matrix");
(void)matrix_end;
auto r0 = matrix_begin[0], r1 = matrix_begin[1], r2 = matrix_begin[2], r3 = matrix_begin[3];
auto t01 = vtrnq_f32(r0, r1);
auto t23 = vtrnq_f32(r2, r3);
matrix_begin[0] = vcombine_f32(vget_low_f32(t01.val[0]), vget_low_f32(t23.val[0]));
matrix_begin[1] = vcombine_f32(vget_low_f32(t01.val[1]), vget_low_f32(t23.val[1]));
matrix_begin[2] = vcombine_f32(vget_high_f32(t01.val[0]), vget_high_f32(t23.val[0]));
matrix_begin[3] = vcombine_f32(vget_high_f32(t01.val[1]), vget_high_f32(t23.val[1]));
}
template <class A>
XSIMD_INLINE void transpose(batch<uint32_t, A>* matrix_begin, batch<uint32_t, A>* matrix_end, requires_arch<neon>) noexcept
{
assert((matrix_end - matrix_begin == batch<uint32_t, A>::size) && "correctly sized matrix");
(void)matrix_end;
auto r0 = matrix_begin[0], r1 = matrix_begin[1], r2 = matrix_begin[2], r3 = matrix_begin[3];
auto t01 = vtrnq_u32(r0, r1);
auto t23 = vtrnq_u32(r2, r3);
matrix_begin[0] = vcombine_u32(vget_low_u32(t01.val[0]), vget_low_u32(t23.val[0]));
matrix_begin[1] = vcombine_u32(vget_low_u32(t01.val[1]), vget_low_u32(t23.val[1]));
matrix_begin[2] = vcombine_u32(vget_high_u32(t01.val[0]), vget_high_u32(t23.val[0]));
matrix_begin[3] = vcombine_u32(vget_high_u32(t01.val[1]), vget_high_u32(t23.val[1]));
}
template <class A>
XSIMD_INLINE void transpose(batch<int32_t, A>* matrix_begin, batch<int32_t, A>* matrix_end, requires_arch<neon>) noexcept
{
assert((matrix_end - matrix_begin == batch<int32_t, A>::size) && "correctly sized matrix");
(void)matrix_end;
auto r0 = matrix_begin[0], r1 = matrix_begin[1], r2 = matrix_begin[2], r3 = matrix_begin[3];
auto t01 = vtrnq_s32(r0, r1);
auto t23 = vtrnq_s32(r2, r3);
matrix_begin[0] = vcombine_s32(vget_low_s32(t01.val[0]), vget_low_s32(t23.val[0]));
matrix_begin[1] = vcombine_s32(vget_low_s32(t01.val[1]), vget_low_s32(t23.val[1]));
matrix_begin[2] = vcombine_s32(vget_high_s32(t01.val[0]), vget_high_s32(t23.val[0]));
matrix_begin[3] = vcombine_s32(vget_high_s32(t01.val[1]), vget_high_s32(t23.val[1]));
}

/**********
* zip_lo *
**********/
Expand Down Expand Up @@ -2737,6 +2780,7 @@ namespace xsimd
return set(batch<T, A>(), A(), data[idx]...);
}
}

}

#undef WRAP_BINARY_INT_EXCLUDING_64
Expand Down
1 change: 1 addition & 0 deletions include/xsimd/arch/xsimd_neon64.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -949,6 +949,7 @@ namespace xsimd
{
return select(batch_bool<double, A> { b... }, true_br, false_br, neon64 {});
}

/**********
* zip_lo *
**********/
Expand Down
40 changes: 40 additions & 0 deletions include/xsimd/arch/xsimd_sse2.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1640,6 +1640,46 @@ namespace xsimd
return bitwise_cast<int32_t>(swizzle(bitwise_cast<uint32_t>(self), mask, sse2 {}));
}

// transpose
template <class A>
XSIMD_INLINE void transpose(batch<float, A>* matrix_begin, batch<float, A>* matrix_end, requires_arch<sse2>) noexcept
{
assert((matrix_end - matrix_begin == batch<float, A>::size) && "correctly sized matrix");
(void)matrix_end;
auto r0 = matrix_begin[0], r1 = matrix_begin[1], r2 = matrix_begin[2], r3 = matrix_begin[3];
_MM_TRANSPOSE4_PS(r0, r1, r2, r3);
matrix_begin[0] = r0;
matrix_begin[1] = r1;
matrix_begin[2] = r2;
matrix_begin[3] = r3;
}
template <class A>
XSIMD_INLINE void transpose(batch<uint32_t, A>* matrix_begin, batch<uint32_t, A>* matrix_end, requires_arch<sse2>) noexcept
{
assert((matrix_end - matrix_begin == batch<uint32_t, A>::size) && "correctly sized matrix");
(void)matrix_end;
auto r0 = bitwise_cast<float>(matrix_begin[0]), r1 = bitwise_cast<float>(matrix_begin[1]),
r2 = bitwise_cast<float>(matrix_begin[2]), r3 = bitwise_cast<float>(matrix_begin[3]);
_MM_TRANSPOSE4_PS(r0, r1, r2, r3);
matrix_begin[0] = bitwise_cast<uint32_t>(r0);
matrix_begin[1] = bitwise_cast<uint32_t>(r1);
matrix_begin[2] = bitwise_cast<uint32_t>(r2);
matrix_begin[3] = bitwise_cast<uint32_t>(r3);
}
template <class A>
XSIMD_INLINE void transpose(batch<int32_t, A>* matrix_begin, batch<int32_t, A>* matrix_end, requires_arch<sse2>) noexcept
{
assert((matrix_end - matrix_begin == batch<int32_t, A>::size) && "correctly sized matrix");
(void)matrix_end;
auto r0 = bitwise_cast<float>(matrix_begin[0]), r1 = bitwise_cast<float>(matrix_begin[1]),
r2 = bitwise_cast<float>(matrix_begin[2]), r3 = bitwise_cast<float>(matrix_begin[3]);
_MM_TRANSPOSE4_PS(r0, r1, r2, r3);
matrix_begin[0] = bitwise_cast<int32_t>(r0);
matrix_begin[1] = bitwise_cast<int32_t>(r1);
matrix_begin[2] = bitwise_cast<int32_t>(r2);
matrix_begin[3] = bitwise_cast<int32_t>(r3);
}

// zip_hi
template <class A>
XSIMD_INLINE batch<float, A> zip_hi(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
Expand Down
29 changes: 29 additions & 0 deletions include/xsimd/arch/xsimd_wasm.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ namespace xsimd
XSIMD_INLINE batch<T, A> shuffle(batch<T, A> const& x, batch<T, A> const& y, batch_constant<ITy, A, Indices...>, requires_arch<generic>) noexcept;
template <class A, class T>
XSIMD_INLINE batch<T, A> avg(batch<T, A> const&, batch<T, A> const&, requires_arch<generic>) noexcept;
template <class A, class T>
XSIMD_INLINE void transpose(batch<T, A>* matrix_begin, batch<T, A>* matrix_end, requires_arch<generic>) noexcept;

// abs
template <class A, class T, typename std::enable_if<std::is_integral<T>::value && std::is_signed<T>::value, void>::type>
Expand Down Expand Up @@ -1576,6 +1578,33 @@ namespace xsimd
return bitwise_cast<int8_t>(swizzle(bitwise_cast<uint8_t>(self), mask, wasm {}));
}

// transpose
template <class A, class T>
XSIMD_INLINE void transpose(batch<T, A>* matrix_begin, batch<T, A>* matrix_end, requires_arch<wasm>) noexcept
{
assert((matrix_end - matrix_begin == batch<T, A>::size) && "correctly sized matrix");
(void)matrix_end;
XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
{
auto r0 = matrix_begin[0], r1 = matrix_begin[1], r2 = matrix_begin[2], r3 = matrix_begin[3];

auto t0 = wasm_i32x4_shuffle(r0, r1, 0, 4, 1, 5); // r0[0] r1[0] r0[1] r1[1]
auto t1 = wasm_i32x4_shuffle(r0, r1, 2, 6, 3, 7); // r0[2] r1[2] r0[3] r1[3]

auto t2 = wasm_i32x4_shuffle(r2, r3, 0, 4, 1, 5); // r2[0] r3[0] r2[1] r3[1]
auto t3 = wasm_i32x4_shuffle(r2, r3, 2, 6, 3, 7); // r2[2] r3[2] r2[3] r3[3]

matrix_begin[0] = wasm_i32x4_shuffle(t0, t2, 0, 1, 4, 5); // r0[0] r1[0] r2[0] r3[0]
matrix_begin[1] = wasm_i32x4_shuffle(t0, t2, 2, 3, 6, 7); // r0[1] r1[1] r2[1] r3[1]
matrix_begin[2] = wasm_i32x4_shuffle(t1, t3, 0, 1, 4, 5); // r0[2] r1[2] r2[2] r3[2]
matrix_begin[3] = wasm_i32x4_shuffle(t1, t3, 2, 3, 6, 7); // r0[3] r1[3] r2[3] r3[3]
}
else
{
transpose(matrix_begin, matrix_end, generic {});
}
}

// trunc
template <class A>
XSIMD_INLINE batch<float, A> trunc(batch<float, A> const& self, requires_arch<wasm>) noexcept
Expand Down
17 changes: 17 additions & 0 deletions include/xsimd/types/xsimd_api.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2516,6 +2516,23 @@ namespace xsimd
return batch_cast<as_integer_t<T>>(x);
}

/**
* @ingroup batch_data_transfer
*
* Transposes in place the matrix whose line are each of the batch passed as
* argument.
* @param matrix_begin pointer to the first line of the matrix to transpose
* @param matrix_end pointer to one element after the last line of the matrix to transpose
*
*/
template <class T, class A>
XSIMD_INLINE void transpose(batch<T, A>* matrix_begin, batch<T, A>* matrix_end) noexcept
{
assert((matrix_end - matrix_begin == batch<T, A>::size) && "correctly sized matrix");
detail::static_check_supported_config<T, A>();
return kernel::transpose(matrix_begin, matrix_end, A {});
}

/**
* @ingroup batch_rounding
*
Expand Down
23 changes: 23 additions & 0 deletions test/test_shuffle.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -605,6 +605,25 @@ struct shuffle_test
}
}

void transpose()
{
B b_lhs = B::load_unaligned(lhs.data());
std::array<B, size> b_matrix;
for (size_t i = 0; i < size; ++i)
b_matrix[i] = b_lhs;
std::array<value_type, size * size> ref_matrix;
for (size_t i = 0; i < size; ++i)
for (size_t j = 0; j < size; ++j)
ref_matrix[i * size + j] = lhs[i];

INFO("transpose");
xsimd::transpose(b_matrix.data(), b_matrix.data() + b_matrix.size());
for (size_t i = 0; i < size; ++i)
{
CHECK_BATCH_EQ(b_matrix[i], B::load_unaligned(&ref_matrix[i * size]));
}
}

void select()
{
B b_lhs = B::load_unaligned(lhs.data());
Expand Down Expand Up @@ -694,6 +713,10 @@ TEST_CASE_TEMPLATE("[shuffle]", B, BATCH_FLOAT_TYPES, xsimd::batch<uint32_t>, xs
{
Test.swizzle();
}
SUBCASE("transpose")
{
Test.transpose();
}
SUBCASE("zip")
{
Test.zip();
Expand Down

0 comments on commit c6740bc

Please sign in to comment.