Skip to content

Commit

Permalink
Add support for xsimd::transpose
Browse files Browse the repository at this point in the history
CUrrently only 32bits types on sse, avx, neon and wasm

Fix #107 (6 years old!)
  • Loading branch information
serge-sans-paille committed Oct 6, 2024
1 parent 3747986 commit 8ae87ab
Show file tree
Hide file tree
Showing 9 changed files with 187 additions and 0 deletions.
2 changes: 2 additions & 0 deletions docs/source/api/data_transfer.rst
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@ In place:

Between batches:

+---------------------------------------+----------------------------------------------------+
| :cpp:func:`transpose` | tranpose a matrix as an array of batches |
+---------------------------------------+----------------------------------------------------+
| :cpp:func:`zip_lo` | interleave low halves of two batches |
+---------------------------------------+----------------------------------------------------+
Expand Down
26 changes: 26 additions & 0 deletions include/xsimd/arch/generic/xsimd_generic_memory.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -639,6 +639,32 @@ namespace xsimd
hi.store_unaligned(buffer + real_batch::size);
}

// transpose
template <class A, class T>
XSIMD_INLINE void transpose(batch<T, A>* matrix_begin, batch<T, A>* matrix_end, requires_arch<generic>) noexcept
{
assert((matrix_end - matrix_begin == batch<T, A>::size) && "correctly sized matrix");
(void)matrix_end;
alignas(A::alignment()) T scratch_buffer[batch<T, A>::size * batch<T, A>::size];
for (size_t i = 0; i < batch<T, A>::size; ++i)
{
matrix_begin[i].store_aligned(&scratch_buffer[i * batch<T, A>::size]);
}
// FIXME: this is super naive we can probably do better.
for (size_t i = 0; i < batch<T, A>::size; ++i)
{
for (size_t j = 0; j < i; ++j)
{
std::swap(scratch_buffer[i * batch<T, A>::size + j],
scratch_buffer[j * batch<T, A>::size + i]);
}
}
for (size_t i = 0; i < batch<T, A>::size; ++i)
{
matrix_begin[i] = batch<T, A>::load_aligned(&scratch_buffer[i * batch<T, A>::size]);
}
}

}

}
Expand Down
40 changes: 40 additions & 0 deletions include/xsimd/arch/xsimd_avx.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1594,6 +1594,46 @@ namespace xsimd
return bitwise_cast<T>(
swizzle(bitwise_cast<double>(self), mask));
}
// transpose
template <class A>
XSIMD_INLINE void transpose(batch<float, A>* matrix_begin, batch<float, A>* matrix_end, requires_arch<avx>) noexcept
{
assert((matrix_end - matrix_begin == batch<float, A>::size) && "correctly sized matrix");
(void)matrix_end;
// See
// https://stackoverflow.com/questions/25622745/transpose-an-8x8-float-using-avx-avx2
auto r0 = matrix_begin[0], r1 = matrix_begin[1],
r2 = matrix_begin[2], r3 = matrix_begin[3],
r4 = matrix_begin[4], r5 = matrix_begin[5],
r6 = matrix_begin[6], r7 = matrix_begin[7];

auto t0 = _mm256_unpacklo_ps(r0, r1);
auto t1 = _mm256_unpackhi_ps(r0, r1);
auto t2 = _mm256_unpacklo_ps(r2, r3);
auto t3 = _mm256_unpackhi_ps(r2, r3);
auto t4 = _mm256_unpacklo_ps(r4, r5);
auto t5 = _mm256_unpackhi_ps(r4, r5);
auto t6 = _mm256_unpacklo_ps(r6, r7);
auto t7 = _mm256_unpackhi_ps(r6, r7);

r0 = _mm256_shuffle_ps(t0, t2, _MM_SHUFFLE(1, 0, 1, 0));
r1 = _mm256_shuffle_ps(t0, t2, _MM_SHUFFLE(3, 2, 3, 2));
r2 = _mm256_shuffle_ps(t1, t3, _MM_SHUFFLE(1, 0, 1, 0));
r3 = _mm256_shuffle_ps(t1, t3, _MM_SHUFFLE(3, 2, 3, 2));
r4 = _mm256_shuffle_ps(t4, t6, _MM_SHUFFLE(1, 0, 1, 0));
r5 = _mm256_shuffle_ps(t4, t6, _MM_SHUFFLE(3, 2, 3, 2));
r6 = _mm256_shuffle_ps(t5, t7, _MM_SHUFFLE(1, 0, 1, 0));
r7 = _mm256_shuffle_ps(t5, t7, _MM_SHUFFLE(3, 2, 3, 2));

matrix_begin[0] = _mm256_permute2f128_ps(r0, r4, 0x20);
matrix_begin[1] = _mm256_permute2f128_ps(r1, r5, 0x20);
matrix_begin[2] = _mm256_permute2f128_ps(r2, r6, 0x20);
matrix_begin[3] = _mm256_permute2f128_ps(r3, r7, 0x20);
matrix_begin[4] = _mm256_permute2f128_ps(r0, r4, 0x31);
matrix_begin[5] = _mm256_permute2f128_ps(r1, r5, 0x31);
matrix_begin[6] = _mm256_permute2f128_ps(r2, r6, 0x31);
matrix_begin[7] = _mm256_permute2f128_ps(r3, r7, 0x31);
}

// trunc
template <class A>
Expand Down
18 changes: 18 additions & 0 deletions include/xsimd/arch/xsimd_neon.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1748,6 +1748,23 @@ namespace xsimd
return select(batch_bool<T, A> { b... }, true_br, false_br, neon {});
}

/*************
* transpose *
*************/
template <class A>
XSIMD_INLINE void transpose(batch<float, A>* matrix_begin, batch<float, A>* matrix_end, requires_arch<neon>) noexcept
{
assert((matrix_end - matrix_begin == batch<float, A>::size) && "correctly sized matrix");
(void)matrix_end;
auto r0 = matrix_begin[0], r1 = matrix_begin[1], r2 = matrix_begin[2], r3 = matrix_begin[3];
auto t01 = vtrnq_f32(r0, r1);
auto t23 = vtrnq_f32(r2, r3);
matrix_begin[0] = vcombine_f32(vget_low_f32(t01.val[0]), vget_low_f32(t23.val[0]));
matrix_begin[1] = vcombine_f32(vget_low_f32(t01.val[1]), vget_low_f32(t23.val[1]));
matrix_begin[2] = vcombine_f32(vget_high_f32(t01.val[0]), vget_high_f32(t23.val[0]));
matrix_begin[3] = vcombine_f32(vget_high_f32(t01.val[1]), vget_high_f32(t23.val[1]));
}

/**********
* zip_lo *
**********/
Expand Down Expand Up @@ -2737,6 +2754,7 @@ namespace xsimd
return set(batch<T, A>(), A(), data[idx]...);
}
}

}

#undef WRAP_BINARY_INT_EXCLUDING_64
Expand Down
1 change: 1 addition & 0 deletions include/xsimd/arch/xsimd_neon64.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -949,6 +949,7 @@ namespace xsimd
{
return select(batch_bool<double, A> { b... }, true_br, false_br, neon64 {});
}

/**********
* zip_lo *
**********/
Expand Down
40 changes: 40 additions & 0 deletions include/xsimd/arch/xsimd_sse2.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1640,6 +1640,46 @@ namespace xsimd
return bitwise_cast<int32_t>(swizzle(bitwise_cast<uint32_t>(self), mask, sse2 {}));
}

// transpose
template <class A>
XSIMD_INLINE void transpose(batch<float, A>* matrix_begin, batch<float, A>* matrix_end, requires_arch<sse2>) noexcept
{
assert((matrix_end - matrix_begin == batch<float, A>::size) && "correctly sized matrix");
(void)matrix_end;
auto r0 = matrix_begin[0], r1 = matrix_begin[1], r2 = matrix_begin[2], r3 = matrix_begin[3];
_MM_TRANSPOSE4_PS(r0, r1, r2, r3);
matrix_begin[0] = r0;
matrix_begin[1] = r1;
matrix_begin[2] = r2;
matrix_begin[3] = r3;
}
template <class A>
XSIMD_INLINE void transpose(batch<uint32_t, A>* matrix_begin, batch<uint32_t, A>* matrix_end, requires_arch<sse2>) noexcept
{
assert((matrix_end - matrix_begin == batch<uint32_t, A>::size) && "correctly sized matrix");
(void)matrix_end;
auto r0 = bitwise_cast<float>(matrix_begin[0]), r1 = bitwise_cast<float>(matrix_begin[1]),
r2 = bitwise_cast<float>(matrix_begin[2]), r3 = bitwise_cast<float>(matrix_begin[3]);
_MM_TRANSPOSE4_PS(r0, r1, r2, r3);
matrix_begin[0] = bitwise_cast<uint32_t>(r0);
matrix_begin[1] = bitwise_cast<uint32_t>(r1);
matrix_begin[2] = bitwise_cast<uint32_t>(r2);
matrix_begin[3] = bitwise_cast<uint32_t>(r3);
}
template <class A>
XSIMD_INLINE void transpose(batch<int32_t, A>* matrix_begin, batch<int32_t, A>* matrix_end, requires_arch<sse2>) noexcept
{
assert((matrix_end - matrix_begin == batch<int32_t, A>::size) && "correctly sized matrix");
(void)matrix_end;
auto r0 = bitwise_cast<float>(matrix_begin[0]), r1 = bitwise_cast<float>(matrix_begin[1]),
r2 = bitwise_cast<float>(matrix_begin[2]), r3 = bitwise_cast<float>(matrix_begin[3]);
_MM_TRANSPOSE4_PS(r0, r1, r2, r3);
matrix_begin[0] = bitwise_cast<int32_t>(r0);
matrix_begin[1] = bitwise_cast<int32_t>(r1);
matrix_begin[2] = bitwise_cast<int32_t>(r2);
matrix_begin[3] = bitwise_cast<int32_t>(r3);
}

// zip_hi
template <class A>
XSIMD_INLINE batch<float, A> zip_hi(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
Expand Down
20 changes: 20 additions & 0 deletions include/xsimd/arch/xsimd_wasm.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1576,6 +1576,26 @@ namespace xsimd
return bitwise_cast<int8_t>(swizzle(bitwise_cast<uint8_t>(self), mask, wasm {}));
}

// transpose
template <class A>
XSIMD_INLINE void transpose(batch<float, A>* matrix_begin, batch<float, A>* matrix_end, requires_arch<wasm>) noexcept
{
assert((matrix_end - matrix_begin == batch<float, A>::size) && "correctly sized matrix");
(void)matrix_end;
auto r0 = matrix_begin[0], r1 = matrix_begin[1], r2 = matrix_begin[2], r3 = matrix_begin[3];

auto t0 = wasm_i32x4_shuffle(r0, r1, 0, 4, 1, 5); // r0[0] r1[0] r0[1] r1[1]
auto t1 = wasm_i32x4_shuffle(r0, r1, 2, 6, 3, 7); // r0[2] r1[2] r0[3] r1[3]

auto t2 = wasm_i32x4_shuffle(r2, r3, 0, 4, 1, 5); // r2[0] r3[0] r2[1] r3[1]
auto t3 = wasm_i32x4_shuffle(r2, r3, 2, 6, 3, 7); // r2[2] r3[2] r2[3] r3[3]

matrix_begin[0] = wasm_i32x4_shuffle(t0, t2, 0, 1, 4, 5); // r0[0] r1[0] r2[0] r3[0]
matrix_begin[1] = wasm_i32x4_shuffle(t0, t2, 2, 3, 6, 7); // r0[1] r1[1] r2[1] r3[1]
matrix_begin[2] = wasm_i32x4_shuffle(t1, t3, 0, 1, 4, 5); // r0[2] r1[2] r2[2] r3[2]
matrix_begin[3] = wasm_i32x4_shuffle(t1, t3, 2, 3, 6, 7); // r0[3] r1[3] r2[3] r3[3]
}

// trunc
template <class A>
XSIMD_INLINE batch<float, A> trunc(batch<float, A> const& self, requires_arch<wasm>) noexcept
Expand Down
17 changes: 17 additions & 0 deletions include/xsimd/types/xsimd_api.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2516,6 +2516,23 @@ namespace xsimd
return batch_cast<as_integer_t<T>>(x);
}

/**
* @ingroup batch_data_transfer
*
* Transposes in place the matrix whose line are each of the batch passed as
* argument.
* @param matrix_begin pointer to the first line of the matrix to transpose
* @param matrix_end pointer to one element after the last line of the matrix to transpose
*
*/
template <class T, class A>
XSIMD_INLINE void transpose(batch<T, A>* matrix_begin, batch<T, A>* matrix_end) noexcept
{
assert((matrix_end - matrix_begin == batch<T, A>::size) && "correctly sized matrix");
detail::static_check_supported_config<T, A>();
return kernel::transpose(matrix_begin, matrix_end, A {});
}

/**
* @ingroup batch_rounding
*
Expand Down
23 changes: 23 additions & 0 deletions test/test_shuffle.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -605,6 +605,25 @@ struct shuffle_test
}
}

void transpose()
{
B b_lhs = B::load_unaligned(lhs.data());
std::array<B, size> b_matrix;
for (size_t i = 0; i < size; ++i)
b_matrix[i] = b_lhs;
std::array<value_type, size * size> ref_matrix;
for (size_t i = 0; i < size; ++i)
for (size_t j = 0; j < size; ++j)
ref_matrix[i * size + j] = lhs[i];

INFO("transpose");
xsimd::transpose(b_matrix.data(), b_matrix.data() + b_matrix.size());
for (size_t i = 0; i < size; ++i)
{
CHECK_BATCH_EQ(b_matrix[i], B::load_unaligned(&ref_matrix[i * size]));
}
}

void select()
{
B b_lhs = B::load_unaligned(lhs.data());
Expand Down Expand Up @@ -694,6 +713,10 @@ TEST_CASE_TEMPLATE("[shuffle]", B, BATCH_FLOAT_TYPES, xsimd::batch<uint32_t>, xs
{
Test.swizzle();
}
SUBCASE("transpose")
{
Test.transpose();
}
SUBCASE("zip")
{
Test.zip();
Expand Down

0 comments on commit 8ae87ab

Please sign in to comment.