From 8ae87abc68fac263d6e94474eddb27aaf99ec409 Mon Sep 17 00:00:00 2001 From: serge-sans-paille Date: Sat, 5 Oct 2024 19:41:24 +0200 Subject: [PATCH] Add support for xsimd::transpose CUrrently only 32bits types on sse, avx, neon and wasm Fix #107 (6 years old!) --- docs/source/api/data_transfer.rst | 2 + .../arch/generic/xsimd_generic_memory.hpp | 26 ++++++++++++ include/xsimd/arch/xsimd_avx.hpp | 40 +++++++++++++++++++ include/xsimd/arch/xsimd_neon.hpp | 18 +++++++++ include/xsimd/arch/xsimd_neon64.hpp | 1 + include/xsimd/arch/xsimd_sse2.hpp | 40 +++++++++++++++++++ include/xsimd/arch/xsimd_wasm.hpp | 20 ++++++++++ include/xsimd/types/xsimd_api.hpp | 17 ++++++++ test/test_shuffle.cpp | 23 +++++++++++ 9 files changed, 187 insertions(+) diff --git a/docs/source/api/data_transfer.rst b/docs/source/api/data_transfer.rst index d8102b1a4..5691073ec 100644 --- a/docs/source/api/data_transfer.rst +++ b/docs/source/api/data_transfer.rst @@ -61,6 +61,8 @@ In place: Between batches: ++---------------------------------------+----------------------------------------------------+ +| :cpp:func:`transpose` | tranpose a matrix as an array of batches | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`zip_lo` | interleave low halves of two batches | +---------------------------------------+----------------------------------------------------+ diff --git a/include/xsimd/arch/generic/xsimd_generic_memory.hpp b/include/xsimd/arch/generic/xsimd_generic_memory.hpp index 18c9c80ad..10c1ffe66 100644 --- a/include/xsimd/arch/generic/xsimd_generic_memory.hpp +++ b/include/xsimd/arch/generic/xsimd_generic_memory.hpp @@ -639,6 +639,32 @@ namespace xsimd hi.store_unaligned(buffer + real_batch::size); } + // transpose + template + XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept + { + assert((matrix_end - matrix_begin == batch::size) && "correctly sized matrix"); + (void)matrix_end; + alignas(A::alignment()) T scratch_buffer[batch::size * batch::size]; + for (size_t i = 0; i < batch::size; ++i) + { + matrix_begin[i].store_aligned(&scratch_buffer[i * batch::size]); + } + // FIXME: this is super naive we can probably do better. + for (size_t i = 0; i < batch::size; ++i) + { + for (size_t j = 0; j < i; ++j) + { + std::swap(scratch_buffer[i * batch::size + j], + scratch_buffer[j * batch::size + i]); + } + } + for (size_t i = 0; i < batch::size; ++i) + { + matrix_begin[i] = batch::load_aligned(&scratch_buffer[i * batch::size]); + } + } + } } diff --git a/include/xsimd/arch/xsimd_avx.hpp b/include/xsimd/arch/xsimd_avx.hpp index f41702bab..32f609271 100644 --- a/include/xsimd/arch/xsimd_avx.hpp +++ b/include/xsimd/arch/xsimd_avx.hpp @@ -1594,6 +1594,46 @@ namespace xsimd return bitwise_cast( swizzle(bitwise_cast(self), mask)); } + // transpose + template + XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept + { + assert((matrix_end - matrix_begin == batch::size) && "correctly sized matrix"); + (void)matrix_end; + // See + // https://stackoverflow.com/questions/25622745/transpose-an-8x8-float-using-avx-avx2 + auto r0 = matrix_begin[0], r1 = matrix_begin[1], + r2 = matrix_begin[2], r3 = matrix_begin[3], + r4 = matrix_begin[4], r5 = matrix_begin[5], + r6 = matrix_begin[6], r7 = matrix_begin[7]; + + auto t0 = _mm256_unpacklo_ps(r0, r1); + auto t1 = _mm256_unpackhi_ps(r0, r1); + auto t2 = _mm256_unpacklo_ps(r2, r3); + auto t3 = _mm256_unpackhi_ps(r2, r3); + auto t4 = _mm256_unpacklo_ps(r4, r5); + auto t5 = _mm256_unpackhi_ps(r4, r5); + auto t6 = _mm256_unpacklo_ps(r6, r7); + auto t7 = _mm256_unpackhi_ps(r6, r7); + + r0 = _mm256_shuffle_ps(t0, t2, _MM_SHUFFLE(1, 0, 1, 0)); + r1 = _mm256_shuffle_ps(t0, t2, _MM_SHUFFLE(3, 2, 3, 2)); + r2 = _mm256_shuffle_ps(t1, t3, _MM_SHUFFLE(1, 0, 1, 0)); + r3 = _mm256_shuffle_ps(t1, t3, _MM_SHUFFLE(3, 2, 3, 2)); + r4 = _mm256_shuffle_ps(t4, t6, _MM_SHUFFLE(1, 0, 1, 0)); + r5 = _mm256_shuffle_ps(t4, t6, _MM_SHUFFLE(3, 2, 3, 2)); + r6 = _mm256_shuffle_ps(t5, t7, _MM_SHUFFLE(1, 0, 1, 0)); + r7 = _mm256_shuffle_ps(t5, t7, _MM_SHUFFLE(3, 2, 3, 2)); + + matrix_begin[0] = _mm256_permute2f128_ps(r0, r4, 0x20); + matrix_begin[1] = _mm256_permute2f128_ps(r1, r5, 0x20); + matrix_begin[2] = _mm256_permute2f128_ps(r2, r6, 0x20); + matrix_begin[3] = _mm256_permute2f128_ps(r3, r7, 0x20); + matrix_begin[4] = _mm256_permute2f128_ps(r0, r4, 0x31); + matrix_begin[5] = _mm256_permute2f128_ps(r1, r5, 0x31); + matrix_begin[6] = _mm256_permute2f128_ps(r2, r6, 0x31); + matrix_begin[7] = _mm256_permute2f128_ps(r3, r7, 0x31); + } // trunc template diff --git a/include/xsimd/arch/xsimd_neon.hpp b/include/xsimd/arch/xsimd_neon.hpp index cd161305f..f6c791dca 100644 --- a/include/xsimd/arch/xsimd_neon.hpp +++ b/include/xsimd/arch/xsimd_neon.hpp @@ -1748,6 +1748,23 @@ namespace xsimd return select(batch_bool { b... }, true_br, false_br, neon {}); } + /************* + * transpose * + *************/ + template + XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept + { + assert((matrix_end - matrix_begin == batch::size) && "correctly sized matrix"); + (void)matrix_end; + auto r0 = matrix_begin[0], r1 = matrix_begin[1], r2 = matrix_begin[2], r3 = matrix_begin[3]; + auto t01 = vtrnq_f32(r0, r1); + auto t23 = vtrnq_f32(r2, r3); + matrix_begin[0] = vcombine_f32(vget_low_f32(t01.val[0]), vget_low_f32(t23.val[0])); + matrix_begin[1] = vcombine_f32(vget_low_f32(t01.val[1]), vget_low_f32(t23.val[1])); + matrix_begin[2] = vcombine_f32(vget_high_f32(t01.val[0]), vget_high_f32(t23.val[0])); + matrix_begin[3] = vcombine_f32(vget_high_f32(t01.val[1]), vget_high_f32(t23.val[1])); + } + /********** * zip_lo * **********/ @@ -2737,6 +2754,7 @@ namespace xsimd return set(batch(), A(), data[idx]...); } } + } #undef WRAP_BINARY_INT_EXCLUDING_64 diff --git a/include/xsimd/arch/xsimd_neon64.hpp b/include/xsimd/arch/xsimd_neon64.hpp index d09997033..063622a9e 100644 --- a/include/xsimd/arch/xsimd_neon64.hpp +++ b/include/xsimd/arch/xsimd_neon64.hpp @@ -949,6 +949,7 @@ namespace xsimd { return select(batch_bool { b... }, true_br, false_br, neon64 {}); } + /********** * zip_lo * **********/ diff --git a/include/xsimd/arch/xsimd_sse2.hpp b/include/xsimd/arch/xsimd_sse2.hpp index 67b74f548..c0253cda6 100644 --- a/include/xsimd/arch/xsimd_sse2.hpp +++ b/include/xsimd/arch/xsimd_sse2.hpp @@ -1640,6 +1640,46 @@ namespace xsimd return bitwise_cast(swizzle(bitwise_cast(self), mask, sse2 {})); } + // transpose + template + XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept + { + assert((matrix_end - matrix_begin == batch::size) && "correctly sized matrix"); + (void)matrix_end; + auto r0 = matrix_begin[0], r1 = matrix_begin[1], r2 = matrix_begin[2], r3 = matrix_begin[3]; + _MM_TRANSPOSE4_PS(r0, r1, r2, r3); + matrix_begin[0] = r0; + matrix_begin[1] = r1; + matrix_begin[2] = r2; + matrix_begin[3] = r3; + } + template + XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept + { + assert((matrix_end - matrix_begin == batch::size) && "correctly sized matrix"); + (void)matrix_end; + auto r0 = bitwise_cast(matrix_begin[0]), r1 = bitwise_cast(matrix_begin[1]), + r2 = bitwise_cast(matrix_begin[2]), r3 = bitwise_cast(matrix_begin[3]); + _MM_TRANSPOSE4_PS(r0, r1, r2, r3); + matrix_begin[0] = bitwise_cast(r0); + matrix_begin[1] = bitwise_cast(r1); + matrix_begin[2] = bitwise_cast(r2); + matrix_begin[3] = bitwise_cast(r3); + } + template + XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept + { + assert((matrix_end - matrix_begin == batch::size) && "correctly sized matrix"); + (void)matrix_end; + auto r0 = bitwise_cast(matrix_begin[0]), r1 = bitwise_cast(matrix_begin[1]), + r2 = bitwise_cast(matrix_begin[2]), r3 = bitwise_cast(matrix_begin[3]); + _MM_TRANSPOSE4_PS(r0, r1, r2, r3); + matrix_begin[0] = bitwise_cast(r0); + matrix_begin[1] = bitwise_cast(r1); + matrix_begin[2] = bitwise_cast(r2); + matrix_begin[3] = bitwise_cast(r3); + } + // zip_hi template XSIMD_INLINE batch zip_hi(batch const& self, batch const& other, requires_arch) noexcept diff --git a/include/xsimd/arch/xsimd_wasm.hpp b/include/xsimd/arch/xsimd_wasm.hpp index 5316cce35..ff4f80357 100644 --- a/include/xsimd/arch/xsimd_wasm.hpp +++ b/include/xsimd/arch/xsimd_wasm.hpp @@ -1576,6 +1576,26 @@ namespace xsimd return bitwise_cast(swizzle(bitwise_cast(self), mask, wasm {})); } + // transpose + template + XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept + { + assert((matrix_end - matrix_begin == batch::size) && "correctly sized matrix"); + (void)matrix_end; + auto r0 = matrix_begin[0], r1 = matrix_begin[1], r2 = matrix_begin[2], r3 = matrix_begin[3]; + + auto t0 = wasm_i32x4_shuffle(r0, r1, 0, 4, 1, 5); // r0[0] r1[0] r0[1] r1[1] + auto t1 = wasm_i32x4_shuffle(r0, r1, 2, 6, 3, 7); // r0[2] r1[2] r0[3] r1[3] + + auto t2 = wasm_i32x4_shuffle(r2, r3, 0, 4, 1, 5); // r2[0] r3[0] r2[1] r3[1] + auto t3 = wasm_i32x4_shuffle(r2, r3, 2, 6, 3, 7); // r2[2] r3[2] r2[3] r3[3] + + matrix_begin[0] = wasm_i32x4_shuffle(t0, t2, 0, 1, 4, 5); // r0[0] r1[0] r2[0] r3[0] + matrix_begin[1] = wasm_i32x4_shuffle(t0, t2, 2, 3, 6, 7); // r0[1] r1[1] r2[1] r3[1] + matrix_begin[2] = wasm_i32x4_shuffle(t1, t3, 0, 1, 4, 5); // r0[2] r1[2] r2[2] r3[2] + matrix_begin[3] = wasm_i32x4_shuffle(t1, t3, 2, 3, 6, 7); // r0[3] r1[3] r2[3] r3[3] + } + // trunc template XSIMD_INLINE batch trunc(batch const& self, requires_arch) noexcept diff --git a/include/xsimd/types/xsimd_api.hpp b/include/xsimd/types/xsimd_api.hpp index 6e9ec094c..5cd87c205 100644 --- a/include/xsimd/types/xsimd_api.hpp +++ b/include/xsimd/types/xsimd_api.hpp @@ -2516,6 +2516,23 @@ namespace xsimd return batch_cast>(x); } + /** + * @ingroup batch_data_transfer + * + * Transposes in place the matrix whose line are each of the batch passed as + * argument. + * @param matrix_begin pointer to the first line of the matrix to transpose + * @param matrix_end pointer to one element after the last line of the matrix to transpose + * + */ + template + XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end) noexcept + { + assert((matrix_end - matrix_begin == batch::size) && "correctly sized matrix"); + detail::static_check_supported_config(); + return kernel::transpose(matrix_begin, matrix_end, A {}); + } + /** * @ingroup batch_rounding * diff --git a/test/test_shuffle.cpp b/test/test_shuffle.cpp index 3ea9dca4d..ed07095da 100644 --- a/test/test_shuffle.cpp +++ b/test/test_shuffle.cpp @@ -605,6 +605,25 @@ struct shuffle_test } } + void transpose() + { + B b_lhs = B::load_unaligned(lhs.data()); + std::array b_matrix; + for (size_t i = 0; i < size; ++i) + b_matrix[i] = b_lhs; + std::array ref_matrix; + for (size_t i = 0; i < size; ++i) + for (size_t j = 0; j < size; ++j) + ref_matrix[i * size + j] = lhs[i]; + + INFO("transpose"); + xsimd::transpose(b_matrix.data(), b_matrix.data() + b_matrix.size()); + for (size_t i = 0; i < size; ++i) + { + CHECK_BATCH_EQ(b_matrix[i], B::load_unaligned(&ref_matrix[i * size])); + } + } + void select() { B b_lhs = B::load_unaligned(lhs.data()); @@ -694,6 +713,10 @@ TEST_CASE_TEMPLATE("[shuffle]", B, BATCH_FLOAT_TYPES, xsimd::batch, xs { Test.swizzle(); } + SUBCASE("transpose") + { + Test.transpose(); + } SUBCASE("zip") { Test.zip();