From a10dae8117b0288f62cd6ca3d555985d0d54df1f Mon Sep 17 00:00:00 2001 From: Victorin Date: Mon, 15 Jul 2024 18:05:24 +0200 Subject: [PATCH] 15/07 --- benchmark/bench_bmat16.cpp | 1 + include/hpcombi/bmat16.hpp | 80 +++++++++++++++----------------- include/hpcombi/bmat16_impl.hpp | 81 ++++++++++++++++++--------------- tests/test_bmat16.cpp | 19 ++++++-- 4 files changed, 96 insertions(+), 85 deletions(-) diff --git a/benchmark/bench_bmat16.cpp b/benchmark/bench_bmat16.cpp index fb00189..d54c973 100644 --- a/benchmark/bench_bmat16.cpp +++ b/benchmark/bench_bmat16.cpp @@ -65,6 +65,7 @@ TEST_CASE_METHOD(Fix_BMat16, "Transpose", "[BMat16][000]") { TEST_CASE_METHOD(Fix_BMat16, "Multiplication", "[BMat16][001]") { BENCHMARK_MEM_FN_PAIR(BMat16::operator*, pair_sample); + BENCHMARK_MEM_FN_PAIR(mult_4bmat8, pair_sample); BENCHMARK_MEM_FN_PAIR(mult_naive, pair_sample); BENCHMARK_MEM_FN_PAIR(mult_naive_array, pair_sample); } diff --git a/include/hpcombi/bmat16.hpp b/include/hpcombi/bmat16.hpp index b2b0a0a..a69c858 100644 --- a/include/hpcombi/bmat16.hpp +++ b/include/hpcombi/bmat16.hpp @@ -40,11 +40,15 @@ #include "bmat8.hpp" #include "simde/x86/avx2.h" +// #include "simde/x86/avx512/popcnt.h" namespace HPCombi { using xpu16 = uint16_t __attribute__((vector_size(32))); using xpu64 = uint64_t __attribute__((vector_size(32))); +xpu64 to_line(xpu64 vect); +xpu64 to_block(xpu64 vect); + //! Class for fast boolean matrices of dimension up to 16 x 16 //! //! The methods for these small matrices over the boolean semiring @@ -65,7 +69,7 @@ class BMat16 { //! A constructor. //! - //! This constructor initializes a BMat16 with a 256-bit register + //! This constructor initializes a matrix with a 256-bit register //! The rows are equal to the 16 chunks, of 16 bits each, //! of the binary representation of the matrix explicit BMat16(xpu64 mat) noexcept : @@ -75,9 +79,11 @@ class BMat16 { //! //! This constructor initializes a matrix with 4 64 bits unsigned int explicit BMat16(uint64_t n0, uint64_t n1, uint64_t n2, uint64_t n3) noexcept; - // explicit BMat16(uint64_t n0, uint64_t n1, uint64_t n2, uint64_t n3) noexcept : - // _data{xpu64{n0, n1, n2, n3}} {} + //! A constructor. + //! + //! This constructor initializes a matrix where the rows of the matrix + //! are the vectors in \p mat. explicit BMat16(std::vector> const &mat) noexcept; //! A constructor. @@ -105,12 +111,12 @@ class BMat16 { //! Returns \c true if \c this equals \p that. //! - //! This method checks the mathematical equality of two BMat8 objects. + //! This method checks the mathematical equality of two BMat16 objects. bool operator==(BMat16 const &that) const noexcept; //! Returns \c true if \c this does not equal \p that //! - //! This method checks the mathematical inequality of two BMat8 objects. + //! This method checks the mathematical inequality of two BMat16 objects. bool operator!=(BMat16 const &that) const noexcept { return !(*this == that); } @@ -118,22 +124,12 @@ class BMat16 { //! Returns \c true if \c this is less than \p that. //! //! This method checks whether a BMat16 objects is less than another. - //! We order by the results of to_int() for each matrix. bool operator<(BMat16 const &that) const noexcept; //! Returns \c true if \c this is greater than \p that. //! //! This method checks whether a BMat16 objects is greater than another. - //! We order by the results of to_int() for each matrix. - // bool operator>(BMat8 const &that) const noexcept { - // return _data > that._data; - // } - - // Conversion of type of storage, from blocks to lines - BMat16 to_line() const; - - // Conversion of type of storage, from lines to blocks - BMat16 to_block() const; + bool operator>(BMat16 const &that) const noexcept; //! Returns the entry in the (\p i, \p j)th position. //! @@ -157,45 +153,43 @@ class BMat16 { //! Returns the bitwise or between \c this and \p that //! //! This method perform the bitwise operator on the matrices and - //! returns the result as a BMat16 + //! returns the result as a BMat16. BMat16 operator|(BMat16 const& that) const noexcept { return BMat16(_data | that._data); } - //! Returns the transpose of \c this. - //! - //! Returns the standard matrix transpose of a BMat8. - //! Uses a naive technique, by simply iterating through all entries - BMat16 transpose_naive() const noexcept; - //! Returns the transpose of \c this. //! //! Returns the standard matrix transpose of a BMat16. - //! Uses a naive technique, by simply iterating through all entries - BMat16 transpose_block() const noexcept; + //! Uses a naive technique, by simply iterating through all entries. + BMat16 transpose_naive() const noexcept; //! Returns the transpose of \c this //! - //! Returns the standard matrix transpose of a BMat8. + //! Returns the standard matrix transpose of a BMat16. //! Uses the technique found in Knuth AoCP Vol. 4 Fasc. 1a, p. 15. - BMat16 transpose() const noexcept { - return to_block().transpose_block().to_line(); - } + BMat16 transpose() const noexcept; //! Returns the matrix product of \c this and the transpose of \p that //! //! This method returns the standard matrix product (over the - //! boolean semiring) of two BMat8 objects. This is faster than transposing + //! boolean semiring) of two BMat16 objects. This is faster than transposing //! that and calling the product of \c this with it. Implementation uses //! vector instructions. BMat16 mult_transpose(BMat16 const &that) const noexcept; - BMat16 mult_bmat8(BMat16 const &that) const noexcept; + //! Returns the matrix product of \c this and \p that + //! + //! This method returns the standard matrix product (over the + //! boolean semiring) of two BMat16 objects. + //! It comes down to 8 products of 8x8 matrices, + //! which make up a 16x16 when we cut it into 4. + BMat16 mult_4bmat8(BMat16 const &that) const noexcept; //! Returns the matrix product of \c this and \p that //! //! This method returns the standard matrix product (over the - //! boolean semiring) of two BMat8 objects. This is a fast implementation + //! boolean semiring) of two BMat16 objects. This is a fast implementation //! using transposition and vector instructions. BMat16 operator*(BMat16 const &that) const noexcept { return mult_transpose(that.transpose()); @@ -204,19 +198,19 @@ class BMat16 { //! Returns the matrix product of \c this and \p that //! //! This method returns the standard matrix product (over the - //! boolean semiring) of two BMat8 objects. It performs the most naive approach - //! by simply iterating through all entries using the access operator of BMat8 + //! boolean semiring) of two BMat16 objects. It performs the most naive approach + //! by simply iterating through all entries using the access operator of BMat16 BMat16 mult_naive(BMat16 const& that) const noexcept; //! Returns the matrix product of \c this and \p that //! //! This method returns the standard matrix product (over the - //! boolean semiring) of two BMat8 objects. It performs the most naive approach + //! boolean semiring) of two BMat16 objects. It performs the most naive approach //! by simply iterating through all entries using array conversion. BMat16 mult_naive_array(BMat16 const& that) const noexcept; - // //! Returns the number of non-zero rows of \c this - // size_t nr_rows() const noexcept; + //! Returns the number of non-zero rows of \c this + size_t nr_rows() const noexcept; //! Returns a \c std::vector for rows of \c this // Not noexcept because it constructs a vector @@ -232,20 +226,20 @@ class BMat16 { return BMat16(ones[dim >= 8 ? 8 : dim], 0, 0, ones[dim >= 8 ? dim - 8 : 0]); } - //! Returns a random BMat8 + //! Returns a random BMat16 //! - //! This method returns a BMat8 chosen at random. + //! This method returns a BMat16 chosen at random. // Not noexcept because random things aren't static BMat16 random(); - //! Returns a random square BMat8 up to dimension \p dim. + //! Returns a random square BMat16 up to dimension \p dim. //! - //! This method returns a BMat8 chosen at random, where only the + //! This method returns a BMat16 chosen at random, where only the //! top-left \p dim x \p dim entries may be non-zero. - // Not noexcept because BMat8::random above is not + // Not noexcept because BMat16::random above is not static BMat16 random(size_t dim); - // void swap(BMat16 &that) noexcept { std::swap(this->_data, that._data); } + void swap(BMat16 &that) noexcept { std::swap(this->_data, that._data); } // ! Write \c this on \c os // Not noexcept diff --git a/include/hpcombi/bmat16_impl.hpp b/include/hpcombi/bmat16_impl.hpp index ed01475..5a1a0a1 100644 --- a/include/hpcombi/bmat16_impl.hpp +++ b/include/hpcombi/bmat16_impl.hpp @@ -28,9 +28,17 @@ static_assert(std::is_trivial(), "BMat16 is not a trivial class!"); static constexpr xpu16 line{0x800, 0x901, 0xa02, 0xb03, 0xc04, 0xd05, 0xe06, 0xf07, 0x800, 0x901, 0xa02, 0xb03, 0xc04, 0xd05, 0xe06, 0xf07}; static constexpr xpu16 block{0x200, 0x604, 0xa08, 0xe0c, 0x301, 0x705, 0xb09, 0xf0d, 0x200, 0x604, 0xa08, 0xe0c, 0x301, 0x705, 0xb09, 0xf0d}; +inline xpu64 to_line(xpu64 vect) { + return simde_mm256_shuffle_epi8(vect, line); +} + +inline xpu64 to_block(xpu64 vect) { + return simde_mm256_shuffle_epi8(vect, block); +} + inline BMat16::BMat16(uint64_t n0, uint64_t n1, uint64_t n2, uint64_t n3) noexcept { xpu64 tmp{n0, n1, n2, n3}; - _data = simde_mm256_shuffle_epi8(tmp, line); + _data = to_line(tmp); } inline BMat16::BMat16(std::vector> const &mat) noexcept { @@ -57,15 +65,21 @@ inline bool BMat16::operator==(BMat16 const &that) const noexcept { } bool BMat16::operator<(BMat16 const &that) const noexcept { - return _data[0] < that._data[0] || (_data[0] == that._data[0] && ( - _data[1] < that._data[1] || (_data[1] == that._data[1] && ( - _data[2] < that._data[2] || (_data[2] == that._data[2] && ( - _data[3] < that._data[3] - )))))); + return _data[0] < that._data[0] || + (_data[0] == that._data[0] && (_data[1] < that._data[1] || + (_data[1] == that._data[1] && (_data[2] < that._data[2] || + (_data[2] == that._data[2] && (_data[3] < that._data[3])))))); +} + +bool BMat16::operator>(BMat16 const &that) const noexcept { + return _data[0] > that._data[0] || + (_data[0] == that._data[0] && (_data[1] > that._data[1] || + (_data[1] == that._data[1] && (_data[2] > that._data[2] || + (_data[2] == that._data[2] && (_data[3] > that._data[3])))))); } std::array, 16> BMat16::to_array() const noexcept { - xpu64 tmp = simde_mm256_shuffle_epi8(_data, block); + xpu64 tmp = to_block(_data); uint64_t a = tmp[0], b = tmp[1], c = tmp[2], d = tmp[3]; std::array, 16> res; for (int i = 0; i < 64; i++) { @@ -77,14 +91,6 @@ std::array, 16> BMat16::to_array() const noexcept { return res; } -inline BMat16 BMat16::to_line() const { - return BMat16(simde_mm256_shuffle_epi8(_data, line)); -} - -inline BMat16 BMat16::to_block() const { - return BMat16(simde_mm256_shuffle_epi8(_data, block)); -} - inline BMat16 BMat16::transpose_naive() const noexcept { uint64_t a = 0, b = 0, c = 0, d = 0; for (int i = 7; i >= 0; i--) { @@ -98,15 +104,16 @@ inline BMat16 BMat16::transpose_naive() const noexcept { return BMat16(a, b, c, d); } -inline BMat16 BMat16::transpose_block() const noexcept { - xpu64 x = simde_mm256_set_epi64x(_data[3], _data[1], _data[2], _data[0]); +inline BMat16 BMat16::transpose() const noexcept { + xpu64 tmp = to_block(_data); + xpu64 x = simde_mm256_set_epi64x(tmp[3], tmp[1], tmp[2], tmp[0]); xpu64 y = (x ^ (x >> 7)) & (xpu64{0xAA00AA00AA00AA, 0xAA00AA00AA00AA, 0xAA00AA00AA00AA, 0xAA00AA00AA00AA}); x = x ^ y ^ (y << 7); y = (x ^ (x >> 14)) & (xpu64{0xCCCC0000CCCC, 0xCCCC0000CCCC, 0xCCCC0000CCCC, 0xCCCC0000CCCC}); x = x ^ y ^ (y << 14); y = (x ^ (x >> 28)) & (xpu64{0xF0F0F0F0, 0xF0F0F0F0, 0xF0F0F0F0, 0xF0F0F0F0}); x = x ^ y ^ (y << 28); - return BMat16(x); + return BMat16(to_line(x)); } static constexpr xpu16 rot{0x302, 0x504, 0x706, 0x908, 0xb0a, 0xd0c, 0xf0e, 0x100, 0x302, 0x504, 0x706, 0x908, 0xb0a, 0xd0c, 0xf0e, 0x100}; @@ -131,23 +138,16 @@ inline BMat16 BMat16::mult_transpose(BMat16 const &that) const noexcept { return BMat16(data); } -BMat16 BMat16::mult_bmat8(BMat16 const &that) const noexcept { - xpu64 tmp1 = simde_mm256_shuffle_epi8(_data, block), - tmp2 = simde_mm256_shuffle_epi8(that._data, block); - BMat8 t1_0(tmp1[0]), - t1_1(tmp1[1]), - t1_2(tmp1[2]), - t1_3(tmp1[3]), - t2_0 = BMat8(tmp2[0]).transpose(), - t2_1 = BMat8(tmp2[1]).transpose(), - t2_2 = BMat8(tmp2[2]).transpose(), - t2_3 = BMat8(tmp2[3]).transpose(); - return BMat16( - (t1_0.mult_transpose(t2_0) | t1_1.mult_transpose(t2_2)).to_int(), - (t1_0.mult_transpose(t2_1) | t1_1.mult_transpose(t2_3)).to_int(), - (t1_2.mult_transpose(t2_0) | t1_3.mult_transpose(t2_2)).to_int(), - (t1_2.mult_transpose(t2_1) | t1_3.mult_transpose(t2_3)).to_int() - ); +BMat16 BMat16::mult_4bmat8(BMat16 const &that) const noexcept { + BMat16 tmp = that.transpose(); + xpu64 t1 = to_block(_data), + t2 = to_block(tmp._data); + BMat8 a1(t1[0]), b1(t1[1]), c1(t1[2]), d1(t1[3]), + a2(t2[0]), b2(t2[1]), c2(t2[2]), d2(t2[3]); + return BMat16((a1.mult_transpose(a2) | b1.mult_transpose(b2)).to_int(), + (a1.mult_transpose(c2) | b1.mult_transpose(d2)).to_int(), + (c1.mult_transpose(a2) | d1.mult_transpose(b2)).to_int(), + (c1.mult_transpose(c2) | d1.mult_transpose(d2)).to_int()); } BMat16 BMat16::mult_naive(BMat16 const &that) const noexcept { @@ -183,10 +183,17 @@ BMat16 BMat16::mult_naive_array(BMat16 const &that) const noexcept { return BMat16(a, b, c, d); } -inline std::vector BMat16::rows() const { // A changer +inline size_t BMat16::nr_rows() const noexcept{ + xpu16 tmp = _data, zero = simde_mm256_setzero_si256(); + xpu16 x = (tmp != zero); + return 0; + // return simde_mm256_popcnt_epi16(x); // To change +} + +inline std::vector BMat16::rows() const { // To change std::vector rows; for (size_t i = 0; i < 16; ++i) { - uint16_t row = static_cast(_data[i/4] << (8 * i) >> 56); + uint16_t row = static_cast(_data[i/4] << (16 * (i%4)) >> 48); rows.push_back(row); } return rows; diff --git a/tests/test_bmat16.cpp b/tests/test_bmat16.cpp index 9b5eb92..3f67528 100644 --- a/tests/test_bmat16.cpp +++ b/tests/test_bmat16.cpp @@ -162,10 +162,11 @@ TEST_CASE_METHOD(BMat16Fixture, "BMat16::operator*", "[BMat16][002]") { } } -TEST_AGREES2(BMat16Fixture, BMat16::operator*, mult_naive, BMlist, "[BMat16][003]"); -TEST_AGREES2(BMat16Fixture, BMat16::operator*, mult_naive_array, BMlist, "[BMat16][004]"); +TEST_AGREES2(BMat16Fixture, BMat16::operator*, mult_4bmat8, BMlist, "[BMat16][003]"); +TEST_AGREES2(BMat16Fixture, BMat16::operator*, mult_naive, BMlist, "[BMat16][004]"); +TEST_AGREES2(BMat16Fixture, BMat16::operator*, mult_naive_array, BMlist, "[BMat16][005]"); -TEST_CASE("BMat16::random", "[BMat16][005]") { +TEST_CASE("BMat16::random", "[BMat16][006]") { for (size_t d = 1; d < 8; ++d) { BMat16 bm = BMat16::random(d); for (size_t i = d + 1; i < 16; ++i) { @@ -177,7 +178,7 @@ TEST_CASE("BMat16::random", "[BMat16][005]") { } } -TEST_CASE("BMat8::operator()", "[BMat8][006]") { +TEST_CASE("BMat16::operator()", "[BMat16][007]") { std::vector> mat = { {0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0}, {0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0}, @@ -202,7 +203,7 @@ TEST_CASE("BMat8::operator()", "[BMat8][006]") { } } -TEST_CASE_METHOD(BMat16Fixture, "BMa16::operator<<", "[BMat16][007]") { +TEST_CASE_METHOD(BMat16Fixture, "BMat16::operator<<", "[BMat16][008]") { std::ostringstream oss; oss << bm3; CHECK(oss.str() == "0001010001011011\n" @@ -227,4 +228,12 @@ TEST_CASE_METHOD(BMat16Fixture, "BMa16::operator<<", "[BMat16][007]") { os << BMat8::random(); // Also does not do anything visible } +TEST_CASE_METHOD(BMat16Fixture, "BMat16::nr_rows", "[BMat16][009]") { + CHECK(zero.nr_rows() == 0); + CHECK(one1.nr_rows() == 1); + CHECK(one2.nr_rows() == 2); + CHECK(bm.nr_rows() == 16); + CHECK(BMat16({{1, 0, 1}, {1, 1, 0}, {0, 0, 0}}).nr_rows() == 2); +} + } // namespace HPCombi