From a10dae8117b0288f62cd6ca3d555985d0d54df1f Mon Sep 17 00:00:00 2001
From: Victorin <victorin.brunel@universite-paris-saclay.fr>
Date: Mon, 15 Jul 2024 18:05:24 +0200
Subject: [PATCH] 15/07

---
 benchmark/bench_bmat16.cpp      |  1 +
 include/hpcombi/bmat16.hpp      | 80 +++++++++++++++-----------------
 include/hpcombi/bmat16_impl.hpp | 81 ++++++++++++++++++---------------
 tests/test_bmat16.cpp           | 19 ++++++--
 4 files changed, 96 insertions(+), 85 deletions(-)
diff --git a/benchmark/bench_bmat16.cpp b/benchmark/bench_bmat16.cpp
index fb00189..d54c973 100644
--- a/benchmark/bench_bmat16.cpp
+++ b/benchmark/bench_bmat16.cpp
@@ -65,6 +65,7 @@ TEST_CASE_METHOD(Fix_BMat16, "Transpose", "[BMat16][000]") {
 
 TEST_CASE_METHOD(Fix_BMat16, "Multiplication", "[BMat16][001]") {
     BENCHMARK_MEM_FN_PAIR(BMat16::operator*, pair_sample);
+    BENCHMARK_MEM_FN_PAIR(mult_4bmat8, pair_sample);
     BENCHMARK_MEM_FN_PAIR(mult_naive, pair_sample);
     BENCHMARK_MEM_FN_PAIR(mult_naive_array, pair_sample);
 }
diff --git a/include/hpcombi/bmat16.hpp b/include/hpcombi/bmat16.hpp
index b2b0a0a..a69c858 100644
--- a/include/hpcombi/bmat16.hpp
+++ b/include/hpcombi/bmat16.hpp
@@ -40,11 +40,15 @@
 #include "bmat8.hpp"
 
 #include "simde/x86/avx2.h"
+// #include "simde/x86/avx512/popcnt.h"
 
 namespace HPCombi {
 using xpu16 = uint16_t __attribute__((vector_size(32)));
 using xpu64 = uint64_t __attribute__((vector_size(32)));
 
+xpu64 to_line(xpu64 vect);
+xpu64 to_block(xpu64 vect);
+
 //! Class for fast boolean matrices of dimension up to 16 x 16
 //!
 //! The methods for these small matrices over the boolean semiring
@@ -65,7 +69,7 @@ class BMat16 {
 
     //! A constructor.
     //!
-    //! This constructor initializes a BMat16 with a 256-bit register
+    //! This constructor initializes a matrix with a 256-bit register
     //! The rows are equal to the 16 chunks, of 16 bits each, 
     //! of the binary representation of the matrix
     explicit BMat16(xpu64 mat) noexcept : 
@@ -75,9 +79,11 @@ class BMat16 {
     //!
     //! This constructor initializes a matrix with 4 64 bits unsigned int
     explicit BMat16(uint64_t n0, uint64_t n1, uint64_t n2, uint64_t n3) noexcept;
-    // explicit BMat16(uint64_t n0, uint64_t n1, uint64_t n2, uint64_t n3) noexcept :
-    //     _data{xpu64{n0, n1, n2, n3}} {}
     
+    //! A constructor.
+    //!
+    //! This constructor initializes a matrix where the rows of the matrix
+    //! are the vectors in \p mat.
     explicit BMat16(std::vector<std::vector<bool>> const &mat) noexcept;
 
     //! A constructor.
@@ -105,12 +111,12 @@ class BMat16 {
 
     //! Returns \c true if \c this equals \p that.
     //!
-    //! This method checks the mathematical equality of two BMat8 objects.
+    //! This method checks the mathematical equality of two BMat16 objects.
     bool operator==(BMat16 const &that) const noexcept;
 
     //! Returns \c true if \c this does not equal \p that
     //!
-    //! This method checks the mathematical inequality of two BMat8 objects.
+    //! This method checks the mathematical inequality of two BMat16 objects.
     bool operator!=(BMat16 const &that) const noexcept {
         return !(*this == that);
     }
@@ -118,22 +124,12 @@ class BMat16 {
     //! Returns \c true if \c this is less than \p that.
     //!
     //! This method checks whether a BMat16 objects is less than another.
-    //! We order by the results of to_int() for each matrix.
     bool operator<(BMat16 const &that) const noexcept;
 
     //! Returns \c true if \c this is greater than \p that.
     //!
     //! This method checks whether a BMat16 objects is greater than another.
-    //! We order by the results of to_int() for each matrix.
-    // bool operator>(BMat8 const &that) const noexcept {
-    //     return _data > that._data;
-    // }
-
-    // Conversion of type of storage, from blocks to lines
-    BMat16 to_line() const;
-
-    // Conversion of type of storage, from lines to blocks
-    BMat16 to_block() const;
+    bool operator>(BMat16 const &that) const noexcept;
 
     //! Returns the entry in the (\p i, \p j)th position.
     //!
@@ -157,45 +153,43 @@ class BMat16 {
     //! Returns the bitwise or between \c this and \p that
     //!
     //! This method perform the bitwise operator on the matrices and
-    //! returns the result as a BMat16
+    //! returns the result as a BMat16.
     BMat16 operator|(BMat16 const& that) const noexcept {
         return BMat16(_data | that._data);
     }
 
-    //! Returns the transpose of \c this.
-    //!
-    //! Returns the standard matrix transpose of a BMat8.
-    //! Uses a naive technique, by simply iterating through all entries
-    BMat16 transpose_naive() const noexcept;
-
     //! Returns the transpose of \c this.
     //!
     //! Returns the standard matrix transpose of a BMat16.
-    //! Uses a naive technique, by simply iterating through all entries
-    BMat16 transpose_block() const noexcept;
+    //! Uses a naive technique, by simply iterating through all entries.
+    BMat16 transpose_naive() const noexcept;
 
     //! Returns the transpose of \c this
     //!
-    //! Returns the standard matrix transpose of a BMat8.
+    //! Returns the standard matrix transpose of a BMat16.
     //! Uses the technique found in Knuth AoCP Vol. 4 Fasc. 1a, p. 15.
-    BMat16 transpose() const noexcept {
-        return to_block().transpose_block().to_line();
-    }
+    BMat16 transpose() const noexcept;
 
     //! Returns the matrix product of \c this and the transpose of \p that
     //!
     //! This method returns the standard matrix product (over the
-    //! boolean semiring) of two BMat8 objects. This is faster than transposing
+    //! boolean semiring) of two BMat16 objects. This is faster than transposing
     //! that and calling the product of \c this with it. Implementation uses
     //! vector instructions.
     BMat16 mult_transpose(BMat16 const &that) const noexcept;
 
-    BMat16 mult_bmat8(BMat16 const &that) const noexcept;
+    //! Returns the matrix product of \c this and \p that
+    //!
+    //! This method returns the standard matrix product (over the
+    //! boolean semiring) of two BMat16 objects. 
+    //! It comes down to 8 products of 8x8 matrices, 
+    //! which make up a 16x16 when we cut it into 4.
+    BMat16 mult_4bmat8(BMat16 const &that) const noexcept;
 
     //! Returns the matrix product of \c this and \p that
     //!
     //! This method returns the standard matrix product (over the
-    //! boolean semiring) of two BMat8 objects. This is a fast implementation
+    //! boolean semiring) of two BMat16 objects. This is a fast implementation
     //! using transposition and vector instructions.
     BMat16 operator*(BMat16 const &that) const noexcept {
         return mult_transpose(that.transpose());
@@ -204,19 +198,19 @@ class BMat16 {
     //! Returns the matrix product of \c this and \p that
     //!
     //! This method returns the standard matrix product (over the
-    //! boolean semiring) of two BMat8 objects. It performs the most naive approach
-    //! by simply iterating through all entries using the access operator of BMat8
+    //! boolean semiring) of two BMat16 objects. It performs the most naive approach
+    //! by simply iterating through all entries using the access operator of BMat16
     BMat16 mult_naive(BMat16 const& that) const noexcept;
 
     //! Returns the matrix product of \c this and \p that
     //!
     //! This method returns the standard matrix product (over the
-    //! boolean semiring) of two BMat8 objects. It performs the most naive approach
+    //! boolean semiring) of two BMat16 objects. It performs the most naive approach
     //! by simply iterating through all entries using array conversion.
     BMat16 mult_naive_array(BMat16 const& that) const noexcept;
 
-    // //! Returns the number of non-zero rows of \c this
-    // size_t nr_rows() const noexcept;
+    //! Returns the number of non-zero rows of \c this
+    size_t nr_rows() const noexcept;
 
     //! Returns a \c std::vector for rows of \c this
     // Not noexcept because it constructs a vector
@@ -232,20 +226,20 @@ class BMat16 {
         return BMat16(ones[dim >= 8 ? 8 : dim], 0, 0, ones[dim >= 8 ? dim - 8 : 0]);
     }
 
-    //! Returns a random BMat8
+    //! Returns a random BMat16
     //!
-    //! This method returns a BMat8 chosen at random.
+    //! This method returns a BMat16 chosen at random.
     // Not noexcept because random things aren't
     static BMat16 random();
 
-    //! Returns a random square BMat8 up to dimension \p dim.
+    //! Returns a random square BMat16 up to dimension \p dim.
     //!
-    //! This method returns a BMat8 chosen at random, where only the
+    //! This method returns a BMat16 chosen at random, where only the
     //! top-left \p dim x \p dim entries may be non-zero.
-    // Not noexcept because BMat8::random above is not
+    // Not noexcept because BMat16::random above is not
     static BMat16 random(size_t dim);
 
-    // void swap(BMat16 &that) noexcept { std::swap(this->_data, that._data); }
+    void swap(BMat16 &that) noexcept { std::swap(this->_data, that._data); }
 
     // ! Write \c this on \c os
     // Not noexcept
diff --git a/include/hpcombi/bmat16_impl.hpp b/include/hpcombi/bmat16_impl.hpp
index ed01475..5a1a0a1 100644
--- a/include/hpcombi/bmat16_impl.hpp
+++ b/include/hpcombi/bmat16_impl.hpp
@@ -28,9 +28,17 @@ static_assert(std::is_trivial<BMat16>(), "BMat16 is not a trivial class!");
 static constexpr xpu16 line{0x800, 0x901, 0xa02, 0xb03, 0xc04, 0xd05, 0xe06, 0xf07, 0x800, 0x901, 0xa02, 0xb03, 0xc04, 0xd05, 0xe06, 0xf07};
 static constexpr xpu16 block{0x200, 0x604, 0xa08, 0xe0c, 0x301, 0x705, 0xb09, 0xf0d, 0x200, 0x604, 0xa08, 0xe0c, 0x301, 0x705, 0xb09, 0xf0d};
 
+inline xpu64 to_line(xpu64 vect) {
+    return simde_mm256_shuffle_epi8(vect, line);
+}
+
+inline xpu64 to_block(xpu64 vect) {
+    return simde_mm256_shuffle_epi8(vect, block);
+}
+
 inline BMat16::BMat16(uint64_t n0, uint64_t n1, uint64_t n2, uint64_t n3) noexcept {
     xpu64 tmp{n0, n1, n2, n3};
-    _data = simde_mm256_shuffle_epi8(tmp, line);
+    _data = to_line(tmp);
 }
 
 inline BMat16::BMat16(std::vector<std::vector<bool>> const &mat) noexcept {
@@ -57,15 +65,21 @@ inline bool BMat16::operator==(BMat16 const &that) const noexcept {
 }
 
 bool BMat16::operator<(BMat16 const &that) const noexcept {
-    return _data[0] < that._data[0] || (_data[0] == that._data[0] && (
-           _data[1] < that._data[1] || (_data[1] == that._data[1] && (
-           _data[2] < that._data[2] || (_data[2] == that._data[2] && (
-           _data[3] < that._data[3]
-           ))))));
+    return _data[0] < that._data[0] || 
+            (_data[0] == that._data[0] && (_data[1] < that._data[1] || 
+            (_data[1] == that._data[1] && (_data[2] < that._data[2] || 
+            (_data[2] == that._data[2] && (_data[3] < that._data[3]))))));
+}
+
+bool BMat16::operator>(BMat16 const &that) const noexcept {
+    return _data[0] > that._data[0] || 
+            (_data[0] == that._data[0] && (_data[1] > that._data[1] || 
+            (_data[1] == that._data[1] && (_data[2] > that._data[2] || 
+            (_data[2] == that._data[2] && (_data[3] > that._data[3]))))));
 }
 
 std::array<std::array<bool, 16>, 16> BMat16::to_array() const noexcept {
-    xpu64 tmp = simde_mm256_shuffle_epi8(_data, block);
+    xpu64 tmp = to_block(_data);
     uint64_t a = tmp[0], b = tmp[1], c = tmp[2], d = tmp[3];
     std::array<std::array<bool, 16>, 16> res;
     for (int i = 0; i < 64; i++) {
@@ -77,14 +91,6 @@ std::array<std::array<bool, 16>, 16> BMat16::to_array() const noexcept {
     return res;
 }
 
-inline BMat16 BMat16::to_line() const {
-    return BMat16(simde_mm256_shuffle_epi8(_data, line));
-}
-
-inline BMat16 BMat16::to_block() const {
-    return BMat16(simde_mm256_shuffle_epi8(_data, block));
-}
-
 inline BMat16 BMat16::transpose_naive() const noexcept {
     uint64_t a = 0, b = 0, c = 0, d = 0;
     for (int i = 7; i >= 0; i--) {
@@ -98,15 +104,16 @@ inline BMat16 BMat16::transpose_naive() const noexcept {
     return BMat16(a, b, c, d);
 }
 
-inline BMat16 BMat16::transpose_block() const noexcept {
-    xpu64 x = simde_mm256_set_epi64x(_data[3], _data[1], _data[2], _data[0]);
+inline BMat16 BMat16::transpose() const noexcept {
+    xpu64 tmp = to_block(_data);
+    xpu64 x = simde_mm256_set_epi64x(tmp[3], tmp[1], tmp[2], tmp[0]);
     xpu64 y = (x ^ (x >> 7)) & (xpu64{0xAA00AA00AA00AA, 0xAA00AA00AA00AA, 0xAA00AA00AA00AA, 0xAA00AA00AA00AA});
     x = x ^ y ^ (y << 7);
     y = (x ^ (x >> 14)) & (xpu64{0xCCCC0000CCCC, 0xCCCC0000CCCC, 0xCCCC0000CCCC, 0xCCCC0000CCCC});
     x = x ^ y ^ (y << 14);
     y = (x ^ (x >> 28)) & (xpu64{0xF0F0F0F0, 0xF0F0F0F0, 0xF0F0F0F0, 0xF0F0F0F0});
     x = x ^ y ^ (y << 28);
-    return BMat16(x);
+    return BMat16(to_line(x));
 }
 
 static constexpr xpu16 rot{0x302, 0x504, 0x706, 0x908, 0xb0a, 0xd0c, 0xf0e, 0x100, 0x302, 0x504, 0x706, 0x908, 0xb0a, 0xd0c, 0xf0e, 0x100};
@@ -131,23 +138,16 @@ inline BMat16 BMat16::mult_transpose(BMat16 const &that) const noexcept {
     return BMat16(data);
 }
 
-BMat16 BMat16::mult_bmat8(BMat16 const &that) const noexcept {
-    xpu64 tmp1 = simde_mm256_shuffle_epi8(_data, block),
-          tmp2 = simde_mm256_shuffle_epi8(that._data, block);
-    BMat8 t1_0(tmp1[0]), 
-          t1_1(tmp1[1]), 
-          t1_2(tmp1[2]), 
-          t1_3(tmp1[3]), 
-          t2_0 = BMat8(tmp2[0]).transpose(), 
-          t2_1 = BMat8(tmp2[1]).transpose(), 
-          t2_2 = BMat8(tmp2[2]).transpose(), 
-          t2_3 = BMat8(tmp2[3]).transpose();
-    return BMat16(
-        (t1_0.mult_transpose(t2_0) | t1_1.mult_transpose(t2_2)).to_int(), 
-        (t1_0.mult_transpose(t2_1) | t1_1.mult_transpose(t2_3)).to_int(), 
-        (t1_2.mult_transpose(t2_0) | t1_3.mult_transpose(t2_2)).to_int(),
-        (t1_2.mult_transpose(t2_1) | t1_3.mult_transpose(t2_3)).to_int()
-    );
+BMat16 BMat16::mult_4bmat8(BMat16 const &that) const noexcept {
+    BMat16 tmp = that.transpose();
+    xpu64 t1 = to_block(_data), 
+          t2 = to_block(tmp._data);
+    BMat8 a1(t1[0]), b1(t1[1]), c1(t1[2]), d1(t1[3]),
+          a2(t2[0]), b2(t2[1]), c2(t2[2]), d2(t2[3]);
+    return BMat16((a1.mult_transpose(a2) | b1.mult_transpose(b2)).to_int(), 
+                  (a1.mult_transpose(c2) | b1.mult_transpose(d2)).to_int(), 
+                  (c1.mult_transpose(a2) | d1.mult_transpose(b2)).to_int(), 
+                  (c1.mult_transpose(c2) | d1.mult_transpose(d2)).to_int());
 }
 
 BMat16 BMat16::mult_naive(BMat16 const &that) const noexcept {
@@ -183,10 +183,17 @@ BMat16 BMat16::mult_naive_array(BMat16 const &that) const noexcept {
     return BMat16(a, b, c, d);
 }
 
-inline std::vector<uint16_t> BMat16::rows() const { // A changer
+inline size_t BMat16::nr_rows() const noexcept{
+    xpu16 tmp = _data, zero = simde_mm256_setzero_si256();
+    xpu16 x = (tmp != zero);
+    return 0;
+    // return simde_mm256_popcnt_epi16(x); // To change
+}
+
+inline std::vector<uint16_t> BMat16::rows() const { // To change
     std::vector<uint16_t> rows;
     for (size_t i = 0; i < 16; ++i) {
-        uint16_t row = static_cast<uint16_t>(_data[i/4] << (8 * i) >> 56);
+        uint16_t row = static_cast<uint16_t>(_data[i/4] << (16 * (i%4)) >> 48);
         rows.push_back(row);
     }
     return rows;
diff --git a/tests/test_bmat16.cpp b/tests/test_bmat16.cpp
index 9b5eb92..3f67528 100644
--- a/tests/test_bmat16.cpp
+++ b/tests/test_bmat16.cpp
@@ -162,10 +162,11 @@ TEST_CASE_METHOD(BMat16Fixture, "BMat16::operator*", "[BMat16][002]") {
     }
 }
 
-TEST_AGREES2(BMat16Fixture, BMat16::operator*, mult_naive, BMlist, "[BMat16][003]");
-TEST_AGREES2(BMat16Fixture, BMat16::operator*, mult_naive_array, BMlist, "[BMat16][004]");
+TEST_AGREES2(BMat16Fixture, BMat16::operator*, mult_4bmat8, BMlist, "[BMat16][003]");
+TEST_AGREES2(BMat16Fixture, BMat16::operator*, mult_naive, BMlist, "[BMat16][004]");
+TEST_AGREES2(BMat16Fixture, BMat16::operator*, mult_naive_array, BMlist, "[BMat16][005]");
 
-TEST_CASE("BMat16::random", "[BMat16][005]") {
+TEST_CASE("BMat16::random", "[BMat16][006]") {
     for (size_t d = 1; d < 8; ++d) {
         BMat16 bm = BMat16::random(d);
         for (size_t i = d + 1; i < 16; ++i) {
@@ -177,7 +178,7 @@ TEST_CASE("BMat16::random", "[BMat16][005]") {
     }
 }
 
-TEST_CASE("BMat8::operator()", "[BMat8][006]") {
+TEST_CASE("BMat16::operator()", "[BMat16][007]") {
     std::vector<std::vector<bool>> mat = {
         {0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0},
         {0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0},
@@ -202,7 +203,7 @@ TEST_CASE("BMat8::operator()", "[BMat8][006]") {
     }
 }
 
-TEST_CASE_METHOD(BMat16Fixture, "BMa16::operator<<", "[BMat16][007]") {
+TEST_CASE_METHOD(BMat16Fixture, "BMat16::operator<<", "[BMat16][008]") {
     std::ostringstream oss;
     oss << bm3;
     CHECK(oss.str() == "0001010001011011\n"
@@ -227,4 +228,12 @@ TEST_CASE_METHOD(BMat16Fixture, "BMa16::operator<<", "[BMat16][007]") {
     os << BMat8::random();  // Also does not do anything visible
 }
 
+TEST_CASE_METHOD(BMat16Fixture, "BMat16::nr_rows", "[BMat16][009]") {
+    CHECK(zero.nr_rows() == 0);
+    CHECK(one1.nr_rows() == 1);
+    CHECK(one2.nr_rows() == 2);
+    CHECK(bm.nr_rows() == 16);
+    CHECK(BMat16({{1, 0, 1}, {1, 1, 0}, {0, 0, 0}}).nr_rows() == 2);
+}
+
 }  // namespace HPCombi