diff --git a/include/hpcombi/bmat16.hpp b/include/hpcombi/bmat16.hpp index a69c858..f40f9af 100644 --- a/include/hpcombi/bmat16.hpp +++ b/include/hpcombi/bmat16.hpp @@ -78,6 +78,7 @@ class BMat16 { //! A constructor. //! //! This constructor initializes a matrix with 4 64 bits unsigned int + //! Each uint represents one of the four quarter (8x8 matrix). explicit BMat16(uint64_t n0, uint64_t n1, uint64_t n2, uint64_t n3) noexcept; //! A constructor. @@ -138,12 +139,12 @@ class BMat16 { //! is possible to access entries that you might not believe exist. bool operator()(size_t i, size_t j) const noexcept; - // //! Sets the (\p i, \p j)th position to \p val. - // //! - // //! This method sets the (\p i, \p j)th entry of \c this to \p val. - // //! Uses the bit twiddle for setting bits found - // //! here. - // void set(size_t i, size_t j, bool val) noexcept; + //! Sets the (\p i, \p j)th position to \p val. + //! + //! This method sets the (\p i, \p j)th entry of \c this to \p val. + //! Uses the bit twiddle for setting bits found + //! here. + void set(size_t i, size_t j, bool val) noexcept; //! Returns the array representation of \c this. //! diff --git a/include/hpcombi/bmat16_impl.hpp b/include/hpcombi/bmat16_impl.hpp index 5a1a0a1..5824ba0 100644 --- a/include/hpcombi/bmat16_impl.hpp +++ b/include/hpcombi/bmat16_impl.hpp @@ -45,18 +45,30 @@ inline BMat16::BMat16(std::vector> const &mat) noexcept { HPCOMBI_ASSERT(mat.size() <= 16); HPCOMBI_ASSERT(0 < mat.size()); std::array tmp = {0, 0, 0, 0}; - for (int i = mat.size() - 1; i >= 0; i--) { + for (int i = mat.size() - 1; i >= 0; --i) { HPCOMBI_ASSERT(mat.size() == mat[i].size()); tmp[i/4] <<= 16 - mat.size(); - for (int j = mat[i].size() - 1; j >= 0; j--) { + for (int j = mat[i].size() - 1; j >= 0; --j) { tmp[i/4] = (tmp[i/4] << 1) | mat[i][j]; } } _data = xpu64{tmp[0], tmp[1], tmp[2], tmp[3]}; } -bool BMat16::operator()(size_t i, size_t j) const noexcept { - return (_data[i/4] >> (16 * (i%4) + j)) % 2; +inline bool BMat16::operator()(size_t i, size_t j) const noexcept { + return (_data[i/4] >> (16 * (i%4) + j)) & 1; +} + +inline void BMat16::set(size_t i, size_t j, bool val) noexcept { + HPCOMBI_ASSERT(i < 16); + HPCOMBI_ASSERT(j < 16); + uint64_t a = 1; + a <<= 16 * (i%4) + j; + xpu64 mask{(i/4 == 0)*a, + (i/4 == 1)*a, + (i/4 == 2)*a, + (i/4 == 3)*a}; + _data ^= (-val ^ _data) & mask; } inline bool BMat16::operator==(BMat16 const &that) const noexcept { @@ -64,37 +76,37 @@ inline bool BMat16::operator==(BMat16 const &that) const noexcept { return simde_mm256_testz_si256(tmp, tmp); } -bool BMat16::operator<(BMat16 const &that) const noexcept { +inline bool BMat16::operator<(BMat16 const &that) const noexcept { return _data[0] < that._data[0] || (_data[0] == that._data[0] && (_data[1] < that._data[1] || (_data[1] == that._data[1] && (_data[2] < that._data[2] || (_data[2] == that._data[2] && (_data[3] < that._data[3])))))); } -bool BMat16::operator>(BMat16 const &that) const noexcept { +inline bool BMat16::operator>(BMat16 const &that) const noexcept { return _data[0] > that._data[0] || (_data[0] == that._data[0] && (_data[1] > that._data[1] || (_data[1] == that._data[1] && (_data[2] > that._data[2] || (_data[2] == that._data[2] && (_data[3] > that._data[3])))))); } -std::array, 16> BMat16::to_array() const noexcept { +inline std::array, 16> BMat16::to_array() const noexcept { xpu64 tmp = to_block(_data); uint64_t a = tmp[0], b = tmp[1], c = tmp[2], d = tmp[3]; std::array, 16> res; - for (int i = 0; i < 64; i++) { - res[i/8][i%8] = a % 2; a >>= 1; - res[i/8][8 + i%8] = b % 2; b >>= 1; - res[8 + i/8][i%8] = c % 2; c >>= 1; - res[8 + i/8][8 + i%8] = d % 2; d >>= 1; + for (size_t i = 0; i < 64; ++i) { + res[i/8][i%8] = a & 1; a >>= 1; + res[i/8][8 + i%8] = b & 1; b >>= 1; + res[8 + i/8][i%8] = c & 1; c >>= 1; + res[8 + i/8][8 + i%8] = d & 1; d >>= 1; } return res; } inline BMat16 BMat16::transpose_naive() const noexcept { uint64_t a = 0, b = 0, c = 0, d = 0; - for (int i = 7; i >= 0; i--) { - for (int j = 7; j >= 0; j--) { + for (int i = 7; i >= 0; --i) { + for (int j = 7; j >= 0; --j) { a = (a << 1) | (*this)(j, i); b = (b << 1) | (*this)(j+8, i); c = (c << 1) | (*this)(j, i+8); @@ -127,7 +139,7 @@ inline BMat16 BMat16::mult_transpose(BMat16 const &that) const noexcept { xpu16 data = simde_mm256_setzero_si256(); xpu16 diag1{0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, 0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000}; xpu16 diag2{0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000, 0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80}; - for (int i = 0; i < 8; ++i) { + for (size_t i = 0; i < 8; ++i) { data |= ((x & y1) != zero) & diag1; data |= ((x & y2) != zero) & diag2; y1 = simde_mm256_shuffle_epi8(y1, rot); @@ -138,7 +150,7 @@ inline BMat16 BMat16::mult_transpose(BMat16 const &that) const noexcept { return BMat16(data); } -BMat16 BMat16::mult_4bmat8(BMat16 const &that) const noexcept { +inline BMat16 BMat16::mult_4bmat8(BMat16 const &that) const noexcept { BMat16 tmp = that.transpose(); xpu64 t1 = to_block(_data), t2 = to_block(tmp._data); @@ -150,12 +162,12 @@ BMat16 BMat16::mult_4bmat8(BMat16 const &that) const noexcept { (c1.mult_transpose(c2) | d1.mult_transpose(d2)).to_int()); } -BMat16 BMat16::mult_naive(BMat16 const &that) const noexcept { +inline BMat16 BMat16::mult_naive(BMat16 const &that) const noexcept { uint64_t a = 0, b = 0, c = 0, d = 0; - for (int i = 7; i >= 0; i--) { - for (int j = 7; j >= 0; j--) { + for (int i = 7; i >= 0; --i) { + for (int j = 7; j >= 0; --j) { a <<= 1; b <<= 1; c <<= 1; d <<= 1; - for (int k = 0; k < 8; k++) { + for (size_t k = 0; k < 8; ++k) { a |= ((*this)(i, k) & that(k, j)) | ((*this)(i, k + 8) & that(k + 8, j)); b |= ((*this)(i, k) & that(k, j + 8)) | ((*this)(i, k + 8) & that(k + 8, j + 8)); c |= ((*this)(i + 8, k) & that(k, j)) | ((*this)(i + 8, k + 8) & that(k + 8, j)); @@ -166,13 +178,13 @@ BMat16 BMat16::mult_naive(BMat16 const &that) const noexcept { return BMat16(a, b, c, d); } -BMat16 BMat16::mult_naive_array(BMat16 const &that) const noexcept { +inline BMat16 BMat16::mult_naive_array(BMat16 const &that) const noexcept { std::array, 16> tab1 = to_array(), tab2 = that.to_array(); uint64_t a = 0, b = 0, c = 0, d = 0; - for (int i = 7; i >= 0; i--) { - for (int j = 7; j >= 0; j--) { + for (int i = 7; i >= 0; --i) { + for (int j = 7; j >= 0; --j) { a <<= 1; b <<= 1; c <<= 1; d <<= 1; - for (int k = 0; k < 16; k++) { + for (size_t k = 0; k < 16; ++k) { a |= tab1[i][k] & tab2[k][j]; b |= tab1[i][k] & tab2[k][j + 8]; c |= tab1[i + 8][k] & tab2[k][j]; @@ -184,16 +196,27 @@ BMat16 BMat16::mult_naive_array(BMat16 const &that) const noexcept { } inline size_t BMat16::nr_rows() const noexcept{ - xpu16 tmp = _data, zero = simde_mm256_setzero_si256(); - xpu16 x = (tmp != zero); - return 0; - // return simde_mm256_popcnt_epi16(x); // To change + size_t res = 0; + for (size_t i = 0; i < 16; ++i) + if ((_data[i/4] << (16 * (i%4)) >> 48) != 0) + ++res; + return res; + + //// Vectorized version that doesn't work due to the absence of popcnt in simde + // xpu16 tmp = _data, zero = simde_mm256_setzero_si256(); + // xpu16 x = (tmp != zero); + // return simde_mm256_popcnt_epi16(x); } -inline std::vector BMat16::rows() const { // To change +inline std::vector BMat16::rows() const { std::vector rows; for (size_t i = 0; i < 16; ++i) { - uint16_t row = static_cast(_data[i/4] << (16 * (i%4)) >> 48); + uint16_t row_rev = (_data[i/4] << (16 * (3 - i%4)) >> 48); + uint16_t row = 0; + for (size_t j = 0; j < 16; ++j) { + row = (row << 1) | (row_rev & 1); + row_rev >>= 1; + } rows.push_back(row); } return rows; diff --git a/include/hpcombi/bmat8.hpp b/include/hpcombi/bmat8.hpp index 1437da3..f4132ff 100644 --- a/include/hpcombi/bmat8.hpp +++ b/include/hpcombi/bmat8.hpp @@ -133,7 +133,7 @@ class BMat8 { //! //! This method sets the (\p i, \p j)th entry of \c this to \p val. //! Uses the bit twiddle for setting bits found - //! here. + //! here. void set(size_t i, size_t j, bool val) noexcept; //! Returns the integer representation of \c this.