diff --git a/include/hpcombi/bmat8_impl.hpp b/include/hpcombi/bmat8_impl.hpp
index 6d17585..35dd9fa 100644
--- a/include/hpcombi/bmat8_impl.hpp
+++ b/include/hpcombi/bmat8_impl.hpp
@@ -20,7 +20,8 @@
 //****************************************************************************//
 
 /** @file
-@brief implementation of bmat8.hpp ; this file should not be included directly. */
+@brief implementation of bmat8.hpp ; this file should not be included directly.
+*/
 
 // NOLINT(build/header_guard)
 
@@ -36,71 +37,70 @@ static const constexpr std::array<uint64_t, 8> COL_MASK = {
     0x1010101010101010, 0x808080808080808,  0x404040404040404,
     0x202020202020202,  0x101010101010101};
 
-static const constexpr std::array<uint64_t, 64> BIT_MASK = {{
-    0x8000000000000000,
-    0x4000000000000000,
-    0x2000000000000000,
-    0x1000000000000000,
-    0x800000000000000,
-    0x400000000000000,
-    0x200000000000000,
-    0x100000000000000,
-    0x80000000000000,
-    0x40000000000000,
-    0x20000000000000,
-    0x10000000000000,
-    0x8000000000000,
-    0x4000000000000,
-    0x2000000000000,
-    0x1000000000000,
-    0x800000000000,
-    0x400000000000,
-    0x200000000000,
-    0x100000000000,
-    0x80000000000,
-    0x40000000000,
-    0x20000000000,
-    0x10000000000,
-    0x8000000000,
-    0x4000000000,
-    0x2000000000,
-    0x1000000000,
-    0x800000000,
-    0x400000000,
-    0x200000000,
-    0x100000000,
-    0x80000000,
-    0x40000000,
-    0x20000000,
-    0x10000000,
-    0x8000000,
-    0x4000000,
-    0x2000000,
-    0x1000000,
-    0x800000,
-    0x400000,
-    0x200000,
-    0x100000,
-    0x80000,
-    0x40000,
-    0x20000,
-    0x10000,
-    0x8000,
-    0x4000,
-    0x2000,
-    0x1000,
-    0x800,
-    0x400,
-    0x200,
-    0x100,
-    0x80,
-    0x40,
-    0x20,
-    0x10,
-    0x8,
-    0x4,
-    0x2,
-    0x1}};
+static const constexpr std::array<uint64_t, 64> BIT_MASK = {{0x8000000000000000,
+                                                             0x4000000000000000,
+                                                             0x2000000000000000,
+                                                             0x1000000000000000,
+                                                             0x800000000000000,
+                                                             0x400000000000000,
+                                                             0x200000000000000,
+                                                             0x100000000000000,
+                                                             0x80000000000000,
+                                                             0x40000000000000,
+                                                             0x20000000000000,
+                                                             0x10000000000000,
+                                                             0x8000000000000,
+                                                             0x4000000000000,
+                                                             0x2000000000000,
+                                                             0x1000000000000,
+                                                             0x800000000000,
+                                                             0x400000000000,
+                                                             0x200000000000,
+                                                             0x100000000000,
+                                                             0x80000000000,
+                                                             0x40000000000,
+                                                             0x20000000000,
+                                                             0x10000000000,
+                                                             0x8000000000,
+                                                             0x4000000000,
+                                                             0x2000000000,
+                                                             0x1000000000,
+                                                             0x800000000,
+                                                             0x400000000,
+                                                             0x200000000,
+                                                             0x100000000,
+                                                             0x80000000,
+                                                             0x40000000,
+                                                             0x20000000,
+                                                             0x10000000,
+                                                             0x8000000,
+                                                             0x4000000,
+                                                             0x2000000,
+                                                             0x1000000,
+                                                             0x800000,
+                                                             0x400000,
+                                                             0x200000,
+                                                             0x100000,
+                                                             0x80000,
+                                                             0x40000,
+                                                             0x20000,
+                                                             0x10000,
+                                                             0x8000,
+                                                             0x4000,
+                                                             0x2000,
+                                                             0x1000,
+                                                             0x800,
+                                                             0x400,
+                                                             0x200,
+                                                             0x100,
+                                                             0x80,
+                                                             0x40,
+                                                             0x20,
+                                                             0x10,
+                                                             0x8,
+                                                             0x4,
+                                                             0x2,
+                                                             0x1}};
 
 inline bool BMat8::operator()(size_t i, size_t j) const noexcept {
     HPCOMBI_ASSERT(i < 8);
diff --git a/include/hpcombi/epu8.hpp b/include/hpcombi/epu8.hpp
index c5f9b3b..2d8af73 100644
--- a/include/hpcombi/epu8.hpp
+++ b/include/hpcombi/epu8.hpp
@@ -79,7 +79,8 @@ inline bool equal(epu8 a, epu8 b) noexcept {
 /** Non equality of #HPCombi::epu8 */
 inline bool not_equal(epu8 a, epu8 b) noexcept { return !equal(a, b); }
 
-/** Apply a permutation \c b on the vector \c a: for i=0..16 {result[i] = a[b[i]} */
+/** Apply a permutation \c b on the vector \c a: for i=0..16 {result[i] =
+ * a[b[i]} */
 inline epu8 permuted_ref(epu8 a, epu8 b) noexcept;
 
 /** Same as \ref HPCombi::permuted_ref "permuted_ref"
@@ -154,13 +155,15 @@ inline epu8 sort8_perm(epu8 &a) noexcept;
 inline void merge(epu8 &a, epu8 &b) noexcept;
 
 #ifdef SIMDE_X86_SSE4_2_NATIVE
-/** Same interface as \ref HPCombi::permutation_of "permutation_of" but with a different implementation.
+/** Same interface as \ref HPCombi::permutation_of "permutation_of" but with a
+   different implementation.
     @par Algorithm: uses string matching cpmestrm intrinsics
  */
 inline epu8 permutation_of_cmpestrm(epu8 a, epu8 b) noexcept;
 #endif
 
-/** Same interface as \ref HPCombi::permutation_of "permutation_of" but with a different implementation.
+/** Same interface as \ref HPCombi::permutation_of "permutation_of" but with a
+   different implementation.
     @par Algorithm: reference implementation
  */
 inline epu8 permutation_of_ref(epu8 a, epu8 b) noexcept;
@@ -197,14 +200,16 @@ inline epu8 random_epu8(uint16_t bnd);
  */
 inline epu8 remove_dups(epu8 a, uint8_t repl = 0) noexcept;
 
-/** Same interface as \ref HPCombi::horiz_sum "horiz_sum" but with a different implementation.
+/** Same interface as \ref HPCombi::horiz_sum "horiz_sum" but with a different
+ * implementation.
  *  @par Algorithm:
  *  Reference @f$O(n)@f$ algorithm using loop and indexed access
  */
 
 inline uint8_t horiz_sum_ref(epu8) noexcept;
 
-/** Same interface as \ref HPCombi::horiz_sum "horiz_sum" but with a different implementation.
+/** Same interface as \ref HPCombi::horiz_sum "horiz_sum" but with a different
+ * implementation.
  *  @par Algorithm:
  *  Reference @f$O(n)@f$ algorithm using loop and indexed access
  *  through #HPCombi::VectGeneric
@@ -212,13 +217,15 @@ inline uint8_t horiz_sum_ref(epu8) noexcept;
 
 inline uint8_t horiz_sum_gen(epu8) noexcept;
 
-/** Same interface as \ref HPCombi::horiz_sum "horiz_sum" but with a different implementation.
+/** Same interface as \ref HPCombi::horiz_sum "horiz_sum" but with a different
+ * implementation.
  *  @par Algorithm:
  *  4-stages parallel algorithm
  */
 inline uint8_t horiz_sum4(epu8) noexcept;
 
-/** Same interface as \ref HPCombi::horiz_sum "horiz_sum" but with a different implementation.
+/** Same interface as \ref HPCombi::horiz_sum "horiz_sum" but with a different
+ * implementation.
  *  @par Algorithm:
  *  3-stages parallel algorithm + indexed access
  */
@@ -237,20 +244,23 @@ inline uint8_t horiz_sum3(epu8) noexcept;
  */
 inline uint8_t horiz_sum(epu8 v) noexcept { return horiz_sum3(v); }
 
-/** Same interface as \ref HPCombi::partial_sums "partial_sums" but with a different implementation.
+/** Same interface as \ref HPCombi::partial_sums "partial_sums" but with a
+ * different implementation.
  *  @par Algorithm:
  *  Reference @f$O(n)@f$ algorithm using loop and indexed access
  */
 inline epu8 partial_sums_ref(epu8) noexcept;
 
-/** Same interface as \ref HPCombi::partial_sums "partial_sums" but with a different implementation.
+/** Same interface as \ref HPCombi::partial_sums "partial_sums" but with a
+ * different implementation.
  *  @par Algorithm:
  *  Reference @f$O(n)@f$ algorithm using loop and indexed access
  *  through #HPCombi::VectGeneric
  */
 inline epu8 partial_sums_gen(epu8) noexcept;
 
-/** Same interface as \ref HPCombi::partial_sums "partial_sums" but with a different implementation.
+/** Same interface as \ref HPCombi::partial_sums "partial_sums" but with a
+ * different implementation.
  *  @par Algorithm:
  *  4-stages parallel algorithm
  */
@@ -268,26 +278,30 @@ inline epu8 partial_sums_round(epu8) noexcept;
  */
 inline epu8 partial_sums(epu8 v) noexcept { return partial_sums_round(v); }
 
-/** Same interface as \ref HPCombi::horiz_max "horiz_max" but with a different implementation.
+/** Same interface as \ref HPCombi::horiz_max "horiz_max" but with a different
+ * implementation.
  *  @par Algorithm:
  *  Reference @f$O(n)@f$ algorithm using loop and indexed access
  */
 inline uint8_t horiz_max_ref(epu8) noexcept;
 
-/** Same interface as \ref HPCombi::horiz_max "horiz_max" but with a different implementation.
+/** Same interface as \ref HPCombi::horiz_max "horiz_max" but with a different
+ * implementation.
  *  @par Algorithm:
  *  Reference @f$O(n)@f$ algorithm using loop and indexed access
  *  through #HPCombi::VectGeneric
  */
 inline uint8_t horiz_max_gen(epu8) noexcept;
 
-/** Same interface as \ref HPCombi::horiz_max "horiz_max" but with a different implementation.
+/** Same interface as \ref HPCombi::horiz_max "horiz_max" but with a different
+ * implementation.
  *  @par Algorithm:
  *  4-stages parallel algorithm
  */
 inline uint8_t horiz_max4(epu8) noexcept;
 
-/** Same interface as \ref HPCombi::horiz_max "horiz_max" but with a different implementation.
+/** Same interface as \ref HPCombi::horiz_max "horiz_max" but with a different
+ * implementation.
  *  @par Algorithm:
  *  3-stages parallel algorithm + indexed access
  */
@@ -305,20 +319,23 @@ inline uint8_t horiz_max3(epu8) noexcept;
  */
 inline uint8_t horiz_max(epu8 v) noexcept { return horiz_max4(v); }
 
-/** Same interface as \ref HPCombi::partial_max "partial_max" but with a different implementation.
+/** Same interface as \ref HPCombi::partial_max "partial_max" but with a
+ * different implementation.
  *  @par Algorithm:
  *  Reference @f$O(n)@f$ algorithm using loop and indexed access
  */
 inline epu8 partial_max_ref(epu8) noexcept;
 
-/** Same interface as \ref HPCombi::partial_max "partial_max" but with a different implementation.
+/** Same interface as \ref HPCombi::partial_max "partial_max" but with a
+ * different implementation.
  *  @par Algorithm:
  *  Reference @f$O(n)@f$ algorithm using loop and indexed access
  *  through #HPCombi::VectGeneric
  */
 inline epu8 partial_max_gen(epu8) noexcept;
 
-/** Same interface as \ref HPCombi::partial_max "partial_max" but with a different implementation.
+/** Same interface as \ref HPCombi::partial_max "partial_max" but with a
+ * different implementation.
  *  @par Algorithm:
  *  4-stages parallel algorithm
  */
@@ -336,26 +353,30 @@ inline epu8 partial_max_round(epu8) noexcept;
  */
 inline epu8 partial_max(epu8 v) noexcept { return partial_max_round(v); }
 
-/** Same interface as \ref HPCombi::horiz_min "horiz_min" but with a different implementation.
+/** Same interface as \ref HPCombi::horiz_min "horiz_min" but with a different
+ * implementation.
  *  @par Algorithm:
  *  Reference @f$O(n)@f$ algorithm using loop and indexed access
  */
 inline uint8_t horiz_min_ref(epu8) noexcept;
 
-/** Same interface as \ref HPCombi::horiz_min "horiz_min" but with a different implementation.
+/** Same interface as \ref HPCombi::horiz_min "horiz_min" but with a different
+ * implementation.
  *  @par Algorithm:
  *  Reference @f$O(n)@f$ algorithm using loop and indexed access
  *  through #HPCombi::VectGeneric
  */
 inline uint8_t horiz_min_gen(epu8) noexcept;
 
-/** Same interface as \ref HPCombi::horiz_min "horiz_min" but with a different implementation.
+/** Same interface as \ref HPCombi::horiz_min "horiz_min" but with a different
+ * implementation.
  *  @par Algorithm:
  *  4-stages parallel algorithm
  */
 inline uint8_t horiz_min4(epu8) noexcept;
 
-/** Same interface as \ref HPCombi::horiz_min "horiz_min" but with a different implementation.
+/** Same interface as \ref HPCombi::horiz_min "horiz_min" but with a different
+ * implementation.
  *  @par Algorithm:
  *  3-stages parallel algorithm + indexed access
  */
@@ -370,23 +391,26 @@ inline uint8_t horiz_min3(epu8) noexcept;
  * horiz_min(epu8 { 5, 5, 2, 5, 1, 6,12, 4, 1, 3, 2, 2,12, 3, 4, 4});
  * @endcode
  * Returns `1`
-*/
+ */
 inline uint8_t horiz_min(epu8 v) noexcept { return horiz_min4(v); }
 
-/** Same interface as \ref HPCombi::partial_min "partial_min" but with a different implementation.
+/** Same interface as \ref HPCombi::partial_min "partial_min" but with a
+ * different implementation.
  *  @par Algorithm:
  *  Reference @f$O(n)@f$ algorithm using loop and indexed access
  */
 inline epu8 partial_min_ref(epu8) noexcept;
 
-/** Same interface as \ref HPCombi::partial_min "partial_min" but with a different implementation.
+/** Same interface as \ref HPCombi::partial_min "partial_min" but with a
+ * different implementation.
  *  @par Algorithm:
  *  Reference @f$O(n)@f$ algorithm using loop and indexed access
  *  through #HPCombi::VectGeneric
  */
 inline epu8 partial_min_gen(epu8) noexcept;
 
-/** Same interface as \ref HPCombi::partial_min "partial_min" but with a different implementation.
+/** Same interface as \ref HPCombi::partial_min "partial_min" but with a
+ * different implementation.
  *  @par Algorithm:
  *  4-stages parallel algorithm
  */
@@ -404,32 +428,37 @@ inline epu8 partial_min_round(epu8) noexcept;
  */
 inline epu8 partial_min(epu8 v) noexcept { return partial_min_round(v); }
 
-/** Same interface as \ref HPCombi::eval16 "eval16" but with a different implementation.
+/** Same interface as \ref HPCombi::eval16 "eval16" but with a different
+ * implementation.
  *  @par Algorithm:
  *  Reference @f$O(n)@f$ algorithm using loop and indexed access
  */
 inline epu8 eval16_ref(epu8 v) noexcept;
 
-/** Same interface as \ref HPCombi::eval16 "eval16" but with a different implementation.
+/** Same interface as \ref HPCombi::eval16 "eval16" but with a different
+ * implementation.
  *  @par Algorithm:
  *  Reference @f$O(n)@f$ algorithm using loop and cast to array
  */
 inline epu8 eval16_arr(epu8 v) noexcept;
 
-/** Same interface as \ref HPCombi::eval16 "eval16" but with a different implementation.
+/** Same interface as \ref HPCombi::eval16 "eval16" but with a different
+ * implementation.
  *  @par Algorithm:
  *  Vector @f$O(n)@f$ using cyclic shifting
  */
 inline epu8 eval16_cycle(epu8 v) noexcept;
 
-/** Same interface as \ref HPCombi::eval16 "eval16" but with a different implementation.
+/** Same interface as \ref HPCombi::eval16 "eval16" but with a different
+ * implementation.
  *  @par Algorithm:
  *  Vector @f$O(n)@f$ using popcount
  */
 inline epu8 eval16_popcount(epu8 v) noexcept;
 
 /**
- * @brief Evaluation of a #HPCombi::epu8: count how many times each int of 0..15 appears in the input.
+ * @brief Evaluation of a #HPCombi::epu8: count how many times each int of 0..15
+ * appears in the input.
  * @details
  * @param v : a #HPCombi::epu8
  * @returns the evaluation, that is the #HPCombi::epu8 \c r such that
@@ -443,21 +472,24 @@ inline epu8 eval16_popcount(epu8 v) noexcept;
  */
 inline epu8 eval16(epu8 v) noexcept { return eval16_cycle(v); }
 
-/** Same interface as \ref HPCombi::first_diff "first_diff" but with a different implementation.
+/** Same interface as \ref HPCombi::first_diff "first_diff" but with a different
+ * implementation.
  *  @par Algorithm:
  *  Reference @f$O(n)@f$ algorithm using loop and indexed access
  */
 inline uint64_t first_diff_ref(epu8 a, epu8 b, size_t bound = 16) noexcept;
 
 #ifdef SIMDE_X86_SSE4_2_NATIVE
-/** Same interface as \ref HPCombi::first_diff "first_diff" but with a different implementation.
+/** Same interface as \ref HPCombi::first_diff "first_diff" but with a different
+ * implementation.
  *  @par Algorithm:
  *  Using \c cmpestri instruction
  */
 inline uint64_t first_diff_cmpstr(epu8 a, epu8 b, size_t bound = 16) noexcept;
 #endif
 
-/** Same interface as \ref HPCombi::first_diff "first_diff" but with a different implementation.
+/** Same interface as \ref HPCombi::first_diff "first_diff" but with a different
+ * implementation.
  *  @par Algorithm:
  *  Using vector comparison and mask
  */
@@ -485,21 +517,24 @@ inline uint64_t first_diff(epu8 a, epu8 b, size_t bound = 16) noexcept {
     return first_diff_mask(a, b, bound);
 }
 
-/** Same interface as \ref HPCombi::last_diff "last_diff" but with a different implementation.
+/** Same interface as \ref HPCombi::last_diff "last_diff" but with a different
+ * implementation.
  *  @par Algorithm:
  *  Reference @f$O(n)@f$ algorithm using loop and indexed access
  */
 inline uint64_t last_diff_ref(epu8 a, epu8 b, size_t bound = 16) noexcept;
 
 #ifdef SIMDE_X86_SSE4_2_NATIVE
-/** Same interface as \ref HPCombi::last_diff "last_diff" but with a different implementation.
+/** Same interface as \ref HPCombi::last_diff "last_diff" but with a different
+ * implementation.
  *  @par Algorithm:
  *  Using \c cmpestri instruction
  */
 inline uint64_t last_diff_cmpstr(epu8 a, epu8 b, size_t bound = 16) noexcept;
 #endif
 
-/** Same interface as \ref HPCombi::last_diff "last_diff" but with a different implementation.
+/** Same interface as \ref HPCombi::last_diff "last_diff" but with a different
+ * implementation.
  *  @par Algorithm:
  *  Using vector comparison and mask
  */
@@ -610,17 +645,20 @@ inline bool is_transformation(epu8 v, const size_t k = 16) noexcept;
 inline bool is_partial_permutation(epu8 v, const size_t k = 16) noexcept;
 
 #ifdef SIMDE_X86_SSE4_2_NATIVE
-/** Same interface as \ref HPCombi::is_permutation "is_permutation" but with a different implementation.
+/** Same interface as \ref HPCombi::is_permutation "is_permutation" but with a
+   different implementation.
     @par Algorithm: uses string matching cpmestri intrinsics
  */
 inline bool is_permutation_cpmestri(epu8 v, const size_t k = 16) noexcept;
 #endif
-/** Same interface as \ref HPCombi::is_permutation "is_permutation" but with a different implementation.
+/** Same interface as \ref HPCombi::is_permutation "is_permutation" but with a
+   different implementation.
     @par Algorithm: sort the vector and compare to identity
  */
 inline bool is_permutation_sort(epu8 v, const size_t k = 16) noexcept;
 
-/** Same interface as \ref HPCombi::is_permutation "is_permutation" but with a different implementation.
+/** Same interface as \ref HPCombi::is_permutation "is_permutation" but with a
+   different implementation.
     @par Algorithm: uses evaluation
  */
 inline bool is_permutation_eval(epu8 v, const size_t k = 16) noexcept;
diff --git a/include/hpcombi/epu8_impl.hpp b/include/hpcombi/epu8_impl.hpp
index 2cc6c87..702372c 100644
--- a/include/hpcombi/epu8_impl.hpp
+++ b/include/hpcombi/epu8_impl.hpp
@@ -20,7 +20,8 @@
 // NOLINT(build/header_guard)
 
 /** @file
-@brief implementation of epu8.hpp ; this file should not be included directly. */
+@brief implementation of epu8.hpp ; this file should not be included directly.
+*/
 
 #include <initializer_list>
 #include <iostream>
@@ -553,7 +554,8 @@ inline std::string to_string(HPCombi::epu8 const &a) {
     return ss.str();
 }
 
-//! This type appears in the doc because we provide an equal operator for HPCombi::epu8.
+//! This type appears in the doc because we provide an equal operator for
+//! HPCombi::epu8.
 template <> struct equal_to<HPCombi::epu8> {
     bool operator()(const HPCombi::epu8 &lhs,
                     const HPCombi::epu8 &rhs) const noexcept {
@@ -561,7 +563,8 @@ template <> struct equal_to<HPCombi::epu8> {
     }
 };
 
-//! This type appears in the doc because we provide a not_equal operator for HPCombi::epu8.
+//! This type appears in the doc because we provide a not_equal operator for
+//! HPCombi::epu8.
 template <> struct not_equal_to<HPCombi::epu8> {
     bool operator()(const HPCombi::epu8 &lhs,
                     const HPCombi::epu8 &rhs) const noexcept {
@@ -569,7 +572,8 @@ template <> struct not_equal_to<HPCombi::epu8> {
     }
 };
 
-//! This type appears in the doc because we provide a hash function for HPCombi::epu8.
+//! This type appears in the doc because we provide a hash function for
+//! HPCombi::epu8.
 template <> struct hash<HPCombi::epu8> {
     inline size_t operator()(HPCombi::epu8 a) const noexcept {
         unsigned __int128 v0 = simde_mm_extract_epi64(a, 0);
@@ -586,7 +590,8 @@ template <> struct hash<HPCombi::epu8> {
     }
 };
 
-//! This type appears in the doc because we provide a less operator for HPCombi::epu8.
+//! This type appears in the doc because we provide a less operator for
+//! HPCombi::epu8.
 template <> struct less<HPCombi::epu8> {
     // WARNING: due to endianness this is not lexicographic comparison,
     //          but we don't care when using in std::set.
diff --git a/include/hpcombi/hpcombi.hpp b/include/hpcombi/hpcombi.hpp
index b84f6fd..af9282c 100644
--- a/include/hpcombi/hpcombi.hpp
+++ b/include/hpcombi/hpcombi.hpp
@@ -39,23 +39,27 @@ and also debug.hpp, epu8.hpp, etc.*/
 
 \section readme_sec Readme
 
-You might want to have a look at [the Readme in the sources](https://github.com/libsemigroups/HPCombi/blob/main/README.md).
+You might want to have a look at [the Readme in the
+sources](https://github.com/libsemigroups/HPCombi/blob/main/README.md).
 
 \section sec_philo Philosophy
-This library provides high performance computations in combinatorics (hence its name).
-In practice we observe large speedups in several enumeration problems.
+This library provides high performance computations in combinatorics (hence its
+name). In practice we observe large speedups in several enumeration problems.
 
-The main idea of the library is a way to encode data as a small sequence of small integers,
-that can be handled efficiently by a creative use of vector instructions.
-For example, on the current x86 machines, small permutations (N ≤ 16) are very well handled.
-Indeed thanks to machine instructions such as PSHUFB (Packed SHUFfle Bytes),
-applying a permutation on a vector only takes a few CPU cycles.
+The main idea of the library is a way to encode data as a small sequence of
+small integers, that can be handled efficiently by a creative use of vector
+instructions. For example, on the current x86 machines, small permutations (N ≤
+16) are very well handled. Indeed thanks to machine instructions such as PSHUFB
+(Packed SHUFfle Bytes), applying a permutation on a vector only takes a few CPU
+cycles.
 
 Further ideas are:
 - Vectorization (MMX, SSE, AVX instructions sets) and careful memory alignment,
-- Careful memory management: avoid all dynamic allocation during the computation,
+- Careful memory management: avoid all dynamic allocation during the
+computation,
 - Avoid all unnecessary copies (it is often needed to rewrite the containers),
-- Due to combinatorial explosion, sets often don’t fit in memory or disk and are enumerated on the fly.
+- Due to combinatorial explosion, sets often don’t fit in memory or disk and are
+enumerated on the fly.
 
 Here are some examples,
 the speedup is in comparison to an implementation without vector instructions:
@@ -73,37 +77,42 @@ Cycle type of a permutation | 8.94
 \section sec_tips Tips to the user
 
 Note that memory access can become a problem.
-If your algorithm stores many things, most of the time will be spent in fetching from RAM, not computing.
-The data structures your client code uses should preserve locality.
-You might want to compute some stats on data structure usage
+If your algorithm stores many things, most of the time will be spent in fetching
+from RAM, not computing. The data structures your client code uses should
+preserve locality. You might want to compute some stats on data structure usage
 (eg avg size of buckets used, lengths of lists, lifetime of objects, etc.)
 and write custom data structure optimized for your usage profile.
 
 This lib is implemented with speed in mind, not code safety.
-Eg. there are no checks when building a permutation, which could be invalid (like non injective).
+Eg. there are no checks when building a permutation, which could be invalid
+(like non injective).
 
-We suggest having a look, in the menus above, at Classes → [Class list](annotated.html),
-esp. at the classes HPCombi::Perm16 and HPCombi::BMat8.
+We suggest having a look, in the menus above, at Classes → [Class
+list](annotated.html), esp. at the classes HPCombi::Perm16 and HPCombi::BMat8.
 
 \section Parallelism
-There is no parallelisation here. To use parallelism with this lib, see for instance:
-- Florent Hivert, High Performance Computing Experiments in Enumerative and Algebraic Combinatorics
+There is no parallelisation here. To use parallelism with this lib, see for
+instance:
+- Florent Hivert, High Performance Computing Experiments in Enumerative and
+Algebraic Combinatorics
 ([pdf](https://plouffe.fr/OEIS/citations/3115936.3115938.pdf),
 [DOI](https://dx.doi.org/10.1145/3115936.3115938)).
-- [OpenCilk](https://github.com/OpenCilk/) or look for another work stealing framework.
+- [OpenCilk](https://github.com/OpenCilk/) or look for another work stealing
+framework.
 
-Cilk is based on C++ and essentially adds the keywords `spawn` and `sync` to ease parallelism.
-Intel decided not to maintain Cilk anymore so it is deprecated.
-[OpencilK](https://github.com/OpenCilk/) is an open source project to continue it.
+Cilk is based on C++ and essentially adds the keywords `spawn` and `sync` to
+ease parallelism. Intel decided not to maintain Cilk anymore so it is
+deprecated. [OpencilK](https://github.com/OpenCilk/) is an open source project
+to continue it.
 
 We tested OpenMP and it was 2 orders of magnitude slower.
 
 OpencilK adds the keyword `spawn`,
 which adds a special tag to the stack and launches a recursive call.
-If a thread finishes its work, it will look at other threads' stacks and steal their work.
-The value of Cilk is that recursive calls cost only 4 or 5 times more,
-much faster than launching true threads
-(which would take 6-7 orders of magnitude more time to create, measured in μs).
+If a thread finishes its work, it will look at other threads' stacks and steal
+their work. The value of Cilk is that recursive calls cost only 4 or 5 times
+more, much faster than launching true threads (which would take 6-7 orders of
+magnitude more time to create, measured in μs).
 
 OpencilK provides some primitives for concurrent access to data.
 It guarantees the semantics of serial execution.
diff --git a/include/hpcombi/perm16.hpp b/include/hpcombi/perm16.hpp
index b4e0959..05c4a36 100644
--- a/include/hpcombi/perm16.hpp
+++ b/include/hpcombi/perm16.hpp
@@ -155,8 +155,8 @@ struct Transf16 : public PTransf16 {
 };
 
 /** Partial permutation of @f$\{0\dots 15\}@f$; see also HPCombi::Perm16;
-partial means it might not be defined everywhere (but where it's defined, it's injective).
-Undefined images are encoded as 0xFF. */
+partial means it might not be defined everywhere (but where it's defined, it's
+injective). Undefined images are encoded as 0xFF. */
 struct PPerm16 : public PTransf16 {
     PPerm16() = default;
     constexpr PPerm16(const PPerm16 &v) = default;
@@ -200,7 +200,8 @@ struct PPerm16 : public PTransf16 {
     PPerm16 inverse_ref() const;
 
 #ifdef SIMDE_X86_SSE4_2_NATIVE
-    /** Same as \ref HPCombi::PPerm16::inverse_ref "inverse_ref" but with a different algorithm.
+    /** Same as \ref HPCombi::PPerm16::inverse_ref "inverse_ref" but with a
+     * different algorithm.
      *  @par Algorithm:
      *  @f$O(\log n)@f$ algorithm using some kind of vectorized dichotomic
      * search.
@@ -255,19 +256,22 @@ struct Perm16 : public Transf16 /* public PPerm : diamond problem */ {
      * Frontend method: currently aliased to #inverse_cycl */
     Perm16 inverse() const { return inverse_cycl(); }
 
-    /** Same as \ref HPCombi::Perm16::inverse "inverse" but with a different algorithm.
+    /** Same as \ref HPCombi::Perm16::inverse "inverse" but with a different
+     * algorithm.
      *  @par Algorithm:
      *  Reference @f$O(n)@f$ algorithm using loop and indexed access
      */
     Perm16 inverse_ref() const;
 
-    /** Same as \ref HPCombi::Perm16::inverse "inverse" but with a different algorithm.
+    /** Same as \ref HPCombi::Perm16::inverse "inverse" but with a different
+     * algorithm.
      *  @par Algorithm:
      *  @f$O(n)@f$ algorithm using reference cast to arrays
      */
     Perm16 inverse_arr() const;
 
-    /** Same as \ref HPCombi::Perm16::inverse "inverse" but with a different algorithm.
+    /** Same as \ref HPCombi::Perm16::inverse "inverse" but with a different
+     * algorithm.
      *  @par Algorithm:
      *  Insert the identity in the least significant bits and sort using a
      *  sorting network. The number of rounds of the optimal sorting network is
@@ -275,14 +279,16 @@ struct Perm16 : public Transf16 /* public PPerm : diamond problem */ {
      */
     Perm16 inverse_sort() const;
 
-    /** Same as \ref HPCombi::Perm16::inverse "inverse" but with a different algorithm.
+    /** Same as \ref HPCombi::Perm16::inverse "inverse" but with a different
+     * algorithm.
      *  @par Algorithm:
      *  @f$O(\log n)@f$ algorithm using some kind of vectorized dichotomic
      * search.
      */
     Perm16 inverse_find() const { return permutation_of(v, one()); }
 
-    /** Same as \ref HPCombi::Perm16::inverse "inverse" but with a different algorithm.
+    /** Same as \ref HPCombi::Perm16::inverse "inverse" but with a different
+     * algorithm.
      *  @par Algorithm:
      *
      * Use HPCombi::pow to
@@ -291,7 +297,8 @@ struct Perm16 : public Transf16 /* public PPerm : diamond problem */ {
      */
     Perm16 inverse_pow() const;
 
-    /** Same as \ref HPCombi::Perm16::inverse "inverse" but with a different algorithm.
+    /** Same as \ref HPCombi::Perm16::inverse "inverse" but with a different
+     * algorithm.
      *  @par Algorithm:
      *  Compute power from @f$n/2@f$ to @f$n@f$, when @f$\sigma^k(i)=i@f$ then
      *  @f$\sigma^{-1}(i)=\sigma^{k-1}(i)@f$. Complexity @f$O(n)@f$
@@ -323,13 +330,15 @@ struct Perm16 : public Transf16 /* public PPerm : diamond problem */ {
      */
     epu8 lehmer() const;
 
-    /** Same interface as \ref HPCombi::Perm16::lehmer "lehmer" but with a different implementation.
+    /** Same interface as \ref HPCombi::Perm16::lehmer "lehmer" but with a
+     * different implementation.
      * @par Algorithm:
      * Reference @f$O(n^2)@f$ algorithm using loop and indexed access
      */
     epu8 lehmer_ref() const;
 
-    /** Same interface as \ref HPCombi::Perm16::lehmer "lehmer" but with a different implementation.
+    /** Same interface as \ref HPCombi::Perm16::lehmer "lehmer" but with a
+     * different implementation.
      * @par Algorithm:
      * Reference @f$O(n^2)@f$ algorithm using array, loop and indexed access
      */
@@ -350,13 +359,15 @@ struct Perm16 : public Transf16 /* public PPerm : diamond problem */ {
      */
     uint8_t length() const;
 
-    /** Same interface as \ref HPCombi::Perm16::length "length", with a different implementation.
+    /** Same interface as \ref HPCombi::Perm16::length "length", with a
+     * different implementation.
      *  @par Algorithm:
      *  Reference @f$O(n^2)@f$ algorithm using loop and indexed access
      */
     uint8_t length_ref() const;
 
-    /** Same interface as \ref HPCombi::Perm16::length "length", with a different implementation.
+    /** Same interface as \ref HPCombi::Perm16::length "length", with a
+     * different implementation.
      *  @par Algorithm:
      *  Reference @f$O(n^2)@f$ algorithm using loop and indexed access after
      *     a cast to \c std::array
@@ -378,7 +389,8 @@ struct Perm16 : public Transf16 /* public PPerm : diamond problem */ {
      */
     uint8_t nb_descents() const;
 
-    /** Same interface as \ref HPCombi::Perm16::nb_descents "nb_descents", with a different implementation.
+    /** Same interface as \ref HPCombi::Perm16::nb_descents "nb_descents", with
+     * a different implementation.
      *  @par Algorithm:
      *  Reference @f$O(n)@f$ using a loop
      */
@@ -414,13 +426,15 @@ struct Perm16 : public Transf16 /* public PPerm : diamond problem */ {
      */
     uint8_t nb_cycles() const { return nb_cycles_unroll(); }
 
-    /** Same interface as \ref HPCombi::Perm16::nb_cycles "nb_cycles" but with a different implementation.
+    /** Same interface as \ref HPCombi::Perm16::nb_cycles "nb_cycles" but with a
+     * different implementation.
      *  @par Algorithm:
      *  Reference @f$O(n)@f$ using a boolean vector
      */
     uint8_t nb_cycles_ref() const;
 
-    /** Same interface as \ref HPCombi::Perm16::nb_cycles "nb_cycles" but with a different implementation.
+    /** Same interface as \ref HPCombi::Perm16::nb_cycles "nb_cycles" but with a
+     * different implementation.
      *  @par Algorithm:
      *  Reference @f$O(\log(n))@f$ using #cycles_partition
      */
diff --git a/include/hpcombi/perm16_impl.hpp b/include/hpcombi/perm16_impl.hpp
index 5c4d102..4d2daed 100644
--- a/include/hpcombi/perm16_impl.hpp
+++ b/include/hpcombi/perm16_impl.hpp
@@ -20,7 +20,8 @@
 // NOLINT(build/header_guard)
 
 /** @file
-@brief implementation of perm16.hpp ; this file should not be included directly. */
+@brief implementation of perm16.hpp ; this file should not be included directly.
+*/
 
 namespace HPCombi {
 inline PTransf16::PTransf16(std::initializer_list<uint8_t> il)
diff --git a/include/hpcombi/perm_generic.hpp b/include/hpcombi/perm_generic.hpp
index db44126..164cb71 100644
--- a/include/hpcombi/perm_generic.hpp
+++ b/include/hpcombi/perm_generic.hpp
@@ -38,10 +38,11 @@
 
 namespace HPCombi {
 
-/** Vanilla (ie NOT optimized) implementation of a permutation, used to check for test correctness and as baseline to measure speedup.
-Implemented as an std array, so the permutation is not necessarily of size n=16.
-PermGeneric<16> should implement as much as possibles of Perm16 (currently not everything due to lack of time/need).
-No optimisation, so prefer to use Perm16.
+/** Vanilla (ie NOT optimized) implementation of a permutation, used to check
+for test correctness and as baseline to measure speedup. Implemented as an std
+array, so the permutation is not necessarily of size n=16. PermGeneric<16>
+should implement as much as possibles of Perm16 (currently not everything due to
+lack of time/need). No optimisation, so prefer to use Perm16.
 
 About Expo, see comment on HPCombi::VectGeneric.
 */
diff --git a/include/hpcombi/perm_generic_impl.hpp b/include/hpcombi/perm_generic_impl.hpp
index 72b5a45..744ac3c 100644
--- a/include/hpcombi/perm_generic_impl.hpp
+++ b/include/hpcombi/perm_generic_impl.hpp
@@ -119,7 +119,8 @@ bool PermGeneric<Size, Expo>::left_weak_leq(PermGeneric other) const {
 
 namespace std {
 
-//! This type appears in the doc because we provide a hash function for HPCombi::PermGeneric.
+//! This type appears in the doc because we provide a hash function for
+//! HPCombi::PermGeneric.
 template <size_t Size, typename Expo>
 struct hash<HPCombi::PermGeneric<Size, Expo>> {
     size_t operator()(const HPCombi::PermGeneric<Size, Expo> &ar) const {
diff --git a/include/hpcombi/power.hpp b/include/hpcombi/power.hpp
index a98db24..6afb7b2 100644
--- a/include/hpcombi/power.hpp
+++ b/include/hpcombi/power.hpp
@@ -17,13 +17,13 @@
 //  with HP-Combi. If not, see <https://www.gnu.org/licenses/>.               //
 //****************************************************************************//
 
-
 /** @file
 @brief  Generic compile-time unrolling of the fast exponentiation algorithm.
 
 Allows to write expressions such as
 - @c pow<23>(2.5) : entirely computed at compile time
-- @c pow<n>(x) expanded at compile time to a O(log n) long sequence of multiplications.
+- @c pow<n>(x) expanded at compile time to a O(log n) long sequence of
+multiplications.
 
 Such expressions work for numbers but also for any type where there is a
 neutral element and an associative (non necessarily commutative) product,
diff --git a/include/hpcombi/vect16.hpp b/include/hpcombi/vect16.hpp
index f9a0b2e..d0e13b3 100644
--- a/include/hpcombi/vect16.hpp
+++ b/include/hpcombi/vect16.hpp
@@ -34,7 +34,8 @@
 
 namespace HPCombi {
 
-/** Vector of 16 bytes, with some optimized methods, superclass of HPCombi::Transf16. */
+/** Vector of 16 bytes, with some optimized methods, superclass of
+ * HPCombi::Transf16. */
 struct alignas(16) Vect16 {
     static constexpr size_t size() { return 16; }
     using array = typename decltype(Epu8)::array;
@@ -117,7 +118,8 @@ inline std::ostream &operator<<(std::ostream &stream,
     return operator<<(stream, ar.v);
 }
 
-//! This type appears in the doc because we provide a hash function for HPCombi::Vect16.
+//! This type appears in the doc because we provide a hash function for
+//! HPCombi::Vect16.
 template <> struct hash<HPCombi::Vect16> {
     size_t operator()(const HPCombi::Vect16 &ar) const {
         return std::hash<HPCombi::epu8>{}(ar.v);
diff --git a/include/hpcombi/vect_generic.hpp b/include/hpcombi/vect_generic.hpp
index 65cf4da..0927f18 100644
--- a/include/hpcombi/vect_generic.hpp
+++ b/include/hpcombi/vect_generic.hpp
@@ -47,8 +47,8 @@ std::array<Expo, Size> sorted_vect(std::array<Expo, Size> v) {
 }
 
 /** \ref HPCombi::VectGeneric "VectGeneric" is to \ref HPCombi::Vect16 "Vect16"
-what \ref HPCombi::PermGeneric "PermGeneric" is to \ref HPCombi::Perm16 "Perm16";
-see \ref HPCombi::PermGeneric "PermGeneric".
+what \ref HPCombi::PermGeneric "PermGeneric" is to \ref HPCombi::Perm16
+"Perm16"; see \ref HPCombi::PermGeneric "PermGeneric".
 
 HPCombi started as a library to manipulate monomials on several variables,
 ie a tuple of *expo*nents. The elements of arrays were thus named Expo.
@@ -244,7 +244,8 @@ std::ostream &operator<<(std::ostream &stream,
     return stream;
 }
 
-//! This type appears in the doc because we provide a hash function for HPCombi::VectGeneric.
+//! This type appears in the doc because we provide a hash function for
+//! HPCombi::VectGeneric.
 template <size_t Size, typename Expo>
 struct hash<HPCombi::VectGeneric<Size, Expo>> {
     size_t operator()(const HPCombi::VectGeneric<Size, Expo> &ar) const {