From 33ca3fe0413587ad6076fbf9cbc445ed79ed6f55 Mon Sep 17 00:00:00 2001
From: Jean-Baptiste Rouquier <jrouquie@example.com>
Date: Mon, 16 Dec 2024 16:35:29 +0100
Subject: [PATCH] doc: misc

---
 TODO.txt                         |  3 ++-
 include/hpcombi/epu8.hpp         | 11 ++++++++---
 include/hpcombi/hpcombi.hpp      | 28 +++++++++++++++++++++++-----
 include/hpcombi/perm_generic.hpp |  2 +-
 4 files changed, 34 insertions(+), 10 deletions(-)

diff --git a/TODO.txt b/TODO.txt
index 229d50df..9109b089 100644
--- a/TODO.txt
+++ b/TODO.txt
@@ -2,5 +2,6 @@
 - https://cmake.org/Wiki/CMake:How_To_Write_Platform_Checks
 - https://stackoverflow.com/questions/11944060/how-to-detect-target-architecture-using-cmake
 
-
 Add method data in perm16 and perm_generic
+
+Document examples. Eg for each file foo.cpp in examples/, add "@example foo.cpp" in a relevant file of include/hpcombi/.
diff --git a/include/hpcombi/epu8.hpp b/include/hpcombi/epu8.hpp
index 7b616227..8c47b5ef 100644
--- a/include/hpcombi/epu8.hpp
+++ b/include/hpcombi/epu8.hpp
@@ -79,22 +79,27 @@ inline bool equal(epu8 a, epu8 b) noexcept {
 /** Non equality of #HPCombi::epu8 */
 inline bool not_equal(epu8 a, epu8 b) noexcept { return !equal(a, b); }
 
-/** Permuting a #HPCombi::epu8 */
+/** Apply a permutation \c b on the vector \c a: for i=0..16 {result[i] = a[b[i]} */
 inline epu8 permuted_ref(epu8 a, epu8 b) noexcept;
-/** Permuting a #HPCombi::epu8 */
+
+/** Same as \ref HPCombi::permuted_ref "permuted_ref"
+but with an optimized implementation using intrinsics. */
 inline epu8 permuted(epu8 a, epu8 b) noexcept {
     return simde_mm_shuffle_epi8(a, b);
 }
+
 /** Left shifted of a #HPCombi::epu8 inserting a 0
  * @warning we use the convention that the 0 entry is on the left !
  */
 inline epu8 shifted_right(epu8 a) noexcept {
     return simde_mm_bslli_si128(a, 1);
 }
+
 /** Right shifted of a #HPCombi::epu8 inserting a 0
  * @warning we use the convention that the 0 entry is on the left !
  */
 inline epu8 shifted_left(epu8 a) noexcept { return simde_mm_bsrli_si128(a, 1); }
+
 /** Reverting a #HPCombi::epu8 */
 inline epu8 reverted(epu8 a) noexcept { return permuted(a, Epu8.rev()); }
 
@@ -161,7 +166,7 @@ inline epu8 permutation_of_cmpestrm(epu8 a, epu8 b) noexcept;
 inline epu8 permutation_of_ref(epu8 a, epu8 b) noexcept;
 
 /**
- * @brief Find if a vector is a permutation of one other
+ * @brief Find if a vector is a permutation of another one
  * @details
  * @param a, b: two #HPCombi::epu8
  * @returns a #HPCombi::epu8
diff --git a/include/hpcombi/hpcombi.hpp b/include/hpcombi/hpcombi.hpp
index fcb094a4..c4cacad5 100644
--- a/include/hpcombi/hpcombi.hpp
+++ b/include/hpcombi/hpcombi.hpp
@@ -72,11 +72,6 @@ Cycle type of a permutation | 8.94
 
 \section sec_tips Tips to the user
 
-There is no parallelisation here. To use parallelism with this lib, see for instance:
-- Florent Hivert, High Performance Computing Experiments in Enumerative and Algebraic Combinatorics
-([pdf](https://plouffe.fr/OEIS/citations/3115936.3115938.pdf), [DOI](https://dx.doi.org/10.1145/3115936.3115938)).
-- [OpenCilk](https://github.com/OpenCilk/) or look for another work stealing framework.
-
 Note that memory access can become a problem. It you store many things, most of the time will be spent in fetching from RAM, not computing.
 Data structure should preserve locality. You might want to compute some stats on data structure usage and write custom ones.
 
@@ -85,4 +80,27 @@ Eg. there are no checks when building a permutation, which could be invalid (lik
 
 We now suggest to have a look, in the menus above, at Classes → [Class list](annotated.html),
 esp. at classes are HPCombi::Perm16 and HPCombi::BMat8.
+
+\section Parallelism
+There is no parallelisation here. To use parallelism with this lib, see for instance:
+- Florent Hivert, High Performance Computing Experiments in Enumerative and Algebraic Combinatorics
+([pdf](https://plouffe.fr/OEIS/citations/3115936.3115938.pdf), [DOI](https://dx.doi.org/10.1145/3115936.3115938)).
+- [OpenCilk](https://github.com/OpenCilk/) or look for another work stealing framework.
+
+Cilk is based on C++ and essentially adds the keywords `spawn` and `sync` to ease parallelism.
+Intel decided not to maintain it anymore so its deprecated.
+OpencilK is an open source project to continue it.
+
+We tested OpenMP and it was 2 orders of magnitude slower.
+
+OpencilK adds the keyword `spawn`,
+which adds a special tag to the stack and launches a recursive call.
+If a thread finishes its work, it will look at other threads' stacks and steal their work.
+The value of Cilk is that recursive calls cost only 4 or 5 times more,
+much faster than launching true threads
+(which would take 6-7 orders of magnitude more time to create, measured in μs).
+
+OpencilK provides some primitives for concurrent access to data.
+It guarantees the semantics of serial execution.
+
 */
\ No newline at end of file
diff --git a/include/hpcombi/perm_generic.hpp b/include/hpcombi/perm_generic.hpp
index 9335a3f9..db441266 100644
--- a/include/hpcombi/perm_generic.hpp
+++ b/include/hpcombi/perm_generic.hpp
@@ -40,7 +40,7 @@ namespace HPCombi {
 
 /** Vanilla (ie NOT optimized) implementation of a permutation, used to check for test correctness and as baseline to measure speedup.
 Implemented as an std array, so the permutation is not necessarily of size n=16.
-PermGeneric<16> should implment as much as possibles of Perm16 (currently not everything due to lack of time/need).
+PermGeneric<16> should implement as much as possibles of Perm16 (currently not everything due to lack of time/need).
 No optimisation, so prefer to use Perm16.
 
 About Expo, see comment on HPCombi::VectGeneric.