From 9de3055c46907a9f00d8321451885c8fc40e0e41 Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Rouquier Date: Tue, 17 Dec 2024 12:07:09 +0100 Subject: [PATCH] proofread the pull request --- README.md | 7 +++---- include/hpcombi/epu8.hpp | 6 +++--- include/hpcombi/hpcombi.hpp | 27 ++++++++++++++++----------- include/hpcombi/perm16.hpp | 25 ++++++++++++++----------- 4 files changed, 36 insertions(+), 29 deletions(-) diff --git a/README.md b/README.md index 2a9dfb7..ed2333d 100644 --- a/README.md +++ b/README.md @@ -2,10 +2,9 @@ High Performance Combinatorics in C++ using vector instructions v1.0.1 HPCombi is a C++17 header-only library using the SSE and AVX instruction sets, -and some equivalents, for very fast manipulation of small combinatorial objects such -as transformations, permutations, and boolean matrices. The goal -of this project is to implement various new algorithms and benchmark them on -various compiler and architectures. +and some equivalents, for very fast manipulation of small combinatorial objects +such as transformations, permutations, and boolean matrices. HPCombi implements +new algorithms and benchmarks them on various compilers and architectures. HPCombi was initially designed using the SSE and AVX instruction sets, and did not work on machines without these instructions (such as ARM). From v1.0.1 diff --git a/include/hpcombi/epu8.hpp b/include/hpcombi/epu8.hpp index 8c47b5e..c5f9b3b 100644 --- a/include/hpcombi/epu8.hpp +++ b/include/hpcombi/epu8.hpp @@ -52,9 +52,9 @@ epu8 stands for *Extended Packed Unsigned, grouped by 8 bits*; this is the low level type chosen by Intel for their API to intrinsics, ie a SIMD vector of 16 unsigned bytes (16×8 = 128bits). Functions using this type use semantically equivalent types, -eg a _m128 which is 2 vect of 64bits. -a flag tells the compiler to silently consider those types equivalent. - */ +eg a _m128 which is a vector containing 2 signed 64 bits integers. +A flag tells the compiler to silently consider those types equivalent. +*/ using epu8 = uint8_t __attribute__((vector_size(16))); static_assert(alignof(epu8) == 16, diff --git a/include/hpcombi/hpcombi.hpp b/include/hpcombi/hpcombi.hpp index c4cacad..b84f6fd 100644 --- a/include/hpcombi/hpcombi.hpp +++ b/include/hpcombi/hpcombi.hpp @@ -53,9 +53,9 @@ applying a permutation on a vector only takes a few CPU cycles. Further ideas are: - Vectorization (MMX, SSE, AVX instructions sets) and careful memory alignment, -- Careful memory management: avoiding all dynamic allocation during the computation, -- Avoid all unnecessary copies (often needed to rewrite the containers), -- Due to combinatorial explosion, sets often don’t fit in the computer’s memory or disks and are enumerated on the fly. +- Careful memory management: avoid all dynamic allocation during the computation, +- Avoid all unnecessary copies (it is often needed to rewrite the containers), +- Due to combinatorial explosion, sets often don’t fit in memory or disk and are enumerated on the fly. Here are some examples, the speedup is in comparison to an implementation without vector instructions: @@ -72,24 +72,29 @@ Cycle type of a permutation | 8.94 \section sec_tips Tips to the user -Note that memory access can become a problem. It you store many things, most of the time will be spent in fetching from RAM, not computing. -Data structure should preserve locality. You might want to compute some stats on data structure usage and write custom ones. +Note that memory access can become a problem. +If your algorithm stores many things, most of the time will be spent in fetching from RAM, not computing. +The data structures your client code uses should preserve locality. +You might want to compute some stats on data structure usage +(eg avg size of buckets used, lengths of lists, lifetime of objects, etc.) +and write custom data structure optimized for your usage profile. This lib is implemented with speed in mind, not code safety. Eg. there are no checks when building a permutation, which could be invalid (like non injective). -We now suggest to have a look, in the menus above, at Classes → [Class list](annotated.html), -esp. at classes are HPCombi::Perm16 and HPCombi::BMat8. +We suggest having a look, in the menus above, at Classes → [Class list](annotated.html), +esp. at the classes HPCombi::Perm16 and HPCombi::BMat8. \section Parallelism There is no parallelisation here. To use parallelism with this lib, see for instance: - Florent Hivert, High Performance Computing Experiments in Enumerative and Algebraic Combinatorics -([pdf](https://plouffe.fr/OEIS/citations/3115936.3115938.pdf), [DOI](https://dx.doi.org/10.1145/3115936.3115938)). +([pdf](https://plouffe.fr/OEIS/citations/3115936.3115938.pdf), +[DOI](https://dx.doi.org/10.1145/3115936.3115938)). - [OpenCilk](https://github.com/OpenCilk/) or look for another work stealing framework. Cilk is based on C++ and essentially adds the keywords `spawn` and `sync` to ease parallelism. -Intel decided not to maintain it anymore so its deprecated. -OpencilK is an open source project to continue it. +Intel decided not to maintain Cilk anymore so it is deprecated. +[OpencilK](https://github.com/OpenCilk/) is an open source project to continue it. We tested OpenMP and it was 2 orders of magnitude slower. @@ -103,4 +108,4 @@ much faster than launching true threads OpencilK provides some primitives for concurrent access to data. It guarantees the semantics of serial execution. -*/ \ No newline at end of file +*/ diff --git a/include/hpcombi/perm16.hpp b/include/hpcombi/perm16.hpp index d620749..b4e0959 100644 --- a/include/hpcombi/perm16.hpp +++ b/include/hpcombi/perm16.hpp @@ -154,7 +154,7 @@ struct Transf16 : public PTransf16 { explicit operator uint64_t() const; }; -/** Partial permutation of @f$\{0\dots 15\}@f$; see HPCombi::Perm16; +/** Partial permutation of @f$\{0\dots 15\}@f$; see also HPCombi::Perm16; partial means it might not be defined everywhere (but where it's defined, it's injective). Undefined images are encoded as 0xFF. */ struct PPerm16 : public PTransf16 { @@ -255,7 +255,6 @@ struct Perm16 : public Transf16 /* public PPerm : diamond problem */ { * Frontend method: currently aliased to #inverse_cycl */ Perm16 inverse() const { return inverse_cycl(); } - /** Same as \ref HPCombi::Perm16::inverse "inverse" but with a different algorithm. * @par Algorithm: * Reference @f$O(n)@f$ algorithm using loop and indexed access @@ -308,7 +307,6 @@ struct Perm16 : public Transf16 /* public PPerm : diamond problem */ { */ static Perm16 unrankSJT(int n, int r); - /** * @brief The Lehmer code of a permutation * @details @@ -337,7 +335,7 @@ struct Perm16 : public Transf16 /* public PPerm : diamond problem */ { */ epu8 lehmer_arr() const; - /** + /** * @brief The Coxeter length (ie: number of inversion) of a permutation * @details * @returns the number of inversions of \c *this @@ -428,7 +426,6 @@ struct Perm16 : public Transf16 /* public PPerm : diamond problem */ { */ uint8_t nb_cycles_unroll() const; - /** * @brief Compare two permutations for the left weak order * @par Example: @@ -442,13 +439,15 @@ struct Perm16 : public Transf16 /* public PPerm : diamond problem */ { */ bool left_weak_leq(Perm16 other) const; - /** Same interface as \ref HPCombi::Perm16::left_weak_leq "left_weak_leq" but with a different implementation. + /** Same interface as \ref HPCombi::Perm16::left_weak_leq "left_weak_leq" + * but with a different implementation. * @par Algorithm: * Reference @f$O(n^2)@f$ testing inclusion of inversions one by one */ bool left_weak_leq_ref(Perm16 other) const; - /** Same interface as \ref HPCombi::Perm16::left_weak_leq "left_weak_leq" but with a different implementation. + /** Same interface as \ref HPCombi::Perm16::left_weak_leq "left_weak_leq" + * but with a different implementation. * @par Algorithm: * Reference @f$O(n)@f$ with vectorized test of inclusion */ @@ -471,7 +470,8 @@ static_assert(std::is_trivial(), "Perm16 is not a trivial class !"); namespace std { // Hash operators for Transf and Perm: -//! This type appears in the doc because we provide a hash function for HPCombi::PTransf16. +//! This type appears in the doc because we provide a hash function for +//! HPCombi::PTransf16. template <> struct hash { //! A hash operator for #HPCombi::PTransf16 size_t operator()(const HPCombi::PTransf16 &ar) const { @@ -479,7 +479,8 @@ template <> struct hash { } }; -//! This type appears in the doc because we provide a hash function for HPCombi::Transf16. +//! This type appears in the doc because we provide a hash function for +//! HPCombi::Transf16. template <> struct hash { //! A hash operator for #HPCombi::Transf16 size_t operator()(const HPCombi::Transf16 &ar) const { @@ -487,7 +488,8 @@ template <> struct hash { } }; -//! This type appears in the doc because we provide a hash function for HPCombi::PPerm16. +//! This type appears in the doc because we provide a hash function for +//! HPCombi::PPerm16. template <> struct hash { //! A hash operator for #HPCombi::PPerm16 size_t operator()(const HPCombi::PPerm16 &ar) const { @@ -495,7 +497,8 @@ template <> struct hash { } }; -//! This type appears in the doc because we provide a hash function for HPCombi::Perm16. +//! This type appears in the doc because we provide a hash function for +//! HPCombi::Perm16. template <> struct hash { //! A hash operator for #HPCombi::Perm16 size_t operator()(const HPCombi::Perm16 &ar) const { return uint64_t(ar); }