From eb84cee8b886cf278e999973511f9382f408c0d3 Mon Sep 17 00:00:00 2001
From: Florent Hivert <Florent.Hivert@lri.fr>
Date: Mon, 23 Oct 2023 14:52:06 +0200
Subject: [PATCH] Switch to simde

---
 include/epu.hpp         | 26 ++++++++++------
 include/epu_impl.hpp    | 66 ++++++++++++++++++++++++-----------------
 include/perm16_impl.hpp | 40 ++++++++++++++-----------
 3 files changed, 78 insertions(+), 54 deletions(-)
diff --git a/include/epu.hpp b/include/epu.hpp
index a3e13a88..87f807fe 100644
--- a/include/epu.hpp
+++ b/include/epu.hpp
@@ -22,7 +22,6 @@
 #include <functional>  // less<>, equal_to<>
 #include <iomanip>
 #include <ostream>
-#include <x86intrin.h>
 
 #ifdef HPCOMBI_HAVE_CONFIG
 #include "HPCombi-config.h"
@@ -34,6 +33,11 @@
 
 #include "vect_generic.hpp"
 
+
+#include "simde/x86/sse4.1.h"
+#include "simde/x86/sse4.2.h"
+
+
 #ifdef HPCOMBI_CONSTEXPR_FUN_ARGS
 #define HPCOMBI_CONSTEXPR constexpr
 #define HPCOMBI_CONSTEXPR_CONSTRUCTOR constexpr
@@ -202,32 +206,32 @@ inline const VectGeneric<16> &as_VectGeneric(const epu8 &v) {
 }
 
 /** Test whether all the entries of a #HPCombi::epu8 are zero */
-inline bool is_all_zero(epu8 a) { return _mm_testz_si128(a, a); }
+inline bool is_all_zero(epu8 a) { return simde_mm_testz_si128(a, a); }
 /** Test whether all the entries of a #HPCombi::epu8 are one */
-inline bool is_all_one(epu8 a) { return _mm_testc_si128(a, Epu8(0xFF)); }
+inline bool is_all_one(epu8 a) { return simde_mm_testc_si128(a, Epu8(0xFF)); }
 
 /** Equality of #HPCombi::epu8 */
-inline bool equal(epu8 a, epu8 b) { return is_all_zero(_mm_xor_si128(a, b)); }
+inline bool equal(epu8 a, epu8 b) { return is_all_zero(simde_mm_xor_si128(a, b)); }
 /** Non equality of #HPCombi::epu8 */
 inline bool not_equal(epu8 a, epu8 b) { return not equal(a, b); }
 
 /** Permuting a #HPCombi::epu8 */
-inline epu8 permuted(epu8 a, epu8 b) { return _mm_shuffle_epi8(a, b); }
+inline epu8 permuted(epu8 a, epu8 b) { return simde_mm_shuffle_epi8(a, b); }
 /** Left shifted of a #HPCombi::epu8 inserting a 0
  * @warning we use the convention that the 0 entry is on the left !
  */
-inline epu8 shifted_right(epu8 a) { return _mm_bslli_si128(a, 1); }
+inline epu8 shifted_right(epu8 a) { return simde_mm_bslli_si128(a, 1); }
 /** Right shifted of a #HPCombi::epu8 inserting a 0
  * @warning we use the convention that the 0 entry is on the left !
  */
-inline epu8 shifted_left(epu8 a) { return _mm_bsrli_si128(a, 1); }
+inline epu8 shifted_left(epu8 a) { return simde_mm_bsrli_si128(a, 1); }
 /** Reverting a #HPCombi::epu8 */
 inline epu8 reverted(epu8 a) { return permuted(a, epu8rev); }
 
 /** Vector min between two #HPCombi::epu8 0 */
-inline epu8 min(epu8 a, epu8 b) { return _mm_min_epu8(a, b); }
+inline epu8 min(epu8 a, epu8 b) { return simde_mm_min_epu8(a, b); }
 /** Vector max between two #HPCombi::epu8 0 */
-inline epu8 max(epu8 a, epu8 b) { return _mm_max_epu8(a, b); }
+inline epu8 max(epu8 a, epu8 b) { return simde_mm_max_epu8(a, b); }
 
 /** Testing if a #HPCombi::epu8 is sorted */
 inline bool is_sorted(epu8 a);
@@ -546,11 +550,13 @@ inline epu8 eval16(epu8 v) { return eval16_cycle(v); };
  *  Reference @f$O(n)@f$ algorithm using loop and indexed access
  */
 inline uint64_t first_diff_ref(epu8 a, epu8 b, size_t bound = 16);
+#ifdef SIMDE_X86_SSE4_2_NATIVE
 /** @copydoc common_first_diff
  *  @par Algorithm:
  *  Using \c cmpestri instruction
  */
 inline uint64_t first_diff_cmpstr(epu8 a, epu8 b, size_t bound = 16);
+#endif
 /** @copydoc common_first_diff
  *  @par Algorithm:
  *  Using vector comparison and mask
@@ -584,11 +590,13 @@ inline uint64_t first_diff(epu8 a, epu8 b, size_t bound = 16) {
  *  Reference @f$O(n)@f$ algorithm using loop and indexed access
  */
 inline uint64_t last_diff_ref(epu8 a, epu8 b, size_t bound = 16);
+#ifdef SIMDE_X86_SSE4_2_NATIVE
 /** @copydoc common_last_diff
  *  @par Algorithm:
  *  Using \c cmpestri instruction
  */
 inline uint64_t last_diff_cmpstr(epu8 a, epu8 b, size_t bound = 16);
+#endif
 /** @copydoc common_last_diff
  *  @par Algorithm:
  *  Using vector comparison and mask
diff --git a/include/epu_impl.hpp b/include/epu_impl.hpp
index c085abff..43784b3d 100644
--- a/include/epu_impl.hpp
+++ b/include/epu_impl.hpp
@@ -24,18 +24,18 @@
 
 // Comparison mode for _mm_cmpestri
 #define FIRST_DIFF                                                             \
-    (_SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_EACH | _SIDD_NEGATIVE_POLARITY)
+    (SIMDE_SIDD_UBYTE_OPS | SIMDE_SIDD_CMP_EQUAL_EACH | SIMDE_SIDD_NEGATIVE_POLARITY)
 #define LAST_DIFF                                                              \
-    (_SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_EACH | _SIDD_NEGATIVE_POLARITY |        \
-     _SIDD_MOST_SIGNIFICANT)
-#define FIRST_ZERO (_SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY)
+    (SIMDE_SIDD_UBYTE_OPS | SIMDE_SIDD_CMP_EQUAL_EACH | SIMDE_SIDD_NEGATIVE_POLARITY |        \
+     SIMDE_SIDD_MOST_SIGNIFICANT)
+#define FIRST_ZERO (SIMDE_SIDD_UBYTE_OPS | SIMDE_SIDD_CMP_EQUAL_ANY)
 #define LAST_ZERO                                                              \
-    (_SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_MOST_SIGNIFICANT)
+    (SIMDE_SIDD_UBYTE_OPS | SIMDE_SIDD_CMP_EQUAL_ANY | SIMDE_SIDD_MOST_SIGNIFICANT)
 #define FIRST_NON_ZERO                                                         \
-    (_SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_MASKED_NEGATIVE_POLARITY)
+    (SIMDE_SIDD_UBYTE_OPS | SIMDE_SIDD_CMP_EQUAL_ANY | SIMDE_SIDD_MASKED_NEGATIVE_POLARITY)
 #define LAST_NON_ZERO                                                          \
-    (_SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_MASKED_NEGATIVE_POLARITY |  \
-     _SIDD_MOST_SIGNIFICANT)
+    (SIMDE_SIDD_UBYTE_OPS | SIMDE_SIDD_CMP_EQUAL_ANY | SIMDE_SIDD_MASKED_NEGATIVE_POLARITY |  \
+     SIMDE_SIDD_MOST_SIGNIFICANT)
 
 namespace HPCombi {
 
@@ -45,11 +45,11 @@ namespace HPCombi {
 
 // Msk is supposed to be a boolean mask (i.e. each entry is either 0 or 255)
 inline uint64_t first_mask(epu8 msk, size_t bound) {
-    uint64_t res = _mm_movemask_epi8(msk & (epu8id < Epu8(bound)));
+    uint64_t res = simde_mm_movemask_epi8(msk & (epu8id < Epu8(bound)));
     return res == 0 ? 16 : _bit_scan_forward(res);
 }
 inline uint64_t last_mask(epu8 msk, size_t bound) {
-    auto res = _mm_movemask_epi8(msk & (epu8id < Epu8(bound)));
+    auto res = simde_mm_movemask_epi8(msk & (epu8id < Epu8(bound)));
     return res == 0 ? 16 : _bit_scan_reverse(res);
 }
 
@@ -59,9 +59,11 @@ inline uint64_t first_diff_ref(epu8 a, epu8 b, size_t bound) {
             return i;
     return 16;
 }
+#ifdef SIMDE_X86_SSE4_2_NATIVE
 inline uint64_t first_diff_cmpstr(epu8 a, epu8 b, size_t bound) {
     return unsigned(_mm_cmpestri(a, bound, b, bound, FIRST_DIFF));
 }
+#endif
 inline uint64_t first_diff_mask(epu8 a, epu8 b, size_t bound) {
     return first_mask(a != b, bound);
 }
@@ -74,9 +76,11 @@ inline uint64_t last_diff_ref(epu8 a, epu8 b, size_t bound) {
     }
     return 16;
 }
+#ifdef SIMDE_X86_SSE4_2_NATIVE
 inline uint64_t last_diff_cmpstr(epu8 a, epu8 b, size_t bound) {
     return unsigned(_mm_cmpestri(a, bound, b, bound, LAST_DIFF));
 }
+#endif
 inline uint64_t last_diff_mask(epu8 a, epu8 b, size_t bound) {
     return last_mask(a != b, bound);
 }
@@ -114,7 +118,7 @@ inline epu8 network_sort(epu8 res, std::array<epu8, sz> rounds) {
         epu8 mask = Increassing ? round < epu8id : epu8id < round;
         epu8 b = permuted(res, round);
         // res = mask ? min(res,b) : max(res,b); is not accepted by clang
-        res = _mm_blendv_epi8(min(res, b), max(res, b), mask);
+        res = simde_mm_blendv_epi8(min(res, b), max(res, b), mask);
     }
     return res;
 }
@@ -127,9 +131,9 @@ inline epu8 network_sort_perm(epu8 &v, std::array<epu8, sz> rounds) {
         // This conditional should be optimized out by the compiler
         epu8 mask = Increassing ? round < epu8id : epu8id < round;
         epu8 b = permuted(v, round);
-        epu8 cmp = _mm_blendv_epi8(b < v, v < b, mask);
-        v = _mm_blendv_epi8(v, b, cmp);
-        res = _mm_blendv_epi8(res, permuted(res, round), cmp);
+        epu8 cmp = simde_mm_blendv_epi8(b < v, v < b, mask);
+        v = simde_mm_blendv_epi8(v, b, cmp);
+        res = simde_mm_blendv_epi8(res, permuted(res, round), cmp);
     }
     return res;
 }
@@ -178,7 +182,7 @@ constexpr std::array<epu8, 6> sorting_rounds8
 // clang-format on
 
 inline bool is_sorted(epu8 a) {
-    return _mm_movemask_epi8(shifted_right(a) > a) == 0;
+    return simde_mm_movemask_epi8(shifted_right(a) > a) == 0;
 }
 inline epu8 sorted(epu8 a) {
     return network_sort<true>(a, sorting_rounds);
@@ -215,7 +219,7 @@ inline epu8 random_epu8(uint16_t bnd) {
 inline epu8 remove_dups(epu8 v, uint8_t repl) {
     // Vector ternary operator is not supported by clang.
     // return (v != shifted_right(v) ? v : Epu8(repl);
-    return _mm_blendv_epi8(Epu8(repl), v, v != shifted_right(v));
+    return simde_mm_blendv_epi8(Epu8(repl), v, v != shifted_right(v));
 }
 
 // Gather at the front numbers with (3-i)-th bit not set.
@@ -229,12 +233,13 @@ constexpr std::array<epu8, 3> inverting_rounds {{
 }};
 
 #define FIND_IN_VECT                                                           \
-    (_SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_UNIT_MASK |                 \
-     _SIDD_NEGATIVE_POLARITY)
+    (SIMDE_SIDD_UBYTE_OPS | SIMDE_SIDD_CMP_EQUAL_ANY | SIMDE_SIDD_UNIT_MASK |                 \
+     SIMDE_SIDD_NEGATIVE_POLARITY)
 #define FIND_IN_VECT_COMPL                                                     \
-    (_SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_UNIT_MASK)
+    (SIMDE_SIDD_UBYTE_OPS | SIMDE_SIDD_CMP_EQUAL_ANY | SIMDE_SIDD_UNIT_MASK)
 
 inline epu8 permutation_of(epu8 a, epu8 b) {
+#ifdef SIMDE_X86_SSE4_2_NATIVE
     epu8 res = -static_cast<epu8>(_mm_cmpestrm(a, 8, b, 16, FIND_IN_VECT));
     for (epu8 round : inverting_rounds) {
         a = permuted(a, round);
@@ -242,6 +247,8 @@ inline epu8 permutation_of(epu8 a, epu8 b) {
         res -= static_cast<epu8>(_mm_cmpestrm(a, 8, b, 16, FIND_IN_VECT));
     }
     return res;
+#else
+#endif
 }
 
 
@@ -404,7 +411,7 @@ inline epu8 eval16_cycle(epu8 v) {
 inline epu8 eval16_popcount(epu8 v) {
     epu8 res{};
     for (size_t i = 0; i < 16; i++) {
-        res[i] = __builtin_popcountl(_mm_movemask_epi8(v == Epu8(uint8_t(i))));
+        res[i] = __builtin_popcountl(simde_mm_movemask_epi8(v == Epu8(uint8_t(i))));
     }
     return res;
 }
@@ -419,13 +426,13 @@ inline bool is_partial_transformation(epu8 v, const size_t k) {
     uint64_t diff = last_diff(v, epu8id, 16);
     // (forall x in v, x + 1 <= 16)  and
     // (v = Perm16::one()   or  last diff index < 16)
-    return (_mm_movemask_epi8(v + Epu8(1) <= Epu8(0x10)) == 0xffff)
+    return (simde_mm_movemask_epi8(v + Epu8(1) <= Epu8(0x10)) == 0xffff)
         && (diff == 16 || diff < k);
 }
 
 inline bool is_transformation(epu8 v, const size_t k) {
     uint64_t diff = last_diff(v, epu8id, 16);
-    return (_mm_movemask_epi8(v < Epu8(0x10)) == 0xffff)
+    return (simde_mm_movemask_epi8(v < Epu8(0x10)) == 0xffff)
         && (diff == 16 || diff < k);
 }
 
@@ -434,8 +441,8 @@ inline bool is_partial_permutation(epu8 v, const size_t k) {
     // (forall x in v, x <= 15)  and
     // (forall x < 15, multiplicity x v <= 1
     // (v = Perm16::one()   or  last diff index < 16)
-    return (_mm_movemask_epi8(v + Epu8(1) <= Epu8(0x10)) == 0xffff)
-        && (_mm_movemask_epi8(eval16(v) <= Epu8(1)) == 0xffff)
+    return (simde_mm_movemask_epi8(v + Epu8(1) <= Epu8(0x10)) == 0xffff)
+        && (simde_mm_movemask_epi8(eval16(v) <= Epu8(1)) == 0xffff)
         && (diff == 16 || diff < k);
 }
 
@@ -444,9 +451,12 @@ inline bool is_permutation(epu8 v, const size_t k) {
     // (forall x in v, x in Perm16::one())  and
     // (forall x in Perm16::one(), x in v)  and
     // (v = Perm16::one()   or  last diff index < 16)
+#ifdef SIMDE_X86_SSE4_2_NATIVE
     return _mm_cmpestri(epu8id, 16, v, 16, FIRST_NON_ZERO) == 16
         && _mm_cmpestri(v, 16, epu8id, 16, FIRST_NON_ZERO) == 16
         && (diff == 16 || diff < k);
+#else
+#endif
 }
 
 }  // namespace HPCombi
@@ -475,13 +485,13 @@ template <> struct not_equal_to<HPCombi::epu8> {
 
 template <> struct hash<HPCombi::epu8> {
     inline size_t operator()(HPCombi::epu8 a) const {
-        unsigned __int128 v0 = _mm_extract_epi64(a, 0);
-        unsigned __int128 v1 = _mm_extract_epi64(a, 1);
+        unsigned __int128 v0 = simde_mm_extract_epi64(a, 0);
+        unsigned __int128 v1 = simde_mm_extract_epi64(a, 1);
         return ((v1 * HPCombi::prime + v0) * HPCombi::prime) >> 64;
 
         /* The following is extremely slow on Renner benchmark
-           uint64_t v0 = _mm_extract_epi64(ar.v, 0);
-           uint64_t v1 = _mm_extract_epi64(ar.v, 1);
+           uint64_t v0 = simde_mm_extract_epi64(ar.v, 0);
+           uint64_t v1 = simde_mm_extract_epi64(ar.v, 1);
            size_t seed = v0 + 0x9e3779b9;
            seed ^= v1 + 0x9e3779b9 + (seed<<6) + (seed>>2);
            return seed;
diff --git a/include/perm16_impl.hpp b/include/perm16_impl.hpp
index 5ea3af2a..19c25563 100644
--- a/include/perm16_impl.hpp
+++ b/include/perm16_impl.hpp
@@ -50,18 +50,21 @@ inline epu8 PTransf16::domain_mask(bool complement) const {
     return complement ? v == Epu8(0xFF) : v != Epu8(0xFF);
 }
 inline uint32_t PTransf16::domain_bitset(bool complement) const {
-    return _mm_movemask_epi8(domain_mask(complement));
+    return simde_mm_movemask_epi8(domain_mask(complement));
 }
 inline PTransf16 PTransf16::right_one() const {
     return domain_mask(true) | epu8id;
 }
 
 inline epu8 PTransf16::image_mask(bool complement) const {
+#ifdef SIMDE_X86_SSE4_2_NATIVE
     return complement ? _mm_cmpestrm(v, 16, one().v, 16, FIND_IN_VECT)
                       : _mm_cmpestrm(v, 16, one().v, 16, FIND_IN_VECT_COMPL);
+#else
+#endif
 }
 inline uint32_t PTransf16::image_bitset(bool complement) const {
-    return _mm_movemask_epi8(image_mask(complement));
+    return simde_mm_movemask_epi8(image_mask(complement));
 }
 inline PTransf16 PTransf16::left_one() const {
     return image_mask(true) | epu8id;
@@ -83,7 +86,7 @@ inline epu8 PTransf16::fix_points_mask(bool complement) const {
     return complement ? v != one().v : v == one().v;
 }
 inline uint32_t PTransf16::fix_points_bitset(bool complement) const {
-    return _mm_movemask_epi8(fix_points_mask(complement));
+    return simde_mm_movemask_epi8(fix_points_mask(complement));
 }
 
 inline uint8_t PTransf16::smallest_fix_point() const {
@@ -120,14 +123,14 @@ inline static HPCOMBI_CONSTEXPR uint8_t hilo_mask_fun(uint8_t i) {
 static HPCOMBI_CONSTEXPR epu8 hilo_mask = Epu8(hilo_mask_fun);
 
 inline Transf16::Transf16(uint64_t compressed) {
-    epu8 res = _mm_set_epi64x(compressed, compressed);
-    v = _mm_blendv_epi8(res & Epu8(0x0F), res >> 4, hilo_mask);
+    epu8 res = simde_mm_set_epi64x(compressed, compressed);
+    v = simde_mm_blendv_epi8(res & Epu8(0x0F), res >> 4, hilo_mask);
 }
 
 inline Transf16::operator uint64_t() const {
-    epu8 res = static_cast<epu8>(_mm_slli_epi32(v, 4));
+    epu8 res = static_cast<epu8>(simde_mm_slli_epi32(v, 4));
     res = HPCombi::permuted(res, hilo_exchng) + v;
-    return _mm_extract_epi64(res, 0);
+    return simde_mm_extract_epi64(res, 0);
 }
 
 inline PPerm16 PPerm16::inverse_ref() const {
@@ -139,8 +142,11 @@ inline PPerm16 PPerm16::inverse_ref() const {
 }
 
 inline PPerm16 PPerm16::inverse_find() const {
+#ifdef SIMDE_X86_SSE4_2_NATIVE
     epu8 mask = _mm_cmpestrm(v, 16, one(), 16, FIND_IN_VECT);
     return permutation_of(v, one()) | mask;
+#else
+#endif
 }
 
 inline Perm16 Perm16::random(uint64_t n) {
@@ -207,7 +213,7 @@ inline Perm16 Perm16::inverse_sort() const {
     // G++-7 compile this shift by 3 additions.
     // epu8 res = (v << 4) + one().v;
     // I call directly the shift intrinsic
-    epu8 res = static_cast<epu8>(_mm_slli_epi32(v, 4)) + one().v;
+    epu8 res = static_cast<epu8>(simde_mm_slli_epi32(v, 4)) + one().v;
     res = sorted(res) & Epu8(0x0F);
     return res;
 }
@@ -230,7 +236,7 @@ inline Perm16 Perm16::inverse_cycl() const {
     for (int i = 9; i <= 16; i++) {
         Perm16 oldpow = newpow;
         newpow = oldpow * *this;
-        res.v = _mm_blendv_epi8(res, oldpow, newpow.v == one().v);
+        res.v = simde_mm_blendv_epi8(res, oldpow, newpow.v == one().v);
     }
     return res;
 }
@@ -307,7 +313,7 @@ inline uint8_t Perm16::nb_descents_ref() const {
     return res;
 }
 inline uint8_t Perm16::nb_descents() const {
-    return __builtin_popcountl(_mm_movemask_epi8(v < shifted_right(v)));
+    return __builtin_popcountl(simde_mm_movemask_epi8(v < shifted_right(v)));
 }
 
 inline uint8_t Perm16::nb_cycles_ref() const {
@@ -326,19 +332,19 @@ inline uint8_t Perm16::nb_cycles_ref() const {
 inline epu8 Perm16::cycles_partition() const {
     epu8 x0, x1 = one();
     Perm16 p = *this;
-    x0 = _mm_min_epi8(x1, HPCombi::permuted(x1, p));
+    x0 = simde_mm_min_epi8(x1, HPCombi::permuted(x1, p));
     p = p * p;
-    x1 = _mm_min_epi8(x0, HPCombi::permuted(x0, p));
+    x1 = simde_mm_min_epi8(x0, HPCombi::permuted(x0, p));
     p = p * p;
-    x0 = _mm_min_epi8(x1, HPCombi::permuted(x1, p));
+    x0 = simde_mm_min_epi8(x1, HPCombi::permuted(x1, p));
     p = p * p;
-    x1 = _mm_min_epi8(x0, HPCombi::permuted(x0, p));
+    x1 = simde_mm_min_epi8(x0, HPCombi::permuted(x0, p));
     return x1;
 }
 
 inline uint8_t Perm16::nb_cycles_unroll() const {
     epu8 res = (epu8id == cycles_partition());
-    return __builtin_popcountl(_mm_movemask_epi8(res));
+    return __builtin_popcountl(simde_mm_movemask_epi8(res));
 }
 
 inline bool Perm16::left_weak_leq_ref(Perm16 other) const {
@@ -356,8 +362,8 @@ inline bool Perm16::left_weak_leq(Perm16 other) const {
     for (size_t i = 0; i < 15; i++) {
         srot = shifted_right(srot);
         orot = shifted_right(orot);
-        uint64_t sinv = _mm_movemask_epi8(v < srot);
-        uint64_t oinv = _mm_movemask_epi8(other.v < orot);
+        uint64_t sinv = simde_mm_movemask_epi8(v < srot);
+        uint64_t oinv = simde_mm_movemask_epi8(other.v < orot);
         if ((sinv & oinv) != sinv)
             return false;
     }