From eb84cee8b886cf278e999973511f9382f408c0d3 Mon Sep 17 00:00:00 2001 From: Florent Hivert Date: Mon, 23 Oct 2023 14:52:06 +0200 Subject: [PATCH] Switch to simde --- include/epu.hpp | 26 ++++++++++------ include/epu_impl.hpp | 66 ++++++++++++++++++++++++----------------- include/perm16_impl.hpp | 40 ++++++++++++++----------- 3 files changed, 78 insertions(+), 54 deletions(-) diff --git a/include/epu.hpp b/include/epu.hpp index a3e13a88..87f807fe 100644 --- a/include/epu.hpp +++ b/include/epu.hpp @@ -22,7 +22,6 @@ #include // less<>, equal_to<> #include #include -#include #ifdef HPCOMBI_HAVE_CONFIG #include "HPCombi-config.h" @@ -34,6 +33,11 @@ #include "vect_generic.hpp" + +#include "simde/x86/sse4.1.h" +#include "simde/x86/sse4.2.h" + + #ifdef HPCOMBI_CONSTEXPR_FUN_ARGS #define HPCOMBI_CONSTEXPR constexpr #define HPCOMBI_CONSTEXPR_CONSTRUCTOR constexpr @@ -202,32 +206,32 @@ inline const VectGeneric<16> &as_VectGeneric(const epu8 &v) { } /** Test whether all the entries of a #HPCombi::epu8 are zero */ -inline bool is_all_zero(epu8 a) { return _mm_testz_si128(a, a); } +inline bool is_all_zero(epu8 a) { return simde_mm_testz_si128(a, a); } /** Test whether all the entries of a #HPCombi::epu8 are one */ -inline bool is_all_one(epu8 a) { return _mm_testc_si128(a, Epu8(0xFF)); } +inline bool is_all_one(epu8 a) { return simde_mm_testc_si128(a, Epu8(0xFF)); } /** Equality of #HPCombi::epu8 */ -inline bool equal(epu8 a, epu8 b) { return is_all_zero(_mm_xor_si128(a, b)); } +inline bool equal(epu8 a, epu8 b) { return is_all_zero(simde_mm_xor_si128(a, b)); } /** Non equality of #HPCombi::epu8 */ inline bool not_equal(epu8 a, epu8 b) { return not equal(a, b); } /** Permuting a #HPCombi::epu8 */ -inline epu8 permuted(epu8 a, epu8 b) { return _mm_shuffle_epi8(a, b); } +inline epu8 permuted(epu8 a, epu8 b) { return simde_mm_shuffle_epi8(a, b); } /** Left shifted of a #HPCombi::epu8 inserting a 0 * @warning we use the convention that the 0 entry is on the left ! */ -inline epu8 shifted_right(epu8 a) { return _mm_bslli_si128(a, 1); } +inline epu8 shifted_right(epu8 a) { return simde_mm_bslli_si128(a, 1); } /** Right shifted of a #HPCombi::epu8 inserting a 0 * @warning we use the convention that the 0 entry is on the left ! */ -inline epu8 shifted_left(epu8 a) { return _mm_bsrli_si128(a, 1); } +inline epu8 shifted_left(epu8 a) { return simde_mm_bsrli_si128(a, 1); } /** Reverting a #HPCombi::epu8 */ inline epu8 reverted(epu8 a) { return permuted(a, epu8rev); } /** Vector min between two #HPCombi::epu8 0 */ -inline epu8 min(epu8 a, epu8 b) { return _mm_min_epu8(a, b); } +inline epu8 min(epu8 a, epu8 b) { return simde_mm_min_epu8(a, b); } /** Vector max between two #HPCombi::epu8 0 */ -inline epu8 max(epu8 a, epu8 b) { return _mm_max_epu8(a, b); } +inline epu8 max(epu8 a, epu8 b) { return simde_mm_max_epu8(a, b); } /** Testing if a #HPCombi::epu8 is sorted */ inline bool is_sorted(epu8 a); @@ -546,11 +550,13 @@ inline epu8 eval16(epu8 v) { return eval16_cycle(v); }; * Reference @f$O(n)@f$ algorithm using loop and indexed access */ inline uint64_t first_diff_ref(epu8 a, epu8 b, size_t bound = 16); +#ifdef SIMDE_X86_SSE4_2_NATIVE /** @copydoc common_first_diff * @par Algorithm: * Using \c cmpestri instruction */ inline uint64_t first_diff_cmpstr(epu8 a, epu8 b, size_t bound = 16); +#endif /** @copydoc common_first_diff * @par Algorithm: * Using vector comparison and mask @@ -584,11 +590,13 @@ inline uint64_t first_diff(epu8 a, epu8 b, size_t bound = 16) { * Reference @f$O(n)@f$ algorithm using loop and indexed access */ inline uint64_t last_diff_ref(epu8 a, epu8 b, size_t bound = 16); +#ifdef SIMDE_X86_SSE4_2_NATIVE /** @copydoc common_last_diff * @par Algorithm: * Using \c cmpestri instruction */ inline uint64_t last_diff_cmpstr(epu8 a, epu8 b, size_t bound = 16); +#endif /** @copydoc common_last_diff * @par Algorithm: * Using vector comparison and mask diff --git a/include/epu_impl.hpp b/include/epu_impl.hpp index c085abff..43784b3d 100644 --- a/include/epu_impl.hpp +++ b/include/epu_impl.hpp @@ -24,18 +24,18 @@ // Comparison mode for _mm_cmpestri #define FIRST_DIFF \ - (_SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_EACH | _SIDD_NEGATIVE_POLARITY) + (SIMDE_SIDD_UBYTE_OPS | SIMDE_SIDD_CMP_EQUAL_EACH | SIMDE_SIDD_NEGATIVE_POLARITY) #define LAST_DIFF \ - (_SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_EACH | _SIDD_NEGATIVE_POLARITY | \ - _SIDD_MOST_SIGNIFICANT) -#define FIRST_ZERO (_SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY) + (SIMDE_SIDD_UBYTE_OPS | SIMDE_SIDD_CMP_EQUAL_EACH | SIMDE_SIDD_NEGATIVE_POLARITY | \ + SIMDE_SIDD_MOST_SIGNIFICANT) +#define FIRST_ZERO (SIMDE_SIDD_UBYTE_OPS | SIMDE_SIDD_CMP_EQUAL_ANY) #define LAST_ZERO \ - (_SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_MOST_SIGNIFICANT) + (SIMDE_SIDD_UBYTE_OPS | SIMDE_SIDD_CMP_EQUAL_ANY | SIMDE_SIDD_MOST_SIGNIFICANT) #define FIRST_NON_ZERO \ - (_SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_MASKED_NEGATIVE_POLARITY) + (SIMDE_SIDD_UBYTE_OPS | SIMDE_SIDD_CMP_EQUAL_ANY | SIMDE_SIDD_MASKED_NEGATIVE_POLARITY) #define LAST_NON_ZERO \ - (_SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_MASKED_NEGATIVE_POLARITY | \ - _SIDD_MOST_SIGNIFICANT) + (SIMDE_SIDD_UBYTE_OPS | SIMDE_SIDD_CMP_EQUAL_ANY | SIMDE_SIDD_MASKED_NEGATIVE_POLARITY | \ + SIMDE_SIDD_MOST_SIGNIFICANT) namespace HPCombi { @@ -45,11 +45,11 @@ namespace HPCombi { // Msk is supposed to be a boolean mask (i.e. each entry is either 0 or 255) inline uint64_t first_mask(epu8 msk, size_t bound) { - uint64_t res = _mm_movemask_epi8(msk & (epu8id < Epu8(bound))); + uint64_t res = simde_mm_movemask_epi8(msk & (epu8id < Epu8(bound))); return res == 0 ? 16 : _bit_scan_forward(res); } inline uint64_t last_mask(epu8 msk, size_t bound) { - auto res = _mm_movemask_epi8(msk & (epu8id < Epu8(bound))); + auto res = simde_mm_movemask_epi8(msk & (epu8id < Epu8(bound))); return res == 0 ? 16 : _bit_scan_reverse(res); } @@ -59,9 +59,11 @@ inline uint64_t first_diff_ref(epu8 a, epu8 b, size_t bound) { return i; return 16; } +#ifdef SIMDE_X86_SSE4_2_NATIVE inline uint64_t first_diff_cmpstr(epu8 a, epu8 b, size_t bound) { return unsigned(_mm_cmpestri(a, bound, b, bound, FIRST_DIFF)); } +#endif inline uint64_t first_diff_mask(epu8 a, epu8 b, size_t bound) { return first_mask(a != b, bound); } @@ -74,9 +76,11 @@ inline uint64_t last_diff_ref(epu8 a, epu8 b, size_t bound) { } return 16; } +#ifdef SIMDE_X86_SSE4_2_NATIVE inline uint64_t last_diff_cmpstr(epu8 a, epu8 b, size_t bound) { return unsigned(_mm_cmpestri(a, bound, b, bound, LAST_DIFF)); } +#endif inline uint64_t last_diff_mask(epu8 a, epu8 b, size_t bound) { return last_mask(a != b, bound); } @@ -114,7 +118,7 @@ inline epu8 network_sort(epu8 res, std::array rounds) { epu8 mask = Increassing ? round < epu8id : epu8id < round; epu8 b = permuted(res, round); // res = mask ? min(res,b) : max(res,b); is not accepted by clang - res = _mm_blendv_epi8(min(res, b), max(res, b), mask); + res = simde_mm_blendv_epi8(min(res, b), max(res, b), mask); } return res; } @@ -127,9 +131,9 @@ inline epu8 network_sort_perm(epu8 &v, std::array rounds) { // This conditional should be optimized out by the compiler epu8 mask = Increassing ? round < epu8id : epu8id < round; epu8 b = permuted(v, round); - epu8 cmp = _mm_blendv_epi8(b < v, v < b, mask); - v = _mm_blendv_epi8(v, b, cmp); - res = _mm_blendv_epi8(res, permuted(res, round), cmp); + epu8 cmp = simde_mm_blendv_epi8(b < v, v < b, mask); + v = simde_mm_blendv_epi8(v, b, cmp); + res = simde_mm_blendv_epi8(res, permuted(res, round), cmp); } return res; } @@ -178,7 +182,7 @@ constexpr std::array sorting_rounds8 // clang-format on inline bool is_sorted(epu8 a) { - return _mm_movemask_epi8(shifted_right(a) > a) == 0; + return simde_mm_movemask_epi8(shifted_right(a) > a) == 0; } inline epu8 sorted(epu8 a) { return network_sort(a, sorting_rounds); @@ -215,7 +219,7 @@ inline epu8 random_epu8(uint16_t bnd) { inline epu8 remove_dups(epu8 v, uint8_t repl) { // Vector ternary operator is not supported by clang. // return (v != shifted_right(v) ? v : Epu8(repl); - return _mm_blendv_epi8(Epu8(repl), v, v != shifted_right(v)); + return simde_mm_blendv_epi8(Epu8(repl), v, v != shifted_right(v)); } // Gather at the front numbers with (3-i)-th bit not set. @@ -229,12 +233,13 @@ constexpr std::array inverting_rounds {{ }}; #define FIND_IN_VECT \ - (_SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_UNIT_MASK | \ - _SIDD_NEGATIVE_POLARITY) + (SIMDE_SIDD_UBYTE_OPS | SIMDE_SIDD_CMP_EQUAL_ANY | SIMDE_SIDD_UNIT_MASK | \ + SIMDE_SIDD_NEGATIVE_POLARITY) #define FIND_IN_VECT_COMPL \ - (_SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_UNIT_MASK) + (SIMDE_SIDD_UBYTE_OPS | SIMDE_SIDD_CMP_EQUAL_ANY | SIMDE_SIDD_UNIT_MASK) inline epu8 permutation_of(epu8 a, epu8 b) { +#ifdef SIMDE_X86_SSE4_2_NATIVE epu8 res = -static_cast(_mm_cmpestrm(a, 8, b, 16, FIND_IN_VECT)); for (epu8 round : inverting_rounds) { a = permuted(a, round); @@ -242,6 +247,8 @@ inline epu8 permutation_of(epu8 a, epu8 b) { res -= static_cast(_mm_cmpestrm(a, 8, b, 16, FIND_IN_VECT)); } return res; +#else +#endif } @@ -404,7 +411,7 @@ inline epu8 eval16_cycle(epu8 v) { inline epu8 eval16_popcount(epu8 v) { epu8 res{}; for (size_t i = 0; i < 16; i++) { - res[i] = __builtin_popcountl(_mm_movemask_epi8(v == Epu8(uint8_t(i)))); + res[i] = __builtin_popcountl(simde_mm_movemask_epi8(v == Epu8(uint8_t(i)))); } return res; } @@ -419,13 +426,13 @@ inline bool is_partial_transformation(epu8 v, const size_t k) { uint64_t diff = last_diff(v, epu8id, 16); // (forall x in v, x + 1 <= 16) and // (v = Perm16::one() or last diff index < 16) - return (_mm_movemask_epi8(v + Epu8(1) <= Epu8(0x10)) == 0xffff) + return (simde_mm_movemask_epi8(v + Epu8(1) <= Epu8(0x10)) == 0xffff) && (diff == 16 || diff < k); } inline bool is_transformation(epu8 v, const size_t k) { uint64_t diff = last_diff(v, epu8id, 16); - return (_mm_movemask_epi8(v < Epu8(0x10)) == 0xffff) + return (simde_mm_movemask_epi8(v < Epu8(0x10)) == 0xffff) && (diff == 16 || diff < k); } @@ -434,8 +441,8 @@ inline bool is_partial_permutation(epu8 v, const size_t k) { // (forall x in v, x <= 15) and // (forall x < 15, multiplicity x v <= 1 // (v = Perm16::one() or last diff index < 16) - return (_mm_movemask_epi8(v + Epu8(1) <= Epu8(0x10)) == 0xffff) - && (_mm_movemask_epi8(eval16(v) <= Epu8(1)) == 0xffff) + return (simde_mm_movemask_epi8(v + Epu8(1) <= Epu8(0x10)) == 0xffff) + && (simde_mm_movemask_epi8(eval16(v) <= Epu8(1)) == 0xffff) && (diff == 16 || diff < k); } @@ -444,9 +451,12 @@ inline bool is_permutation(epu8 v, const size_t k) { // (forall x in v, x in Perm16::one()) and // (forall x in Perm16::one(), x in v) and // (v = Perm16::one() or last diff index < 16) +#ifdef SIMDE_X86_SSE4_2_NATIVE return _mm_cmpestri(epu8id, 16, v, 16, FIRST_NON_ZERO) == 16 && _mm_cmpestri(v, 16, epu8id, 16, FIRST_NON_ZERO) == 16 && (diff == 16 || diff < k); +#else +#endif } } // namespace HPCombi @@ -475,13 +485,13 @@ template <> struct not_equal_to { template <> struct hash { inline size_t operator()(HPCombi::epu8 a) const { - unsigned __int128 v0 = _mm_extract_epi64(a, 0); - unsigned __int128 v1 = _mm_extract_epi64(a, 1); + unsigned __int128 v0 = simde_mm_extract_epi64(a, 0); + unsigned __int128 v1 = simde_mm_extract_epi64(a, 1); return ((v1 * HPCombi::prime + v0) * HPCombi::prime) >> 64; /* The following is extremely slow on Renner benchmark - uint64_t v0 = _mm_extract_epi64(ar.v, 0); - uint64_t v1 = _mm_extract_epi64(ar.v, 1); + uint64_t v0 = simde_mm_extract_epi64(ar.v, 0); + uint64_t v1 = simde_mm_extract_epi64(ar.v, 1); size_t seed = v0 + 0x9e3779b9; seed ^= v1 + 0x9e3779b9 + (seed<<6) + (seed>>2); return seed; diff --git a/include/perm16_impl.hpp b/include/perm16_impl.hpp index 5ea3af2a..19c25563 100644 --- a/include/perm16_impl.hpp +++ b/include/perm16_impl.hpp @@ -50,18 +50,21 @@ inline epu8 PTransf16::domain_mask(bool complement) const { return complement ? v == Epu8(0xFF) : v != Epu8(0xFF); } inline uint32_t PTransf16::domain_bitset(bool complement) const { - return _mm_movemask_epi8(domain_mask(complement)); + return simde_mm_movemask_epi8(domain_mask(complement)); } inline PTransf16 PTransf16::right_one() const { return domain_mask(true) | epu8id; } inline epu8 PTransf16::image_mask(bool complement) const { +#ifdef SIMDE_X86_SSE4_2_NATIVE return complement ? _mm_cmpestrm(v, 16, one().v, 16, FIND_IN_VECT) : _mm_cmpestrm(v, 16, one().v, 16, FIND_IN_VECT_COMPL); +#else +#endif } inline uint32_t PTransf16::image_bitset(bool complement) const { - return _mm_movemask_epi8(image_mask(complement)); + return simde_mm_movemask_epi8(image_mask(complement)); } inline PTransf16 PTransf16::left_one() const { return image_mask(true) | epu8id; @@ -83,7 +86,7 @@ inline epu8 PTransf16::fix_points_mask(bool complement) const { return complement ? v != one().v : v == one().v; } inline uint32_t PTransf16::fix_points_bitset(bool complement) const { - return _mm_movemask_epi8(fix_points_mask(complement)); + return simde_mm_movemask_epi8(fix_points_mask(complement)); } inline uint8_t PTransf16::smallest_fix_point() const { @@ -120,14 +123,14 @@ inline static HPCOMBI_CONSTEXPR uint8_t hilo_mask_fun(uint8_t i) { static HPCOMBI_CONSTEXPR epu8 hilo_mask = Epu8(hilo_mask_fun); inline Transf16::Transf16(uint64_t compressed) { - epu8 res = _mm_set_epi64x(compressed, compressed); - v = _mm_blendv_epi8(res & Epu8(0x0F), res >> 4, hilo_mask); + epu8 res = simde_mm_set_epi64x(compressed, compressed); + v = simde_mm_blendv_epi8(res & Epu8(0x0F), res >> 4, hilo_mask); } inline Transf16::operator uint64_t() const { - epu8 res = static_cast(_mm_slli_epi32(v, 4)); + epu8 res = static_cast(simde_mm_slli_epi32(v, 4)); res = HPCombi::permuted(res, hilo_exchng) + v; - return _mm_extract_epi64(res, 0); + return simde_mm_extract_epi64(res, 0); } inline PPerm16 PPerm16::inverse_ref() const { @@ -139,8 +142,11 @@ inline PPerm16 PPerm16::inverse_ref() const { } inline PPerm16 PPerm16::inverse_find() const { +#ifdef SIMDE_X86_SSE4_2_NATIVE epu8 mask = _mm_cmpestrm(v, 16, one(), 16, FIND_IN_VECT); return permutation_of(v, one()) | mask; +#else +#endif } inline Perm16 Perm16::random(uint64_t n) { @@ -207,7 +213,7 @@ inline Perm16 Perm16::inverse_sort() const { // G++-7 compile this shift by 3 additions. // epu8 res = (v << 4) + one().v; // I call directly the shift intrinsic - epu8 res = static_cast(_mm_slli_epi32(v, 4)) + one().v; + epu8 res = static_cast(simde_mm_slli_epi32(v, 4)) + one().v; res = sorted(res) & Epu8(0x0F); return res; } @@ -230,7 +236,7 @@ inline Perm16 Perm16::inverse_cycl() const { for (int i = 9; i <= 16; i++) { Perm16 oldpow = newpow; newpow = oldpow * *this; - res.v = _mm_blendv_epi8(res, oldpow, newpow.v == one().v); + res.v = simde_mm_blendv_epi8(res, oldpow, newpow.v == one().v); } return res; } @@ -307,7 +313,7 @@ inline uint8_t Perm16::nb_descents_ref() const { return res; } inline uint8_t Perm16::nb_descents() const { - return __builtin_popcountl(_mm_movemask_epi8(v < shifted_right(v))); + return __builtin_popcountl(simde_mm_movemask_epi8(v < shifted_right(v))); } inline uint8_t Perm16::nb_cycles_ref() const { @@ -326,19 +332,19 @@ inline uint8_t Perm16::nb_cycles_ref() const { inline epu8 Perm16::cycles_partition() const { epu8 x0, x1 = one(); Perm16 p = *this; - x0 = _mm_min_epi8(x1, HPCombi::permuted(x1, p)); + x0 = simde_mm_min_epi8(x1, HPCombi::permuted(x1, p)); p = p * p; - x1 = _mm_min_epi8(x0, HPCombi::permuted(x0, p)); + x1 = simde_mm_min_epi8(x0, HPCombi::permuted(x0, p)); p = p * p; - x0 = _mm_min_epi8(x1, HPCombi::permuted(x1, p)); + x0 = simde_mm_min_epi8(x1, HPCombi::permuted(x1, p)); p = p * p; - x1 = _mm_min_epi8(x0, HPCombi::permuted(x0, p)); + x1 = simde_mm_min_epi8(x0, HPCombi::permuted(x0, p)); return x1; } inline uint8_t Perm16::nb_cycles_unroll() const { epu8 res = (epu8id == cycles_partition()); - return __builtin_popcountl(_mm_movemask_epi8(res)); + return __builtin_popcountl(simde_mm_movemask_epi8(res)); } inline bool Perm16::left_weak_leq_ref(Perm16 other) const { @@ -356,8 +362,8 @@ inline bool Perm16::left_weak_leq(Perm16 other) const { for (size_t i = 0; i < 15; i++) { srot = shifted_right(srot); orot = shifted_right(orot); - uint64_t sinv = _mm_movemask_epi8(v < srot); - uint64_t oinv = _mm_movemask_epi8(other.v < orot); + uint64_t sinv = simde_mm_movemask_epi8(v < srot); + uint64_t oinv = simde_mm_movemask_epi8(other.v < orot); if ((sinv & oinv) != sinv) return false; }