Skip to content

Commit

Permalink
Switch to simde
Browse files Browse the repository at this point in the history
  • Loading branch information
hivert committed Oct 23, 2023
1 parent 282ec03 commit eb84cee
Show file tree
Hide file tree
Showing 3 changed files with 78 additions and 54 deletions.
26 changes: 17 additions & 9 deletions include/epu.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
#include <functional> // less<>, equal_to<>
#include <iomanip>
#include <ostream>
#include <x86intrin.h>

#ifdef HPCOMBI_HAVE_CONFIG
#include "HPCombi-config.h"
Expand All @@ -34,6 +33,11 @@

#include "vect_generic.hpp"


#include "simde/x86/sse4.1.h"
#include "simde/x86/sse4.2.h"


#ifdef HPCOMBI_CONSTEXPR_FUN_ARGS
#define HPCOMBI_CONSTEXPR constexpr
#define HPCOMBI_CONSTEXPR_CONSTRUCTOR constexpr
Expand Down Expand Up @@ -202,32 +206,32 @@ inline const VectGeneric<16> &as_VectGeneric(const epu8 &v) {
}

/** Test whether all the entries of a #HPCombi::epu8 are zero */
inline bool is_all_zero(epu8 a) { return _mm_testz_si128(a, a); }
inline bool is_all_zero(epu8 a) { return simde_mm_testz_si128(a, a); }
/** Test whether all the entries of a #HPCombi::epu8 are one */
inline bool is_all_one(epu8 a) { return _mm_testc_si128(a, Epu8(0xFF)); }
inline bool is_all_one(epu8 a) { return simde_mm_testc_si128(a, Epu8(0xFF)); }

/** Equality of #HPCombi::epu8 */
inline bool equal(epu8 a, epu8 b) { return is_all_zero(_mm_xor_si128(a, b)); }
inline bool equal(epu8 a, epu8 b) { return is_all_zero(simde_mm_xor_si128(a, b)); }
/** Non equality of #HPCombi::epu8 */
inline bool not_equal(epu8 a, epu8 b) { return not equal(a, b); }

/** Permuting a #HPCombi::epu8 */
inline epu8 permuted(epu8 a, epu8 b) { return _mm_shuffle_epi8(a, b); }
inline epu8 permuted(epu8 a, epu8 b) { return simde_mm_shuffle_epi8(a, b); }
/** Left shifted of a #HPCombi::epu8 inserting a 0
* @warning we use the convention that the 0 entry is on the left !
*/
inline epu8 shifted_right(epu8 a) { return _mm_bslli_si128(a, 1); }
inline epu8 shifted_right(epu8 a) { return simde_mm_bslli_si128(a, 1); }
/** Right shifted of a #HPCombi::epu8 inserting a 0
* @warning we use the convention that the 0 entry is on the left !
*/
inline epu8 shifted_left(epu8 a) { return _mm_bsrli_si128(a, 1); }
inline epu8 shifted_left(epu8 a) { return simde_mm_bsrli_si128(a, 1); }
/** Reverting a #HPCombi::epu8 */
inline epu8 reverted(epu8 a) { return permuted(a, epu8rev); }

/** Vector min between two #HPCombi::epu8 0 */
inline epu8 min(epu8 a, epu8 b) { return _mm_min_epu8(a, b); }
inline epu8 min(epu8 a, epu8 b) { return simde_mm_min_epu8(a, b); }
/** Vector max between two #HPCombi::epu8 0 */
inline epu8 max(epu8 a, epu8 b) { return _mm_max_epu8(a, b); }
inline epu8 max(epu8 a, epu8 b) { return simde_mm_max_epu8(a, b); }

/** Testing if a #HPCombi::epu8 is sorted */
inline bool is_sorted(epu8 a);
Expand Down Expand Up @@ -546,11 +550,13 @@ inline epu8 eval16(epu8 v) { return eval16_cycle(v); };
* Reference @f$O(n)@f$ algorithm using loop and indexed access
*/
inline uint64_t first_diff_ref(epu8 a, epu8 b, size_t bound = 16);
#ifdef SIMDE_X86_SSE4_2_NATIVE
/** @copydoc common_first_diff
* @par Algorithm:
* Using \c cmpestri instruction
*/
inline uint64_t first_diff_cmpstr(epu8 a, epu8 b, size_t bound = 16);
#endif
/** @copydoc common_first_diff
* @par Algorithm:
* Using vector comparison and mask
Expand Down Expand Up @@ -584,11 +590,13 @@ inline uint64_t first_diff(epu8 a, epu8 b, size_t bound = 16) {
* Reference @f$O(n)@f$ algorithm using loop and indexed access
*/
inline uint64_t last_diff_ref(epu8 a, epu8 b, size_t bound = 16);
#ifdef SIMDE_X86_SSE4_2_NATIVE
/** @copydoc common_last_diff
* @par Algorithm:
* Using \c cmpestri instruction
*/
inline uint64_t last_diff_cmpstr(epu8 a, epu8 b, size_t bound = 16);
#endif
/** @copydoc common_last_diff
* @par Algorithm:
* Using vector comparison and mask
Expand Down
66 changes: 38 additions & 28 deletions include/epu_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,18 +24,18 @@

// Comparison mode for _mm_cmpestri
#define FIRST_DIFF \
(_SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_EACH | _SIDD_NEGATIVE_POLARITY)
(SIMDE_SIDD_UBYTE_OPS | SIMDE_SIDD_CMP_EQUAL_EACH | SIMDE_SIDD_NEGATIVE_POLARITY)
#define LAST_DIFF \
(_SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_EACH | _SIDD_NEGATIVE_POLARITY | \
_SIDD_MOST_SIGNIFICANT)
#define FIRST_ZERO (_SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY)
(SIMDE_SIDD_UBYTE_OPS | SIMDE_SIDD_CMP_EQUAL_EACH | SIMDE_SIDD_NEGATIVE_POLARITY | \
SIMDE_SIDD_MOST_SIGNIFICANT)
#define FIRST_ZERO (SIMDE_SIDD_UBYTE_OPS | SIMDE_SIDD_CMP_EQUAL_ANY)
#define LAST_ZERO \
(_SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_MOST_SIGNIFICANT)
(SIMDE_SIDD_UBYTE_OPS | SIMDE_SIDD_CMP_EQUAL_ANY | SIMDE_SIDD_MOST_SIGNIFICANT)
#define FIRST_NON_ZERO \
(_SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_MASKED_NEGATIVE_POLARITY)
(SIMDE_SIDD_UBYTE_OPS | SIMDE_SIDD_CMP_EQUAL_ANY | SIMDE_SIDD_MASKED_NEGATIVE_POLARITY)
#define LAST_NON_ZERO \
(_SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_MASKED_NEGATIVE_POLARITY | \
_SIDD_MOST_SIGNIFICANT)
(SIMDE_SIDD_UBYTE_OPS | SIMDE_SIDD_CMP_EQUAL_ANY | SIMDE_SIDD_MASKED_NEGATIVE_POLARITY | \
SIMDE_SIDD_MOST_SIGNIFICANT)

namespace HPCombi {

Expand All @@ -45,11 +45,11 @@ namespace HPCombi {

// Msk is supposed to be a boolean mask (i.e. each entry is either 0 or 255)
inline uint64_t first_mask(epu8 msk, size_t bound) {
uint64_t res = _mm_movemask_epi8(msk & (epu8id < Epu8(bound)));
uint64_t res = simde_mm_movemask_epi8(msk & (epu8id < Epu8(bound)));
return res == 0 ? 16 : _bit_scan_forward(res);
}
inline uint64_t last_mask(epu8 msk, size_t bound) {
auto res = _mm_movemask_epi8(msk & (epu8id < Epu8(bound)));
auto res = simde_mm_movemask_epi8(msk & (epu8id < Epu8(bound)));
return res == 0 ? 16 : _bit_scan_reverse(res);
}

Expand All @@ -59,9 +59,11 @@ inline uint64_t first_diff_ref(epu8 a, epu8 b, size_t bound) {
return i;
return 16;
}
#ifdef SIMDE_X86_SSE4_2_NATIVE
inline uint64_t first_diff_cmpstr(epu8 a, epu8 b, size_t bound) {
return unsigned(_mm_cmpestri(a, bound, b, bound, FIRST_DIFF));
}
#endif
inline uint64_t first_diff_mask(epu8 a, epu8 b, size_t bound) {
return first_mask(a != b, bound);
}
Expand All @@ -74,9 +76,11 @@ inline uint64_t last_diff_ref(epu8 a, epu8 b, size_t bound) {
}
return 16;
}
#ifdef SIMDE_X86_SSE4_2_NATIVE
inline uint64_t last_diff_cmpstr(epu8 a, epu8 b, size_t bound) {
return unsigned(_mm_cmpestri(a, bound, b, bound, LAST_DIFF));
}
#endif
inline uint64_t last_diff_mask(epu8 a, epu8 b, size_t bound) {
return last_mask(a != b, bound);
}
Expand Down Expand Up @@ -114,7 +118,7 @@ inline epu8 network_sort(epu8 res, std::array<epu8, sz> rounds) {
epu8 mask = Increassing ? round < epu8id : epu8id < round;
epu8 b = permuted(res, round);
// res = mask ? min(res,b) : max(res,b); is not accepted by clang
res = _mm_blendv_epi8(min(res, b), max(res, b), mask);
res = simde_mm_blendv_epi8(min(res, b), max(res, b), mask);
}
return res;
}
Expand All @@ -127,9 +131,9 @@ inline epu8 network_sort_perm(epu8 &v, std::array<epu8, sz> rounds) {
// This conditional should be optimized out by the compiler
epu8 mask = Increassing ? round < epu8id : epu8id < round;
epu8 b = permuted(v, round);
epu8 cmp = _mm_blendv_epi8(b < v, v < b, mask);
v = _mm_blendv_epi8(v, b, cmp);
res = _mm_blendv_epi8(res, permuted(res, round), cmp);
epu8 cmp = simde_mm_blendv_epi8(b < v, v < b, mask);
v = simde_mm_blendv_epi8(v, b, cmp);
res = simde_mm_blendv_epi8(res, permuted(res, round), cmp);
}
return res;
}
Expand Down Expand Up @@ -178,7 +182,7 @@ constexpr std::array<epu8, 6> sorting_rounds8
// clang-format on

inline bool is_sorted(epu8 a) {
return _mm_movemask_epi8(shifted_right(a) > a) == 0;
return simde_mm_movemask_epi8(shifted_right(a) > a) == 0;
}
inline epu8 sorted(epu8 a) {
return network_sort<true>(a, sorting_rounds);
Expand Down Expand Up @@ -215,7 +219,7 @@ inline epu8 random_epu8(uint16_t bnd) {
inline epu8 remove_dups(epu8 v, uint8_t repl) {
// Vector ternary operator is not supported by clang.
// return (v != shifted_right(v) ? v : Epu8(repl);
return _mm_blendv_epi8(Epu8(repl), v, v != shifted_right(v));
return simde_mm_blendv_epi8(Epu8(repl), v, v != shifted_right(v));
}

// Gather at the front numbers with (3-i)-th bit not set.
Expand All @@ -229,19 +233,22 @@ constexpr std::array<epu8, 3> inverting_rounds {{
}};

#define FIND_IN_VECT \
(_SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_UNIT_MASK | \
_SIDD_NEGATIVE_POLARITY)
(SIMDE_SIDD_UBYTE_OPS | SIMDE_SIDD_CMP_EQUAL_ANY | SIMDE_SIDD_UNIT_MASK | \
SIMDE_SIDD_NEGATIVE_POLARITY)
#define FIND_IN_VECT_COMPL \
(_SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_UNIT_MASK)
(SIMDE_SIDD_UBYTE_OPS | SIMDE_SIDD_CMP_EQUAL_ANY | SIMDE_SIDD_UNIT_MASK)

inline epu8 permutation_of(epu8 a, epu8 b) {
#ifdef SIMDE_X86_SSE4_2_NATIVE
epu8 res = -static_cast<epu8>(_mm_cmpestrm(a, 8, b, 16, FIND_IN_VECT));
for (epu8 round : inverting_rounds) {
a = permuted(a, round);
res <<= 1;
res -= static_cast<epu8>(_mm_cmpestrm(a, 8, b, 16, FIND_IN_VECT));
}
return res;
#else
#endif
}


Expand Down Expand Up @@ -404,7 +411,7 @@ inline epu8 eval16_cycle(epu8 v) {
inline epu8 eval16_popcount(epu8 v) {
epu8 res{};
for (size_t i = 0; i < 16; i++) {
res[i] = __builtin_popcountl(_mm_movemask_epi8(v == Epu8(uint8_t(i))));
res[i] = __builtin_popcountl(simde_mm_movemask_epi8(v == Epu8(uint8_t(i))));
}
return res;
}
Expand All @@ -419,13 +426,13 @@ inline bool is_partial_transformation(epu8 v, const size_t k) {
uint64_t diff = last_diff(v, epu8id, 16);
// (forall x in v, x + 1 <= 16) and
// (v = Perm16::one() or last diff index < 16)
return (_mm_movemask_epi8(v + Epu8(1) <= Epu8(0x10)) == 0xffff)
return (simde_mm_movemask_epi8(v + Epu8(1) <= Epu8(0x10)) == 0xffff)
&& (diff == 16 || diff < k);
}

inline bool is_transformation(epu8 v, const size_t k) {
uint64_t diff = last_diff(v, epu8id, 16);
return (_mm_movemask_epi8(v < Epu8(0x10)) == 0xffff)
return (simde_mm_movemask_epi8(v < Epu8(0x10)) == 0xffff)
&& (diff == 16 || diff < k);
}

Expand All @@ -434,8 +441,8 @@ inline bool is_partial_permutation(epu8 v, const size_t k) {
// (forall x in v, x <= 15) and
// (forall x < 15, multiplicity x v <= 1
// (v = Perm16::one() or last diff index < 16)
return (_mm_movemask_epi8(v + Epu8(1) <= Epu8(0x10)) == 0xffff)
&& (_mm_movemask_epi8(eval16(v) <= Epu8(1)) == 0xffff)
return (simde_mm_movemask_epi8(v + Epu8(1) <= Epu8(0x10)) == 0xffff)
&& (simde_mm_movemask_epi8(eval16(v) <= Epu8(1)) == 0xffff)
&& (diff == 16 || diff < k);
}

Expand All @@ -444,9 +451,12 @@ inline bool is_permutation(epu8 v, const size_t k) {
// (forall x in v, x in Perm16::one()) and
// (forall x in Perm16::one(), x in v) and
// (v = Perm16::one() or last diff index < 16)
#ifdef SIMDE_X86_SSE4_2_NATIVE
return _mm_cmpestri(epu8id, 16, v, 16, FIRST_NON_ZERO) == 16
&& _mm_cmpestri(v, 16, epu8id, 16, FIRST_NON_ZERO) == 16
&& (diff == 16 || diff < k);
#else
#endif
}

} // namespace HPCombi
Expand Down Expand Up @@ -475,13 +485,13 @@ template <> struct not_equal_to<HPCombi::epu8> {

template <> struct hash<HPCombi::epu8> {
inline size_t operator()(HPCombi::epu8 a) const {
unsigned __int128 v0 = _mm_extract_epi64(a, 0);
unsigned __int128 v1 = _mm_extract_epi64(a, 1);
unsigned __int128 v0 = simde_mm_extract_epi64(a, 0);
unsigned __int128 v1 = simde_mm_extract_epi64(a, 1);
return ((v1 * HPCombi::prime + v0) * HPCombi::prime) >> 64;

/* The following is extremely slow on Renner benchmark
uint64_t v0 = _mm_extract_epi64(ar.v, 0);
uint64_t v1 = _mm_extract_epi64(ar.v, 1);
uint64_t v0 = simde_mm_extract_epi64(ar.v, 0);
uint64_t v1 = simde_mm_extract_epi64(ar.v, 1);
size_t seed = v0 + 0x9e3779b9;
seed ^= v1 + 0x9e3779b9 + (seed<<6) + (seed>>2);
return seed;
Expand Down
Loading

0 comments on commit eb84cee

Please sign in to comment.