Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use global Macros for GCC specific keywords #68

Merged
merged 3 commits into from
Sep 5, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 13 additions & 13 deletions src/avx512-64bit-argsort.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ X86_SIMD_SORT_INLINE void argsort_32_64bit(type_t *arr, int64_t *arg, int32_t N)
zmm_t arrzmm[4];
argzmm_t argzmm[4];

#pragma GCC unroll 2
#pragma X86_SIMD_SORT_UNROLL_LOOP(2)
for (int ii = 0; ii < 2; ++ii) {
argzmm[ii] = argtype::loadu(arg + 8 * ii);
arrzmm[ii] = vtype::template i64gather<sizeof(type_t)>(argzmm[ii], arr);
Expand All @@ -117,7 +117,7 @@ X86_SIMD_SORT_INLINE void argsort_32_64bit(type_t *arr, int64_t *arg, int32_t N)

uint64_t combined_mask = (0x1ull << (N - 16)) - 0x1ull;
opmask_t load_mask[2] = {0xFF, 0xFF};
#pragma GCC unroll 2
#pragma X86_SIMD_SORT_UNROLL_LOOP(2)
for (int ii = 0; ii < 2; ++ii) {
load_mask[ii] = (combined_mask >> (ii * 8)) & 0xFF;
argzmm[ii + 2] = argtype::maskz_loadu(load_mask[ii], arg + 16 + 8 * ii);
Expand Down Expand Up @@ -151,7 +151,7 @@ X86_SIMD_SORT_INLINE void argsort_64_64bit(type_t *arr, int64_t *arg, int32_t N)
zmm_t arrzmm[8];
argzmm_t argzmm[8];

#pragma GCC unroll 4
#pragma X86_SIMD_SORT_UNROLL_LOOP(4)
for (int ii = 0; ii < 4; ++ii) {
argzmm[ii] = argtype::loadu(arg + 8 * ii);
arrzmm[ii] = vtype::template i64gather<sizeof(type_t)>(argzmm[ii], arr);
Expand All @@ -160,7 +160,7 @@ X86_SIMD_SORT_INLINE void argsort_64_64bit(type_t *arr, int64_t *arg, int32_t N)

opmask_t load_mask[4] = {0xFF, 0xFF, 0xFF, 0xFF};
uint64_t combined_mask = (0x1ull << (N - 32)) - 0x1ull;
#pragma GCC unroll 4
#pragma X86_SIMD_SORT_UNROLL_LOOP(4)
for (int ii = 0; ii < 4; ++ii) {
load_mask[ii] = (combined_mask >> (ii * 8)) & 0xFF;
argzmm[ii + 4] = argtype::maskz_loadu(load_mask[ii], arg + 32 + 8 * ii);
Expand All @@ -170,7 +170,7 @@ X86_SIMD_SORT_INLINE void argsort_64_64bit(type_t *arr, int64_t *arg, int32_t N)
argzmm[ii + 4]);
}

#pragma GCC unroll 4
#pragma X86_SIMD_SORT_UNROLL_LOOP(4)
for (int ii = 0; ii < 8; ii = ii + 2) {
bitonic_merge_two_zmm_64bit<vtype, argtype>(
arrzmm[ii], arrzmm[ii + 1], argzmm[ii], argzmm[ii + 1]);
Expand All @@ -179,11 +179,11 @@ X86_SIMD_SORT_INLINE void argsort_64_64bit(type_t *arr, int64_t *arg, int32_t N)
bitonic_merge_four_zmm_64bit<vtype, argtype>(arrzmm + 4, argzmm + 4);
bitonic_merge_eight_zmm_64bit<vtype, argtype>(arrzmm, argzmm);

#pragma GCC unroll 4
#pragma X86_SIMD_SORT_UNROLL_LOOP(4)
for (int ii = 0; ii < 4; ++ii) {
argtype::storeu(arg + 8 * ii, argzmm[ii]);
}
#pragma GCC unroll 4
#pragma X86_SIMD_SORT_UNROLL_LOOP(4)
for (int ii = 0; ii < 4; ++ii) {
argtype::mask_storeu(arg + 32 + 8 * ii, load_mask[ii], argzmm[ii + 4]);
}
Expand All @@ -203,7 +203,7 @@ X86_SIMD_SORT_INLINE void argsort_64_64bit(type_t *arr, int64_t *arg, int32_t N)
// zmm_t arrzmm[16];
// argzmm_t argzmm[16];
//
//#pragma GCC unroll 8
//#pragma X86_SIMD_SORT_UNROLL_LOOP(8)
// for (int ii = 0; ii < 8; ++ii) {
// argzmm[ii] = argtype::loadu(arg + 8*ii);
// arrzmm[ii] = vtype::template i64gather<sizeof(type_t)>(argzmm[ii], arr);
Expand All @@ -213,19 +213,19 @@ X86_SIMD_SORT_INLINE void argsort_64_64bit(type_t *arr, int64_t *arg, int32_t N)
// opmask_t load_mask[8] = {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF};
// if (N != 128) {
// uint64_t combined_mask = (0x1ull << (N - 64)) - 0x1ull;
//#pragma GCC unroll 8
//#pragma X86_SIMD_SORT_UNROLL_LOOP(8)
// for (int ii = 0; ii < 8; ++ii) {
// load_mask[ii] = (combined_mask >> (ii*8)) & 0xFF;
// }
// }
//#pragma GCC unroll 8
//#pragma X86_SIMD_SORT_UNROLL_LOOP(8)
// for (int ii = 0; ii < 8; ++ii) {
// argzmm[ii+8] = argtype::maskz_loadu(load_mask[ii], arg + 64 + 8*ii);
// arrzmm[ii+8] = vtype::template mask_i64gather<sizeof(type_t)>(vtype::zmm_max(), load_mask[ii], argzmm[ii+8], arr);
// arrzmm[ii+8] = sort_zmm_64bit<vtype, argtype>(arrzmm[ii+8], argzmm[ii+8]);
// }
//
//#pragma GCC unroll 8
//#pragma X86_SIMD_SORT_UNROLL_LOOP(8)
// for (int ii = 0; ii < 16; ii = ii + 2) {
// bitonic_merge_two_zmm_64bit<vtype, argtype>(arrzmm[ii], arrzmm[ii + 1], argzmm[ii], argzmm[ii + 1]);
// }
Expand All @@ -237,11 +237,11 @@ X86_SIMD_SORT_INLINE void argsort_64_64bit(type_t *arr, int64_t *arg, int32_t N)
// bitonic_merge_eight_zmm_64bit<vtype, argtype>(arrzmm+8, argzmm+8);
// bitonic_merge_sixteen_zmm_64bit<vtype, argtype>(arrzmm, argzmm);
//
//#pragma GCC unroll 8
//#pragma X86_SIMD_SORT_UNROLL_LOOP(8)
// for (int ii = 0; ii < 8; ++ii) {
// argtype::storeu(arg + 8*ii, argzmm[ii]);
// }
//#pragma GCC unroll 8
//#pragma X86_SIMD_SORT_UNROLL_LOOP(8)
// for (int ii = 0; ii < 8; ++ii) {
// argtype::mask_storeu(arg + 64 + 8*ii, load_mask[ii], argzmm[ii + 8]);
// }
Expand Down
12 changes: 6 additions & 6 deletions src/avx512-common-argsort.h
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,7 @@ static inline int64_t partition_avx512_unrolled(type_t *arr,
// first and last vtype::numlanes values are partitioned at the end
zmm_t vec_left[num_unroll], vec_right[num_unroll];
argzmm_t argvec_left[num_unroll], argvec_right[num_unroll];
#pragma GCC unroll 8
#pragma X86_SIMD_SORT_UNROLL_LOOP(8)
for (int ii = 0; ii < num_unroll; ++ii) {
argvec_left[ii] = argtype::loadu(arg + left + vtype::numlanes * ii);
vec_left[ii] = vtype::template i64gather<sizeof(type_t)>(
Expand All @@ -224,7 +224,7 @@ static inline int64_t partition_avx512_unrolled(type_t *arr,
*/
if ((r_store + vtype::numlanes) - right < left - l_store) {
right -= num_unroll * vtype::numlanes;
#pragma GCC unroll 8
#pragma X86_SIMD_SORT_UNROLL_LOOP(8)
for (int ii = 0; ii < num_unroll; ++ii) {
arg_vec[ii]
= argtype::loadu(arg + right + ii * vtype::numlanes);
Expand All @@ -233,7 +233,7 @@ static inline int64_t partition_avx512_unrolled(type_t *arr,
}
}
else {
#pragma GCC unroll 8
#pragma X86_SIMD_SORT_UNROLL_LOOP(8)
for (int ii = 0; ii < num_unroll; ++ii) {
arg_vec[ii] = argtype::loadu(arg + left + ii * vtype::numlanes);
curr_vec[ii] = vtype::template i64gather<sizeof(type_t)>(
Expand All @@ -242,7 +242,7 @@ static inline int64_t partition_avx512_unrolled(type_t *arr,
left += num_unroll * vtype::numlanes;
}
// partition the current vector and save it on both sides of the array
#pragma GCC unroll 8
#pragma X86_SIMD_SORT_UNROLL_LOOP(8)
for (int ii = 0; ii < num_unroll; ++ii) {
int32_t amount_gt_pivot
= partition_vec<vtype>(arg,
Expand All @@ -259,7 +259,7 @@ static inline int64_t partition_avx512_unrolled(type_t *arr,
}

/* partition and save vec_left and vec_right */
#pragma GCC unroll 8
#pragma X86_SIMD_SORT_UNROLL_LOOP(8)
for (int ii = 0; ii < num_unroll; ++ii) {
int32_t amount_gt_pivot
= partition_vec<vtype>(arg,
Expand All @@ -273,7 +273,7 @@ static inline int64_t partition_avx512_unrolled(type_t *arr,
l_store += (vtype::numlanes - amount_gt_pivot);
r_store -= amount_gt_pivot;
}
#pragma GCC unroll 8
#pragma X86_SIMD_SORT_UNROLL_LOOP(8)
for (int ii = 0; ii < num_unroll; ++ii) {
int32_t amount_gt_pivot
= partition_vec<vtype>(arg,
Expand Down
27 changes: 19 additions & 8 deletions src/avx512-common-qsort.h
Original file line number Diff line number Diff line change
Expand Up @@ -67,9 +67,12 @@
#define ZMM_MAX_INT16 _mm512_set1_epi16(X86_SIMD_SORT_MAX_INT16)
#define SHUFFLE_MASK(a, b, c, d) (a << 6) | (b << 4) | (c << 2) | d

/* Compiler specific macros specific */
#ifdef _MSC_VER
#define X86_SIMD_SORT_INLINE static inline
#define X86_SIMD_SORT_FINLINE static __forceinline
#define LIKELY(x)
#define UNLIKELY(x)
#elif defined(__CYGWIN__)
/*
* Force inline in cygwin to work around a compiler bug. See
Expand All @@ -80,13 +83,21 @@
#elif defined(__GNUC__)
#define X86_SIMD_SORT_INLINE static inline
#define X86_SIMD_SORT_FINLINE static __attribute__((always_inline))
#define LIKELY(x) __builtin_expect((x), 1)
#define UNLIKELY(x) __builtin_expect((x), 0)
#else
#define X86_SIMD_SORT_INLINE static
#define X86_SIMD_SORT_FINLINE static
#define LIKELY(x)
#define UNLIKELY(x)
#endif

#define LIKELY(x) __builtin_expect((x), 1)
#define UNLIKELY(x) __builtin_expect((x), 0)
#if __GNUC__ >= 8
#define X86_SIMD_SORT_UNROLL_LOOP(num)\
GCC unroll num
#else
#define X86_SIMD_SORT_UNROLL_LOOP(num)
#endif

template <typename type>
struct zmm_vector;
Expand Down Expand Up @@ -382,7 +393,7 @@ static inline int64_t partition_avx512_unrolled(type_t *arr,
// We will now have atleast 16 registers worth of data to process:
// left and right vtype::numlanes values are partitioned at the end
zmm_t vec_left[num_unroll], vec_right[num_unroll];
#pragma GCC unroll 8
#pragma X86_SIMD_SORT_UNROLL_LOOP(8)
for (int ii = 0; ii < num_unroll; ++ii) {
vec_left[ii] = vtype::loadu(arr + left + vtype::numlanes * ii);
vec_right[ii] = vtype::loadu(
Expand All @@ -403,20 +414,20 @@ static inline int64_t partition_avx512_unrolled(type_t *arr,
*/
if ((r_store + vtype::numlanes) - right < left - l_store) {
right -= num_unroll * vtype::numlanes;
#pragma GCC unroll 8
#pragma X86_SIMD_SORT_UNROLL_LOOP(8)
for (int ii = 0; ii < num_unroll; ++ii) {
curr_vec[ii] = vtype::loadu(arr + right + ii * vtype::numlanes);
}
}
else {
#pragma GCC unroll 8
#pragma X86_SIMD_SORT_UNROLL_LOOP(8)
for (int ii = 0; ii < num_unroll; ++ii) {
curr_vec[ii] = vtype::loadu(arr + left + ii * vtype::numlanes);
}
left += num_unroll * vtype::numlanes;
}
// partition the current vector and save it on both sides of the array
#pragma GCC unroll 8
#pragma X86_SIMD_SORT_UNROLL_LOOP(8)
for (int ii = 0; ii < num_unroll; ++ii) {
int32_t amount_ge_pivot
= partition_vec<vtype>(arr,
Expand All @@ -432,7 +443,7 @@ static inline int64_t partition_avx512_unrolled(type_t *arr,
}

/* partition and save vec_left[8] and vec_right[8] */
#pragma GCC unroll 8
#pragma X86_SIMD_SORT_UNROLL_LOOP(8)
for (int ii = 0; ii < num_unroll; ++ii) {
int32_t amount_ge_pivot
= partition_vec<vtype>(arr,
Expand All @@ -445,7 +456,7 @@ static inline int64_t partition_avx512_unrolled(type_t *arr,
l_store += (vtype::numlanes - amount_ge_pivot);
r_store -= amount_ge_pivot;
}
#pragma GCC unroll 8
#pragma X86_SIMD_SORT_UNROLL_LOOP(8)
for (int ii = 0; ii < num_unroll; ++ii) {
int32_t amount_ge_pivot
= partition_vec<vtype>(arr,
Expand Down
Loading