diff --git a/Makefile b/Makefile index 4474f462..dbda049d 100644 --- a/Makefile +++ b/Makefile @@ -46,7 +46,7 @@ BENCHOBJS := $(patsubst %.cpp, %.o, $(filter-out $(addprefix $(BENCHDIR)/, $(BEN TESTOBJS := $(patsubst %.cpp, %.o, $(filter-out $(addprefix $(TESTDIR)/, $(TESTS_SKIP)), $(TESTS))) UTILOBJS := $(UTILS:.cpp=.o) -# Stops make from wondering if it needs to generate the .hpp files (.cpp and .h have equivalent rules by default) +# Stops make from wondering if it needs to generate the .hpp files (.cpp and .h have equivalent rules by default) %.hpp: .PHONY: all @@ -75,7 +75,7 @@ benchexe: $(BENCHOBJS) $(UTILOBJS) .PHONY: meson meson: - meson setup --warnlevel 0 --buildtype plain builddir + meson setup --warnlevel 2 --buildtype plain builddir cd builddir && ninja .PHONY: clean diff --git a/src/avx512-64bit-argsort.hpp b/src/avx512-64bit-argsort.hpp index 000f822c..79001d33 100644 --- a/src/avx512-64bit-argsort.hpp +++ b/src/avx512-64bit-argsort.hpp @@ -108,7 +108,7 @@ X86_SIMD_SORT_INLINE void argsort_32_64bit(type_t *arr, int64_t *arg, int32_t N) zmm_t arrzmm[4]; argzmm_t argzmm[4]; -#pragma X86_SIMD_SORT_UNROLL_LOOP(2) +X86_SIMD_SORT_UNROLL_LOOP(2) for (int ii = 0; ii < 2; ++ii) { argzmm[ii] = argtype::loadu(arg + 8 * ii); arrzmm[ii] = vtype::template i64gather(argzmm[ii], arr); @@ -117,7 +117,7 @@ X86_SIMD_SORT_INLINE void argsort_32_64bit(type_t *arr, int64_t *arg, int32_t N) uint64_t combined_mask = (0x1ull << (N - 16)) - 0x1ull; opmask_t load_mask[2] = {0xFF, 0xFF}; -#pragma X86_SIMD_SORT_UNROLL_LOOP(2) +X86_SIMD_SORT_UNROLL_LOOP(2) for (int ii = 0; ii < 2; ++ii) { load_mask[ii] = (combined_mask >> (ii * 8)) & 0xFF; argzmm[ii + 2] = argtype::maskz_loadu(load_mask[ii], arg + 16 + 8 * ii); @@ -151,7 +151,7 @@ X86_SIMD_SORT_INLINE void argsort_64_64bit(type_t *arr, int64_t *arg, int32_t N) zmm_t arrzmm[8]; argzmm_t argzmm[8]; -#pragma X86_SIMD_SORT_UNROLL_LOOP(4) +X86_SIMD_SORT_UNROLL_LOOP(4) for (int ii = 0; ii < 4; ++ii) { argzmm[ii] = argtype::loadu(arg + 8 * ii); arrzmm[ii] = vtype::template i64gather(argzmm[ii], arr); @@ -160,7 +160,7 @@ X86_SIMD_SORT_INLINE void argsort_64_64bit(type_t *arr, int64_t *arg, int32_t N) opmask_t load_mask[4] = {0xFF, 0xFF, 0xFF, 0xFF}; uint64_t combined_mask = (0x1ull << (N - 32)) - 0x1ull; -#pragma X86_SIMD_SORT_UNROLL_LOOP(4) +X86_SIMD_SORT_UNROLL_LOOP(4) for (int ii = 0; ii < 4; ++ii) { load_mask[ii] = (combined_mask >> (ii * 8)) & 0xFF; argzmm[ii + 4] = argtype::maskz_loadu(load_mask[ii], arg + 32 + 8 * ii); @@ -170,7 +170,7 @@ X86_SIMD_SORT_INLINE void argsort_64_64bit(type_t *arr, int64_t *arg, int32_t N) argzmm[ii + 4]); } -#pragma X86_SIMD_SORT_UNROLL_LOOP(4) +X86_SIMD_SORT_UNROLL_LOOP(4) for (int ii = 0; ii < 8; ii = ii + 2) { bitonic_merge_two_zmm_64bit( arrzmm[ii], arrzmm[ii + 1], argzmm[ii], argzmm[ii + 1]); @@ -179,11 +179,11 @@ X86_SIMD_SORT_INLINE void argsort_64_64bit(type_t *arr, int64_t *arg, int32_t N) bitonic_merge_four_zmm_64bit(arrzmm + 4, argzmm + 4); bitonic_merge_eight_zmm_64bit(arrzmm, argzmm); -#pragma X86_SIMD_SORT_UNROLL_LOOP(4) +X86_SIMD_SORT_UNROLL_LOOP(4) for (int ii = 0; ii < 4; ++ii) { argtype::storeu(arg + 8 * ii, argzmm[ii]); } -#pragma X86_SIMD_SORT_UNROLL_LOOP(4) +X86_SIMD_SORT_UNROLL_LOOP(4) for (int ii = 0; ii < 4; ++ii) { argtype::mask_storeu(arg + 32 + 8 * ii, load_mask[ii], argzmm[ii + 4]); } @@ -203,7 +203,7 @@ X86_SIMD_SORT_INLINE void argsort_64_64bit(type_t *arr, int64_t *arg, int32_t N) // zmm_t arrzmm[16]; // argzmm_t argzmm[16]; // -//#pragma X86_SIMD_SORT_UNROLL_LOOP(8) +//X86_SIMD_SORT_UNROLL_LOOP(8) // for (int ii = 0; ii < 8; ++ii) { // argzmm[ii] = argtype::loadu(arg + 8*ii); // arrzmm[ii] = vtype::template i64gather(argzmm[ii], arr); @@ -213,19 +213,19 @@ X86_SIMD_SORT_INLINE void argsort_64_64bit(type_t *arr, int64_t *arg, int32_t N) // opmask_t load_mask[8] = {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF}; // if (N != 128) { // uint64_t combined_mask = (0x1ull << (N - 64)) - 0x1ull; -//#pragma X86_SIMD_SORT_UNROLL_LOOP(8) +//X86_SIMD_SORT_UNROLL_LOOP(8) // for (int ii = 0; ii < 8; ++ii) { // load_mask[ii] = (combined_mask >> (ii*8)) & 0xFF; // } // } -//#pragma X86_SIMD_SORT_UNROLL_LOOP(8) +//X86_SIMD_SORT_UNROLL_LOOP(8) // for (int ii = 0; ii < 8; ++ii) { // argzmm[ii+8] = argtype::maskz_loadu(load_mask[ii], arg + 64 + 8*ii); // arrzmm[ii+8] = vtype::template mask_i64gather(vtype::zmm_max(), load_mask[ii], argzmm[ii+8], arr); // arrzmm[ii+8] = sort_zmm_64bit(arrzmm[ii+8], argzmm[ii+8]); // } // -//#pragma X86_SIMD_SORT_UNROLL_LOOP(8) +//X86_SIMD_SORT_UNROLL_LOOP(8) // for (int ii = 0; ii < 16; ii = ii + 2) { // bitonic_merge_two_zmm_64bit(arrzmm[ii], arrzmm[ii + 1], argzmm[ii], argzmm[ii + 1]); // } @@ -237,11 +237,11 @@ X86_SIMD_SORT_INLINE void argsort_64_64bit(type_t *arr, int64_t *arg, int32_t N) // bitonic_merge_eight_zmm_64bit(arrzmm+8, argzmm+8); // bitonic_merge_sixteen_zmm_64bit(arrzmm, argzmm); // -//#pragma X86_SIMD_SORT_UNROLL_LOOP(8) +//X86_SIMD_SORT_UNROLL_LOOP(8) // for (int ii = 0; ii < 8; ++ii) { // argtype::storeu(arg + 8*ii, argzmm[ii]); // } -//#pragma X86_SIMD_SORT_UNROLL_LOOP(8) +//X86_SIMD_SORT_UNROLL_LOOP(8) // for (int ii = 0; ii < 8; ++ii) { // argtype::mask_storeu(arg + 64 + 8*ii, load_mask[ii], argzmm[ii + 8]); // } diff --git a/src/avx512-common-argsort.h b/src/avx512-common-argsort.h index 7a36c0ef..c45b6130 100644 --- a/src/avx512-common-argsort.h +++ b/src/avx512-common-argsort.h @@ -198,7 +198,7 @@ static inline int64_t partition_avx512_unrolled(type_t *arr, // first and last vtype::numlanes values are partitioned at the end zmm_t vec_left[num_unroll], vec_right[num_unroll]; argzmm_t argvec_left[num_unroll], argvec_right[num_unroll]; -#pragma X86_SIMD_SORT_UNROLL_LOOP(8) +X86_SIMD_SORT_UNROLL_LOOP(8) for (int ii = 0; ii < num_unroll; ++ii) { argvec_left[ii] = argtype::loadu(arg + left + vtype::numlanes * ii); vec_left[ii] = vtype::template i64gather( @@ -224,7 +224,7 @@ static inline int64_t partition_avx512_unrolled(type_t *arr, */ if ((r_store + vtype::numlanes) - right < left - l_store) { right -= num_unroll * vtype::numlanes; -#pragma X86_SIMD_SORT_UNROLL_LOOP(8) +X86_SIMD_SORT_UNROLL_LOOP(8) for (int ii = 0; ii < num_unroll; ++ii) { arg_vec[ii] = argtype::loadu(arg + right + ii * vtype::numlanes); @@ -233,7 +233,7 @@ static inline int64_t partition_avx512_unrolled(type_t *arr, } } else { -#pragma X86_SIMD_SORT_UNROLL_LOOP(8) +X86_SIMD_SORT_UNROLL_LOOP(8) for (int ii = 0; ii < num_unroll; ++ii) { arg_vec[ii] = argtype::loadu(arg + left + ii * vtype::numlanes); curr_vec[ii] = vtype::template i64gather( @@ -242,7 +242,7 @@ static inline int64_t partition_avx512_unrolled(type_t *arr, left += num_unroll * vtype::numlanes; } // partition the current vector and save it on both sides of the array -#pragma X86_SIMD_SORT_UNROLL_LOOP(8) +X86_SIMD_SORT_UNROLL_LOOP(8) for (int ii = 0; ii < num_unroll; ++ii) { int32_t amount_gt_pivot = partition_vec(arg, @@ -259,7 +259,7 @@ static inline int64_t partition_avx512_unrolled(type_t *arr, } /* partition and save vec_left and vec_right */ -#pragma X86_SIMD_SORT_UNROLL_LOOP(8) +X86_SIMD_SORT_UNROLL_LOOP(8) for (int ii = 0; ii < num_unroll; ++ii) { int32_t amount_gt_pivot = partition_vec(arg, @@ -273,7 +273,7 @@ static inline int64_t partition_avx512_unrolled(type_t *arr, l_store += (vtype::numlanes - amount_gt_pivot); r_store -= amount_gt_pivot; } -#pragma X86_SIMD_SORT_UNROLL_LOOP(8) +X86_SIMD_SORT_UNROLL_LOOP(8) for (int ii = 0; ii < num_unroll; ++ii) { int32_t amount_gt_pivot = partition_vec(arg, diff --git a/src/avx512-common-qsort.h b/src/avx512-common-qsort.h index 8cd75d70..4c907260 100644 --- a/src/avx512-common-qsort.h +++ b/src/avx512-common-qsort.h @@ -67,6 +67,8 @@ #define ZMM_MAX_INT16 _mm512_set1_epi16(X86_SIMD_SORT_MAX_INT16) #define SHUFFLE_MASK(a, b, c, d) (a << 6) | (b << 4) | (c << 2) | d +#define PRAGMA(x) _Pragma (#x) + /* Compiler specific macros specific */ #ifdef _MSC_VER #define X86_SIMD_SORT_INLINE static inline @@ -93,8 +95,7 @@ #endif #if __GNUC__ >= 8 -#define X86_SIMD_SORT_UNROLL_LOOP(num)\ -GCC unroll num +#define X86_SIMD_SORT_UNROLL_LOOP(num) PRAGMA(GCC unroll num) #else #define X86_SIMD_SORT_UNROLL_LOOP(num) #endif @@ -393,7 +394,7 @@ static inline int64_t partition_avx512_unrolled(type_t *arr, // We will now have atleast 16 registers worth of data to process: // left and right vtype::numlanes values are partitioned at the end zmm_t vec_left[num_unroll], vec_right[num_unroll]; -#pragma X86_SIMD_SORT_UNROLL_LOOP(8) +X86_SIMD_SORT_UNROLL_LOOP(8) for (int ii = 0; ii < num_unroll; ++ii) { vec_left[ii] = vtype::loadu(arr + left + vtype::numlanes * ii); vec_right[ii] = vtype::loadu( @@ -414,20 +415,20 @@ static inline int64_t partition_avx512_unrolled(type_t *arr, */ if ((r_store + vtype::numlanes) - right < left - l_store) { right -= num_unroll * vtype::numlanes; -#pragma X86_SIMD_SORT_UNROLL_LOOP(8) +X86_SIMD_SORT_UNROLL_LOOP(8) for (int ii = 0; ii < num_unroll; ++ii) { curr_vec[ii] = vtype::loadu(arr + right + ii * vtype::numlanes); } } else { -#pragma X86_SIMD_SORT_UNROLL_LOOP(8) +X86_SIMD_SORT_UNROLL_LOOP(8) for (int ii = 0; ii < num_unroll; ++ii) { curr_vec[ii] = vtype::loadu(arr + left + ii * vtype::numlanes); } left += num_unroll * vtype::numlanes; } // partition the current vector and save it on both sides of the array -#pragma X86_SIMD_SORT_UNROLL_LOOP(8) +X86_SIMD_SORT_UNROLL_LOOP(8) for (int ii = 0; ii < num_unroll; ++ii) { int32_t amount_ge_pivot = partition_vec(arr, @@ -443,7 +444,7 @@ static inline int64_t partition_avx512_unrolled(type_t *arr, } /* partition and save vec_left[8] and vec_right[8] */ -#pragma X86_SIMD_SORT_UNROLL_LOOP(8) +X86_SIMD_SORT_UNROLL_LOOP(8) for (int ii = 0; ii < num_unroll; ++ii) { int32_t amount_ge_pivot = partition_vec(arr, @@ -456,7 +457,7 @@ static inline int64_t partition_avx512_unrolled(type_t *arr, l_store += (vtype::numlanes - amount_ge_pivot); r_store -= amount_ge_pivot; } -#pragma X86_SIMD_SORT_UNROLL_LOOP(8) +X86_SIMD_SORT_UNROLL_LOOP(8) for (int ii = 0; ii < num_unroll; ++ii) { int32_t amount_ge_pivot = partition_vec(arr, diff --git a/tests/test-argselect.hpp b/tests/test-argselect.hpp index b5a98c1d..13506283 100644 --- a/tests/test-argselect.hpp +++ b/tests/test-argselect.hpp @@ -29,13 +29,15 @@ TYPED_TEST_P(avx512argselect, test_random) = avx512_argselect(arr.data(), k, arr.size()); auto true_kth = arr[sorted_inx[k]]; EXPECT_EQ(true_kth, arr[inx[k]]) << "Failed at index k = " << k; - if (k >= 1) + if (k >= 1) { EXPECT_GE(true_kth, std_max_element(arr, inx, 0, k - 1)) << "failed at k = " << k; - if (k != arrsize - 1) + } + if (k != arrsize - 1) { EXPECT_LE(true_kth, std_min_element(arr, inx, k + 1, arrsize - 1)) << "failed at k = " << k; + } EXPECT_UNIQUE(inx) } } diff --git a/tests/test-argsort.hpp b/tests/test-argsort.hpp index d2c403c2..62c3de60 100644 --- a/tests/test-argsort.hpp +++ b/tests/test-argsort.hpp @@ -23,7 +23,7 @@ TYPED_TEST_P(avx512argsort, test_random) std::vector inx2 = avx512_argsort(arr.data(), arr.size()); std::vector sort1, sort2; - for (size_t jj = 0; jj < size; ++jj) { + for (auto jj = 0; jj < size; ++jj) { sort1.push_back(arr[inx1[jj]]); sort2.push_back(arr[inx2[jj]]); } @@ -48,14 +48,14 @@ TYPED_TEST_P(avx512argsort, test_constant) for (auto &size : arrsizes) { /* constant array */ auto elem = get_uniform_rand_array(1)[0]; - for (int64_t jj = 0; jj < size; ++jj) { + for (auto jj = 0; jj < size; ++jj) { arr.push_back(elem); } std::vector inx1 = std_argsort(arr); std::vector inx2 = avx512_argsort(arr.data(), arr.size()); std::vector sort1, sort2; - for (size_t jj = 0; jj < size; ++jj) { + for (auto jj = 0; jj < size; ++jj) { sort1.push_back(arr[inx1[jj]]); sort2.push_back(arr[inx2[jj]]); } @@ -84,7 +84,7 @@ TYPED_TEST_P(avx512argsort, test_small_range) std::vector inx2 = avx512_argsort(arr.data(), arr.size()); std::vector sort1, sort2; - for (size_t jj = 0; jj < size; ++jj) { + for (auto jj = 0; jj < size; ++jj) { sort1.push_back(arr[inx1[jj]]); sort2.push_back(arr[inx2[jj]]); } @@ -113,7 +113,7 @@ TYPED_TEST_P(avx512argsort, test_sorted) std::vector inx2 = avx512_argsort(arr.data(), arr.size()); std::vector sort1, sort2; - for (size_t jj = 0; jj < size; ++jj) { + for (auto jj = 0; jj < size; ++jj) { sort1.push_back(arr[inx1[jj]]); sort2.push_back(arr[inx2[jj]]); } @@ -143,7 +143,7 @@ TYPED_TEST_P(avx512argsort, test_reverse) std::vector inx2 = avx512_argsort(arr.data(), arr.size()); std::vector sort1, sort2; - for (size_t jj = 0; jj < size; ++jj) { + for (auto jj = 0; jj < size; ++jj) { sort1.push_back(arr[inx1[jj]]); sort2.push_back(arr[inx2[jj]]); } @@ -177,7 +177,7 @@ TYPED_TEST_P(avx512argsort, test_array_with_nan) std::vector inx = avx512_argsort(arr.data(), arr.size()); std::vector sort1; - for (size_t jj = 0; jj < size; ++jj) { + for (auto jj = 0; jj < size; ++jj) { sort1.push_back(arr[inx[jj]]); } if ((!std::isnan(sort1[size - 1])) || (!std::isnan(sort1[size - 2]))) { @@ -211,7 +211,7 @@ TYPED_TEST_P(avx512argsort, test_max_value_at_end_of_array) } std::vector inx = avx512_argsort(arr.data(), arr.size()); std::vector sorted; - for (size_t jj = 0; jj < size; ++jj) { + for (auto jj = 0; jj < size; ++jj) { sorted.push_back(arr[inx[jj]]); } if (!std::is_sorted(sorted.begin(), sorted.end())) { @@ -250,7 +250,7 @@ TYPED_TEST_P(avx512argsort, test_all_inf_array) } std::vector inx = avx512_argsort(arr.data(), arr.size()); std::vector sorted; - for (size_t jj = 0; jj < size; ++jj) { + for (auto jj = 0; jj < size; ++jj) { sorted.push_back(arr[inx[jj]]); } if (!std::is_sorted(sorted.begin(), sorted.end())) { diff --git a/tests/test-partial-qsort.hpp b/tests/test-partial-qsort.hpp index 6050b4c2..fee3d9f3 100644 --- a/tests/test-partial-qsort.hpp +++ b/tests/test-partial-qsort.hpp @@ -24,7 +24,7 @@ TYPED_TEST_P(avx512_partial_sort, test_ranges) /* Sort with std::sort for comparison */ std::sort(sortedarr.begin(), sortedarr.end()); - for (size_t ii = 0; ii < nranges; ++ii) { + for (auto ii = 0; ii < nranges; ++ii) { psortedarr = arr; /* Pick a random number of elements to sort at the beginning of the array */ @@ -33,7 +33,7 @@ TYPED_TEST_P(avx512_partial_sort, test_ranges) /* Sort the range and verify all the required elements match the presorted set */ avx512_partial_qsort( psortedarr.data(), k, psortedarr.size()); - for (size_t jj = 0; jj < k; jj++) { + for (auto jj = 0; jj < k; jj++) { ASSERT_EQ(sortedarr[jj], psortedarr[jj]); }