From 2789901d6d1123fb989f2b335df32249d3b2a36b Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Mon, 9 Sep 2024 17:30:16 -0400 Subject: [PATCH 1/7] slightly-faster --- src/spreadinterp.cpp | 125 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 123 insertions(+), 2 deletions(-) diff --git a/src/spreadinterp.cpp b/src/spreadinterp.cpp index 12327c2d6..1903646d4 100644 --- a/src/spreadinterp.cpp +++ b/src/spreadinterp.cpp @@ -114,6 +114,10 @@ static void bin_sort_singlethread(BIGINT *ret, UBIGINT M, const FLT *kx, const F const FLT *kz, UBIGINT N1, UBIGINT N2, UBIGINT N3, double bin_size_x, double bin_size_y, double bin_size_z, int debug); +static void bin_sort_singlethread_vector(BIGINT *ret, UBIGINT M, const FLT *kx, + const FLT *ky, const FLT *kz, UBIGINT N1, + UBIGINT N2, UBIGINT N3, double bin_size_x, + double bin_size_y, double bin_size_z, int debug); void bin_sort_multithread(BIGINT *ret, UBIGINT M, FLT *kx, FLT *ky, FLT *kz, UBIGINT N1, UBIGINT N2, UBIGINT N3, double bin_size_x, double bin_size_y, double bin_size_z, int debug, int nthr); @@ -296,8 +300,8 @@ int indexSort(BIGINT *sort_indices, UBIGINT N1, UBIGINT N2, UBIGINT N3, UBIGINT if (sort_nthr == 0) // multithreaded auto choice: when N>>M, one thread is better! sort_nthr = (10 * M > N) ? maxnthr : 1; // heuristic if (sort_nthr == 1) - bin_sort_singlethread(sort_indices, M, kx, ky, kz, N1, N2, N3, bin_size_x, - bin_size_y, bin_size_z, sort_debug); + bin_sort_singlethread_vector(sort_indices, M, kx, ky, kz, N1, N2, N3, bin_size_x, + bin_size_y, bin_size_z, sort_debug); else // sort_nthr>1, user fixes # threads (>=2) bin_sort_multithread(sort_indices, M, kx, ky, kz, N1, N2, N3, bin_size_x, bin_size_y, bin_size_z, sort_debug, sort_nthr); @@ -1841,6 +1845,123 @@ void add_wrapped_subgrid(BIGINT offset1, BIGINT offset2, BIGINT offset3, } } +void bin_sort_singlethread_vector( + BIGINT *ret, const UBIGINT M, const FLT *kx, const FLT *ky, const FLT *kz, + const UBIGINT N1, const UBIGINT N2, const UBIGINT N3, const double bin_size_x, + const double bin_size_y, const double bin_size_z, const int debug) +/* Returns permutation of all nonuniform points with good RAM access, + * ie less cache misses for spreading, in 1D, 2D, or 3D. Single-threaded version + * + * This is achieved by binning into cuboids (of given bin_size within the + * overall box domain), then reading out the indices within + * these bins in a Cartesian cuboid ordering (x fastest, y med, z slowest). + * Finally the permutation is inverted, so that the good ordering is: the + * NU pt of index ret[0], the NU pt of index ret[1],..., NU pt of index ret[M-1] + * + * Inputs: M - number of input NU points. + * kx,ky,kz - length-M arrays of real coords of NU pts in [-pi, pi). + * Points outside this range are folded into it. + * N1,N2,N3 - integer sizes of overall box (N2=N3=1 for 1D, N3=1 for 2D) + * bin_size_x,y,z - what binning box size to use in each dimension + * (in rescaled coords where ranges are [0,Ni] ). + * For 1D, only bin_size_x is used; for 2D, it & bin_size_y. + * Output: + * writes to ret a vector list of indices, each in the range 0,..,M-1. + * Thus, ret must have been preallocated for M BIGINTs. + * + * Notes: I compared RAM usage against declaring an internal vector and passing + * back; the latter used more RAM and was slower. + * Avoided the bins array, as in JFM's spreader of 2016, + * tidied up, early 2017, Barnett. + * Timings (2017): 3s for M=1e8 NU pts on 1 core of i7; 5s on 1 core of xeon. + * Simplified by Martin Reinecke, 6/19/23 (no apparent effect on speed). + */ +{ + using simd_type = xsimd::batch; + using arch_t = simd_type::arch_type; + static constexpr auto simd_size = simd_type::size; + static constexpr auto alignment = arch_t::alignment(); + + constexpr auto to_array = [](const auto &vec) constexpr noexcept { + using T = decltype(std::decay_t()); + alignas(alignment) std::array array{}; + vec.store_aligned(array.data()); + return array; + }; + + const auto isky = (N2 > 1), iskz = (N3 > 1); // ky,kz avail? (cannot access if not) + // here the +1 is needed to allow round-off error causing i1=N1/bin_size_x, + // for kx near +pi, ie foldrescale gives N1 (exact arith would be 0 to N1-1). + // Note that round-off near kx=-pi stably rounds negative to i1=0. + const auto nbins1 = BIGINT(FLT(N1) / bin_size_x + 1); + const auto nbins2 = isky ? BIGINT(FLT(N2) / bin_size_y + 1) : 1; + const auto nbins3 = iskz ? BIGINT(FLT(N3) / bin_size_z + 1) : 1; + const auto nbins = nbins1 * nbins2 * nbins3; + const auto inv_bin_size_x = FLT(1.0 / bin_size_x); + const auto inv_bin_size_y = FLT(1.0 / bin_size_y); + const auto inv_bin_size_z = FLT(1.0 / bin_size_z); + const auto inv_bin_size_x_vec = simd_type(1.0 / bin_size_x); + const auto inv_bin_size_y_vec = simd_type(1.0 / bin_size_y); + const auto inv_bin_size_z_vec = simd_type(1.0 / bin_size_z); + const auto zero = to_int(simd_type(0)); + + // count how many pts in each bin + std::vector counts(nbins, 0); + const auto simd_M = M & (-simd_size); // round down to simd_size multiple + UBIGINT i{}; + for (i = 0; i < simd_M; i += simd_size) { + const auto i1 = to_int(simd_type::load_aligned(kx + i) * inv_bin_size_x_vec); + const auto i2 = + isky ? to_int(simd_type::load_aligned(ky + i) * inv_bin_size_y_vec) : zero; + const auto i3 = + iskz ? to_int(simd_type::load_aligned(kz + i) * inv_bin_size_z_vec) : zero; + const auto bin = i1 + nbins1 * (i2 + nbins2 * i3); + const auto bin_array = to_array(bin); + for (int j = 0; j < simd_size; j++) { + ++counts[bin_array[j]]; + } + } + for (; i < M; i++) { + // find the bin index in however many dims are needed + const auto i1 = BIGINT(fold_rescale(kx[i], N1) * inv_bin_size_x); + const auto i2 = isky ? BIGINT(fold_rescale(ky[i], N2) * inv_bin_size_y) : 0; + const auto i3 = iskz ? BIGINT(fold_rescale(kz[i], N3) * inv_bin_size_z) : 0; + const auto bin = i1 + nbins1 * (i2 + nbins2 * i3); + ++counts[bin]; + } + + // compute the offsets directly in the counts array (no offset array) + BIGINT current_offset = 0; + for (BIGINT i = 0; i < nbins; i++) { + BIGINT tmp = counts[i]; + counts[i] = current_offset; // Reinecke's cute replacement of counts[i] + current_offset += tmp; + } // (counts now contains the index offsets for each bin) + + for (i = 0; i < simd_M; i += simd_size) { + const auto i1 = xsimd::to_int((simd_type::load_aligned(kx + i) * inv_bin_size_x_vec)); + const auto i2 = + isky ? xsimd::to_int(simd_type::load_aligned(ky + i) * inv_bin_size_y_vec) : zero; + const auto i3 = + iskz ? xsimd::to_int(simd_type::load_aligned(kz + i) * inv_bin_size_z_vec) : zero; + const auto bin = i1 + nbins1 * (i2 + nbins2 * i3); + const auto bin_array = to_array(bin); + for (int j = 0; j < simd_size; j++) { + ret[counts[bin_array[j]]] = j + i; + counts[bin_array[j]]++; + } + } + for (; i < M; i++) { + // find the bin index (again! but better than using RAM) + const auto i1 = BIGINT(fold_rescale(kx[i], N1) * inv_bin_size_x); + const auto i2 = isky ? BIGINT(fold_rescale(ky[i], N2) * inv_bin_size_y) : 0; + const auto i3 = iskz ? BIGINT(fold_rescale(kz[i], N3) * inv_bin_size_z) : 0; + const auto bin = i1 + nbins1 * (i2 + nbins2 * i3); + ret[counts[bin]] = i; // fill the inverse map on the fly + ++counts[bin]; // update the offsets + } +} + void bin_sort_singlethread( BIGINT *ret, const UBIGINT M, const FLT *kx, const FLT *ky, const FLT *kz, const UBIGINT N1, const UBIGINT N2, const UBIGINT N3, const double bin_size_x, From 7aa45ec0e712374402a96dff427aa51d0e3db29b Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Wed, 11 Sep 2024 10:17:58 -0400 Subject: [PATCH 2/7] WIP --- src/spreadinterp.cpp | 66 +++++++++++++++++++++++++++++++++++--------- 1 file changed, 53 insertions(+), 13 deletions(-) diff --git a/src/spreadinterp.cpp b/src/spreadinterp.cpp index 1903646d4..1a2660cbc 100644 --- a/src/spreadinterp.cpp +++ b/src/spreadinterp.cpp @@ -14,6 +14,7 @@ #include #include #include +#include #include using namespace std; @@ -73,8 +74,8 @@ static FINUFFT_ALWAYS_INLINE auto ker_eval(FLT *FINUFFT_RESTRICT ker, const V... elems) noexcept; static FINUFFT_ALWAYS_INLINE FLT fold_rescale(FLT x, UBIGINT N) noexcept; template -FINUFFT_ALWAYS_INLINE static simd_type fold_rescale(const simd_type &x, - UBIGINT N) noexcept; +static FINUFFT_ALWAYS_INLINE simd_type fold_rescale(const simd_type &x, + const BIGINT N) noexcept; static FINUFFT_ALWAYS_INLINE void set_kernel_args( FLT *args, FLT x, const finufft_spread_opts &opts) noexcept; static FINUFFT_ALWAYS_INLINE void evaluate_kernel_vector( @@ -1845,6 +1846,11 @@ void add_wrapped_subgrid(BIGINT offset1, BIGINT offset2, BIGINT offset3, } } +template +simd_type make_incremented_vectors(std::index_sequence) { + return simd_type{Is...}; // Creates a SIMD vector with the index sequence +} + void bin_sort_singlethread_vector( BIGINT *ret, const UBIGINT M, const FLT *kx, const FLT *ky, const FLT *kz, const UBIGINT N1, const UBIGINT N2, const UBIGINT N3, const double bin_size_x, @@ -1878,6 +1884,7 @@ void bin_sort_singlethread_vector( */ { using simd_type = xsimd::batch; + using int_simd_type = xsimd::batch>; using arch_t = simd_type::arch_type; static constexpr auto simd_size = simd_type::size; static constexpr auto alignment = arch_t::alignment(); @@ -1904,23 +1911,36 @@ void bin_sort_singlethread_vector( const auto inv_bin_size_y_vec = simd_type(1.0 / bin_size_y); const auto inv_bin_size_z_vec = simd_type(1.0 / bin_size_z); const auto zero = to_int(simd_type(0)); + const auto increment = + to_int(make_incremented_vectors(std::make_index_sequence{})); // count how many pts in each bin - std::vector counts(nbins, 0); + alignas(alignment) std::vector> counts(nbins + simd_size, 0); + alignas(alignment) std::vector> ref_counts(nbins + simd_size, + 0); const auto simd_M = M & (-simd_size); // round down to simd_size multiple UBIGINT i{}; for (i = 0; i < simd_M; i += simd_size) { - const auto i1 = to_int(simd_type::load_aligned(kx + i) * inv_bin_size_x_vec); + const auto i1 = xsimd::to_int( + fold_rescale(simd_type::load_unaligned(kx + i), N1) * inv_bin_size_x_vec); const auto i2 = - isky ? to_int(simd_type::load_aligned(ky + i) * inv_bin_size_y_vec) : zero; + isky ? xsimd::to_int(fold_rescale(simd_type::load_unaligned(ky + i), N2) * + inv_bin_size_y_vec) + : zero; const auto i3 = - iskz ? to_int(simd_type::load_aligned(kz + i) * inv_bin_size_z_vec) : zero; + iskz ? xsimd::to_int(fold_rescale(simd_type::load_unaligned(kz + i), N3) * + inv_bin_size_z_vec) + : zero; const auto bin = i1 + nbins1 * (i2 + nbins2 * i3); const auto bin_array = to_array(bin); for (int j = 0; j < simd_size; j++) { - ++counts[bin_array[j]]; + ++ref_counts[bin_array[j]]; } + const auto bins = int_simd_type::gather(counts.data(), bin); + const auto incr_bins = bins + 1; + incr_bins.scatter(counts.data(), bin); } + for (; i < M; i++) { // find the bin index in however many dims are needed const auto i1 = BIGINT(fold_rescale(kx[i], N1) * inv_bin_size_x); @@ -1928,24 +1948,44 @@ void bin_sort_singlethread_vector( const auto i3 = iskz ? BIGINT(fold_rescale(kz[i], N3) * inv_bin_size_z) : 0; const auto bin = i1 + nbins1 * (i2 + nbins2 * i3); ++counts[bin]; + ++ref_counts[bin]; + } + + for (i = 0; i < nbins; i++) { + if (counts[i] != ref_counts[i]) { + std::cerr << "Error: bin count mismatch at bin " << i + << " counts[i] = " << counts[i] << " ref_counts[i] = " << ref_counts[i] + << std::endl; + std::abort(); + } } // compute the offsets directly in the counts array (no offset array) BIGINT current_offset = 0; - for (BIGINT i = 0; i < nbins; i++) { + for (i = 0; i < nbins; i++) { BIGINT tmp = counts[i]; counts[i] = current_offset; // Reinecke's cute replacement of counts[i] current_offset += tmp; } // (counts now contains the index offsets for each bin) for (i = 0; i < simd_M; i += simd_size) { - const auto i1 = xsimd::to_int((simd_type::load_aligned(kx + i) * inv_bin_size_x_vec)); + const auto i1 = xsimd::to_int( + fold_rescale(simd_type::load_unaligned(kx + i), N1) * inv_bin_size_x_vec); const auto i2 = - isky ? xsimd::to_int(simd_type::load_aligned(ky + i) * inv_bin_size_y_vec) : zero; + isky ? xsimd::to_int(fold_rescale(simd_type::load_unaligned(ky + i), N2) * + inv_bin_size_y_vec) + : zero; const auto i3 = - iskz ? xsimd::to_int(simd_type::load_aligned(kz + i) * inv_bin_size_z_vec) : zero; - const auto bin = i1 + nbins1 * (i2 + nbins2 * i3); - const auto bin_array = to_array(bin); + iskz ? xsimd::to_int(fold_rescale(simd_type::load_unaligned(kz + i), N3) * + inv_bin_size_z_vec) + : zero; + const auto bin = i1 + nbins1 * (i2 + nbins2 * i3); + // const auto bins = decltype(bin)::gather(counts.data(), bin); + // const auto ret_elems = decltype(bins)::gather(ret, bins) + (increment+i); + // ret_elems.scatter(ret, bins); + // const auto inc_bins = bins+1; + // inc_bins.scatter(counts.data(), bin); + const auto bin_array = to_array(to_int(bin)); for (int j = 0; j < simd_size; j++) { ret[counts[bin_array[j]]] = j + i; counts[bin_array[j]]++; From 8da21c26dfd67346d4d58205542834be8511050b Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Thu, 12 Sep 2024 11:09:29 -0400 Subject: [PATCH 3/7] vectorized --- src/spreadinterp.cpp | 65 ++++++++++++++++++++++++-------------------- 1 file changed, 35 insertions(+), 30 deletions(-) diff --git a/src/spreadinterp.cpp b/src/spreadinterp.cpp index 1a2660cbc..1707dac41 100644 --- a/src/spreadinterp.cpp +++ b/src/spreadinterp.cpp @@ -1889,13 +1889,24 @@ void bin_sort_singlethread_vector( static constexpr auto simd_size = simd_type::size; static constexpr auto alignment = arch_t::alignment(); - constexpr auto to_array = [](const auto &vec) constexpr noexcept { + static constexpr auto to_array = [](const auto &vec) constexpr noexcept { using T = decltype(std::decay_t()); alignas(alignment) std::array array{}; vec.store_aligned(array.data()); return array; }; + static constexpr auto has_duplicates = [](const auto &vec) constexpr noexcept { + using T = decltype(std::decay_t()); + for (auto i = 0; i < simd_size; i++) { + const auto rotated = xsimd::rotl(vec, (sizeof(typename T::value_type) * 8) * i); + if ((rotated == vec).mask() != 0) { + return true; + } + } + return false; + }; + const auto isky = (N2 > 1), iskz = (N3 > 1); // ky,kz avail? (cannot access if not) // here the +1 is needed to allow round-off error causing i1=N1/bin_size_x, // for kx near +pi, ie foldrescale gives N1 (exact arith would be 0 to N1-1). @@ -1916,8 +1927,6 @@ void bin_sort_singlethread_vector( // count how many pts in each bin alignas(alignment) std::vector> counts(nbins + simd_size, 0); - alignas(alignment) std::vector> ref_counts(nbins + simd_size, - 0); const auto simd_M = M & (-simd_size); // round down to simd_size multiple UBIGINT i{}; for (i = 0; i < simd_M; i += simd_size) { @@ -1931,14 +1940,17 @@ void bin_sort_singlethread_vector( iskz ? xsimd::to_int(fold_rescale(simd_type::load_unaligned(kz + i), N3) * inv_bin_size_z_vec) : zero; - const auto bin = i1 + nbins1 * (i2 + nbins2 * i3); - const auto bin_array = to_array(bin); - for (int j = 0; j < simd_size; j++) { - ++ref_counts[bin_array[j]]; + const auto bin = i1 + nbins1 * (i2 + nbins2 * i3); + if (has_duplicates(bin)) { + const auto bin_array = to_array(bin); + for (int j = 0; j < simd_size; j++) { + ++counts[bin_array[j]]; + } + } else { + const auto bins = int_simd_type::gather(counts.data(), bin); + const auto incr_bins = bins + 1; + incr_bins.scatter(counts.data(), bin); } - const auto bins = int_simd_type::gather(counts.data(), bin); - const auto incr_bins = bins + 1; - incr_bins.scatter(counts.data(), bin); } for (; i < M; i++) { @@ -1948,16 +1960,6 @@ void bin_sort_singlethread_vector( const auto i3 = iskz ? BIGINT(fold_rescale(kz[i], N3) * inv_bin_size_z) : 0; const auto bin = i1 + nbins1 * (i2 + nbins2 * i3); ++counts[bin]; - ++ref_counts[bin]; - } - - for (i = 0; i < nbins; i++) { - if (counts[i] != ref_counts[i]) { - std::cerr << "Error: bin count mismatch at bin " << i - << " counts[i] = " << counts[i] << " ref_counts[i] = " << ref_counts[i] - << std::endl; - std::abort(); - } } // compute the offsets directly in the counts array (no offset array) @@ -1979,16 +1981,19 @@ void bin_sort_singlethread_vector( iskz ? xsimd::to_int(fold_rescale(simd_type::load_unaligned(kz + i), N3) * inv_bin_size_z_vec) : zero; - const auto bin = i1 + nbins1 * (i2 + nbins2 * i3); - // const auto bins = decltype(bin)::gather(counts.data(), bin); - // const auto ret_elems = decltype(bins)::gather(ret, bins) + (increment+i); - // ret_elems.scatter(ret, bins); - // const auto inc_bins = bins+1; - // inc_bins.scatter(counts.data(), bin); - const auto bin_array = to_array(to_int(bin)); - for (int j = 0; j < simd_size; j++) { - ret[counts[bin_array[j]]] = j + i; - counts[bin_array[j]]++; + const auto bin = i1 + nbins1 * (i2 + nbins2 * i3); + const auto bins = decltype(bin)::gather(counts.data(), bin); + if (has_duplicates(bin) || has_duplicates(bins)) { + const auto bin_array = to_array(to_int(bin)); + for (int j = 0; j < simd_size; j++) { + ret[counts[bin_array[j]]] = j + i; + counts[bin_array[j]]++; + } + } else { + const auto incr_bins = bins + 1; + incr_bins.scatter(counts.data(), bin); + const auto result = increment + i; + result.scatter(ret, bins); } } for (; i < M; i++) { From 8a9fec92b5527cbca2f4e52b068607c9cde75c8e Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Thu, 12 Sep 2024 15:10:34 -0400 Subject: [PATCH 4/7] it works now --- src/spreadinterp.cpp | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/spreadinterp.cpp b/src/spreadinterp.cpp index 1707dac41..bfbb67576 100644 --- a/src/spreadinterp.cpp +++ b/src/spreadinterp.cpp @@ -1899,8 +1899,8 @@ void bin_sort_singlethread_vector( static constexpr auto has_duplicates = [](const auto &vec) constexpr noexcept { using T = decltype(std::decay_t()); for (auto i = 0; i < simd_size; i++) { - const auto rotated = xsimd::rotl(vec, (sizeof(typename T::value_type) * 8) * i); - if ((rotated == vec).mask() != 0) { + const auto rotated = xsimd::rotr(vec, sizeof(typename T::value_type) * 8 * i); + if ((rotated == vec) != xsimd::batch_bool(false)) { return true; } } @@ -1948,7 +1948,7 @@ void bin_sort_singlethread_vector( } } else { const auto bins = int_simd_type::gather(counts.data(), bin); - const auto incr_bins = bins + 1; + const auto incr_bins = xsimd::incr(bins); incr_bins.scatter(counts.data(), bin); } } @@ -1990,9 +1990,9 @@ void bin_sort_singlethread_vector( counts[bin_array[j]]++; } } else { - const auto incr_bins = bins + 1; + const auto incr_bins = xsimd::incr(bins); incr_bins.scatter(counts.data(), bin); - const auto result = increment + i; + const auto result = increment + int_simd_type(i); result.scatter(ret, bins); } } @@ -2053,6 +2053,7 @@ void bin_sort_singlethread( // count how many pts in each bin std::vector counts(nbins, 0); +#pragma omp simd for (auto i = 0; i < M; i++) { // find the bin index in however many dims are needed const auto i1 = BIGINT(fold_rescale(kx[i], N1) * inv_bin_size_x); @@ -2070,6 +2071,7 @@ void bin_sort_singlethread( current_offset += tmp; } // (counts now contains the index offsets for each bin) +#pragma omp simd for (auto i = 0; i < M; i++) { // find the bin index (again! but better than using RAM) const auto i1 = BIGINT(fold_rescale(kx[i], N1) * inv_bin_size_x); From 2f33b1f0fec4210660e9f97e3491de2a4c8b3c30 Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Fri, 13 Sep 2024 12:34:22 -0400 Subject: [PATCH 5/7] vector check removed --- src/spreadinterp.cpp | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/src/spreadinterp.cpp b/src/spreadinterp.cpp index bfbb67576..cc964402a 100644 --- a/src/spreadinterp.cpp +++ b/src/spreadinterp.cpp @@ -1897,11 +1897,11 @@ void bin_sort_singlethread_vector( }; static constexpr auto has_duplicates = [](const auto &vec) constexpr noexcept { - using T = decltype(std::decay_t()); - for (auto i = 0; i < simd_size; i++) { - const auto rotated = xsimd::rotr(vec, sizeof(typename T::value_type) * 8 * i); - if ((rotated == vec) != xsimd::batch_bool(false)) { - return true; + for (int i = 0; i < simd_size; i++) { + for (int j = i + 1; j < simd_size; j++) { + if (vec[i] == vec[j]) { + return true; + } } } return false; @@ -1940,9 +1940,9 @@ void bin_sort_singlethread_vector( iskz ? xsimd::to_int(fold_rescale(simd_type::load_unaligned(kz + i), N3) * inv_bin_size_z_vec) : zero; - const auto bin = i1 + nbins1 * (i2 + nbins2 * i3); - if (has_duplicates(bin)) { - const auto bin_array = to_array(bin); + const auto bin = i1 + nbins1 * (i2 + nbins2 * i3); + const auto bin_array = to_array(bin); + if (has_duplicates(bin_array)) { for (int j = 0; j < simd_size; j++) { ++counts[bin_array[j]]; } @@ -1981,10 +1981,11 @@ void bin_sort_singlethread_vector( iskz ? xsimd::to_int(fold_rescale(simd_type::load_unaligned(kz + i), N3) * inv_bin_size_z_vec) : zero; - const auto bin = i1 + nbins1 * (i2 + nbins2 * i3); - const auto bins = decltype(bin)::gather(counts.data(), bin); - if (has_duplicates(bin) || has_duplicates(bins)) { - const auto bin_array = to_array(to_int(bin)); + const auto bin = i1 + nbins1 * (i2 + nbins2 * i3); + const auto bins = decltype(bin)::gather(counts.data(), bin); + const auto bin_array = to_array(to_int(bin)); + const auto bins_array = to_array(to_int(bins)); + if (has_duplicates(bin_array) || has_duplicates(bins_array)) { for (int j = 0; j < simd_size; j++) { ret[counts[bin_array[j]]] = j + i; counts[bin_array[j]]++; From cffbb9fee4f9378d0f9affcc10f43d2c85217f07 Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Fri, 13 Sep 2024 12:46:40 -0400 Subject: [PATCH 6/7] minor optimizations --- src/spreadinterp.cpp | 48 ++++++++++++++++++++++++++------------------ 1 file changed, 29 insertions(+), 19 deletions(-) diff --git a/src/spreadinterp.cpp b/src/spreadinterp.cpp index cc964402a..5aa68bb11 100644 --- a/src/spreadinterp.cpp +++ b/src/spreadinterp.cpp @@ -1851,6 +1851,28 @@ simd_type make_incremented_vectors(std::index_sequence) { return simd_type{Is...}; // Creates a SIMD vector with the index sequence } +template +static FINUFFT_ALWAYS_INLINE constexpr bool has_duplicates_impl( + const simd_type &vec) noexcept { + + if constexpr (N == simd_type::size) { + return false; + } else { + const auto duplicates = + (xsimd::rotate_right(vec) == vec) != xsimd::batch_bool(false); + if (duplicates) { + return true; + } + return has_duplicates_impl(vec); + } +} + +template +static FINUFFT_ALWAYS_INLINE constexpr bool has_duplicates( + const simd_type &vec) noexcept { + return has_duplicates_impl<1, simd_type>(vec); +} + void bin_sort_singlethread_vector( BIGINT *ret, const UBIGINT M, const FLT *kx, const FLT *ky, const FLT *kz, const UBIGINT N1, const UBIGINT N2, const UBIGINT N3, const double bin_size_x, @@ -1896,17 +1918,6 @@ void bin_sort_singlethread_vector( return array; }; - static constexpr auto has_duplicates = [](const auto &vec) constexpr noexcept { - for (int i = 0; i < simd_size; i++) { - for (int j = i + 1; j < simd_size; j++) { - if (vec[i] == vec[j]) { - return true; - } - } - } - return false; - }; - const auto isky = (N2 > 1), iskz = (N3 > 1); // ky,kz avail? (cannot access if not) // here the +1 is needed to allow round-off error causing i1=N1/bin_size_x, // for kx near +pi, ie foldrescale gives N1 (exact arith would be 0 to N1-1). @@ -1940,9 +1951,9 @@ void bin_sort_singlethread_vector( iskz ? xsimd::to_int(fold_rescale(simd_type::load_unaligned(kz + i), N3) * inv_bin_size_z_vec) : zero; - const auto bin = i1 + nbins1 * (i2 + nbins2 * i3); - const auto bin_array = to_array(bin); - if (has_duplicates(bin_array)) { + const auto bin = i1 + nbins1 * (i2 + nbins2 * i3); + if (has_duplicates(bin)) { + const auto bin_array = to_array(bin); for (int j = 0; j < simd_size; j++) { ++counts[bin_array[j]]; } @@ -1981,16 +1992,15 @@ void bin_sort_singlethread_vector( iskz ? xsimd::to_int(fold_rescale(simd_type::load_unaligned(kz + i), N3) * inv_bin_size_z_vec) : zero; - const auto bin = i1 + nbins1 * (i2 + nbins2 * i3); - const auto bins = decltype(bin)::gather(counts.data(), bin); - const auto bin_array = to_array(to_int(bin)); - const auto bins_array = to_array(to_int(bins)); - if (has_duplicates(bin_array) || has_duplicates(bins_array)) { + const auto bin = i1 + nbins1 * (i2 + nbins2 * i3); + if (has_duplicates(bin)) { + const auto bin_array = to_array(to_int(bin)); for (int j = 0; j < simd_size; j++) { ret[counts[bin_array[j]]] = j + i; counts[bin_array[j]]++; } } else { + const auto bins = decltype(bin)::gather(counts.data(), bin); const auto incr_bins = xsimd::incr(bins); incr_bins.scatter(counts.data(), bin); const auto result = increment + int_simd_type(i); From 1e03401f67744d4d7a1bb3c41ad37297b7a38acf Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Fri, 13 Sep 2024 12:56:07 -0400 Subject: [PATCH 7/7] omp simd causes segfaults --- src/spreadinterp.cpp | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/src/spreadinterp.cpp b/src/spreadinterp.cpp index 5aa68bb11..0afeab562 100644 --- a/src/spreadinterp.cpp +++ b/src/spreadinterp.cpp @@ -1847,13 +1847,13 @@ void add_wrapped_subgrid(BIGINT offset1, BIGINT offset2, BIGINT offset3, } template -simd_type make_incremented_vectors(std::index_sequence) { +static FINUFFT_ALWAYS_INLINE simd_type make_incremented_vectors( + std::index_sequence) { return simd_type{Is...}; // Creates a SIMD vector with the index sequence } template -static FINUFFT_ALWAYS_INLINE constexpr bool has_duplicates_impl( - const simd_type &vec) noexcept { +static FINUFFT_ALWAYS_INLINE bool has_duplicates_impl(const simd_type &vec) noexcept { if constexpr (N == simd_type::size) { return false; @@ -1868,8 +1868,7 @@ static FINUFFT_ALWAYS_INLINE constexpr bool has_duplicates_impl( } template -static FINUFFT_ALWAYS_INLINE constexpr bool has_duplicates( - const simd_type &vec) noexcept { +static FINUFFT_ALWAYS_INLINE bool has_duplicates(const simd_type &vec) noexcept { return has_duplicates_impl<1, simd_type>(vec); } @@ -2064,7 +2063,6 @@ void bin_sort_singlethread( // count how many pts in each bin std::vector counts(nbins, 0); -#pragma omp simd for (auto i = 0; i < M; i++) { // find the bin index in however many dims are needed const auto i1 = BIGINT(fold_rescale(kx[i], N1) * inv_bin_size_x); @@ -2082,7 +2080,6 @@ void bin_sort_singlethread( current_offset += tmp; } // (counts now contains the index offsets for each bin) -#pragma omp simd for (auto i = 0; i < M; i++) { // find the bin index (again! but better than using RAM) const auto i1 = BIGINT(fold_rescale(kx[i], N1) * inv_bin_size_x);