diff --git a/src/spreadinterp.cpp b/src/spreadinterp.cpp index 12327c2d6..0afeab562 100644 --- a/src/spreadinterp.cpp +++ b/src/spreadinterp.cpp @@ -14,6 +14,7 @@ #include #include #include +#include #include using namespace std; @@ -73,8 +74,8 @@ static FINUFFT_ALWAYS_INLINE auto ker_eval(FLT *FINUFFT_RESTRICT ker, const V... elems) noexcept; static FINUFFT_ALWAYS_INLINE FLT fold_rescale(FLT x, UBIGINT N) noexcept; template -FINUFFT_ALWAYS_INLINE static simd_type fold_rescale(const simd_type &x, - UBIGINT N) noexcept; +static FINUFFT_ALWAYS_INLINE simd_type fold_rescale(const simd_type &x, + const BIGINT N) noexcept; static FINUFFT_ALWAYS_INLINE void set_kernel_args( FLT *args, FLT x, const finufft_spread_opts &opts) noexcept; static FINUFFT_ALWAYS_INLINE void evaluate_kernel_vector( @@ -114,6 +115,10 @@ static void bin_sort_singlethread(BIGINT *ret, UBIGINT M, const FLT *kx, const F const FLT *kz, UBIGINT N1, UBIGINT N2, UBIGINT N3, double bin_size_x, double bin_size_y, double bin_size_z, int debug); +static void bin_sort_singlethread_vector(BIGINT *ret, UBIGINT M, const FLT *kx, + const FLT *ky, const FLT *kz, UBIGINT N1, + UBIGINT N2, UBIGINT N3, double bin_size_x, + double bin_size_y, double bin_size_z, int debug); void bin_sort_multithread(BIGINT *ret, UBIGINT M, FLT *kx, FLT *ky, FLT *kz, UBIGINT N1, UBIGINT N2, UBIGINT N3, double bin_size_x, double bin_size_y, double bin_size_z, int debug, int nthr); @@ -296,8 +301,8 @@ int indexSort(BIGINT *sort_indices, UBIGINT N1, UBIGINT N2, UBIGINT N3, UBIGINT if (sort_nthr == 0) // multithreaded auto choice: when N>>M, one thread is better! sort_nthr = (10 * M > N) ? maxnthr : 1; // heuristic if (sort_nthr == 1) - bin_sort_singlethread(sort_indices, M, kx, ky, kz, N1, N2, N3, bin_size_x, - bin_size_y, bin_size_z, sort_debug); + bin_sort_singlethread_vector(sort_indices, M, kx, ky, kz, N1, N2, N3, bin_size_x, + bin_size_y, bin_size_z, sort_debug); else // sort_nthr>1, user fixes # threads (>=2) bin_sort_multithread(sort_indices, M, kx, ky, kz, N1, N2, N3, bin_size_x, bin_size_y, bin_size_z, sort_debug, sort_nthr); @@ -1841,6 +1846,177 @@ void add_wrapped_subgrid(BIGINT offset1, BIGINT offset2, BIGINT offset3, } } +template +static FINUFFT_ALWAYS_INLINE simd_type make_incremented_vectors( + std::index_sequence) { + return simd_type{Is...}; // Creates a SIMD vector with the index sequence +} + +template +static FINUFFT_ALWAYS_INLINE bool has_duplicates_impl(const simd_type &vec) noexcept { + + if constexpr (N == simd_type::size) { + return false; + } else { + const auto duplicates = + (xsimd::rotate_right(vec) == vec) != xsimd::batch_bool(false); + if (duplicates) { + return true; + } + return has_duplicates_impl(vec); + } +} + +template +static FINUFFT_ALWAYS_INLINE bool has_duplicates(const simd_type &vec) noexcept { + return has_duplicates_impl<1, simd_type>(vec); +} + +void bin_sort_singlethread_vector( + BIGINT *ret, const UBIGINT M, const FLT *kx, const FLT *ky, const FLT *kz, + const UBIGINT N1, const UBIGINT N2, const UBIGINT N3, const double bin_size_x, + const double bin_size_y, const double bin_size_z, const int debug) +/* Returns permutation of all nonuniform points with good RAM access, + * ie less cache misses for spreading, in 1D, 2D, or 3D. Single-threaded version + * + * This is achieved by binning into cuboids (of given bin_size within the + * overall box domain), then reading out the indices within + * these bins in a Cartesian cuboid ordering (x fastest, y med, z slowest). + * Finally the permutation is inverted, so that the good ordering is: the + * NU pt of index ret[0], the NU pt of index ret[1],..., NU pt of index ret[M-1] + * + * Inputs: M - number of input NU points. + * kx,ky,kz - length-M arrays of real coords of NU pts in [-pi, pi). + * Points outside this range are folded into it. + * N1,N2,N3 - integer sizes of overall box (N2=N3=1 for 1D, N3=1 for 2D) + * bin_size_x,y,z - what binning box size to use in each dimension + * (in rescaled coords where ranges are [0,Ni] ). + * For 1D, only bin_size_x is used; for 2D, it & bin_size_y. + * Output: + * writes to ret a vector list of indices, each in the range 0,..,M-1. + * Thus, ret must have been preallocated for M BIGINTs. + * + * Notes: I compared RAM usage against declaring an internal vector and passing + * back; the latter used more RAM and was slower. + * Avoided the bins array, as in JFM's spreader of 2016, + * tidied up, early 2017, Barnett. + * Timings (2017): 3s for M=1e8 NU pts on 1 core of i7; 5s on 1 core of xeon. + * Simplified by Martin Reinecke, 6/19/23 (no apparent effect on speed). + */ +{ + using simd_type = xsimd::batch; + using int_simd_type = xsimd::batch>; + using arch_t = simd_type::arch_type; + static constexpr auto simd_size = simd_type::size; + static constexpr auto alignment = arch_t::alignment(); + + static constexpr auto to_array = [](const auto &vec) constexpr noexcept { + using T = decltype(std::decay_t()); + alignas(alignment) std::array array{}; + vec.store_aligned(array.data()); + return array; + }; + + const auto isky = (N2 > 1), iskz = (N3 > 1); // ky,kz avail? (cannot access if not) + // here the +1 is needed to allow round-off error causing i1=N1/bin_size_x, + // for kx near +pi, ie foldrescale gives N1 (exact arith would be 0 to N1-1). + // Note that round-off near kx=-pi stably rounds negative to i1=0. + const auto nbins1 = BIGINT(FLT(N1) / bin_size_x + 1); + const auto nbins2 = isky ? BIGINT(FLT(N2) / bin_size_y + 1) : 1; + const auto nbins3 = iskz ? BIGINT(FLT(N3) / bin_size_z + 1) : 1; + const auto nbins = nbins1 * nbins2 * nbins3; + const auto inv_bin_size_x = FLT(1.0 / bin_size_x); + const auto inv_bin_size_y = FLT(1.0 / bin_size_y); + const auto inv_bin_size_z = FLT(1.0 / bin_size_z); + const auto inv_bin_size_x_vec = simd_type(1.0 / bin_size_x); + const auto inv_bin_size_y_vec = simd_type(1.0 / bin_size_y); + const auto inv_bin_size_z_vec = simd_type(1.0 / bin_size_z); + const auto zero = to_int(simd_type(0)); + const auto increment = + to_int(make_incremented_vectors(std::make_index_sequence{})); + + // count how many pts in each bin + alignas(alignment) std::vector> counts(nbins + simd_size, 0); + const auto simd_M = M & (-simd_size); // round down to simd_size multiple + UBIGINT i{}; + for (i = 0; i < simd_M; i += simd_size) { + const auto i1 = xsimd::to_int( + fold_rescale(simd_type::load_unaligned(kx + i), N1) * inv_bin_size_x_vec); + const auto i2 = + isky ? xsimd::to_int(fold_rescale(simd_type::load_unaligned(ky + i), N2) * + inv_bin_size_y_vec) + : zero; + const auto i3 = + iskz ? xsimd::to_int(fold_rescale(simd_type::load_unaligned(kz + i), N3) * + inv_bin_size_z_vec) + : zero; + const auto bin = i1 + nbins1 * (i2 + nbins2 * i3); + if (has_duplicates(bin)) { + const auto bin_array = to_array(bin); + for (int j = 0; j < simd_size; j++) { + ++counts[bin_array[j]]; + } + } else { + const auto bins = int_simd_type::gather(counts.data(), bin); + const auto incr_bins = xsimd::incr(bins); + incr_bins.scatter(counts.data(), bin); + } + } + + for (; i < M; i++) { + // find the bin index in however many dims are needed + const auto i1 = BIGINT(fold_rescale(kx[i], N1) * inv_bin_size_x); + const auto i2 = isky ? BIGINT(fold_rescale(ky[i], N2) * inv_bin_size_y) : 0; + const auto i3 = iskz ? BIGINT(fold_rescale(kz[i], N3) * inv_bin_size_z) : 0; + const auto bin = i1 + nbins1 * (i2 + nbins2 * i3); + ++counts[bin]; + } + + // compute the offsets directly in the counts array (no offset array) + BIGINT current_offset = 0; + for (i = 0; i < nbins; i++) { + BIGINT tmp = counts[i]; + counts[i] = current_offset; // Reinecke's cute replacement of counts[i] + current_offset += tmp; + } // (counts now contains the index offsets for each bin) + + for (i = 0; i < simd_M; i += simd_size) { + const auto i1 = xsimd::to_int( + fold_rescale(simd_type::load_unaligned(kx + i), N1) * inv_bin_size_x_vec); + const auto i2 = + isky ? xsimd::to_int(fold_rescale(simd_type::load_unaligned(ky + i), N2) * + inv_bin_size_y_vec) + : zero; + const auto i3 = + iskz ? xsimd::to_int(fold_rescale(simd_type::load_unaligned(kz + i), N3) * + inv_bin_size_z_vec) + : zero; + const auto bin = i1 + nbins1 * (i2 + nbins2 * i3); + if (has_duplicates(bin)) { + const auto bin_array = to_array(to_int(bin)); + for (int j = 0; j < simd_size; j++) { + ret[counts[bin_array[j]]] = j + i; + counts[bin_array[j]]++; + } + } else { + const auto bins = decltype(bin)::gather(counts.data(), bin); + const auto incr_bins = xsimd::incr(bins); + incr_bins.scatter(counts.data(), bin); + const auto result = increment + int_simd_type(i); + result.scatter(ret, bins); + } + } + for (; i < M; i++) { + // find the bin index (again! but better than using RAM) + const auto i1 = BIGINT(fold_rescale(kx[i], N1) * inv_bin_size_x); + const auto i2 = isky ? BIGINT(fold_rescale(ky[i], N2) * inv_bin_size_y) : 0; + const auto i3 = iskz ? BIGINT(fold_rescale(kz[i], N3) * inv_bin_size_z) : 0; + const auto bin = i1 + nbins1 * (i2 + nbins2 * i3); + ret[counts[bin]] = i; // fill the inverse map on the fly + ++counts[bin]; // update the offsets + } +} + void bin_sort_singlethread( BIGINT *ret, const UBIGINT M, const FLT *kx, const FLT *ky, const FLT *kz, const UBIGINT N1, const UBIGINT N2, const UBIGINT N3, const double bin_size_x,