From 2789901d6d1123fb989f2b335df32249d3b2a36b Mon Sep 17 00:00:00 2001
From: Marco Barbone <mbarbone@flatironinstitute.org>
Date: Mon, 9 Sep 2024 17:30:16 -0400
Subject: [PATCH 1/7] slightly-faster

---
 src/spreadinterp.cpp | 125 ++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 123 insertions(+), 2 deletions(-)

diff --git a/src/spreadinterp.cpp b/src/spreadinterp.cpp
index 12327c2d6..1903646d4 100644
--- a/src/spreadinterp.cpp
+++ b/src/spreadinterp.cpp
@@ -114,6 +114,10 @@ static void bin_sort_singlethread(BIGINT *ret, UBIGINT M, const FLT *kx, const F
                                   const FLT *kz, UBIGINT N1, UBIGINT N2, UBIGINT N3,
                                   double bin_size_x, double bin_size_y, double bin_size_z,
                                   int debug);
+static void bin_sort_singlethread_vector(BIGINT *ret, UBIGINT M, const FLT *kx,
+                                         const FLT *ky, const FLT *kz, UBIGINT N1,
+                                         UBIGINT N2, UBIGINT N3, double bin_size_x,
+                                         double bin_size_y, double bin_size_z, int debug);
 void bin_sort_multithread(BIGINT *ret, UBIGINT M, FLT *kx, FLT *ky, FLT *kz, UBIGINT N1,
                           UBIGINT N2, UBIGINT N3, double bin_size_x, double bin_size_y,
                           double bin_size_z, int debug, int nthr);
@@ -296,8 +300,8 @@ int indexSort(BIGINT *sort_indices, UBIGINT N1, UBIGINT N2, UBIGINT N3, UBIGINT
     if (sort_nthr == 0) // multithreaded auto choice: when N>>M, one thread is better!
       sort_nthr = (10 * M > N) ? maxnthr : 1; // heuristic
     if (sort_nthr == 1)
-      bin_sort_singlethread(sort_indices, M, kx, ky, kz, N1, N2, N3, bin_size_x,
-                            bin_size_y, bin_size_z, sort_debug);
+      bin_sort_singlethread_vector(sort_indices, M, kx, ky, kz, N1, N2, N3, bin_size_x,
+                                   bin_size_y, bin_size_z, sort_debug);
     else // sort_nthr>1, user fixes # threads (>=2)
       bin_sort_multithread(sort_indices, M, kx, ky, kz, N1, N2, N3, bin_size_x,
                            bin_size_y, bin_size_z, sort_debug, sort_nthr);
@@ -1841,6 +1845,123 @@ void add_wrapped_subgrid(BIGINT offset1, BIGINT offset2, BIGINT offset3,
   }
 }
 
+void bin_sort_singlethread_vector(
+    BIGINT *ret, const UBIGINT M, const FLT *kx, const FLT *ky, const FLT *kz,
+    const UBIGINT N1, const UBIGINT N2, const UBIGINT N3, const double bin_size_x,
+    const double bin_size_y, const double bin_size_z, const int debug)
+/* Returns permutation of all nonuniform points with good RAM access,
+ * ie less cache misses for spreading, in 1D, 2D, or 3D. Single-threaded version
+ *
+ * This is achieved by binning into cuboids (of given bin_size within the
+ * overall box domain), then reading out the indices within
+ * these bins in a Cartesian cuboid ordering (x fastest, y med, z slowest).
+ * Finally the permutation is inverted, so that the good ordering is: the
+ * NU pt of index ret[0], the NU pt of index ret[1],..., NU pt of index ret[M-1]
+ *
+ * Inputs: M - number of input NU points.
+ *         kx,ky,kz - length-M arrays of real coords of NU pts in [-pi, pi).
+ *                    Points outside this range are folded into it.
+ *         N1,N2,N3 - integer sizes of overall box (N2=N3=1 for 1D, N3=1 for 2D)
+ *         bin_size_x,y,z - what binning box size to use in each dimension
+ *                    (in rescaled coords where ranges are [0,Ni] ).
+ *                    For 1D, only bin_size_x is used; for 2D, it & bin_size_y.
+ * Output:
+ *         writes to ret a vector list of indices, each in the range 0,..,M-1.
+ *         Thus, ret must have been preallocated for M BIGINTs.
+ *
+ * Notes: I compared RAM usage against declaring an internal vector and passing
+ * back; the latter used more RAM and was slower.
+ * Avoided the bins array, as in JFM's spreader of 2016,
+ * tidied up, early 2017, Barnett.
+ * Timings (2017): 3s for M=1e8 NU pts on 1 core of i7; 5s on 1 core of xeon.
+ * Simplified by Martin Reinecke, 6/19/23 (no apparent effect on speed).
+ */
+{
+  using simd_type                 = xsimd::batch<FLT>;
+  using arch_t                    = simd_type::arch_type;
+  static constexpr auto simd_size = simd_type::size;
+  static constexpr auto alignment = arch_t::alignment();
+
+  constexpr auto to_array = [](const auto &vec) constexpr noexcept {
+    using T = decltype(std::decay_t<decltype(vec)>());
+    alignas(alignment) std::array<typename T::value_type, T::size> array{};
+    vec.store_aligned(array.data());
+    return array;
+  };
+
+  const auto isky = (N2 > 1), iskz = (N3 > 1); // ky,kz avail? (cannot access if not)
+  // here the +1 is needed to allow round-off error causing i1=N1/bin_size_x,
+  // for kx near +pi, ie foldrescale gives N1 (exact arith would be 0 to N1-1).
+  // Note that round-off near kx=-pi stably rounds negative to i1=0.
+  const auto nbins1             = BIGINT(FLT(N1) / bin_size_x + 1);
+  const auto nbins2             = isky ? BIGINT(FLT(N2) / bin_size_y + 1) : 1;
+  const auto nbins3             = iskz ? BIGINT(FLT(N3) / bin_size_z + 1) : 1;
+  const auto nbins              = nbins1 * nbins2 * nbins3;
+  const auto inv_bin_size_x     = FLT(1.0 / bin_size_x);
+  const auto inv_bin_size_y     = FLT(1.0 / bin_size_y);
+  const auto inv_bin_size_z     = FLT(1.0 / bin_size_z);
+  const auto inv_bin_size_x_vec = simd_type(1.0 / bin_size_x);
+  const auto inv_bin_size_y_vec = simd_type(1.0 / bin_size_y);
+  const auto inv_bin_size_z_vec = simd_type(1.0 / bin_size_z);
+  const auto zero               = to_int(simd_type(0));
+
+  // count how many pts in each bin
+  std::vector<BIGINT> counts(nbins, 0);
+  const auto simd_M = M & (-simd_size); // round down to simd_size multiple
+  UBIGINT i{};
+  for (i = 0; i < simd_M; i += simd_size) {
+    const auto i1 = to_int(simd_type::load_aligned(kx + i) * inv_bin_size_x_vec);
+    const auto i2 =
+        isky ? to_int(simd_type::load_aligned(ky + i) * inv_bin_size_y_vec) : zero;
+    const auto i3 =
+        iskz ? to_int(simd_type::load_aligned(kz + i) * inv_bin_size_z_vec) : zero;
+    const auto bin       = i1 + nbins1 * (i2 + nbins2 * i3);
+    const auto bin_array = to_array(bin);
+    for (int j = 0; j < simd_size; j++) {
+      ++counts[bin_array[j]];
+    }
+  }
+  for (; i < M; i++) {
+    // find the bin index in however many dims are needed
+    const auto i1  = BIGINT(fold_rescale(kx[i], N1) * inv_bin_size_x);
+    const auto i2  = isky ? BIGINT(fold_rescale(ky[i], N2) * inv_bin_size_y) : 0;
+    const auto i3  = iskz ? BIGINT(fold_rescale(kz[i], N3) * inv_bin_size_z) : 0;
+    const auto bin = i1 + nbins1 * (i2 + nbins2 * i3);
+    ++counts[bin];
+  }
+
+  // compute the offsets directly in the counts array (no offset array)
+  BIGINT current_offset = 0;
+  for (BIGINT i = 0; i < nbins; i++) {
+    BIGINT tmp = counts[i];
+    counts[i]  = current_offset; // Reinecke's cute replacement of counts[i]
+    current_offset += tmp;
+  } // (counts now contains the index offsets for each bin)
+
+  for (i = 0; i < simd_M; i += simd_size) {
+    const auto i1 = xsimd::to_int((simd_type::load_aligned(kx + i) * inv_bin_size_x_vec));
+    const auto i2 =
+        isky ? xsimd::to_int(simd_type::load_aligned(ky + i) * inv_bin_size_y_vec) : zero;
+    const auto i3 =
+        iskz ? xsimd::to_int(simd_type::load_aligned(kz + i) * inv_bin_size_z_vec) : zero;
+    const auto bin       = i1 + nbins1 * (i2 + nbins2 * i3);
+    const auto bin_array = to_array(bin);
+    for (int j = 0; j < simd_size; j++) {
+      ret[counts[bin_array[j]]] = j + i;
+      counts[bin_array[j]]++;
+    }
+  }
+  for (; i < M; i++) {
+    // find the bin index (again! but better than using RAM)
+    const auto i1    = BIGINT(fold_rescale(kx[i], N1) * inv_bin_size_x);
+    const auto i2    = isky ? BIGINT(fold_rescale(ky[i], N2) * inv_bin_size_y) : 0;
+    const auto i3    = iskz ? BIGINT(fold_rescale(kz[i], N3) * inv_bin_size_z) : 0;
+    const auto bin   = i1 + nbins1 * (i2 + nbins2 * i3);
+    ret[counts[bin]] = i; // fill the inverse map on the fly
+    ++counts[bin];        // update the offsets
+  }
+}
+
 void bin_sort_singlethread(
     BIGINT *ret, const UBIGINT M, const FLT *kx, const FLT *ky, const FLT *kz,
     const UBIGINT N1, const UBIGINT N2, const UBIGINT N3, const double bin_size_x,

From 7aa45ec0e712374402a96dff427aa51d0e3db29b Mon Sep 17 00:00:00 2001
From: Marco Barbone <mbarbone@flatironinstitute.org>
Date: Wed, 11 Sep 2024 10:17:58 -0400
Subject: [PATCH 2/7] WIP

---
 src/spreadinterp.cpp | 66 +++++++++++++++++++++++++++++++++++---------
 1 file changed, 53 insertions(+), 13 deletions(-)

diff --git a/src/spreadinterp.cpp b/src/spreadinterp.cpp
index 1903646d4..1a2660cbc 100644
--- a/src/spreadinterp.cpp
+++ b/src/spreadinterp.cpp
@@ -14,6 +14,7 @@
 #include <cmath>
 #include <cstdio>
 #include <cstdlib>
+#include <iostream>
 #include <vector>
 
 using namespace std;
@@ -73,8 +74,8 @@ static FINUFFT_ALWAYS_INLINE auto ker_eval(FLT *FINUFFT_RESTRICT ker,
                                            const V... elems) noexcept;
 static FINUFFT_ALWAYS_INLINE FLT fold_rescale(FLT x, UBIGINT N) noexcept;
 template<class simd_type>
-FINUFFT_ALWAYS_INLINE static simd_type fold_rescale(const simd_type &x,
-                                                    UBIGINT N) noexcept;
+static FINUFFT_ALWAYS_INLINE simd_type fold_rescale(const simd_type &x,
+                                                    const BIGINT N) noexcept;
 static FINUFFT_ALWAYS_INLINE void set_kernel_args(
     FLT *args, FLT x, const finufft_spread_opts &opts) noexcept;
 static FINUFFT_ALWAYS_INLINE void evaluate_kernel_vector(
@@ -1845,6 +1846,11 @@ void add_wrapped_subgrid(BIGINT offset1, BIGINT offset2, BIGINT offset3,
   }
 }
 
+template<typename simd_type, std::size_t... Is>
+simd_type make_incremented_vectors(std::index_sequence<Is...>) {
+  return simd_type{Is...}; // Creates a SIMD vector with the index sequence
+}
+
 void bin_sort_singlethread_vector(
     BIGINT *ret, const UBIGINT M, const FLT *kx, const FLT *ky, const FLT *kz,
     const UBIGINT N1, const UBIGINT N2, const UBIGINT N3, const double bin_size_x,
@@ -1878,6 +1884,7 @@ void bin_sort_singlethread_vector(
  */
 {
   using simd_type                 = xsimd::batch<FLT>;
+  using int_simd_type             = xsimd::batch<xsimd::as_integer_t<FLT>>;
   using arch_t                    = simd_type::arch_type;
   static constexpr auto simd_size = simd_type::size;
   static constexpr auto alignment = arch_t::alignment();
@@ -1904,23 +1911,36 @@ void bin_sort_singlethread_vector(
   const auto inv_bin_size_y_vec = simd_type(1.0 / bin_size_y);
   const auto inv_bin_size_z_vec = simd_type(1.0 / bin_size_z);
   const auto zero               = to_int(simd_type(0));
+  const auto increment =
+      to_int(make_incremented_vectors<simd_type>(std::make_index_sequence<simd_size>{}));
 
   // count how many pts in each bin
-  std::vector<BIGINT> counts(nbins, 0);
+  alignas(alignment) std::vector<xsimd::as_integer_t<FLT>> counts(nbins + simd_size, 0);
+  alignas(alignment) std::vector<xsimd::as_integer_t<FLT>> ref_counts(nbins + simd_size,
+                                                                      0);
   const auto simd_M = M & (-simd_size); // round down to simd_size multiple
   UBIGINT i{};
   for (i = 0; i < simd_M; i += simd_size) {
-    const auto i1 = to_int(simd_type::load_aligned(kx + i) * inv_bin_size_x_vec);
+    const auto i1 = xsimd::to_int(
+        fold_rescale(simd_type::load_unaligned(kx + i), N1) * inv_bin_size_x_vec);
     const auto i2 =
-        isky ? to_int(simd_type::load_aligned(ky + i) * inv_bin_size_y_vec) : zero;
+        isky ? xsimd::to_int(fold_rescale(simd_type::load_unaligned(ky + i), N2) *
+                             inv_bin_size_y_vec)
+             : zero;
     const auto i3 =
-        iskz ? to_int(simd_type::load_aligned(kz + i) * inv_bin_size_z_vec) : zero;
+        iskz ? xsimd::to_int(fold_rescale(simd_type::load_unaligned(kz + i), N3) *
+                             inv_bin_size_z_vec)
+             : zero;
     const auto bin       = i1 + nbins1 * (i2 + nbins2 * i3);
     const auto bin_array = to_array(bin);
     for (int j = 0; j < simd_size; j++) {
-      ++counts[bin_array[j]];
+      ++ref_counts[bin_array[j]];
     }
+    const auto bins      = int_simd_type::gather(counts.data(), bin);
+    const auto incr_bins = bins + 1;
+    incr_bins.scatter(counts.data(), bin);
   }
+
   for (; i < M; i++) {
     // find the bin index in however many dims are needed
     const auto i1  = BIGINT(fold_rescale(kx[i], N1) * inv_bin_size_x);
@@ -1928,24 +1948,44 @@ void bin_sort_singlethread_vector(
     const auto i3  = iskz ? BIGINT(fold_rescale(kz[i], N3) * inv_bin_size_z) : 0;
     const auto bin = i1 + nbins1 * (i2 + nbins2 * i3);
     ++counts[bin];
+    ++ref_counts[bin];
+  }
+
+  for (i = 0; i < nbins; i++) {
+    if (counts[i] != ref_counts[i]) {
+      std::cerr << "Error: bin count mismatch at bin " << i
+                << " counts[i] = " << counts[i] << " ref_counts[i] = " << ref_counts[i]
+                << std::endl;
+      std::abort();
+    }
   }
 
   // compute the offsets directly in the counts array (no offset array)
   BIGINT current_offset = 0;
-  for (BIGINT i = 0; i < nbins; i++) {
+  for (i = 0; i < nbins; i++) {
     BIGINT tmp = counts[i];
     counts[i]  = current_offset; // Reinecke's cute replacement of counts[i]
     current_offset += tmp;
   } // (counts now contains the index offsets for each bin)
 
   for (i = 0; i < simd_M; i += simd_size) {
-    const auto i1 = xsimd::to_int((simd_type::load_aligned(kx + i) * inv_bin_size_x_vec));
+    const auto i1 = xsimd::to_int(
+        fold_rescale(simd_type::load_unaligned(kx + i), N1) * inv_bin_size_x_vec);
     const auto i2 =
-        isky ? xsimd::to_int(simd_type::load_aligned(ky + i) * inv_bin_size_y_vec) : zero;
+        isky ? xsimd::to_int(fold_rescale(simd_type::load_unaligned(ky + i), N2) *
+                             inv_bin_size_y_vec)
+             : zero;
     const auto i3 =
-        iskz ? xsimd::to_int(simd_type::load_aligned(kz + i) * inv_bin_size_z_vec) : zero;
-    const auto bin       = i1 + nbins1 * (i2 + nbins2 * i3);
-    const auto bin_array = to_array(bin);
+        iskz ? xsimd::to_int(fold_rescale(simd_type::load_unaligned(kz + i), N3) *
+                             inv_bin_size_z_vec)
+             : zero;
+    const auto bin = i1 + nbins1 * (i2 + nbins2 * i3);
+    // const auto bins = decltype(bin)::gather(counts.data(), bin);
+    // const auto ret_elems = decltype(bins)::gather(ret, bins) + (increment+i);
+    // ret_elems.scatter(ret, bins);
+    // const auto inc_bins = bins+1;
+    // inc_bins.scatter(counts.data(), bin);
+    const auto bin_array = to_array(to_int(bin));
     for (int j = 0; j < simd_size; j++) {
       ret[counts[bin_array[j]]] = j + i;
       counts[bin_array[j]]++;

From 8da21c26dfd67346d4d58205542834be8511050b Mon Sep 17 00:00:00 2001
From: Marco Barbone <mbarbone@flatironinstitute.org>
Date: Thu, 12 Sep 2024 11:09:29 -0400
Subject: [PATCH 3/7] vectorized

---
 src/spreadinterp.cpp | 65 ++++++++++++++++++++++++--------------------
 1 file changed, 35 insertions(+), 30 deletions(-)

diff --git a/src/spreadinterp.cpp b/src/spreadinterp.cpp
index 1a2660cbc..1707dac41 100644
--- a/src/spreadinterp.cpp
+++ b/src/spreadinterp.cpp
@@ -1889,13 +1889,24 @@ void bin_sort_singlethread_vector(
   static constexpr auto simd_size = simd_type::size;
   static constexpr auto alignment = arch_t::alignment();
 
-  constexpr auto to_array = [](const auto &vec) constexpr noexcept {
+  static constexpr auto to_array = [](const auto &vec) constexpr noexcept {
     using T = decltype(std::decay_t<decltype(vec)>());
     alignas(alignment) std::array<typename T::value_type, T::size> array{};
     vec.store_aligned(array.data());
     return array;
   };
 
+  static constexpr auto has_duplicates = [](const auto &vec) constexpr noexcept {
+    using T = decltype(std::decay_t<decltype(vec)>());
+    for (auto i = 0; i < simd_size; i++) {
+      const auto rotated = xsimd::rotl(vec, (sizeof(typename T::value_type) * 8) * i);
+      if ((rotated == vec).mask() != 0) {
+        return true;
+      }
+    }
+    return false;
+  };
+
   const auto isky = (N2 > 1), iskz = (N3 > 1); // ky,kz avail? (cannot access if not)
   // here the +1 is needed to allow round-off error causing i1=N1/bin_size_x,
   // for kx near +pi, ie foldrescale gives N1 (exact arith would be 0 to N1-1).
@@ -1916,8 +1927,6 @@ void bin_sort_singlethread_vector(
 
   // count how many pts in each bin
   alignas(alignment) std::vector<xsimd::as_integer_t<FLT>> counts(nbins + simd_size, 0);
-  alignas(alignment) std::vector<xsimd::as_integer_t<FLT>> ref_counts(nbins + simd_size,
-                                                                      0);
   const auto simd_M = M & (-simd_size); // round down to simd_size multiple
   UBIGINT i{};
   for (i = 0; i < simd_M; i += simd_size) {
@@ -1931,14 +1940,17 @@ void bin_sort_singlethread_vector(
         iskz ? xsimd::to_int(fold_rescale(simd_type::load_unaligned(kz + i), N3) *
                              inv_bin_size_z_vec)
              : zero;
-    const auto bin       = i1 + nbins1 * (i2 + nbins2 * i3);
-    const auto bin_array = to_array(bin);
-    for (int j = 0; j < simd_size; j++) {
-      ++ref_counts[bin_array[j]];
+    const auto bin = i1 + nbins1 * (i2 + nbins2 * i3);
+    if (has_duplicates(bin)) {
+      const auto bin_array = to_array(bin);
+      for (int j = 0; j < simd_size; j++) {
+        ++counts[bin_array[j]];
+      }
+    } else {
+      const auto bins      = int_simd_type::gather(counts.data(), bin);
+      const auto incr_bins = bins + 1;
+      incr_bins.scatter(counts.data(), bin);
     }
-    const auto bins      = int_simd_type::gather(counts.data(), bin);
-    const auto incr_bins = bins + 1;
-    incr_bins.scatter(counts.data(), bin);
   }
 
   for (; i < M; i++) {
@@ -1948,16 +1960,6 @@ void bin_sort_singlethread_vector(
     const auto i3  = iskz ? BIGINT(fold_rescale(kz[i], N3) * inv_bin_size_z) : 0;
     const auto bin = i1 + nbins1 * (i2 + nbins2 * i3);
     ++counts[bin];
-    ++ref_counts[bin];
-  }
-
-  for (i = 0; i < nbins; i++) {
-    if (counts[i] != ref_counts[i]) {
-      std::cerr << "Error: bin count mismatch at bin " << i
-                << " counts[i] = " << counts[i] << " ref_counts[i] = " << ref_counts[i]
-                << std::endl;
-      std::abort();
-    }
   }
 
   // compute the offsets directly in the counts array (no offset array)
@@ -1979,16 +1981,19 @@ void bin_sort_singlethread_vector(
         iskz ? xsimd::to_int(fold_rescale(simd_type::load_unaligned(kz + i), N3) *
                              inv_bin_size_z_vec)
              : zero;
-    const auto bin = i1 + nbins1 * (i2 + nbins2 * i3);
-    // const auto bins = decltype(bin)::gather(counts.data(), bin);
-    // const auto ret_elems = decltype(bins)::gather(ret, bins) + (increment+i);
-    // ret_elems.scatter(ret, bins);
-    // const auto inc_bins = bins+1;
-    // inc_bins.scatter(counts.data(), bin);
-    const auto bin_array = to_array(to_int(bin));
-    for (int j = 0; j < simd_size; j++) {
-      ret[counts[bin_array[j]]] = j + i;
-      counts[bin_array[j]]++;
+    const auto bin  = i1 + nbins1 * (i2 + nbins2 * i3);
+    const auto bins = decltype(bin)::gather(counts.data(), bin);
+    if (has_duplicates(bin) || has_duplicates(bins)) {
+      const auto bin_array = to_array(to_int(bin));
+      for (int j = 0; j < simd_size; j++) {
+        ret[counts[bin_array[j]]] = j + i;
+        counts[bin_array[j]]++;
+      }
+    } else {
+      const auto incr_bins = bins + 1;
+      incr_bins.scatter(counts.data(), bin);
+      const auto result = increment + i;
+      result.scatter(ret, bins);
     }
   }
   for (; i < M; i++) {

From 8a9fec92b5527cbca2f4e52b068607c9cde75c8e Mon Sep 17 00:00:00 2001
From: Marco Barbone <mbarbone@flatironinstitute.org>
Date: Thu, 12 Sep 2024 15:10:34 -0400
Subject: [PATCH 4/7] it works now

---
 src/spreadinterp.cpp | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/spreadinterp.cpp b/src/spreadinterp.cpp
index 1707dac41..bfbb67576 100644
--- a/src/spreadinterp.cpp
+++ b/src/spreadinterp.cpp
@@ -1899,8 +1899,8 @@ void bin_sort_singlethread_vector(
   static constexpr auto has_duplicates = [](const auto &vec) constexpr noexcept {
     using T = decltype(std::decay_t<decltype(vec)>());
     for (auto i = 0; i < simd_size; i++) {
-      const auto rotated = xsimd::rotl(vec, (sizeof(typename T::value_type) * 8) * i);
-      if ((rotated == vec).mask() != 0) {
+      const auto rotated = xsimd::rotr(vec, sizeof(typename T::value_type) * 8 * i);
+      if ((rotated == vec) != xsimd::batch_bool<bool>(false)) {
         return true;
       }
     }
@@ -1948,7 +1948,7 @@ void bin_sort_singlethread_vector(
       }
     } else {
       const auto bins      = int_simd_type::gather(counts.data(), bin);
-      const auto incr_bins = bins + 1;
+      const auto incr_bins = xsimd::incr(bins);
       incr_bins.scatter(counts.data(), bin);
     }
   }
@@ -1990,9 +1990,9 @@ void bin_sort_singlethread_vector(
         counts[bin_array[j]]++;
       }
     } else {
-      const auto incr_bins = bins + 1;
+      const auto incr_bins = xsimd::incr(bins);
       incr_bins.scatter(counts.data(), bin);
-      const auto result = increment + i;
+      const auto result = increment + int_simd_type(i);
       result.scatter(ret, bins);
     }
   }
@@ -2053,6 +2053,7 @@ void bin_sort_singlethread(
   // count how many pts in each bin
   std::vector<BIGINT> counts(nbins, 0);
 
+#pragma omp simd
   for (auto i = 0; i < M; i++) {
     // find the bin index in however many dims are needed
     const auto i1  = BIGINT(fold_rescale(kx[i], N1) * inv_bin_size_x);
@@ -2070,6 +2071,7 @@ void bin_sort_singlethread(
     current_offset += tmp;
   } // (counts now contains the index offsets for each bin)
 
+#pragma omp simd
   for (auto i = 0; i < M; i++) {
     // find the bin index (again! but better than using RAM)
     const auto i1    = BIGINT(fold_rescale(kx[i], N1) * inv_bin_size_x);

From 2f33b1f0fec4210660e9f97e3491de2a4c8b3c30 Mon Sep 17 00:00:00 2001
From: Marco Barbone <mbarbone@flatironinstitute.org>
Date: Fri, 13 Sep 2024 12:34:22 -0400
Subject: [PATCH 5/7] vector check removed

---
 src/spreadinterp.cpp | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/src/spreadinterp.cpp b/src/spreadinterp.cpp
index bfbb67576..cc964402a 100644
--- a/src/spreadinterp.cpp
+++ b/src/spreadinterp.cpp
@@ -1897,11 +1897,11 @@ void bin_sort_singlethread_vector(
   };
 
   static constexpr auto has_duplicates = [](const auto &vec) constexpr noexcept {
-    using T = decltype(std::decay_t<decltype(vec)>());
-    for (auto i = 0; i < simd_size; i++) {
-      const auto rotated = xsimd::rotr(vec, sizeof(typename T::value_type) * 8 * i);
-      if ((rotated == vec) != xsimd::batch_bool<bool>(false)) {
-        return true;
+    for (int i = 0; i < simd_size; i++) {
+      for (int j = i + 1; j < simd_size; j++) {
+        if (vec[i] == vec[j]) {
+          return true;
+        }
       }
     }
     return false;
@@ -1940,9 +1940,9 @@ void bin_sort_singlethread_vector(
         iskz ? xsimd::to_int(fold_rescale(simd_type::load_unaligned(kz + i), N3) *
                              inv_bin_size_z_vec)
              : zero;
-    const auto bin = i1 + nbins1 * (i2 + nbins2 * i3);
-    if (has_duplicates(bin)) {
-      const auto bin_array = to_array(bin);
+    const auto bin       = i1 + nbins1 * (i2 + nbins2 * i3);
+    const auto bin_array = to_array(bin);
+    if (has_duplicates(bin_array)) {
       for (int j = 0; j < simd_size; j++) {
         ++counts[bin_array[j]];
       }
@@ -1981,10 +1981,11 @@ void bin_sort_singlethread_vector(
         iskz ? xsimd::to_int(fold_rescale(simd_type::load_unaligned(kz + i), N3) *
                              inv_bin_size_z_vec)
              : zero;
-    const auto bin  = i1 + nbins1 * (i2 + nbins2 * i3);
-    const auto bins = decltype(bin)::gather(counts.data(), bin);
-    if (has_duplicates(bin) || has_duplicates(bins)) {
-      const auto bin_array = to_array(to_int(bin));
+    const auto bin        = i1 + nbins1 * (i2 + nbins2 * i3);
+    const auto bins       = decltype(bin)::gather(counts.data(), bin);
+    const auto bin_array  = to_array(to_int(bin));
+    const auto bins_array = to_array(to_int(bins));
+    if (has_duplicates(bin_array) || has_duplicates(bins_array)) {
       for (int j = 0; j < simd_size; j++) {
         ret[counts[bin_array[j]]] = j + i;
         counts[bin_array[j]]++;

From cffbb9fee4f9378d0f9affcc10f43d2c85217f07 Mon Sep 17 00:00:00 2001
From: Marco Barbone <mbarbone@flatironinstitute.org>
Date: Fri, 13 Sep 2024 12:46:40 -0400
Subject: [PATCH 6/7] minor optimizations

---
 src/spreadinterp.cpp | 48 ++++++++++++++++++++++++++------------------
 1 file changed, 29 insertions(+), 19 deletions(-)

diff --git a/src/spreadinterp.cpp b/src/spreadinterp.cpp
index cc964402a..5aa68bb11 100644
--- a/src/spreadinterp.cpp
+++ b/src/spreadinterp.cpp
@@ -1851,6 +1851,28 @@ simd_type make_incremented_vectors(std::index_sequence<Is...>) {
   return simd_type{Is...}; // Creates a SIMD vector with the index sequence
 }
 
+template<std::size_t N, typename simd_type>
+static FINUFFT_ALWAYS_INLINE constexpr bool has_duplicates_impl(
+    const simd_type &vec) noexcept {
+
+  if constexpr (N == simd_type::size) {
+    return false;
+  } else {
+    const auto duplicates =
+        (xsimd::rotate_right<N>(vec) == vec) != xsimd::batch_bool<bool>(false);
+    if (duplicates) {
+      return true;
+    }
+    return has_duplicates_impl<N + 1>(vec);
+  }
+}
+
+template<typename simd_type>
+static FINUFFT_ALWAYS_INLINE constexpr bool has_duplicates(
+    const simd_type &vec) noexcept {
+  return has_duplicates_impl<1, simd_type>(vec);
+}
+
 void bin_sort_singlethread_vector(
     BIGINT *ret, const UBIGINT M, const FLT *kx, const FLT *ky, const FLT *kz,
     const UBIGINT N1, const UBIGINT N2, const UBIGINT N3, const double bin_size_x,
@@ -1896,17 +1918,6 @@ void bin_sort_singlethread_vector(
     return array;
   };
 
-  static constexpr auto has_duplicates = [](const auto &vec) constexpr noexcept {
-    for (int i = 0; i < simd_size; i++) {
-      for (int j = i + 1; j < simd_size; j++) {
-        if (vec[i] == vec[j]) {
-          return true;
-        }
-      }
-    }
-    return false;
-  };
-
   const auto isky = (N2 > 1), iskz = (N3 > 1); // ky,kz avail? (cannot access if not)
   // here the +1 is needed to allow round-off error causing i1=N1/bin_size_x,
   // for kx near +pi, ie foldrescale gives N1 (exact arith would be 0 to N1-1).
@@ -1940,9 +1951,9 @@ void bin_sort_singlethread_vector(
         iskz ? xsimd::to_int(fold_rescale(simd_type::load_unaligned(kz + i), N3) *
                              inv_bin_size_z_vec)
              : zero;
-    const auto bin       = i1 + nbins1 * (i2 + nbins2 * i3);
-    const auto bin_array = to_array(bin);
-    if (has_duplicates(bin_array)) {
+    const auto bin = i1 + nbins1 * (i2 + nbins2 * i3);
+    if (has_duplicates(bin)) {
+      const auto bin_array = to_array(bin);
       for (int j = 0; j < simd_size; j++) {
         ++counts[bin_array[j]];
       }
@@ -1981,16 +1992,15 @@ void bin_sort_singlethread_vector(
         iskz ? xsimd::to_int(fold_rescale(simd_type::load_unaligned(kz + i), N3) *
                              inv_bin_size_z_vec)
              : zero;
-    const auto bin        = i1 + nbins1 * (i2 + nbins2 * i3);
-    const auto bins       = decltype(bin)::gather(counts.data(), bin);
-    const auto bin_array  = to_array(to_int(bin));
-    const auto bins_array = to_array(to_int(bins));
-    if (has_duplicates(bin_array) || has_duplicates(bins_array)) {
+    const auto bin = i1 + nbins1 * (i2 + nbins2 * i3);
+    if (has_duplicates(bin)) {
+      const auto bin_array = to_array(to_int(bin));
       for (int j = 0; j < simd_size; j++) {
         ret[counts[bin_array[j]]] = j + i;
         counts[bin_array[j]]++;
       }
     } else {
+      const auto bins      = decltype(bin)::gather(counts.data(), bin);
       const auto incr_bins = xsimd::incr(bins);
       incr_bins.scatter(counts.data(), bin);
       const auto result = increment + int_simd_type(i);

From 1e03401f67744d4d7a1bb3c41ad37297b7a38acf Mon Sep 17 00:00:00 2001
From: Marco Barbone <mbarbone@flatironinstitute.org>
Date: Fri, 13 Sep 2024 12:56:07 -0400
Subject: [PATCH 7/7] omp simd causes segfaults

---
 src/spreadinterp.cpp | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/src/spreadinterp.cpp b/src/spreadinterp.cpp
index 5aa68bb11..0afeab562 100644
--- a/src/spreadinterp.cpp
+++ b/src/spreadinterp.cpp
@@ -1847,13 +1847,13 @@ void add_wrapped_subgrid(BIGINT offset1, BIGINT offset2, BIGINT offset3,
 }
 
 template<typename simd_type, std::size_t... Is>
-simd_type make_incremented_vectors(std::index_sequence<Is...>) {
+static FINUFFT_ALWAYS_INLINE simd_type make_incremented_vectors(
+    std::index_sequence<Is...>) {
   return simd_type{Is...}; // Creates a SIMD vector with the index sequence
 }
 
 template<std::size_t N, typename simd_type>
-static FINUFFT_ALWAYS_INLINE constexpr bool has_duplicates_impl(
-    const simd_type &vec) noexcept {
+static FINUFFT_ALWAYS_INLINE bool has_duplicates_impl(const simd_type &vec) noexcept {
 
   if constexpr (N == simd_type::size) {
     return false;
@@ -1868,8 +1868,7 @@ static FINUFFT_ALWAYS_INLINE constexpr bool has_duplicates_impl(
 }
 
 template<typename simd_type>
-static FINUFFT_ALWAYS_INLINE constexpr bool has_duplicates(
-    const simd_type &vec) noexcept {
+static FINUFFT_ALWAYS_INLINE bool has_duplicates(const simd_type &vec) noexcept {
   return has_duplicates_impl<1, simd_type>(vec);
 }
 
@@ -2064,7 +2063,6 @@ void bin_sort_singlethread(
   // count how many pts in each bin
   std::vector<BIGINT> counts(nbins, 0);
 
-#pragma omp simd
   for (auto i = 0; i < M; i++) {
     // find the bin index in however many dims are needed
     const auto i1  = BIGINT(fold_rescale(kx[i], N1) * inv_bin_size_x);
@@ -2082,7 +2080,6 @@ void bin_sort_singlethread(
     current_offset += tmp;
   } // (counts now contains the index offsets for each bin)
 
-#pragma omp simd
   for (auto i = 0; i < M; i++) {
     // find the bin index (again! but better than using RAM)
     const auto i1    = BIGINT(fold_rescale(kx[i], N1) * inv_bin_size_x);