Changed foldrescale to a faster version that has no boundary limitations

flatironinstitute · May 10, 2024 · 634f163 · 634f163
1 parent cc8629f
commit 634f163
Show file tree

Hide file tree

Showing 34 changed files with 510 additions and 186 deletions.
diff --git a/CHANGELOG b/CHANGELOG
@@ -9,6 +9,7 @@ If not stated, FINUFFT is assumed (cuFINUFFT <=1.3 is listed separately).
 * MAX_NF increased from 1e11 to 1e12, since machines grow.
 * improved GPU python docs: migration guide; usage from cupy, numba, torch,
   pycuda. PyPI pkg still at 2.2.0beta.
+* Used new foldrescale and removed tests for the range
 
 V 2.2.0 (12/12/23)
 

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -31,6 +31,7 @@ option(FINUFFT_USE_OPENMP "Whether to use OpenMP for parallelization. If disable
 option(FINUFFT_USE_CUDA "Whether to build CUDA accelerated FINUFFT library (libcufinufft). This is completely independent of the main FINUFFT library" OFF)
 option(FINUFFT_USE_CPU "Whether to build the ordinary FINUFFT library (libfinufft)." ON)
 option(FINUFFT_STATIC_LINKING "Whether to link the static FINUFFT library (libfinufft_static)." ON)
+option(FINUFFT_BUILD_DEVEL "Whether to build developement executables" OFF)
 # sphinx tag (don't remove): @cmake_opts_end
 
 if(FINUFFT_USE_CPU)
@@ -45,10 +46,11 @@ if(FINUFFT_USE_CPU)
     endif()
 
     set(CPM_DOWNLOAD_VERSION 0.38.0)
-    include(cmake/setupCPM.cmake)
-
     set(FFTW_VERSION 3.3.10)
+
+    include(cmake/setupCPM.cmake)
     include(cmake/setupFFTW.cmake)
+
 endif()
 
 if (FINUFFT_BUILD_MATLAB)
@@ -246,6 +248,10 @@ if (FINUFFT_BUILD_MATLAB)
     add_subdirectory(matlab)
 endif ()
 
+if (FINUFFT_BUILD_DEVEL)
+    add_subdirectory(devel)
+endif ()
+
 include(GNUInstallDirs)
 install(TARGETS ${INSTALL_TARGETS} PUBLIC_HEADER)
 install(FILES ${PROJECT_SOURCE_DIR}/LICENSE

diff --git a/CMakePresets.json b/CMakePresets.json
@@ -31,7 +31,22 @@
             "generator": "Ninja Multi-Config",
             "cacheVariables": {
                 "FINUFFT_BUILD_TESTS": "ON",
-                "FINUFFT_BUILD_EXAMPLES": "ON"
+                "FINUFFT_BUILD_EXAMPLES": "ON",
+                "FINUFFT_BUILD_DEVEL": "ON"
+            }
+        },
+        {
+            "name": "benchmark",
+            "binaryDir": "build/benchmark",
+            "displayName": "Benchmark",
+            "description": "Benchmark release configuration (ninja)",
+            "generator": "Ninja",
+            "cacheVariables": {
+                "CMAKE_BUILD_TYPE": "RelWithDebInfo",
+                "FINUFFT_BUILD_TESTS": "ON",
+                "FINUFFT_BUILD_EXAMPLES": "ON",
+                "FINUFFT_FFTW_SUFFIX": "",
+                "FINUFFT_USE_OPENMP": "OFF"
             }
         },
         {
@@ -104,7 +119,7 @@
         {
             "name": "dev",
             "configurePreset": "dev",
-            "configuration": "Debug"
+            "configuration": "RelWithDebInfo"
         },
         {
             "name": "ninja-multi",

diff --git a/devel/CMakeLists.txt b/devel/CMakeLists.txt
@@ -0,0 +1,21 @@
+project(finufft_devel)
+# Set the minimum required version of CMake
+cmake_minimum_required(VERSION 3.5)
+
+
+# include cpm cmake, downloading it
+CPMAddPackage(
+    NAME benchmark
+    GITHUB_REPOSITORY google/benchmark
+    VERSION 1.8.3
+    OPTIONS "BENCHMARK_ENABLE_TESTING OFF"
+
+)
+
+if (benchmark_ADDED)
+    # patch benchmark target
+    set_target_properties(benchmark PROPERTIES CXX_STANDARD 17)
+endif()
+
+add_executable(foldrescale foldrescale.cpp)
+target_link_libraries(foldrescale finufft benchmark)
diff --git a/devel/foldrescale.cpp b/devel/foldrescale.cpp
@@ -0,0 +1,294 @@
+#include "finufft/defs.h"
+#include <benchmark/benchmark.h>
+#include <iostream>
+#include <random>
+#include <cmath>
+// no vectorize
+#pragma GCC optimize("no-tree-vectorize")
+/* local NU coord fold+rescale macro: does the following affine transform to x:
+     when p=true:   map [-3pi,-pi) and [-pi,pi) and [pi,3pi)    each to [0,N)
+     otherwise,     map [-N,0) and [0,N) and [N,2N)             each to [0,N)
+   Thus, only one period either side of the principal domain is folded.
+   (It is *so* much faster than slow std::fmod that we stick to it.)
+   This explains FINUFFT's allowed input domain of [-3pi,3pi).
+   Speed comparisons of this macro vs a function are in devel/foldrescale*.
+   The macro wins hands-down on i7, even for modern GCC9.
+   This should be done in C++ not as a macro, someday.
+*/
+#define FOLDRESCALE(x, N, p) (p ?                                         \
+         (x + (x>=-PI ? (x<PI ? PI : -PI) : 3*PI)) * ((FLT)M_1_2PI*N) : \
+                        (x>=0.0 ? (x<(FLT)N ? x : x-(FLT)N) : x+(FLT)N))
+
+
+#define FOLDRESCALE04(x, N, p) (p ? \
+   ((x * FLT(M_1_2PI) + FLT(0.5)) - floor(x * FLT(M_1_2PI) + FLT(0.5)))  * FLT(N) : \
+    ((x/FLT(N))-floor(x/FLT(N)))*FLT(N))
+
+#define FOLDRESCALE05(x, N, p) FLT(N) * (p ? \
+   ((x * FLT(M_1_2PI) + FLT(0.5)) - floor(x * FLT(M_1_2PI) + FLT(0.5))) : \
+    ((x/FLT(N))-floor(x/FLT(N))))
+
+inline __attribute__((always_inline))
+FLT foldRescale00(FLT x, BIGINT N, bool p) {
+  FLT result;
+  FLT fN = FLT(N);
+  if (p) {
+    static constexpr FLT x2pi = FLT(M_1_2PI);
+    result = x * x2pi + FLT(0.5);
+    result -= floor(result);
+  } else {
+    const FLT invN = FLT(1.0) / fN;
+    result = x * invN;
+    result -= floor(result);
+  }
+  return result * fN;
+}
+
+inline __attribute__((always_inline))
+FLT foldRescale01(FLT x, BIGINT N, bool p) {
+  return p ? (x + (x >= -PI ? (x < PI ? PI : -PI) : 3 * PI)) * ((FLT) M_1_2PI * N) :
+         (x >= 0.0 ? (x < (FLT) N ? x : x - (FLT) N) : x + (FLT) N);
+}
+
+template<bool p>
+inline __attribute__((always_inline))
+FLT foldRescale02(FLT x, BIGINT N) {
+  if constexpr (p) {
+    return (x + (x >= -PI ? (x < PI ? PI : -PI) : 3 * PI)) * ((FLT) M_1_2PI * N);
+  } else {
+    return (x >= 0.0 ? (x < (FLT) N ? x : x - (FLT) N) : x + (FLT) N);
+  }
+}
+
+template<bool p>
+inline __attribute__((always_inline))
+FLT foldRescale03(FLT x, BIGINT N) {
+  FLT result;
+  FLT fN = FLT(N);
+  if constexpr (p) {
+    static constexpr FLT x2pi = FLT(M_1_2PI);
+    result = x * x2pi + FLT(0.5);
+    result -= floor(result);
+  } else {
+    const FLT invN = FLT(1.0) / fN;
+    result = x * invN;
+    result -= floor(result);
+  }
+  return result * fN;
+}
+
+static std::mt19937_64 gen;
+static std::uniform_real_distribution<> dis(-3 * M_PI, 3 * M_PI);
+static const auto N = std::uniform_int_distribution<>{0, 1000}(gen);
+static std::uniform_real_distribution<> disN(-N, 2*N);
+static volatile auto pirange = true;
+static volatile auto notPirange = !pirange;
+
+static void BM_BASELINE(benchmark::State &state) {
+  for (auto _: state) {
+    benchmark::DoNotOptimize(dis(gen));
+  }
+}
+
+static void BM_FoldRescaleMacro(benchmark::State &state) {
+  for (auto _: state) {
+    FLT x = dis(gen);
+    benchmark::DoNotOptimize(FOLDRESCALE(x, N, pirange));
+  }
+}
+
+static void BM_FoldRescaleMacroN(benchmark::State &state) {
+  for (auto _: state) {
+    FLT x = disN(gen);
+    benchmark::DoNotOptimize(FOLDRESCALE(x, N, notPirange));
+  }
+}
+
+static void BM_FoldRescale00(benchmark::State &state) {
+  for (auto _: state) {
+    FLT x = dis(gen);
+    benchmark::DoNotOptimize(foldRescale00(x, N, pirange));
+  }
+}
+
+
+static void BM_FoldRescale00N(benchmark::State &state) {
+  for (auto _: state) {
+    FLT x = disN(gen);
+    benchmark::DoNotOptimize(foldRescale00(x, N, notPirange));
+  }
+}
+
+
+static void BM_FoldRescale01(benchmark::State &state) {
+  for (auto _: state) {
+    FLT x = dis(gen);
+    benchmark::DoNotOptimize(foldRescale01(x, N, pirange));
+  }
+}
+
+
+static void BM_FoldRescale01N(benchmark::State &state) {
+  for (auto _: state) {
+    FLT x = disN(gen);
+    benchmark::DoNotOptimize(foldRescale01(x, N, notPirange));
+  }
+}
+
+static void BM_FoldRescale02(benchmark::State &state) {
+  for (auto _: state) {
+    FLT x = dis(gen);
+    benchmark::DoNotOptimize(foldRescale02<true>(x, N));
+  }
+}
+
+
+static void BM_FoldRescale02N(benchmark::State &state) {
+  for (auto _: state) {
+    FLT x = disN(gen);
+    benchmark::DoNotOptimize(foldRescale02<false>(x, N));
+  }
+}
+
+
+static void BM_FoldRescale03(benchmark::State &state) {
+  for (auto _: state) {
+    FLT x = dis(gen);
+    benchmark::DoNotOptimize(foldRescale03<true>(x, N));
+  }
+}
+
+static void BM_FoldRescale03N(benchmark::State &state) {
+  for (auto _: state) {
+    FLT x = disN(gen);
+    benchmark::DoNotOptimize(foldRescale03<false>(x, N));
+  }
+}
+
+static void BM_FoldRescale04(benchmark::State &state) {
+  for (auto _: state) {
+    FLT x = dis(gen);
+    benchmark::DoNotOptimize(FOLDRESCALE04(x, N, pirange));
+  }
+}
+
+static void BM_FoldRescale04N(benchmark::State &state) {
+  for (auto _: state) {
+    FLT x = disN(gen);
+    benchmark::DoNotOptimize(FOLDRESCALE04(x, N, notPirange));
+  }
+}
+
+static void BM_FoldRescale05(benchmark::State &state) {
+  for (auto _: state) {
+    FLT x = dis(gen);
+    benchmark::DoNotOptimize(FOLDRESCALE05(x, N, pirange));
+  }
+}
+
+static void BM_FoldRescale05N(benchmark::State &state) {
+  for (auto _: state) {
+    FLT x = disN(gen);
+    benchmark::DoNotOptimize(FOLDRESCALE05(x, N, notPirange));
+  }
+}
+
+
+BENCHMARK(BM_BASELINE);
+
+BENCHMARK(BM_FoldRescaleMacro);
+BENCHMARK(BM_FoldRescale00);
+BENCHMARK(BM_FoldRescale01);
+BENCHMARK(BM_FoldRescale02);
+BENCHMARK(BM_FoldRescale03);
+BENCHMARK(BM_FoldRescale04);
+BENCHMARK(BM_FoldRescale05);
+
+BENCHMARK(BM_FoldRescaleMacroN);
+BENCHMARK(BM_FoldRescale00N);
+BENCHMARK(BM_FoldRescale01N);
+BENCHMARK(BM_FoldRescale02N);
+BENCHMARK(BM_FoldRescale03N);
+BENCHMARK(BM_FoldRescale04N);
+BENCHMARK(BM_FoldRescale05N);
+
+void testFoldRescaleFunctions() {
+  for (bool p: {true, false}) {
+    for (int i = 0; i < 1024; ++i) {  // Run the test 1000 times
+      FLT x = dis(gen);
+      FLT resultMacro = FOLDRESCALE(x, N, p);
+      FLT result00 = foldRescale00(x, N, p);
+      FLT result01 = foldRescale01(x, N, p);
+      FLT result02 = p ? foldRescale02<true>(x, N) : foldRescale02<false>(x, N);
+      FLT result03 = p ? foldRescale03<true>(x, N) : foldRescale03<false>(x, N);
+      FLT result04 = FOLDRESCALE04(x, N, p);
+      FLT result05 = FOLDRESCALE05(x, N, p);
+      // function that compares two floating point number with a tolerance, using relative error
+      auto compare = [](FLT a, FLT b) {
+        return std::abs(a - b) > std::max(std::abs(a), std::abs(b)) * 10e-13;
+      };
+
+      if (compare(resultMacro, result00)) {
+        std::cout << "resultMacro: " << resultMacro << " result00: " << result00 << std::endl;
+        throw std::runtime_error("function00 is wrong");
+      }
+      if (compare(resultMacro, result01)) {
+        std::cout << "resultMacro: " << resultMacro << " result01: " << result01 << std::endl;
+        throw std::runtime_error("function01 is wrong");
+      }
+      if (compare(resultMacro, result02)) {
+        std::cout << "resultMacro: " << resultMacro << " result02: " << result02 << std::endl;
+        throw std::runtime_error("function02 is wrong");
+      }
+      if (compare(resultMacro, result03)) {
+        std::cout << "resultMacro: " << resultMacro << " result03: " << result03 << std::endl;
+        throw std::runtime_error("function03 is wrong");
+      }
+      if (compare(resultMacro, result04)) {
+        std::cout << "resultMacro: " << resultMacro << " result04: " << result04 << std::endl;
+        throw std::runtime_error("function04 is wrong");
+      }
+      if (compare(resultMacro, result05)) {
+        std::cout << "resultMacro: " << resultMacro << " result05: " << result05 << std::endl;
+        throw std::runtime_error("function05 is wrong");
+      }
+    }
+  }
+}
+
+class BaselineSubtractingReporter : public benchmark::ConsoleReporter {
+public:
+  bool ReportContext(const Context &context) override {
+    return benchmark::ConsoleReporter::ReportContext(context);
+  }
+
+  void ReportRuns(const std::vector<Run> &reports) override {
+    for (const auto &run: reports) {
+      if (run.benchmark_name() == "BM_BASELINE") {
+        baseline_time = run.cpu_accumulated_time;
+      } else {
+        Run modified_run = run;
+        modified_run.cpu_accumulated_time -= baseline_time;
+        benchmark::ConsoleReporter::ReportRuns({modified_run});
+      }
+    }
+  }
+
+private:
+  double baseline_time = 0.0;
+};
+
+int main(int argc, char **argv) {
+  pirange = argc & 1;
+  notPirange = !pirange;
+  static std::random_device rd;
+  const auto seed = rd();
+  std::cout << "Seed: " << seed << "\n";
+//  gen.seed(226307105);
+  gen.seed(seed);
+  testFoldRescaleFunctions();
+  ::benchmark::Initialize(&argc, argv);
+  BaselineSubtractingReporter reporter;
+  ::benchmark::RunSpecifiedBenchmarks(&reporter);
+  return 0;
+}