Skip to content

Commit

Permalink
Changed foldrescale to a faster version that has no boundary limitations
Browse files Browse the repository at this point in the history
  • Loading branch information
Marco Barbone committed May 10, 2024
1 parent cc8629f commit 634f163
Show file tree
Hide file tree
Showing 34 changed files with 510 additions and 186 deletions.
1 change: 1 addition & 0 deletions CHANGELOG
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ If not stated, FINUFFT is assumed (cuFINUFFT <=1.3 is listed separately).
* MAX_NF increased from 1e11 to 1e12, since machines grow.
* improved GPU python docs: migration guide; usage from cupy, numba, torch,
pycuda. PyPI pkg still at 2.2.0beta.
* Used new foldrescale and removed tests for the range

V 2.2.0 (12/12/23)

Expand Down
10 changes: 8 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ option(FINUFFT_USE_OPENMP "Whether to use OpenMP for parallelization. If disable
option(FINUFFT_USE_CUDA "Whether to build CUDA accelerated FINUFFT library (libcufinufft). This is completely independent of the main FINUFFT library" OFF)
option(FINUFFT_USE_CPU "Whether to build the ordinary FINUFFT library (libfinufft)." ON)
option(FINUFFT_STATIC_LINKING "Whether to link the static FINUFFT library (libfinufft_static)." ON)
option(FINUFFT_BUILD_DEVEL "Whether to build developement executables" OFF)
# sphinx tag (don't remove): @cmake_opts_end

if(FINUFFT_USE_CPU)
Expand All @@ -45,10 +46,11 @@ if(FINUFFT_USE_CPU)
endif()

set(CPM_DOWNLOAD_VERSION 0.38.0)
include(cmake/setupCPM.cmake)

set(FFTW_VERSION 3.3.10)

include(cmake/setupCPM.cmake)
include(cmake/setupFFTW.cmake)

endif()

if (FINUFFT_BUILD_MATLAB)
Expand Down Expand Up @@ -246,6 +248,10 @@ if (FINUFFT_BUILD_MATLAB)
add_subdirectory(matlab)
endif ()

if (FINUFFT_BUILD_DEVEL)
add_subdirectory(devel)
endif ()

include(GNUInstallDirs)
install(TARGETS ${INSTALL_TARGETS} PUBLIC_HEADER)
install(FILES ${PROJECT_SOURCE_DIR}/LICENSE
Expand Down
19 changes: 17 additions & 2 deletions CMakePresets.json
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,22 @@
"generator": "Ninja Multi-Config",
"cacheVariables": {
"FINUFFT_BUILD_TESTS": "ON",
"FINUFFT_BUILD_EXAMPLES": "ON"
"FINUFFT_BUILD_EXAMPLES": "ON",
"FINUFFT_BUILD_DEVEL": "ON"
}
},
{
"name": "benchmark",
"binaryDir": "build/benchmark",
"displayName": "Benchmark",
"description": "Benchmark release configuration (ninja)",
"generator": "Ninja",
"cacheVariables": {
"CMAKE_BUILD_TYPE": "RelWithDebInfo",
"FINUFFT_BUILD_TESTS": "ON",
"FINUFFT_BUILD_EXAMPLES": "ON",
"FINUFFT_FFTW_SUFFIX": "",
"FINUFFT_USE_OPENMP": "OFF"
}
},
{
Expand Down Expand Up @@ -104,7 +119,7 @@
{
"name": "dev",
"configurePreset": "dev",
"configuration": "Debug"
"configuration": "RelWithDebInfo"
},
{
"name": "ninja-multi",
Expand Down
21 changes: 21 additions & 0 deletions devel/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
project(finufft_devel)
# Set the minimum required version of CMake
cmake_minimum_required(VERSION 3.5)


# include cpm cmake, downloading it
CPMAddPackage(
NAME benchmark
GITHUB_REPOSITORY google/benchmark
VERSION 1.8.3
OPTIONS "BENCHMARK_ENABLE_TESTING OFF"

)

if (benchmark_ADDED)
# patch benchmark target
set_target_properties(benchmark PROPERTIES CXX_STANDARD 17)
endif()

add_executable(foldrescale foldrescale.cpp)
target_link_libraries(foldrescale finufft benchmark)
294 changes: 294 additions & 0 deletions devel/foldrescale.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,294 @@
#include "finufft/defs.h"
#include <benchmark/benchmark.h>
#include <iostream>
#include <random>
#include <cmath>
// no vectorize
#pragma GCC optimize("no-tree-vectorize")
/* local NU coord fold+rescale macro: does the following affine transform to x:
when p=true: map [-3pi,-pi) and [-pi,pi) and [pi,3pi) each to [0,N)
otherwise, map [-N,0) and [0,N) and [N,2N) each to [0,N)
Thus, only one period either side of the principal domain is folded.
(It is *so* much faster than slow std::fmod that we stick to it.)
This explains FINUFFT's allowed input domain of [-3pi,3pi).
Speed comparisons of this macro vs a function are in devel/foldrescale*.
The macro wins hands-down on i7, even for modern GCC9.
This should be done in C++ not as a macro, someday.
*/
#define FOLDRESCALE(x, N, p) (p ? \
(x + (x>=-PI ? (x<PI ? PI : -PI) : 3*PI)) * ((FLT)M_1_2PI*N) : \
(x>=0.0 ? (x<(FLT)N ? x : x-(FLT)N) : x+(FLT)N))


#define FOLDRESCALE04(x, N, p) (p ? \
((x * FLT(M_1_2PI) + FLT(0.5)) - floor(x * FLT(M_1_2PI) + FLT(0.5))) * FLT(N) : \
((x/FLT(N))-floor(x/FLT(N)))*FLT(N))

#define FOLDRESCALE05(x, N, p) FLT(N) * (p ? \
((x * FLT(M_1_2PI) + FLT(0.5)) - floor(x * FLT(M_1_2PI) + FLT(0.5))) : \
((x/FLT(N))-floor(x/FLT(N))))

inline __attribute__((always_inline))
FLT foldRescale00(FLT x, BIGINT N, bool p) {
FLT result;
FLT fN = FLT(N);
if (p) {
static constexpr FLT x2pi = FLT(M_1_2PI);
result = x * x2pi + FLT(0.5);
result -= floor(result);
} else {
const FLT invN = FLT(1.0) / fN;
result = x * invN;
result -= floor(result);
}
return result * fN;
}

inline __attribute__((always_inline))
FLT foldRescale01(FLT x, BIGINT N, bool p) {
return p ? (x + (x >= -PI ? (x < PI ? PI : -PI) : 3 * PI)) * ((FLT) M_1_2PI * N) :
(x >= 0.0 ? (x < (FLT) N ? x : x - (FLT) N) : x + (FLT) N);
}

template<bool p>
inline __attribute__((always_inline))
FLT foldRescale02(FLT x, BIGINT N) {
if constexpr (p) {
return (x + (x >= -PI ? (x < PI ? PI : -PI) : 3 * PI)) * ((FLT) M_1_2PI * N);
} else {
return (x >= 0.0 ? (x < (FLT) N ? x : x - (FLT) N) : x + (FLT) N);
}
}

template<bool p>
inline __attribute__((always_inline))
FLT foldRescale03(FLT x, BIGINT N) {
FLT result;
FLT fN = FLT(N);
if constexpr (p) {
static constexpr FLT x2pi = FLT(M_1_2PI);
result = x * x2pi + FLT(0.5);
result -= floor(result);
} else {
const FLT invN = FLT(1.0) / fN;
result = x * invN;
result -= floor(result);
}
return result * fN;
}

static std::mt19937_64 gen;
static std::uniform_real_distribution<> dis(-3 * M_PI, 3 * M_PI);
static const auto N = std::uniform_int_distribution<>{0, 1000}(gen);
static std::uniform_real_distribution<> disN(-N, 2*N);
static volatile auto pirange = true;
static volatile auto notPirange = !pirange;

static void BM_BASELINE(benchmark::State &state) {
for (auto _: state) {
benchmark::DoNotOptimize(dis(gen));
}
}

static void BM_FoldRescaleMacro(benchmark::State &state) {
for (auto _: state) {
FLT x = dis(gen);
benchmark::DoNotOptimize(FOLDRESCALE(x, N, pirange));
}
}

static void BM_FoldRescaleMacroN(benchmark::State &state) {
for (auto _: state) {
FLT x = disN(gen);
benchmark::DoNotOptimize(FOLDRESCALE(x, N, notPirange));
}
}

static void BM_FoldRescale00(benchmark::State &state) {
for (auto _: state) {
FLT x = dis(gen);
benchmark::DoNotOptimize(foldRescale00(x, N, pirange));
}
}


static void BM_FoldRescale00N(benchmark::State &state) {
for (auto _: state) {
FLT x = disN(gen);
benchmark::DoNotOptimize(foldRescale00(x, N, notPirange));
}
}


static void BM_FoldRescale01(benchmark::State &state) {
for (auto _: state) {
FLT x = dis(gen);
benchmark::DoNotOptimize(foldRescale01(x, N, pirange));
}
}


static void BM_FoldRescale01N(benchmark::State &state) {
for (auto _: state) {
FLT x = disN(gen);
benchmark::DoNotOptimize(foldRescale01(x, N, notPirange));
}
}

static void BM_FoldRescale02(benchmark::State &state) {
for (auto _: state) {
FLT x = dis(gen);
benchmark::DoNotOptimize(foldRescale02<true>(x, N));
}
}


static void BM_FoldRescale02N(benchmark::State &state) {
for (auto _: state) {
FLT x = disN(gen);
benchmark::DoNotOptimize(foldRescale02<false>(x, N));
}
}


static void BM_FoldRescale03(benchmark::State &state) {
for (auto _: state) {
FLT x = dis(gen);
benchmark::DoNotOptimize(foldRescale03<true>(x, N));
}
}

static void BM_FoldRescale03N(benchmark::State &state) {
for (auto _: state) {
FLT x = disN(gen);
benchmark::DoNotOptimize(foldRescale03<false>(x, N));
}
}

static void BM_FoldRescale04(benchmark::State &state) {
for (auto _: state) {
FLT x = dis(gen);
benchmark::DoNotOptimize(FOLDRESCALE04(x, N, pirange));
}
}

static void BM_FoldRescale04N(benchmark::State &state) {
for (auto _: state) {
FLT x = disN(gen);
benchmark::DoNotOptimize(FOLDRESCALE04(x, N, notPirange));
}
}

static void BM_FoldRescale05(benchmark::State &state) {
for (auto _: state) {
FLT x = dis(gen);
benchmark::DoNotOptimize(FOLDRESCALE05(x, N, pirange));
}
}

static void BM_FoldRescale05N(benchmark::State &state) {
for (auto _: state) {
FLT x = disN(gen);
benchmark::DoNotOptimize(FOLDRESCALE05(x, N, notPirange));
}
}


BENCHMARK(BM_BASELINE);

BENCHMARK(BM_FoldRescaleMacro);
BENCHMARK(BM_FoldRescale00);
BENCHMARK(BM_FoldRescale01);
BENCHMARK(BM_FoldRescale02);
BENCHMARK(BM_FoldRescale03);
BENCHMARK(BM_FoldRescale04);
BENCHMARK(BM_FoldRescale05);

BENCHMARK(BM_FoldRescaleMacroN);
BENCHMARK(BM_FoldRescale00N);
BENCHMARK(BM_FoldRescale01N);
BENCHMARK(BM_FoldRescale02N);
BENCHMARK(BM_FoldRescale03N);
BENCHMARK(BM_FoldRescale04N);
BENCHMARK(BM_FoldRescale05N);

void testFoldRescaleFunctions() {
for (bool p: {true, false}) {
for (int i = 0; i < 1024; ++i) { // Run the test 1000 times
FLT x = dis(gen);
FLT resultMacro = FOLDRESCALE(x, N, p);
FLT result00 = foldRescale00(x, N, p);
FLT result01 = foldRescale01(x, N, p);
FLT result02 = p ? foldRescale02<true>(x, N) : foldRescale02<false>(x, N);
FLT result03 = p ? foldRescale03<true>(x, N) : foldRescale03<false>(x, N);
FLT result04 = FOLDRESCALE04(x, N, p);
FLT result05 = FOLDRESCALE05(x, N, p);
// function that compares two floating point number with a tolerance, using relative error
auto compare = [](FLT a, FLT b) {
return std::abs(a - b) > std::max(std::abs(a), std::abs(b)) * 10e-13;
};

if (compare(resultMacro, result00)) {
std::cout << "resultMacro: " << resultMacro << " result00: " << result00 << std::endl;
throw std::runtime_error("function00 is wrong");
}
if (compare(resultMacro, result01)) {
std::cout << "resultMacro: " << resultMacro << " result01: " << result01 << std::endl;
throw std::runtime_error("function01 is wrong");
}
if (compare(resultMacro, result02)) {
std::cout << "resultMacro: " << resultMacro << " result02: " << result02 << std::endl;
throw std::runtime_error("function02 is wrong");
}
if (compare(resultMacro, result03)) {
std::cout << "resultMacro: " << resultMacro << " result03: " << result03 << std::endl;
throw std::runtime_error("function03 is wrong");
}
if (compare(resultMacro, result04)) {
std::cout << "resultMacro: " << resultMacro << " result04: " << result04 << std::endl;
throw std::runtime_error("function04 is wrong");
}
if (compare(resultMacro, result05)) {
std::cout << "resultMacro: " << resultMacro << " result05: " << result05 << std::endl;
throw std::runtime_error("function05 is wrong");
}
}
}
}

class BaselineSubtractingReporter : public benchmark::ConsoleReporter {
public:
bool ReportContext(const Context &context) override {
return benchmark::ConsoleReporter::ReportContext(context);
}

void ReportRuns(const std::vector<Run> &reports) override {
for (const auto &run: reports) {
if (run.benchmark_name() == "BM_BASELINE") {
baseline_time = run.cpu_accumulated_time;
} else {
Run modified_run = run;
modified_run.cpu_accumulated_time -= baseline_time;
benchmark::ConsoleReporter::ReportRuns({modified_run});
}
}
}

private:
double baseline_time = 0.0;
};

int main(int argc, char **argv) {
pirange = argc & 1;
notPirange = !pirange;
static std::random_device rd;
const auto seed = rd();
std::cout << "Seed: " << seed << "\n";
// gen.seed(226307105);
gen.seed(seed);
testFoldRescaleFunctions();
::benchmark::Initialize(&argc, argv);
BaselineSubtractingReporter reporter;
::benchmark::RunSpecifiedBenchmarks(&reporter);
return 0;
}
Loading

0 comments on commit 634f163

Please sign in to comment.