Skip to content

Commit

Permalink
added vectorized foldrescale to devel
Browse files Browse the repository at this point in the history
  • Loading branch information
Marco Barbone committed May 14, 2024
1 parent bf3994a commit 4af801e
Show file tree
Hide file tree
Showing 2 changed files with 86 additions and 22 deletions.
1 change: 1 addition & 0 deletions devel/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,4 @@ endif()

add_executable(foldrescale foldrescale.cpp)
target_link_libraries(foldrescale finufft benchmark)
target_compile_options(foldrescale PRIVATE -mavx2)
107 changes: 85 additions & 22 deletions devel/foldrescale.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,9 @@
#include <iostream>
#include <random>
#include <cmath>
#include <immintrin.h>
// no vectorize
#pragma GCC optimize("no-tree-vectorize")
//#pragma GCC optimize("no-tree-vectorize")
/* local NU coord fold+rescale macro: does the following affine transform to x:
when p=true: map [-3pi,-pi) and [-pi,pi) and [pi,3pi) each to [0,N)
otherwise, map [-N,0) and [0,N) and [N,2N) each to [0,N)
Expand Down Expand Up @@ -67,7 +68,7 @@ FLT foldRescale03(FLT x, BIGINT N) {
FLT fN = FLT(N);
if constexpr (p) {
static constexpr FLT x2pi = FLT(M_1_2PI);
result = x * x2pi + FLT(0.5);
result = std::fma(x, x2pi, FLT(0.5));
result -= floor(result);
} else {
const FLT invN = FLT(1.0) / fN;
Expand All @@ -77,8 +78,22 @@ FLT foldRescale03(FLT x, BIGINT N) {
return result * fN;
}

#ifdef __AVX2__

inline __attribute__((always_inline))
__m256d foldRescaleVec(__m256d x, BIGINT N) {
__m256d result;
__m256d fN = _mm256_set1_pd(FLT(N));
static const __m256d x2pi = _mm256_set1_pd(FLT(M_1_2PI));
static const __m256d half = _mm256_set1_pd(FLT(0.5));
result = _mm256_fmadd_pd(x, x2pi, half);
result = _mm256_sub_pd(result, _mm256_floor_pd(result));
return _mm256_mul_pd(result, fN);
}
#endif

static std::mt19937_64 gen;
static std::uniform_real_distribution<> dis(-3 * M_PI, 3 * M_PI);
static std::uniform_real_distribution<> dis(-10, 10);
static const auto N = std::uniform_int_distribution<>{0, 1000}(gen);
static std::uniform_real_distribution<> disN(-N, 2*N);
static volatile auto pirange = true;
Expand Down Expand Up @@ -194,26 +209,71 @@ static void BM_FoldRescale05N(benchmark::State &state) {
}


BENCHMARK(BM_BASELINE);

BENCHMARK(BM_FoldRescaleMacro);
BENCHMARK(BM_FoldRescale00);
BENCHMARK(BM_FoldRescale01);
BENCHMARK(BM_FoldRescale02);
BENCHMARK(BM_FoldRescale03);
BENCHMARK(BM_FoldRescale04);
BENCHMARK(BM_FoldRescale05);

BENCHMARK(BM_FoldRescaleMacroN);
BENCHMARK(BM_FoldRescale00N);
BENCHMARK(BM_FoldRescale01N);
BENCHMARK(BM_FoldRescale02N);
BENCHMARK(BM_FoldRescale03N);
BENCHMARK(BM_FoldRescale04N);
BENCHMARK(BM_FoldRescale05N);
#ifdef __AVX2__
static void BM_FoldRescaleVec(benchmark::State &state) {
for (auto _: state) {
// Generate 4 floating point numbers
double x1 = dis(gen);
double x2 = dis(gen);
double x3 = dis(gen);
double x4 = dis(gen);
// Pack them into an AVX vector
__m256d x = _mm256_set_pd(x1, x2, x3, x4);
// Call the foldRescaleVec function
benchmark::DoNotOptimize(foldRescaleVec(x, N));
}
}
#endif


BENCHMARK(BM_BASELINE)->Iterations(10000000);
BENCHMARK(BM_FoldRescaleMacro)->Iterations(1000000);
BENCHMARK(BM_FoldRescale00)->Iterations(1000000);
BENCHMARK(BM_FoldRescale01)->Iterations(1000000);
BENCHMARK(BM_FoldRescale02)->Iterations(1000000);
BENCHMARK(BM_FoldRescale03)->Iterations(10000000);
BENCHMARK(BM_FoldRescale04)->Iterations(1000000);
BENCHMARK(BM_FoldRescale05)->Iterations(1000000);
#ifdef __AVX2__
BENCHMARK(BM_FoldRescaleVec)->Iterations(1000000/4);
#endif
BENCHMARK(BM_FoldRescaleMacroN)->Iterations(1000000);
BENCHMARK(BM_FoldRescale00N)->Iterations(1000000);
BENCHMARK(BM_FoldRescale01N)->Iterations(1000000);
BENCHMARK(BM_FoldRescale02N)->Iterations(1000000);
BENCHMARK(BM_FoldRescale03N)->Iterations(1000000);
BENCHMARK(BM_FoldRescale04N)->Iterations(1000000);
BENCHMARK(BM_FoldRescale05N)->Iterations(1000000);


#ifdef __AVX2__
void testFoldRescaleVec_avx256_vs_foldRescale00() {
// Generate 4 floating point numbers
double x1 = dis(gen);
double x2 = dis(gen);
double x3 = dis(gen);
double x4 = dis(gen);

// Pack them into an AVX vector
__m256d xVec = _mm256_set_pd(x1, x2, x3, x4);

// Call the foldRescaleVec function
__m256d resultVec = foldRescaleVec(xVec, N);

// Extract the results from the AVX vector

for (int i = 0; i < 4; ++i) {
double result00 = foldRescale03<true>(xVec[i], N);
if (std::abs(1 - result00 / resultVec[i]) > 1e-14) {
std::cout << "input: " << xVec[i] << " result00: " << result00 << " result256: " << resultVec[i] << std::endl;
throw std::runtime_error("foldRescaleVec is not equivalent to foldRescale00");
}
}
}
#endif

void testFoldRescaleFunctions() {
for (bool p: {true, false}) {
for (bool p: {true}) {
for (int i = 0; i < 1024; ++i) { // Run the test 1000 times
FLT x = dis(gen);
FLT resultMacro = FOLDRESCALE(x, N, p);
Expand All @@ -223,6 +283,7 @@ void testFoldRescaleFunctions() {
FLT result03 = p ? foldRescale03<true>(x, N) : foldRescale03<false>(x, N);
FLT result04 = FOLDRESCALE04(x, N, p);
FLT result05 = FOLDRESCALE05(x, N, p);

// function that compares two floating point number with a tolerance, using relative error
auto compare = [](FLT a, FLT b) {
return std::abs(a - b) > std::max(std::abs(a), std::abs(b)) * 10e-13;
Expand Down Expand Up @@ -284,9 +345,11 @@ int main(int argc, char **argv) {
static std::random_device rd;
const auto seed = rd();
std::cout << "Seed: " << seed << "\n";
// gen.seed(226307105);
gen.seed(seed);
testFoldRescaleFunctions();
#ifdef __AVX2__
testFoldRescaleVec_avx256_vs_foldRescale00();
#endif
::benchmark::Initialize(&argc, argv);
BaselineSubtractingReporter reporter;
::benchmark::RunSpecifiedBenchmarks(&reporter);
Expand Down

0 comments on commit 4af801e

Please sign in to comment.