From 55c1dca413041ecd788ece5659ed809ad75d6a58 Mon Sep 17 00:00:00 2001 From: Yohan Chatelain Date: Tue, 6 Aug 2024 22:04:33 -0400 Subject: [PATCH] WIP --- src/libvfcinstrumentonline/rand/app/main.cpp | 130 ++++++ src/libvfcinstrumentonline/rand/src/eft.hpp | 22 +- src/libvfcinstrumentonline/rand/src/main.hpp | 3 + src/libvfcinstrumentonline/rand/src/rand.hpp | 16 + src/libvfcinstrumentonline/rand/src/sr.hpp | 101 ++--- .../rand/src/sr_avx.hpp | 292 +++++++++++++ .../rand/src/sr_avx2.hpp | 371 ++++++++++++++++ .../rand/src/sr_sse4-2.hpp | 186 ++++++++ src/libvfcinstrumentonline/rand/src/utils.hpp | 3 +- .../rand/src/vector_types.hpp | 21 +- .../rand/src/xoroshiro256+.hpp | 406 ++++++++++++++++++ src/libvfcinstrumentonline/rand/test.cpp | 2 +- src/libvfcinstrumentonline/rand/test.sh | 4 +- tests/test_online_instrumentation_sr/test.c | 144 ------- tests/test_online_instrumentation_sr/test.sh | 22 - 15 files changed, 1486 insertions(+), 237 deletions(-) create mode 100644 src/libvfcinstrumentonline/rand/app/main.cpp create mode 100644 src/libvfcinstrumentonline/rand/src/sr_avx.hpp create mode 100644 src/libvfcinstrumentonline/rand/src/sr_avx2.hpp create mode 100644 src/libvfcinstrumentonline/rand/src/sr_sse4-2.hpp create mode 100644 src/libvfcinstrumentonline/rand/src/xoroshiro256+.hpp delete mode 100644 tests/test_online_instrumentation_sr/test.c delete mode 100755 tests/test_online_instrumentation_sr/test.sh diff --git a/src/libvfcinstrumentonline/rand/app/main.cpp b/src/libvfcinstrumentonline/rand/app/main.cpp new file mode 100644 index 00000000..17df1df3 --- /dev/null +++ b/src/libvfcinstrumentonline/rand/app/main.cpp @@ -0,0 +1,130 @@ +#include + +#include + +#include "src/debug.hpp" +#include "src/main.hpp" +#include "src/rand.hpp" +#include "src/sr.hpp" +#include "src/sr_avx2.hpp" + +#ifndef UNROLL +#define UNROLL 1 +#endif + +void run_avx2_add_d(__m256d a, __m256d b, __m256d c[UNROLL]) { + for (int i = 0; i < UNROLL; i++) { + c[i] = sr_add_avx2(a, b); + } +} + +void run_avx2_mul_d(__m256d a, __m256d b, __m256d c[UNROLL]) { + for (int i = 0; i < UNROLL; i++) { + c[i] = sr_mul_avx2(a, b); + } +} + +void run_avx2_div_d(__m256d a, __m256d b, __m256d c[UNROLL]) { + for (int i = 0; i < UNROLL; i++) { + c[i] = sr_div_avx2(a, b); + } +} + +void run_add_d(double a, double b, double c[UNROLL]) { + for (int i = 0; i < UNROLL; i++) { + c[i] = sr_add(a, b); + } +} + +void run_mul_d(double a, double b, double c[UNROLL]) { + for (int i = 0; i < UNROLL; i++) { + c[i] = sr_mul(a, b); + } +} + +void run_div_d(double a, double b, double c[UNROLL]) { + for (int i = 0; i < UNROLL; i++) { + c[i] = sr_div(a, b); + } +} + +void print_m256d(__m256d a) { + std::cout << std::hexfloat; + for (int i = 0; i < sizeof(a) / sizeof(double); i++) + std::cout << a[i] << " "; + std::cout << std::defaultfloat; +} + +void print_array_m256d(__m256d a[UNROLL]) { + for (int i = 0; i < UNROLL; i++) { + print_m256d(a[i]); + std::cout << std::endl; + } +} + +void print_array_d(double a[UNROLL]) { + std::cout << std::hexfloat; + for (int i = 0; i < UNROLL; i++) { + std::cout << a[i] << std::endl; + } + std::cout << std::defaultfloat; +} + +void test_scalar_d(double a, double b) { + std::cout << "sr_add" << std::endl; + double cs[UNROLL]; + run_add_d(a, b, cs); + print_array_d(cs); + std::cout << "sr_mul" << std::endl; + run_mul_d(a, b, cs); + print_array_d(cs); + std::cout << "sr_div" << std::endl; + run_div_d(a, b, cs); + print_array_d(cs); +} + +void test_avx2_d(__m256d a, __m256d b) { + std::cout << "sr_add_avx" << std::endl; + __m256d c[UNROLL]; + run_avx2_add_d(a, b, c); + + std::cout << std::hexfloat; + print_array_m256d(c); + + std::cout << "sr_mul_avx" << std::endl; + run_avx2_mul_d(a, b, c); + print_array_m256d(c); + + std::cout << "sr_div_avx" << std::endl; + run_avx2_div_d(a, b, c); + print_array_m256d(c); +} + +int main() { + + init(); + + std::vector aa, bb; + __m256d a, b; + std::cout << "sizeof a " << sizeof(a) << std::endl; + std::cout << "elements in a " << sizeof(a) / sizeof(double) << std::endl; + + for (int i = 0; i < 4; i++) { + double x; + std::cin >> x; + aa.push_back(x); + } + for (int i = 0; i < 4; i++) { + double x; + std::cin >> x; + bb.push_back(x); + } + + a = _mm256_loadu_pd(aa.data()); + b = _mm256_loadu_pd(bb.data()); + + test_avx2_d(a, b); + test_scalar_d(a[0], b[0]); + + return 0; +} \ No newline at end of file diff --git a/src/libvfcinstrumentonline/rand/src/eft.hpp b/src/libvfcinstrumentonline/rand/src/eft.hpp index c26d9bb9..e9856ad1 100644 --- a/src/libvfcinstrumentonline/rand/src/eft.hpp +++ b/src/libvfcinstrumentonline/rand/src/eft.hpp @@ -1,6 +1,8 @@ #ifndef __VERIFICARLO_SRLIB_EFT_HPP__ #define __VERIFICARLO_SRLIB_EFT_HPP__ +#include + #include "debug.hpp" #include "vector_types.hpp" @@ -15,16 +17,16 @@ template void twosum(T a, T b, T &sigma, T &tau) { debug_print("twosum(%.13a, %.13a) = %.13a, %.13a\n", a, b, sigma, tau); } -template void twosumx2(T a, T b, T &sigma, T &tau) { - if (std::abs(a[0]) < std::abs(b[0])) - std::swap(a[0], b[0]); - if (std::abs(a[1]) < std::abs(b[1])) - std::swap(a[1], b[1]); - sigma = a + b; - T z = sigma - a; - tau = (a - (sigma - z)) + (b - z); - debug_print("twosumx2(%.13a, %.13a) = %.13a, %.13a\n", a, b, sigma, tau); -} +// template void twosumx2(T a, T b, T &sigma, T &tau) { +// if (std::abs(a[0]) < std::abs(b[0])) +// std::swap(a[0], b[0]); +// if (std::abs(a[1]) < std::abs(b[1])) +// std::swap(a[1], b[1]); +// sigma = a + b; +// T z = sigma - a; +// tau = (a - (sigma - z)) + (b - z); +// debug_print("twosumx2(%.13a, %.13a) = %.13a, %.13a\n", a, b, sigma, tau); +// } template void twoprodfma(T a, T b, T &sigma, T &tau) { sigma = a * b; diff --git a/src/libvfcinstrumentonline/rand/src/main.hpp b/src/libvfcinstrumentonline/rand/src/main.hpp index c57ccab7..c7cc61db 100644 --- a/src/libvfcinstrumentonline/rand/src/main.hpp +++ b/src/libvfcinstrumentonline/rand/src/main.hpp @@ -32,6 +32,9 @@ void init() { auto seeds = get_seeds<1>(); xoroshiro256plus::scalar::init(rng_state, seeds); debug_print("initialized xoroshiro256+ with seed %lu\n", seeds[0]); + auto seeds_avx2 = get_seeds<4>(); + xoroshiro256plus::avx2::init(rng_state_x4, seeds_avx2); + debug_print("initialized xoroshiro256+ with seed %lu\n", seeds_avx2[0]); #elif defined(SHISHUA) auto seeds = get_seeds<4>(); uint64_t seed_state[4] = {seeds[0], seeds[1], seeds[2], seeds[3]}; diff --git a/src/libvfcinstrumentonline/rand/src/rand.hpp b/src/libvfcinstrumentonline/rand/src/rand.hpp index 295a8dea..0462f4d9 100644 --- a/src/libvfcinstrumentonline/rand/src/rand.hpp +++ b/src/libvfcinstrumentonline/rand/src/rand.hpp @@ -6,6 +6,9 @@ #include #include #include +#include +#include +#include #include #include #include @@ -37,12 +40,15 @@ using rng_t = xoroshiro256plus::scalar::rn_uint64; typedef xoroshiro256plus::scalar::state rng_state_t; static __thread rng_state_t rng_state; using rn_int64_x2 = sse4_2::int64x2_t; +using rn_int64_x4 = avx::int64x4_t; using rn_int32_x4 = sse4_2::int32x4_t; using float2 = scalar::floatx2_t; using float4 = sse4_2::floatx4_t; using double2 = sse4_2::doublex2_t; typedef xoroshiro256plus::sse4_2::state rng_state_x2_t; static __thread rng_state_x2_t rng_state_x2; +typedef xoroshiro256plus::avx2::state rng_state_x4_t; +static __thread rng_state_x4_t rng_state_x4; #elif defined(SHISHUA) #include "shishua.h" typedef prng_state rng_state_t; @@ -107,6 +113,16 @@ rn_int64_x2 get_rand_uint64_x2() { return rng; } +rn_int64_x4 get_rand_uint64_x4() { + rn_int64_x4 rng; +#ifdef XOROSHIRO + rng = xoroshiro256plus::avx2::next(rng_state_x4); +#else +#error "No PRNG defined" +#endif + return rng; +} + uint8_t get_rand_uint1() { return _get_rand_uint64() & 0x1; } #ifdef XOROSHIRO diff --git a/src/libvfcinstrumentonline/rand/src/sr.hpp b/src/libvfcinstrumentonline/rand/src/sr.hpp index 1e668852..0a1d4b09 100644 --- a/src/libvfcinstrumentonline/rand/src/sr.hpp +++ b/src/libvfcinstrumentonline/rand/src/sr.hpp @@ -3,6 +3,7 @@ #include "debug.hpp" #include "eft.hpp" +#include "rand.hpp" #include "utils.hpp" template bool isnumber(T a, T b) { @@ -110,28 +111,28 @@ template T sr_roundx2(const T sigma, const T tau, const T z) { return round; } -template <> -scalar::floatx2_t sr_roundx2(const scalar::floatx2_t &sigma, - const scalar::floatx2_t &tau, - const scalar::floatx2_t &z) { - using T = scalar::floatx2_t; - if (sse4_2::not0(isnumberx2(sigma, tau))) { - return {sr_round(sigma.f[0], tau.f[0], z.f[0]), - sr_round(sigma.f[1], tau.f[1], z.f[1])}; - } - constexpr int32_t mantissa = sr::utils::IEEE754::mantissa; - const auto sign_tau = sse4_2::cmplt(tau, sse4_2::setzero()); - const auto sign_sigma = sse4_2::cmplt(sigma, sse4_2::setzero()); - const scalar::int32x2_t eta; - const auto sign_cmp = scalar::bitwise_xor(sign_tau, sign_sigma); - scalar::int32x2_t eta; - eta.u32[0] = sse4_2::blendv( - sse4_2::get_exponent(sse4_2::get_predecessor_abs(sigma.f[0])), - sse4_2::get_exponent(sigma.f[0]), sign_cmp.u32[0]); - eta.u32[1] = sse4_2::blendv( - sse4_2::get_exponent(sse4_2::get_predecessor_abs(sigma.f[1])), - sse4_2::get_exponent(sigma.f[1]), sign_cmp.u32[1]); -} +// template <> +// scalar::floatx2_t sr_roundx2(const scalar::floatx2_t &sigma, +// const scalar::floatx2_t &tau, +// const scalar::floatx2_t &z) { +// using T = scalar::floatx2_t; +// if (sse4_2::not0(isnumberx2(sigma, tau))) { +// return {sr_round(sigma.f[0], tau.f[0], z.f[0]), +// sr_round(sigma.f[1], tau.f[1], z.f[1])}; +// } +// constexpr int32_t mantissa = sr::utils::IEEE754::mantissa; +// const auto sign_tau = sse4_2::cmplt(tau, sse4_2::setzero()); +// const auto sign_sigma = sse4_2::cmplt(sigma, sse4_2::setzero()); +// const scalar::int32x2_t eta; +// const auto sign_cmp = scalar::bitwise_xor(sign_tau, sign_sigma); +// scalar::int32x2_t eta; +// eta.u32[0] = sse4_2::blendv( +// sse4_2::get_exponent(sse4_2::get_predecessor_abs(sigma.f[0])), +// sse4_2::get_exponent(sigma.f[0]), sign_cmp.u32[0]); +// eta.u32[1] = sse4_2::blendv( +// sse4_2::get_exponent(sse4_2::get_predecessor_abs(sigma.f[1])), +// sse4_2::get_exponent(sigma.f[1]), sign_cmp.u32[1]); +// } template T sr_add(T a, T b) { debug_start(); @@ -148,35 +149,35 @@ template T sr_add(T a, T b) { return sigma + round; } -template T sr_addx2(T a, T b) { - if (sse4_2::not0(isnumberx2(a, b))) { - if constexpr (std::is_same::value) { - return {.f = {sr_add(a.f[0], b.f[0]), sr_add(a.f[1], b.f[1])}}; - } - if constexpr (std::is_same::value) { - return {.f = {sr_add(a.f[0], b.f[0]), sr_add(a.f[1], b.f[1])}}; - } - return sse4_2::add(a, b); - } - T z = get_rand_double01_x2(); - T tau, sigma, round; - twosumx2(a, b, sigma, tau); - round = sr_roundx2(sigma, tau, z); - return sigma + round; -} +// template T sr_addx2(T a, T b) { +// if (sse4_2::not0(isnumberx2(a, b))) { +// if constexpr (std::is_same::value) { +// return {.f = {sr_add(a.f[0], b.f[0]), sr_add(a.f[1], b.f[1])}}; +// } +// if constexpr (std::is_same::value) { +// return {.f = {sr_add(a.f[0], b.f[0]), sr_add(a.f[1], b.f[1])}}; +// } +// return sse4_2::add(a, b); +// } +// T z = get_rand_double01_x2(); +// T tau, sigma, round; +// twosumx2(a, b, sigma, tau); +// round = sr_roundx2(sigma, tau, z); +// return sigma + round; +// } -template <> -scalar::floatx2_t sr_addx2(scalar::floatx2_t a, scalar::floatx2_t b) { - using T = scalar::floatx2_t; - if (sse4_2::not0(isnumberx2(a, b))) { - return {.f = {sr_add(a.f[0], b.f[0]), sr_add(a.f[1], b.f[1])}}; - } - T z = get_rand_float01_x2(); - T tau, sigma, round; - twosumx2(a, b, sigma, tau); - round = sr_roundx2(sigma, tau, z); - return scalar::add(sigma, round); -} +// template <> +// scalar::floatx2_t sr_addx2(scalar::floatx2_t a, scalar::floatx2_t b) { +// using T = scalar::floatx2_t; +// if (sse4_2::not0(isnumberx2(a, b))) { +// return {.f = {sr_add(a.f[0], b.f[0]), sr_add(a.f[1], b.f[1])}}; +// } +// T z = get_rand_float01_x2(); +// T tau, sigma, round; +// twosumx2(a, b, sigma, tau); +// round = sr_roundx2(sigma, tau, z); +// return scalar::add(sigma, round); +// } template T sr_sub(T a, T b) { return sr_add(a, -b); } diff --git a/src/libvfcinstrumentonline/rand/src/sr_avx.hpp b/src/libvfcinstrumentonline/rand/src/sr_avx.hpp new file mode 100644 index 00000000..0dddfa0d --- /dev/null +++ b/src/libvfcinstrumentonline/rand/src/sr_avx.hpp @@ -0,0 +1,292 @@ +#include +#include +#include +#include + +#include "rand.hpp" +#include "utils.hpp" + +#ifndef __AVX__ +#error "AVX is required for this file" +#endif + +namespace avx { + +typedef __m128d double2; +typedef __m128i short8; +typedef __m128i int4; +typedef __m128i long2; +typedef __m128 float4; + +typedef __m256d double4; +typedef __m256i short16; +typedef __m256i long4; +typedef __m256i int8; +typedef __m256 float8; + +void debug_m256d(const char msg[], __m256d a) { + std::cout << msg << ": "; + for (int i = 0; i < sizeof(a) / sizeof(double); i++) + std::cout << a[i] << " "; + std::cout << std::endl; +} + +namespace x2double { + +static inline double2 get_rand_double01() { + return xoroshiro256plus::sse4_2::get_rand_double01(rng_state); +} + +static inline double2 get_predecessor_abs(double2 a) { + const __m128d phi = _mm_set1_pd(1.0 - 0x1.0p-53); + return _mm_mul_pd(a, phi); +} + +static inline long2 get_exponent(double2 a) { + const double2 zero = _mm_setzero_pd(); + const long2 exp_mask = _mm_set1_epi64x(0x7ff0000000000000ULL); + const long2 bias = _mm_set1_pd(1023); + + double2 is_zero = _mm_cmp_pd(a, zero, _CMP_EQ_OQ); + + // Extract exponent using floating-point operations + long2 a_bits = _mm_castpd_si128(a); + long2 exp_bits = _mm_and_pd(a_bits, exp_mask); + exp_bits = _mm_srli_epi64(exp_bits, 52); + + // Subtract bias + exp_bits = _mm_sub_epi64(exp_bits, bias); + + // Blend result with zero for zero inputs + return _mm_blendv_pd(exp_bits, _mm_setzero_pd(), is_zero); +} + +static inline double2 isnumber(double2 a, double2 b) { + const long2 zero = _mm_setzero_si128(); + const long2 naninf_mask = _mm_set1_epi64x(0x7ff0000000000000); + + long2 a_int = _mm_castpd_si128(a); + long2 b_int = _mm_castpd_si128(b); + long2 a_not_zero = ~_mm_cmpeq_epi64(a_int, zero); + long2 b_not_zero = ~_mm_cmpeq_epi64(b_int, zero); + long2 a_not_naninf = + ~_mm_cmpeq_epi64(_mm_and_si128(a, naninf_mask), naninf_mask); + long2 b_not_naninf = + ~_mm_cmpeq_epi64(_mm_and_si128(b, naninf_mask), naninf_mask); + + return _mm_and_si128(_mm_and_si128(a_not_zero, a_not_naninf), + _mm_and_si128(b_not_zero, b_not_naninf)) +} + +static inline long2 cmpgt(long2 a, long2 b) { return _mm_cmpgt_epi64(a, b); } + +static inline double2 pow2(long2 n) { + + const int _mantissa = sr::utils::IEEE754::mantissa; + const int _min_exponent = sr::utils::IEEE754::min_exponent; + + long2 min_exp_vec = _mm_set1_epi64x(_min_exponent); + long2 one = _mm_set1_epi64x(1); + long2 mantissa_vec = _mm_set1_epi64x(_mantissa); + + long2 is_subnormal = cmpgt(min_exp_vec, n); + long2 precision_loss = _mm_sub_epi64(min_exp_vec, n - 1); + precision_loss = _mm_and_si128(precision_loss, is_subnormal); + + long2 n_adjusted = _mm_blendv_epi8(n, one, is_subnormal); + double2 res = _mm_blendv_pd(_mm_set1_pd(1.0), _mm_set1_pd(0.0), + _mm_castsi128_pd(is_subnormal)); + + long2 i = _mm_castpd_si128(res); + long2 shift = _mm_sub_epi64(mantissa_vec, precision_loss); + long2 to_add = _mm_sllv_epi64(n_adjusted, shift); + i = _mm_add_epi64(i, to_add); + return _mm_castsi128_pd(i); +} + +static inline double2 abs(double2 a) { + const double2 sign_mask = _mm_set1_pd(-0.0); + return _mm_andnot_pd(sign_mask, a); +} + +static inline double2 sr_round(double2 sigma, double2 tau, double2 z) { + const int _mantissa = sr::utils::IEEE754::mantissa; + const int _min_exponent = sr::utils::IEEE754::min_exponent; + + const double2 zero = _mm_setzero_pd(); + const long2 mantissa = _mm_set1_epi64x(_mantissa); + + double2 tau_is_zero = _mm_cmp_pd(tau, zero, _CMP_EQ_OQ); + double2 sign_tau = _mm_cmp_pd(tau, zero, _CMP_LT_OQ); + double2 sign_sigma = _mm_cmp_pd(sigma, zero, _CMP_LT_OQ); + double2 sign_diff = _mm_xor_pd(sign_tau, sign_sigma); + + double2 pred_sigma = get_predecessor_abs(sigma); + long2 eta = get_exponent(sigma); + long2 pred_eta = get_exponent(pred_sigma); + eta = _mm_blendv_pd(eta, pred_eta, sign_diff); + + double2 ulp = pow2(_mm_sub_epi64(eta, mantissa)); + ulp = _mm_blendv_pd(ulp, _mm_sub_pd(zero, ulp), sign_tau); + + double2 pi = _mm_mul_pd(ulp, z); + double2 abs_tau_plus_pi = abs(_mm_add_pd(tau, pi)); + double2 abs_ulp = abs(ulp); + double2 round = _mm_blendv_pd( + zero, ulp, _mm_cmp_pd(abs_tau_plus_pi, abs_ulp, _CMP_GE_OQ)); + + return _mm_blendv_pd(round, sigma, tau_is_zero); +} + +static inline double2 sr_add(double2 a, double2 b) { + double2 is_number = isnumber(a, b); + double2 normal_sum = _mm_add_pd(a, b); + + double2 z = get_rand_double01(); + double2 sigma, tau; + + double2 tmp = a; + a = _mm_min_pd(a, b); + b = _mm_max_pd(tmp, b); + + sigma = normal_sum; + double2 v = _mm_sub_pd(sigma, a); + tau = _mm_sub_pd(b, v); + + double2 round = sr_round(sigma, tau, z); + double2 stochastic_sum = _mm_add_pd(sigma, round); + + return _mm_blendv_pd(normal_sum, stochastic_sum, is_number); +} + +} // namespace x2double + +// Helper function to get random doubles between 0 and 1 +__m256d get_rand_double01_avx() { + // Implementation depends on your random number generator + // This is a placeholder + return xoroshiro256plus::avx::get_rand_double01(rng_state_x4); +} + +__m256d get_predecessor_abs_avx(__m256d a) { + const __m256d phi = _mm256_set1_pd(1.0 - 0x1.0p-53); + return _mm256_mul_pd(a, phi); +} + +__m256d get_exponent_avx(__m256d a) { + const __m256d zero = _mm256_setzero_pd(); + const __m256d exp_mask = _mm256_set1_epi64x(0x7ff0000000000000ULL); + const __m256d bias = _mm256_set1_pd(1023.0); + + __m256d is_zero = _mm256_cmp_pd(a, zero, _CMP_EQ_OQ); + + // Extract exponent using floating-point operations + __m256d exp_bits = _mm256_and_pd(a, exp_mask); + + // Use multiplication to effectively right-shift by 52 bits + const __m256d shift_const = _mm256_set1_pd(1.0 / (1ULL << 52)); + __m256d exp = _mm256_mul_pd(exp_bits, shift_const); + + // Subtract bias + exp = _mm256_sub_pd(exp, bias); + + // Blend result with zero for zero inputs + return _mm256_blendv_pd(exp, zero, is_zero); +} + +__m256d isnumber_avx(__m256d a, __m256d b) { + const __m256d zero = _mm256_setzero_pd(); + const __m256d naninf_mask = _mm256_set1_pd(0x7ff0000000000000); + + __m256d a_not_zero = _mm256_cmp_pd(a, zero, _CMP_NEQ_UQ); + __m256d b_not_zero = _mm256_cmp_pd(b, zero, _CMP_NEQ_UQ); + __m256d a_not_naninf = + _mm256_cmp_pd(_mm256_and_pd(a, naninf_mask), naninf_mask, _CMP_NEQ_UQ); + __m256d b_not_naninf = + _mm256_cmp_pd(_mm256_and_pd(b, naninf_mask), naninf_mask, _CMP_NEQ_UQ); + + return _mm256_and_pd(_mm256_and_pd(a_not_zero, a_not_naninf), + _mm256_and_pd(b_not_zero, b_not_naninf)); +} + +__m256i cmpgt(__m256d a, __m256d b) { + return _mm256_castpd_si256(_mm256_cmp_pd(a, b, _CMP_GT_OQ)); +} + +__m256d pow2_avx(__m256i n) { + const int mantissa = sr::utils::IEEE754::mantissa; + const int min_exponent = sr::utils::IEEE754::min_exponent; + + __m256i min_exp_vec = _mm256_set1_epi64x(min_exponent); + __m256i one = _mm256_set1_epi64x(1); + __m256i mantissa_vec = _mm256_set1_epi64x(mantissa); + + __m256i is_subnormal = cmpgt(min_exp_vec, n); + __m256i precision_loss = min_exp_vec - n; + precision_loss = precision_loss - one; + precision_loss = precision_loss & is_subnormal; + + __m256i n_adjusted = _mm256_blendv_pd(n, one, is_subnormal); + + __m256d res = _mm256_blendv_pd(_mm256_set1_pd(1.0), _mm256_set1_pd(0.0), + _mm256_castsi256_pd(is_subnormal)); + + __m256i i = _mm256_castpd_si256(res); + __m256i shift = mantissa_vec - precision_loss; + __m256i to_add = n_adjusted << shift; + i += to_add; + + return _mm256_castsi256_pd(i); +} + +__m256d abs_pd(__m256d a) { + const __m256d sign_mask = _mm256_set1_pd(-0.0); + return _mm256_andnot_pd(sign_mask, a); +} + +__m256d sr_round_avx(__m256d sigma, __m256d tau, __m256d z) { + const __m256d zero = _mm256_setzero_pd(); + const __m256d one = _mm256_set1_pd(1.0); + const __m256d mantissa = _mm256_set1_pd(52); + + __m256d tau_is_zero = _mm256_cmp_pd(tau, zero, _CMP_EQ_OQ); + __m256d sign_tau = _mm256_cmp_pd(tau, zero, _CMP_LT_OQ); + __m256d sign_sigma = _mm256_cmp_pd(sigma, zero, _CMP_LT_OQ); + __m256d sign_diff = _mm256_xor_pd(sign_tau, sign_sigma); + + __m256d pred_sigma = get_predecessor_abs_avx(sigma); + __m256d eta = _mm256_blendv_pd(get_exponent_avx(sigma), + get_exponent_avx(pred_sigma), sign_diff); + + __m256d ulp = pow2_avx(_mm256_sub_pd(eta, mantissa)); + ulp = _mm256_blendv_pd(ulp, _mm256_sub_pd(zero, ulp), sign_tau); + + __m256d pi = _mm256_mul_pd(ulp, z); + __m256d abs_tau_plus_pi = abs_pd(_mm256_add_pd(tau, pi)); + __m256d abs_ulp = abs_pd(ulp); + __m256d round = _mm256_blendv_pd( + zero, ulp, _mm256_cmp_pd(abs_tau_plus_pi, abs_ulp, _CMP_GE_OQ)); + + return _mm256_blendv_pd(round, sigma, tau_is_zero); +} + +__m256d sr_add_avx(__m256d a, __m256d b) { + __m256d is_number = isnumber_avx(a, b); + __m256d normal_sum = _mm256_add_pd(a, b); + + __m256d z = get_rand_double01_avx(); + debug_m256d("z", z); + __m256d sigma, tau; + + // Implement twosum using AVX instructions + sigma = _mm256_add_pd(a, b); + __m256d v = _mm256_sub_pd(sigma, a); + tau = _mm256_sub_pd(b, v); + + __m256d round = sr_round_avx(sigma, tau, z); + __m256d stochastic_sum = _mm256_add_pd(sigma, round); + + return _mm256_blendv_pd(normal_sum, stochastic_sum, is_number); +} + +} // namespace avx diff --git a/src/libvfcinstrumentonline/rand/src/sr_avx2.hpp b/src/libvfcinstrumentonline/rand/src/sr_avx2.hpp new file mode 100644 index 00000000..58cdb1ed --- /dev/null +++ b/src/libvfcinstrumentonline/rand/src/sr_avx2.hpp @@ -0,0 +1,371 @@ +#include +#include +#include +#include + +#include "rand.hpp" +#include "utils.hpp" + +#ifndef __AVX2__ +#error "AVX2 is required for this file" +#endif + +typedef __m256 float8; +typedef __m256i short16; +typedef __m256i int8; +typedef __m256i long4; +typedef __m256d double4; + +static inline void debug(const char msg[]) { std::cerr << msg << std::endl; } + +static inline void debug_m256d(const char msg[], double4 a) { + std::cerr << std::hexfloat; + std::cerr << msg << ": "; + for (int i = 0; i < sizeof(a) / sizeof(double); i++) + std::cerr << a[i] << " "; + std::cerr << std::endl; + std::cerr << std::defaultfloat; +} + +static inline void debug_m256(const char msg[], float8 a) { + std::cerr << std::hexfloat; + std::cerr << msg << ": "; + for (int i = 0; i < sizeof(a) / sizeof(float); i++) + std::cerr << a[i] << " "; + std::cerr << std::endl; + std::cerr << std::defaultfloat; +} + +static inline void debug_m256i(const char msg[], __m256i a, bool hex = true) { + if (hex) + std::cerr << std::hex; + else + std::cerr << std::dec; + std::cerr << msg << ": "; + for (int i = 0; i < sizeof(a) / sizeof(double); i++) + std::cerr << a[i] << " "; + std::cerr << std::endl; + std::cerr << std::defaultfloat; +} +// Helper function to get random doubles between 0 and 1 +static inline double4 get_rand_double01_avx2() { + // Implementation depends on your random number generator + // This is a placeholder + return xoroshiro256plus::avx2::get_rand_double01(rng_state_x4); +} + +static inline double4 get_predecessor_abs_avx2(double4 a) { + const double4 phi = _mm256_set1_pd(1.0 - 0x1.0p-53); + return _mm256_mul_pd(a, phi); +} + +static inline float8 get_predecessor_abs_avx2f(float8 a) { + const float8 phi = _mm256_set1_ps(1.0f - 0x1.0p-24f); + return _mm256_mul_ps(a, phi); +} + +static inline double4 get_exponent_avx2(double4 a) { + debug(" [get_exponent]\tstart"); + const double4 zero = _mm256_setzero_pd(); + const __m256i exp_mask = _mm256_set1_epi64x(0x7ff0000000000000ULL); + const __m256i bias = _mm256_set1_epi64x(1023); + + double4 is_zero = _mm256_cmp_pd(a, zero, _CMP_EQ_OQ); + + debug_m256d("\t\ta", a); + __m256i a_bits = _mm256_castpd_si256(a); + debug_m256i("\t\ta_bits", a_bits); + __m256i exp_bits = _mm256_and_si256(a_bits, exp_mask); + debug_m256i("\t\texp_bits", exp_bits); + exp_bits = _mm256_srli_epi64(exp_bits, 52); + debug_m256i("\t\texp_bits", exp_bits, false); + + exp_bits = _mm256_sub_epi64(exp_bits, bias); + debug_m256i("\t\texp", exp_bits, false); + + debug(" [get_exponent]\tend"); + return _mm256_blendv_pd(exp_bits, zero, is_zero); +} + +static inline float8 get_exponent_avx2f(float8 a) { + debug(" [get_exponent]\tstart"); + const float8 zero = _mm256_setzero_ps(); + const __m256i exp_mask = _mm256_set1_epi32(0x7f800000); + const __m256i bias = _mm256_set1_epi32(127); + + float8 is_zero = _mm256_cmp_ps(a, zero, _CMP_EQ_OQ); + + debug_m256("\t\ta", a); + __m256i a_bits = _mm256_castps_si256(a); + debug_m256i("\t\ta_bits", a_bits); + __m256i exp_bits = _mm256_and_si256(a_bits, exp_mask); + debug_m256i("\t\texp_bits", exp_bits); + exp_bits = _mm256_srli_epi32(exp_bits, 23); + debug_m256i("\t\texp_bits", exp_bits, false); + + exp_bits = exp_bits - bias; + debug_m256i("\t\texp", exp_bits, false); + + debug(" [get_exponent]\tend"); + return _mm256_blendv_ps(exp_bits, zero, is_zero); +} + +static inline double4 isnumber_avx2(double4 a, double4 b) { + const double4 zero = _mm256_setzero_pd(); + const double4 naninf_mask = _mm256_set1_pd(0x7ff0000000000000); + + double4 a_not_zero = _mm256_cmp_pd(a, zero, _CMP_NEQ_UQ); + double4 b_not_zero = _mm256_cmp_pd(b, zero, _CMP_NEQ_UQ); + double4 a_not_naninf = + _mm256_cmp_pd(_mm256_and_pd(a, naninf_mask), naninf_mask, _CMP_NEQ_UQ); + double4 b_not_naninf = + _mm256_cmp_pd(_mm256_and_pd(b, naninf_mask), naninf_mask, _CMP_NEQ_UQ); + + return _mm256_and_pd(_mm256_and_pd(a_not_zero, a_not_naninf), + _mm256_and_pd(b_not_zero, b_not_naninf)); +} + +static inline float8 isnumber_avx2f(float8 a, float8 b) { + const float8 zero = _mm256_setzero_ps(); + const float8 naninf_mask = _mm256_set1_ps(0x7f800000); + + float8 a_not_zero = _mm256_cmp_ps(a, zero, _CMP_NEQ_UQ); + float8 b_not_zero = _mm256_cmp_ps(b, zero, _CMP_NEQ_UQ); + float8 a_not_naninf = + _mm256_cmp_ps(_mm256_and_ps(a, naninf_mask), naninf_mask, _CMP_NEQ_UQ); + float8 b_not_naninf = + _mm256_cmp_ps(_mm256_and_ps(b, naninf_mask), naninf_mask, _CMP_NEQ_UQ); + + return _mm256_and_ps(_mm256_and_ps(a_not_zero, a_not_naninf), + _mm256_and_ps(b_not_zero, b_not_naninf)); +} + +// AVX2 version of cmpgt +static inline __m256i cmpgt(__m256i a, __m256i b) { + return _mm256_cmpgt_epi64(a, b); +} + +static inline __m256i cmpgtf(float8 a, float8 b) { + return _mm256_cmpgt_epi32(a, b); +} + +static inline double4 pow2_avx2(__m256i n) { + debug(" [POW2]\tstart"); + debug_m256i("\t\tn", n, false); + const int mantissa = sr::utils::IEEE754::mantissa; + const int min_exponent = sr::utils::IEEE754::min_exponent; + + __m256i min_exp_vec = _mm256_set1_epi64x(min_exponent); + __m256i one = _mm256_set1_epi64x(1); + __m256i mantissa_vec = _mm256_set1_epi64x(mantissa); + + __m256i is_subnormal = cmpgt(min_exp_vec, n); + __m256i precision_loss = _mm256_sub_epi64(min_exp_vec, n - 1); + precision_loss = _mm256_and_si256(precision_loss, is_subnormal); + debug_m256i("\t\tprecision_loss", precision_loss, false); + + __m256i n_adjusted = _mm256_blendv_epi8(n, one, is_subnormal); + + double4 res = _mm256_blendv_pd(_mm256_set1_pd(1.0), _mm256_set1_pd(0.0), + _mm256_castsi256_pd(is_subnormal)); + + __m256i i = _mm256_castpd_si256(res); + __m256i shift = _mm256_sub_epi64(mantissa_vec, precision_loss); + __m256i to_add = _mm256_sllv_epi64(n_adjusted, shift); + debug_m256i("\t\tto_add", to_add); + i = _mm256_add_epi64(i, to_add); + + debug_m256d("\t\tres", _mm256_castsi256_pd(i)); + debug(" [POW2]\tend"); + return _mm256_castsi256_pd(i); +} + +static inline float8 pow2_avx2f(__m256i n) { + debug(" [POW2]\tstart"); + debug_m256i("\t\tn", n, false); + const int mantissa = sr::utils::IEEE754::mantissa; + const int min_exponent = sr::utils::IEEE754::min_exponent; + + __m256i min_exp_vec = _mm256_set1_epi32(min_exponent); + __m256i one = _mm256_set1_epi32(1); + __m256i mantissa_vec = _mm256_set1_epi32(mantissa); + + __m256i is_subnormal = cmpgtf(min_exp_vec, n); + __m256i precision_loss = _mm256_sub_epi32(min_exp_vec, n - 1); + precision_loss = _mm256_and_si256(precision_loss, is_subnormal); + debug_m256i("\t\tprecision_loss", precision_loss, false); + + __m256i n_adjusted = _mm256_blendv_epi8(n, one, is_subnormal); + + float8 res = _mm256_blendv_ps(_mm256_set1_ps(1.0), _mm256_set1_ps(0.0), + _mm256_castsi256_ps(is_subnormal)); + + __m256i i = _mm256_castps_si256(res); + __m256i shift = _mm256_sub_epi32(mantissa_vec, precision_loss); + __m256i to_add = _mm256_sllv_epi32(n_adjusted, shift); + debug_m256i("\t\tto_add", to_add); + i = _mm256_add_epi32(i, to_add); + + debug_m256("\t\tres", _mm256_castsi256_ps(i)); + debug(" [POW2]\tend"); + return _mm256_castsi256_ps(i); +} + +static inline double4 abs_pd(double4 a) { + const double4 sign_mask = _mm256_set1_pd(-0.0); + return _mm256_andnot_pd(sign_mask, a); +} + +static inline float8 abs_ps(float8 a) { + const float8 sign_mask = _mm256_set1_ps(-0.0f); + return _mm256_andnot_ps(sign_mask, a); +} + +static inline double4 sr_round_avx2(double4 sigma, double4 tau, double4 z) { + debug(" [SR_ROUND]\tstart"); + debug_m256d("\tsigma ", sigma); + debug_m256d("\ttau ", tau); + + const int _mantissa = sr::utils::IEEE754::mantissa; + const int _min_exponent = sr::utils::IEEE754::min_exponent; + + const double4 zero = _mm256_setzero_pd(); + const __m256i mantissa = _mm256_set1_epi64x(_mantissa); + + double4 tau_is_zero = _mm256_cmp_pd(tau, zero, _CMP_EQ_OQ); + double4 sign_tau = _mm256_cmp_pd(tau, zero, _CMP_LT_OQ); + double4 sign_sigma = _mm256_cmp_pd(sigma, zero, _CMP_LT_OQ); + double4 sign_diff = _mm256_xor_pd(sign_tau, sign_sigma); + + double4 pred_sigma = get_predecessor_abs_avx2(sigma); + __m256i eta = _mm256_blendv_pd(get_exponent_avx2(sigma), + get_exponent_avx2(pred_sigma), sign_diff); + + debug_m256i("eta-mantissa", eta - mantissa, false); + double4 ulp = pow2_avx2(_mm256_sub_epi64(eta, mantissa)); + debug_m256d("\tulp ", ulp); + ulp = _mm256_blendv_pd(ulp, _mm256_sub_pd(zero, ulp), sign_tau); + + double4 pi = _mm256_mul_pd(ulp, z); + double4 abs_tau_plus_pi = abs_pd(_mm256_add_pd(tau, pi)); + double4 abs_ulp = abs_pd(ulp); + double4 round = _mm256_blendv_pd( + zero, ulp, _mm256_cmp_pd(abs_tau_plus_pi, abs_ulp, _CMP_GE_OQ)); + + debug_m256d("\tz ", z); + debug_m256i("\teta ", eta, false); + debug_m256d("\tpi ", pi); + debug_m256d("\ttau+pi", _mm256_add_pd(tau, pi)); + debug_m256d("\tulp ", ulp); + debug_m256d("\tround ", round); + debug_m256d("\t--", _mm256_setzero_pd()); + debug(" [SR_ROUND]\tend"); + return _mm256_blendv_pd(round, sigma, tau_is_zero); +} + +static inline float8 sr_round_avx2f(float8 sigma, float8 tau, float8 z) { + debug(" [SR_ROUND]\tstart"); + debug_m256("\tsigma ", sigma); + debug_m256("\ttau ", tau); + + const int _mantissa = sr::utils::IEEE754::mantissa; + const int _min_exponent = sr::utils::IEEE754::min_exponent; + + const float8 zero = _mm256_setzero_ps(); + const __m256i mantissa = _mm256_set1_epi32(_mantissa); + + float8 tau_is_zero = _mm256_cmp_ps(tau, zero, _CMP_EQ_OQ); + float8 sign_tau = _mm256_cmp_ps(tau, zero, _CMP_LT_OQ); + float8 sign_sigma = _mm256_cmp_ps(sigma, zero, _CMP_LT_OQ); + float8 sign_diff = _mm256_xor_ps(sign_tau, sign_sigma); + + float8 pred_sigma = get_predecessor_abs_avx2f(sigma); + __m256i eta = _mm256_blendv_ps(get_exponent_avx2f(sigma), + get_exponent_avx2f(pred_sigma), sign_diff); + + debug_m256i("eta-mantissa", eta - mantissa, false); + float8 ulp = pow2_avx2f(eta - mantissa); + debug_m256("\tulp ", ulp); + ulp = _mm256_blendv_ps(ulp, _mm256_sub_ps(zero, ulp), sign_tau); + + float8 pi = _mm256_mul_ps(ulp, z); + float8 abs_tau_plus_pi = abs_ps(_mm256_add_ps(tau, pi)); + float8 abs_ulp = abs_ps(ulp); + float8 round = _mm256_blendv_ps( + zero, ulp, _mm256_cmp_ps(abs_tau_plus_pi, abs_ulp, _CMP_GE_OQ)); + + debug_m256("\tz ", z); + debug_m256i("\teta ", eta, false); + debug_m256("\tpi ", pi); + debug_m256("\ttau+pi", _mm256_add_ps(tau, pi)); + debug_m256("\tulp ", ulp); + debug_m256("\tround ", round); + debug_m256("\t--", _mm256_setzero_ps()); + debug(" [SR_ROUND]\tend"); + return _mm256_blendv_ps(round, sigma, tau_is_zero); +} + +static inline double4 sr_add_avx2(double4 a, double4 b) { + debug(" [SR_ADD]\t\tstart"); + double4 is_number = isnumber_avx2(a, b); + double4 normal_sum = _mm256_add_pd(a, b); + + double4 z = get_rand_double01_avx2(); + debug_m256d("\t\tz", z); + double4 sigma, tau; + + // Implement twosum using AVX instructions + // swap a and b if |a| < |b| + double4 swap = _mm256_cmp_pd(abs_pd(a), abs_pd(b), _CMP_LT_OQ); + double4 tmp = a; + a = _mm256_blendv_pd(a, b, swap); + b = _mm256_blendv_pd(b, tmp, swap); + + sigma = _mm256_add_pd(a, b); + double4 v = _mm256_sub_pd(sigma, a); + tau = _mm256_sub_pd(b, v); + + double4 round = sr_round_avx2(sigma, tau, z); + double4 stochastic_sum = _mm256_add_pd(sigma, round); + + debug_m256d("\t\tsigma", sigma); + debug_m256d("\t\ttau ", tau); + debug_m256d("\t\tstochastic_sum", stochastic_sum); + debug(" [SR_ADD]\t\tend"); + + return _mm256_blendv_pd(normal_sum, stochastic_sum, is_number); +} + +static inline double4 sr_sub_avx2(double4 a, double4 b) { + b = _mm256_xor_pd(b, _mm256_set1_pd(-0.0)); + return sr_add_avx2(a, b); +} + +static inline double4 sr_mul_avx2(double4 a, double4 b) { + debug(" [SR_MUL]\t\tstart"); + double4 is_number = isnumber_avx2(a, b); + double4 normal_mul = _mm256_mul_pd(a, b); + double4 z = get_rand_double01_avx2(); + double4 sigma = normal_mul; + double4 tau = _mm256_fmsub_pd(a, b, sigma); + double4 round = sr_round_avx2(sigma, tau, z); + double4 stochastic_mul = _mm256_add_pd(sigma, round); + + debug(" [SR_MUL]\t\tend"); + return _mm256_blendv_pd(normal_mul, stochastic_mul, is_number); +} + +static inline double4 sr_div_avx2(double4 a, double4 b) { + debug(" [SR_DIV]\t\tstart"); + double4 is_number = isnumber_avx2(a, b); + double4 normal_div = _mm256_div_pd(a, b); + double4 z = get_rand_double01_avx2(); + double4 sigma = normal_div; + double4 tau = + _mm256_fmadd_pd(_mm256_xor_pd(sigma, _mm256_set1_pd(-0.0)), b, a); + double4 round = sr_round_avx2(sigma, tau, z); + double4 stochastic_div = _mm256_add_pd(sigma, round); + + debug(" [SR_DIV]\t\tend"); + return _mm256_blendv_pd(normal_div, stochastic_div, is_number); +} \ No newline at end of file diff --git a/src/libvfcinstrumentonline/rand/src/sr_sse4-2.hpp b/src/libvfcinstrumentonline/rand/src/sr_sse4-2.hpp new file mode 100644 index 00000000..93f36cbf --- /dev/null +++ b/src/libvfcinstrumentonline/rand/src/sr_sse4-2.hpp @@ -0,0 +1,186 @@ +#include +#include +#include +#include + +#include "rand.hpp" +#include "utils.hpp" +#include "xoroshiro256+.hpp" + +#ifndef __SSE4_2__ +#error "SSE4.2 is required for this file" +#endif + +namespace sse4_2 { + +typedef __m128d double2; +typedef __m128i short8; +typedef __m128i int4; +typedef __m128i long2; +typedef __m128 float4; + +void debug_m128d(const char msg[], __m128d a) { + std::cout << msg << ": "; + for (int i = 0; i < sizeof(a) / sizeof(double); i++) + std::cout << a[i] << " "; + std::cout << std::endl; +} + +void debug_m128i(const char msg[], __m128i a) { + std::cout << msg << ": "; + for (int i = 0; i < sizeof(a) / sizeof(int); i++) + std::cout << a[i] << " "; + std::cout << std::endl; +} + +namespace doublex2 { + +static inline double2 get_rand_double01() { + return xoroshiro256plus::sse4_2::get_rand_double01(rng_state_x2); +} + +static inline double2 get_predecessor_abs(double2 a) { + const double2 phi = _mm_set1_pd(1.0 - 0x1.0p-53); + return _mm_mul_pd(a, phi); +} + +static inline double2 get_exponent(double2 a) { + const double2 zero = _mm_setzero_pd(); + const long2 exp_mask = _mm_set1_epi64x(0x7ff0000000000000ULL); + const long2 bias = _mm_set1_pd(1023); + + double2 is_zero = _mm_cmp_pd(a, zero, _CMP_EQ_OQ); + + // Extract exponent using floating-point operations + long2 a_bits = _mm_castpd_si128(a); + long2 exp_bits = _mm_and_pd(a_bits, exp_mask); + exp_bits = _mm_srli_epi64(exp_bits, 52); + + // Subtract bias + exp_bits = _mm_sub_pd(exp_bits, bias); + + // Blend result with zero for zero inputs + return _mm_blendv_pd(exp_bits, zero, is_zero); +} + +static inline double2 isnumber(double2 a, double2 b) { + const double2 zero = _mm_setzero_pd(); + const double2 naninf_mask = _mm_set1_pd(0x7ff0000000000000); + + double2 a_not_zero = _mm_cmp_pd(a, zero, _CMP_NEQ_UQ); + double2 b_not_zero = _mm_cmp_pd(b, zero, _CMP_NEQ_UQ); + double2 a_not_naninf = + _mm_cmp_pd(_mm_and_pd(a, naninf_mask), naninf_mask, _CMP_NEQ_UQ); + double2 b_not_naninf = + _mm_cmp_pd(_mm_and_pd(b, naninf_mask), naninf_mask, _CMP_NEQ_UQ); + + return _mm_and_pd(_mm_and_pd(a_not_zero, a_not_naninf), + _mm_and_pd(b_not_zero, b_not_naninf)); +} + +static inline int4 cmpgt(double2 a, double2 b) { + return _mm_castpd_si128(_mm_cmp_pd(a, b, _CMP_GT_OQ)); +} + +static inline double2 pow2(long2 n) { + const int mantissa = sr::utils::IEEE754::mantissa; + const int min_exponent = sr::utils::IEEE754::min_exponent; + + const double2 zero = _mm_setzero_pd(); + const double2 one = _mm_set1_pd(1.0); + + long2 min_exp_vec = _mm_set1_epi64x(min_exponent); + long2 one = _mm_set1_epi64x(1); + long2 mantissa_vec = _mm_set1_epi64x(mantissa); + + long2 is_subnormal = _mm_cmpeq_epi64(min_exp_vec, n); + long2 precision_loss = _mm_sub_epi64(min_exp_vec, n - 1); + precision_loss = _mm_and_si128(precision_loss, is_subnormal); + + long2 n_adjusted = _mm_blendv_epi8(n, one, is_subnormal); + double2 res = _mm_blendv_pd(one, zero, _mm_castsi128_pd(is_subnormal)); + + long2 i = _mm_castpd_si128(res); + long2 shift = _mm_sub_epi64(mantissa_vec, precision_loss); + long2 to_add = _mm_sllv_epi64(n_adjusted, shift); + i = _mm_add_epi64(i, to_add); + return _mm_castsi128_pd(i); +} + +static inline double2 abs(double2 a) { + const long2 sign_mask = _mm_set1_epi64x(0x8000000000000000ULL); + return _mm_andnot_pd(_mm_castsi128_pd(sign_mask), a); +} + +static inline double2 sr_round(double2 sigma, double2 tau, double2 z) { + const int _mantissa = sr::utils::IEEE754::mantissa; + const int _min_exponent = sr::utils::IEEE754::min_exponent; + + const double2 zero = _mm_setzero_pd(); + const long2 mantissa = _mm_set1_epi64x(_mantissa); + + double2 tau_is_zero = _mm_cmp_pd(tau, zero, _CMP_EQ_OQ); + double2 sign_tau = _mm_and_pd(tau, _mm_set1_pd(-0.0)); + double2 sign_sigma = _mm_and_pd(sigma, _mm_set1_pd(-0.0)); + double2 sign_diff = _mm_xor_pd(sign_tau, sign_sigma); + + double2 pred_sigma = get_predecessor_abs(sigma); + long2 eta = + _mm_blendv_pd(get_exponent(sigma), get_exponent(pred_sigma), sign_diff); + + double2 ulp = pow2(_mm_sub_epi64(eta, mantissa)); + ulp = _mm_blendv_pd(ulp, -ulp, tau_is_zero); + double2 pi = ulp * z; + double2 abs_ulp = abs(ulp); + double2 abs_tau_plus_pi = abs(tau + pi); + double2 round = _mm_blendv_pd( + zero, sign_diff, _mm_cmp_pd(abs_tau_plus_pi, abs_ulp, _CMP_GE_OQ)); + return _mm_blendv_pd(round, sigma, tau_is_zero); +} + +static inline double2 sr_add(double2 a, double2 b) { + double2 is_number = isnumber(a, b); + double2 normal_sum = a + b; + double2 z = get_rand_double01(); + double2 tau, sigma, round; + + double2 tmp = a; + a = _mm_min_pd(a, b); + b = _mm_max_pd(tmp, b); + + sigma = normal_sum; + double2 v = sigma - a; + tau = b - v; + + round = sr_round(sigma, tau, z); + return sigma + round; +} + +static inline double2 sr_sub(double2 a, double2 b) { return sr_add(a, -b); } + +static inline double2 sr_mul(double2 a, double2 b) { + double2 is_number = isnumber(a, b); + double2 normal_sum = a + b; + double2 z = get_rand_double01(); + double2 tau, sigma, round; + + double2 z = get_rand_double01(); + double2 tau, sigma, round; + twoproduct(a, b, sigma, tau); + round = sr_round(sigma, tau, z); + return sigma + round; +} + +static inline double2 sr_div(double2 a, double2 b) { + if (not isnumber(a, b)) { + return a / b; + } + double2 z = get_rand_double01(); + double2 tau, sigma, round; + twodiv(a, b, sigma, tau); + round = sr_round(sigma, tau, z); + return sigma + round; +} + +} // namespace doublex2 +} // namespace sse4_2 diff --git a/src/libvfcinstrumentonline/rand/src/utils.hpp b/src/libvfcinstrumentonline/rand/src/utils.hpp index c13019a3..a9c89b09 100644 --- a/src/libvfcinstrumentonline/rand/src/utils.hpp +++ b/src/libvfcinstrumentonline/rand/src/utils.hpp @@ -3,6 +3,7 @@ #include #include +#include #include "vector_types.hpp" @@ -194,7 +195,7 @@ template T pow2(int n) { return res; } -// TODO: finish implementing this function +// TODO: finish to implement this function template T add_round_odd(T a, T b) { // return addition with rounding to odd // https://www.lri.fr/~melquion/doc/08-tc.pdf diff --git a/src/libvfcinstrumentonline/rand/src/vector_types.hpp b/src/libvfcinstrumentonline/rand/src/vector_types.hpp index 2f1a188a..3e31cff6 100644 --- a/src/libvfcinstrumentonline/rand/src/vector_types.hpp +++ b/src/libvfcinstrumentonline/rand/src/vector_types.hpp @@ -13,6 +13,7 @@ typedef union { uint32_t u32[2]; int32_t i32[2]; } double_t; + typedef double_t floatx2_t; typedef double_t uint64_t; typedef double_t int64_t; @@ -20,8 +21,9 @@ typedef double_t uint32x2_t; typedef double_t int32x2_t; typedef double_t boolx2_t; -floatx2_t add(floatx2_t a, - floatx2_t b){return {.f = {a.f[0] + b.f[0], a.f[1] + b.f[1]}}}; +floatx2_t add(floatx2_t a, floatx2_t b) { + return {.f = {a.f[0] + b.f[0], a.f[1] + b.f[1]}}; +} boolx2_t cmplt(floatx2_t a, floatx2_t b) { return {.u32 = {a.f[0] < b.f[0], a.f[1] < b.f[1]}}; } @@ -34,6 +36,7 @@ boolx2_t bitwise_xor(boolx2_t a, boolx2_t b) { floatx2_t mul(floatx2_t a, floatx2_t b) { return {.f = {a.f[0] * b.f[0], a.f[1] * b.f[1]}}; +} } // namespace scalar @@ -80,19 +83,23 @@ doublex2_t add(doublex2_t a, doublex2_t b) { return _mm_add_pd(a, b); }; } // namespace sse4_2 namespace avx { -typedef __m128i uint64x2_t; -typedef __m128i uint32x4_t; -typedef __m128 floatx4_t; -typedef __m128d doublex2_t; +typedef __m256i int64x4_t; +typedef __m256i int32x8_t; +typedef __m256i uint64x4_t; +typedef __m256i uint32x8_t; +typedef __m256 floatx8_t; +typedef __m256d doublex4_t; /* The current state of the generators. */ typedef struct { - __m128i s[4]; + __m256i s[4]; } state; } // namespace avx namespace avx2 { +typedef __m256i int64x4_t; +typedef __m256i int32x8_t; typedef __m256i uint64x4_t; typedef __m256i uint32x8_t; typedef __m256 floatx8_t; diff --git a/src/libvfcinstrumentonline/rand/src/xoroshiro256+.hpp b/src/libvfcinstrumentonline/rand/src/xoroshiro256+.hpp new file mode 100644 index 00000000..a145e67b --- /dev/null +++ b/src/libvfcinstrumentonline/rand/src/xoroshiro256+.hpp @@ -0,0 +1,406 @@ +#ifndef __VERIFICARLO_SRLIB_RAND_XOROSHIRO256P_HPP__ +#define __VERIFICARLO_SRLIB_RAND_XOROSHIRO256P_HPP__ + +#include + +#include +#include +#include + +#include "vector_types.hpp" + +namespace xoroshiro256plus { + +namespace scalar { +typedef uint64_t rn_uint64; + +/* The current state of the generators. */ +typedef struct { + uint64_t s[4]; +} state; + +static inline uint64_t rotl(uint64_t x, int k) { + return (x << k) | (x >> (64 - k)); +} + +static inline uint64_t next(state &s) { + uint64_t result = s.s[0] + s.s[3]; + uint64_t t = s.s[1] << 17; + + s.s[2] ^= s.s[0]; + s.s[3] ^= s.s[1]; + s.s[1] ^= s.s[2]; + s.s[0] ^= s.s[3]; + s.s[2] ^= t; + s.s[3] = rotl(s.s[3], 45); + + return result; +} + +void init(state &s, const std::array &seeds) { + s.s[0] = seeds[0]; + s.s[1] = 0; + s.s[2] = 0; + s.s[3] = 0; +} + +} // namespace scalar + +namespace sse4_2 { +/* The current state of the generators. */ +typedef struct { + __m128i s[4]; +} state; + +__attribute__((target("sse4.2"))) static __inline __m128i rotl(__m128i x, + int k) { + return _mm_or_si128(_mm_slli_epi64(x, k), _mm_srli_epi64(x, 64 - k)); +} + +__attribute__((target("sse4.2"))) static inline __m128i next(state &s) { + // result[i] = s[0][i] + s[3][i]; + __m128i result = _mm_add_epi64(s.s[0], s.s[3]); + + // t[i] = s[1][i] << 17; + __m128i t = _mm_slli_epi64(s.s[1], 17); + + // s[2][i] ^= s[0][i]; + s.s[2] = _mm_xor_si128(s.s[2], s.s[0]); + + // s[3][i] ^= s[1][i]; + s.s[3] = _mm_xor_si128(s.s[3], s.s[1]); + + // s[1][i] ^= s[2][i]; + s.s[1] = _mm_xor_si128(s.s[1], s.s[2]); + + // s[0][i] ^= s[3][i]; + s.s[0] = _mm_xor_si128(s.s[0], s.s[3]); + + // s[2][i] ^= t[i]; + s.s[2] = _mm_xor_si128(s.s[2], t); + + // s[3][i] = rotl(s[3][i], 45); + s.s[3] = rotl(s.s[3], 45); + + return result; +} + +__attribute__((target("sse4.2"))) static inline __m128d +get_rand_double01(state &s) { + __m128i rng = next(s); + __m128i shift = _mm_srli_epi64(rng, 11); + __m128d shiftd = _mm_cvtepi64_pd(shift) * 0x1.0p-53; + return shiftd; +} + +__attribute__((target("sse4.2"))) void +init(state &s, const std::array &seeds) { + s.s[0] = _mm_set_epi64x(seeds[0], seeds[1]); + s.s[1] = _mm_setzero_si128(); + s.s[2] = _mm_setzero_si128(); + s.s[3] = _mm_setzero_si128(); +} + +} // namespace sse4_2 + +namespace avx { + +void debug_m256i(const char msg[], __m256i a = _mm256_setzero_si256(), + bool novalue = false) { + printf("%s: ", msg); + if (novalue) { + printf("\n"); + return; + } + for (int i = 0; i < 4; i++) { + printf("%016lx ", (uint64_t)a[i]); + } + printf("\n"); +} + +void debug_m256d(const char msg[], __m256d a = _mm256_setzero_pd(), + bool novalue = false) { + printf("%s: ", msg); + if (novalue) { + printf("\n"); + return; + } + for (int i = 0; i < 4; i++) { + printf("%+.13a ", a[i]); + } + printf("\n"); +} + +void debug(const char msg[]) { debug_m256i(msg, _mm256_setzero_si256(), true); } + +/* The current state of the generators. */ +typedef struct { + __m256i s[4]; +} state; + +__attribute__((target("avx"))) static __inline __m256i rotl(__m256i x, int k) { + return x << k | (x >> (64 - k)); +} + +__attribute__((target("avx"))) static inline __m256i next(state &s) { + debug_m256i("state[0]", s.s[0]); + debug_m256i("state[1]", s.s[1]); + debug_m256i("state[2]", s.s[2]); + debug_m256i("state[3]", s.s[3]); + debug("----\n"); + // result[i] = s[0][i] + s[3][i]; + __m256i result = s.s[0] + s.s[3]; + debug("result[i] = s[0][i] + s[3][i];"); + debug_m256i("result ", result); + + // t[i] = s[1][i] << 17; + __m256i t = s.s[1] << 17; + debug("t[i] = s[1][i] << 17;"); + debug_m256i("t ", t); + + // s[2][i] ^= s[0][i]; + s.s[2] = s.s[2] ^ s.s[0]; + debug("s[2][i] ^= s[0][i];"); + debug_m256i("s[2] ", s.s[2]); + + // s[3][i] ^= s[1][i]; + s.s[3] = s.s[3] ^ s.s[1]; + debug("s[3][i] ^= s[1][i];"); + debug_m256i("s[3] ", s.s[3]); + + // s[1][i] ^= s[2][i]; + s.s[1] = s.s[1] ^ s.s[2]; + debug("s[1][i] ^= s[2][i];"); + debug_m256i("s[1] ", s.s[1]); + + // s[0][i] ^= s[3][i]; + s.s[0] = s.s[0] ^ s.s[3]; + debug("s[0][i] ^= s[3][i];"); + debug_m256i("s[0] ", s.s[0]); + + // s[2][i] ^= t[i]; + s.s[2] = s.s[2] ^ t; + debug("s[2][i] ^= t[i];"); + debug_m256i("s[2] ", s.s[2]); + + // s[3][i] = rotl(s[3][i], 45); + s.s[3] = rotl(s.s[3], 45); + debug("s[3][i] = rotl(s[3][i], 45);"); + debug_m256i("s[3] ", s.s[3]); + + return result; +} + +__attribute__((target("avx"))) static inline __m256d +get_rand_double01(state &s) { + __m256i rng = next(s); + __m128i rng_lo = _mm256_extractf128_si256(rng, 0); + __m128i rng_hi = _mm256_extractf128_si256(rng, 1); + debug_m256i("next:", rng); + debug("----\n"); + __m128i shift_lo = _mm_srli_epi64(rng_lo, 11); + __m128i shift_hi = _mm_srli_epi64(rng_hi, 11); + debug_m256i("shift", _mm256_set_m128i(shift_hi, shift_lo)); + debug_m256d("shift", _mm256_set_m128d(_mm_cvtepi64_pd(shift_hi), + _mm_cvtepi64_pd(shift_lo))); + __m128d shiftd_lo = _mm_cvtepi64_pd(shift_lo) * 0x1.0p-53; + __m128d shiftd_hi = _mm_cvtepi64_pd(shift_hi) * 0x1.0p-53; + __m256d shiftd = _mm256_set_m128d(shiftd_hi, shiftd_lo); + debug_m256d("shiftd", shiftd); + return shiftd; +} + +__attribute__((target("avx"))) void init(state &s, + const std::array &seeds) { + s.s[0] = _mm256_set_epi64x(seeds[0], seeds[1], seeds[2], seeds[3]); + s.s[1] = _mm256_setzero_si256(); + s.s[2] = _mm256_setzero_si256(); + s.s[3] = _mm256_setzero_si256(); +} + +} // namespace avx + +namespace avx2 { + +#define RET_DEBUG +#ifdef RET_DEGUG +#define RET return; +#else +#define RET +#endif + +void debug_m128i(const char msg[], __m128i a = _mm_setzero_si128(), + bool novalue = false) { + return; + printf("%s: ", msg); + if (novalue) { + printf("\n"); + return; + } + for (int i = 0; i < 2; i++) { + printf("%016lx ", (uint64_t)a[i]); + } + printf("\n"); +} + +void debug_m256i(const char msg[], __m256i a = _mm256_setzero_si256(), + bool novalue = false) { + return; + printf("%s: ", msg); + if (novalue) { + printf("\n"); + return; + } + for (int i = 0; i < 4; i++) { + printf("%016lx ", (uint64_t)a[i]); + } + printf("\n"); +} + +void debug_m256d(const char msg[], __m256d a = _mm256_setzero_pd(), + bool novalue = false) { + return; + printf("%s: ", msg); + if (novalue) { + printf("\n"); + return; + } + for (int i = 0; i < 4; i++) { + printf("%+.13a ", a[i]); + } + printf("\n"); +} + +void debug_m128d(const char msg[], __m128d a = _mm_setzero_pd(), + bool novalue = false) { + return; + printf("%s: ", msg); + if (novalue) { + printf("\n"); + return; + } + for (int i = 0; i < 2; i++) { + printf("%+.13a ", a[i]); + } + printf("\n"); +} + +void debug(const char msg[]) { debug_m256i(msg, _mm256_setzero_si256(), true); } + +/* The current state of the generators. */ +typedef struct { + __m256i s[4]; +} state; + +__attribute__((target("avx2"))) static __inline __m256i rotl(__m256i x, int k) { + return _mm256_or_si256(_mm256_slli_epi64(x, k), _mm256_srli_epi64(x, 64 - k)); +} + +__attribute__((target("avx2"))) static inline __m256i next(state &s) { + // result[i] = s[0][i] + s[3][i]; + __m256i result = _mm256_add_epi64(s.s[0], s.s[3]); + + // t[i] = s[1][i] << 17; + __m256i t = _mm256_slli_epi64(s.s[1], 17); + + // s[2][i] ^= s[0][i]; + s.s[2] = _mm256_xor_si256(s.s[2], s.s[0]); + + // s[3][i] ^= s[1][i]; + s.s[3] = _mm256_xor_si256(s.s[3], s.s[1]); + + // s[1][i] ^= s[2][i]; + s.s[1] = _mm256_xor_si256(s.s[1], s.s[2]); + + // s[0][i] ^= s[3][i]; + s.s[0] = _mm256_xor_si256(s.s[0], s.s[3]); + + // s[2][i] ^= t[i]; + s.s[2] = _mm256_xor_si256(s.s[2], t); + + // s[3][i] = rotl(s[3][i], 45); + s.s[3] = rotl(s.s[3], 45); + + return result; +} + +__m256d get_rand_double01(state &s) { + __m256i rng = next(s); + debug_m256i("rng", rng); + __m256i shift = _mm256_srli_epi64(rng, 11); + __m128i shift_lo = _mm256_extractf128_si256(shift, 0); + __m128i shift_hi = _mm256_extractf128_si256(shift, 1); + // convert to double + debug_m128i("shift lo", shift_lo); + debug_m128i("shift hi", shift_hi); + debug_m128d("shiftd lo", _mm_castsi128_pd(shift_lo)); + debug_m128d("shiftd hi", _mm_castsi128_pd(shift_hi)); + __m256d x = _mm256_set_pd(shift_hi[1], shift_hi[0], shift_lo[1], shift_lo[0]); + debug_m256d("x", x * 0x1.0p-53); + return x * 0x1.0p-53; +} + +__attribute__((target("avx2"))) void +init(state &s, const std::array &seeds) { + s.s[0] = _mm256_set_epi64x(seeds[0], seeds[1], seeds[2], seeds[3]); + s.s[1] = _mm256_setzero_si256(); + s.s[2] = _mm256_setzero_si256(); + s.s[3] = _mm256_setzero_si256(); +} + +} // namespace avx2 + +namespace avx512 { +typedef __m512i rn_uint64_x8; + +/* The current state of the generators. */ +typedef struct { + __m512i s[4]; +} state; + +__attribute__((target("avx512f"))) static __inline __m512i rotl(__m512i x, + int k) { + return _mm512_or_si512(_mm512_slli_epi64(x, k), _mm512_srli_epi64(x, 64 - k)); +} + +__attribute__((target("avx512f"))) static inline __m512i next(state &s) { + // result[i] = s[0][i] + s[3][i]; + __m512i result = _mm512_add_epi64(s.s[0], s.s[3]); + + // t[i] = s[1][i] << 17; + __m512i t = _mm512_slli_epi64(s.s[1], 17); + + // s[2][i] ^= s[0][i]; + s.s[2] = _mm512_xor_si512(s.s[2], s.s[0]); + + // s[3][i] ^= s[1][i]; + s.s[3] = _mm512_xor_si512(s.s[3], s.s[1]); + + // s[1][i] ^= s[2][i]; + s.s[1] = _mm512_xor_si512(s.s[1], s.s[2]); + + // s[0][i] ^= s[3][i]; + s.s[0] = _mm512_xor_si512(s.s[0], s.s[3]); + + // s[2][i] ^= t[i]; + s.s[2] = _mm512_xor_si512(s.s[2], t); + + // s[3][i] = rotl(s[3][i], 45); + s.s[3] = rotl(s.s[3], 45); + + return result; +} + +__attribute__((target("avx512f"))) void +init(state &s, const std::array &seeds) { + s.s[0] = _mm512_set_epi64(seeds[0], seeds[1], seeds[2], seeds[3], seeds[4], + seeds[5], seeds[6], seeds[7]); + s.s[1] = _mm512_setzero_si512(); + s.s[2] = _mm512_setzero_si512(); + s.s[3] = _mm512_setzero_si512(); +} + +} // namespace avx512 + +} // namespace xoroshiro256plus +#endif // __VERIFICARLO_SRLIB_RAND_XOROSHIRO256P_HPP__ \ No newline at end of file diff --git a/src/libvfcinstrumentonline/rand/test.cpp b/src/libvfcinstrumentonline/rand/test.cpp index 179ede24..7e22ad0c 100644 --- a/src/libvfcinstrumentonline/rand/test.cpp +++ b/src/libvfcinstrumentonline/rand/test.cpp @@ -21,7 +21,7 @@ template T apply_op(const char op, T a, T b) { return a + b; case '-': return a - b; - case '*': + case 'x': return a * b; case '/': return a / b; diff --git a/src/libvfcinstrumentonline/rand/test.sh b/src/libvfcinstrumentonline/rand/test.sh index 4dfaa536..95909a8d 100755 --- a/src/libvfcinstrumentonline/rand/test.sh +++ b/src/libvfcinstrumentonline/rand/test.sh @@ -5,8 +5,8 @@ if [ $# -ne 3 ]; then exit 1 fi -verificarlo-c++ -DREAL=$REAL --function=apply_op test.cpp -std=c++17 -Wall -o test +verificarlo-c++ -DREAL=$REAL --function=_Z8apply_opIdET_cS0_S0_ test.cpp -std=c++17 -Wall -o test -export VFC_BACKENDS="libinterflop_mca.so --precision-binary64=53 --precision-binary32=24" +export VFC_BACKENDS="libinterflop_mca.so --mode=rr --precision-binary64=53 --precision-binary32=24" export VFC_BACKENDS_LOGGER=False ./test $1 $2 $3 diff --git a/tests/test_online_instrumentation_sr/test.c b/tests/test_online_instrumentation_sr/test.c deleted file mode 100644 index 9bc9a110..00000000 --- a/tests/test_online_instrumentation_sr/test.c +++ /dev/null @@ -1,144 +0,0 @@ -#include "interflop/common/float_const.h" -#include -#include -#include -#include -#include -#include - -#ifndef REAL_TYPE -#warning "REAL is not defined, defaulting to double" -#define REAL double -#define REAL_TYPE 1 -#endif - -#if REAL_TYPE == 1 -#define DOUBLE -#define REAL double -#elif REAL_TYPE == 2 -#define FLOAT -#define REAL float -#endif - -#ifndef REAL_TYPE -#error "REAL_TYPE is not defined" -#endif - - -double add_double(double a, double b) { return a + b; } - -double sub_double(double a, double b) { return a - b; } - -double mul_double(double a, double b) { return a * b; } - -double div_double(double a, double b) { return a / b; } - -double sqrt_double(double a) { return __builtin_sqrt(a); } - -double fma_double(double a, double b, double c) { - return __builtin_fma(a, b, c); -} - -double operator_double(char op, double a, double b, double c) { - switch (op) { - case '+': - return add_double(a, b); - case '-': - return sub_double(a, b); - case 'x': - return mul_double(a, b); - case '/': - return div_double(a, b); - case 's': - return sqrt_double(a); - case 'f': - return fma_double(a, b, c); - default: - fprintf(stderr, "Error: unknown operation\n"); - abort(); - } -} - -float add_float(float a, float b) { return a + b; } - -float sub_float(float a, float b) { return a - b; } - -float mul_float(float a, float b) { return a * b; } - -float div_float(float a, float b) { return a / b; } - -float sqrt_float(float a) { return __builtin_sqrtf(a); } - -float fma_float(float a, float b, float c) { return __builtin_fmaf(a, b, c); } - -float operator_float(char op, float a, float b, float c) { - switch (op) { - case '+': - return add_float(a, b); - case '-': - return sub_float(a, b); - case 'x': - return mul_float(a, b); - case '/': - return div_float(a, b); - case 's': - return sqrt_float(a); - case 'f': - return fma_float(a, b, c); - default: - fprintf(stderr, "Error: unknown operation\n"); - abort(); - } -} - -// void run_test() { -// double a = 0.1, b = 0.01, c = 0.0, d = 0.0, tau = 0.0, sigma = 0.0; -// fprintf(stderr, "\n"); -// for (int i = 0; i < 10; i++) { -// twosum_double(a, b, &tau, &sigma); -// fprintf(stderr, "a = %.13a, b = %.13a, tau = %.13a, sigma = %.13a\n", a, -// b, -// tau, sigma); -// d = sr_rounding_double(tau, sigma, i / 10.0); -// fprintf(stderr, "round = %.13a\n", c); -// } -// } - -int main(int argc, const char *argv[]) { - - if (argc == 3) { - if (argv[1][0] != 's') { - fprintf(stderr, "usage: ./test a\n"); - exit(1); - } - } else if (argc == 5) { - if (argv[1][0] != 'f') { - fprintf(stderr, "usage: ./test a b c\n"); - exit(1); - } - - } else if (argc != 4) { - fprintf(stderr, "usage: ./test a b\n"); - exit(1); - } - // run_test(); - - const char op = argv[1][0]; - REAL a = atof(argv[2]), b = 0, c = 0; - - if (op != 's') - b = atof(argv[3]); - - if (op == 'f') - c = atof(argv[4]); - -#ifdef DOUBLE - // fprintf(stderr, "[double] op = %c, a = %.13a, b = %.13a\n", op, a, b); - printf("%.13a\n", operator_double(op, a, b, c)); -#elif defined(FLOAT) - // fprintf(stderr, "[float] op = %c, a = %.6a, b = %.6a\n", op, a, b); - printf("%.6a\n", operator_float(op, a, b, c)); -#endif - - return EXIT_SUCCESS; -} diff --git a/tests/test_online_instrumentation_sr/test.sh b/tests/test_online_instrumentation_sr/test.sh deleted file mode 100755 index d8559356..00000000 --- a/tests/test_online_instrumentation_sr/test.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/bin/bash - -set -e - -echo "Test scalar" -./test_scalar.sh - -if [[ $? != 0 ]]; then - echo "Failed!" - exit 1 -fi - -echo "Test vector" -./test_vector.sh - -if [[ $? != 0 ]]; then - echo "Failed!" - exit 1 -else - echo "Success!" - exit 0 -fi