Skip to content

Commit

Permalink
WIP
Browse files Browse the repository at this point in the history
  • Loading branch information
yohanchatelain committed Aug 7, 2024
1 parent 1403adb commit 55c1dca
Show file tree
Hide file tree
Showing 15 changed files with 1,486 additions and 237 deletions.
130 changes: 130 additions & 0 deletions src/libvfcinstrumentonline/rand/app/main.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
#include <iostream>

#include <immintrin.h>

#include "src/debug.hpp"
#include "src/main.hpp"
#include "src/rand.hpp"
#include "src/sr.hpp"
#include "src/sr_avx2.hpp"

#ifndef UNROLL
#define UNROLL 1
#endif

void run_avx2_add_d(__m256d a, __m256d b, __m256d c[UNROLL]) {
for (int i = 0; i < UNROLL; i++) {
c[i] = sr_add_avx2(a, b);
}
}

void run_avx2_mul_d(__m256d a, __m256d b, __m256d c[UNROLL]) {
for (int i = 0; i < UNROLL; i++) {
c[i] = sr_mul_avx2(a, b);
}
}

void run_avx2_div_d(__m256d a, __m256d b, __m256d c[UNROLL]) {
for (int i = 0; i < UNROLL; i++) {
c[i] = sr_div_avx2(a, b);
}
}

void run_add_d(double a, double b, double c[UNROLL]) {
for (int i = 0; i < UNROLL; i++) {
c[i] = sr_add(a, b);
}
}

void run_mul_d(double a, double b, double c[UNROLL]) {
for (int i = 0; i < UNROLL; i++) {
c[i] = sr_mul(a, b);
}
}

void run_div_d(double a, double b, double c[UNROLL]) {
for (int i = 0; i < UNROLL; i++) {
c[i] = sr_div(a, b);
}
}

void print_m256d(__m256d a) {
std::cout << std::hexfloat;
for (int i = 0; i < sizeof(a) / sizeof(double); i++)
std::cout << a[i] << " ";
std::cout << std::defaultfloat;
}

void print_array_m256d(__m256d a[UNROLL]) {
for (int i = 0; i < UNROLL; i++) {
print_m256d(a[i]);
std::cout << std::endl;
}
}

void print_array_d(double a[UNROLL]) {
std::cout << std::hexfloat;
for (int i = 0; i < UNROLL; i++) {
std::cout << a[i] << std::endl;
}
std::cout << std::defaultfloat;
}

void test_scalar_d(double a, double b) {
std::cout << "sr_add" << std::endl;
double cs[UNROLL];
run_add_d(a, b, cs);
print_array_d(cs);
std::cout << "sr_mul" << std::endl;
run_mul_d(a, b, cs);
print_array_d(cs);
std::cout << "sr_div" << std::endl;
run_div_d(a, b, cs);
print_array_d(cs);
}

void test_avx2_d(__m256d a, __m256d b) {
std::cout << "sr_add_avx" << std::endl;
__m256d c[UNROLL];
run_avx2_add_d(a, b, c);

std::cout << std::hexfloat;
print_array_m256d(c);

std::cout << "sr_mul_avx" << std::endl;
run_avx2_mul_d(a, b, c);
print_array_m256d(c);

std::cout << "sr_div_avx" << std::endl;
run_avx2_div_d(a, b, c);
print_array_m256d(c);
}

int main() {

init();

std::vector<double> aa, bb;
__m256d a, b;
std::cout << "sizeof a " << sizeof(a) << std::endl;
std::cout << "elements in a " << sizeof(a) / sizeof(double) << std::endl;

for (int i = 0; i < 4; i++) {
double x;
std::cin >> x;
aa.push_back(x);
}
for (int i = 0; i < 4; i++) {
double x;
std::cin >> x;
bb.push_back(x);
}

a = _mm256_loadu_pd(aa.data());
b = _mm256_loadu_pd(bb.data());

test_avx2_d(a, b);
test_scalar_d(a[0], b[0]);

return 0;
}
22 changes: 12 additions & 10 deletions src/libvfcinstrumentonline/rand/src/eft.hpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
#ifndef __VERIFICARLO_SRLIB_EFT_HPP__
#define __VERIFICARLO_SRLIB_EFT_HPP__

#include <cmath>

#include "debug.hpp"
#include "vector_types.hpp"

Expand All @@ -15,16 +17,16 @@ template <typename T> void twosum(T a, T b, T &sigma, T &tau) {
debug_print("twosum(%.13a, %.13a) = %.13a, %.13a\n", a, b, sigma, tau);
}

template <typename T> void twosumx2(T a, T b, T &sigma, T &tau) {
if (std::abs(a[0]) < std::abs(b[0]))
std::swap(a[0], b[0]);
if (std::abs(a[1]) < std::abs(b[1]))
std::swap(a[1], b[1]);
sigma = a + b;
T z = sigma - a;
tau = (a - (sigma - z)) + (b - z);
debug_print("twosumx2(%.13a, %.13a) = %.13a, %.13a\n", a, b, sigma, tau);
}
// template <typename T> void twosumx2(T a, T b, T &sigma, T &tau) {
// if (std::abs(a[0]) < std::abs(b[0]))
// std::swap(a[0], b[0]);
// if (std::abs(a[1]) < std::abs(b[1]))
// std::swap(a[1], b[1]);
// sigma = a + b;
// T z = sigma - a;
// tau = (a - (sigma - z)) + (b - z);
// debug_print("twosumx2(%.13a, %.13a) = %.13a, %.13a\n", a, b, sigma, tau);
// }

template <typename T> void twoprodfma(T a, T b, T &sigma, T &tau) {
sigma = a * b;
Expand Down
3 changes: 3 additions & 0 deletions src/libvfcinstrumentonline/rand/src/main.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,9 @@ void init() {
auto seeds = get_seeds<1>();
xoroshiro256plus::scalar::init(rng_state, seeds);
debug_print("initialized xoroshiro256+ with seed %lu\n", seeds[0]);
auto seeds_avx2 = get_seeds<4>();
xoroshiro256plus::avx2::init(rng_state_x4, seeds_avx2);
debug_print("initialized xoroshiro256+ with seed %lu\n", seeds_avx2[0]);
#elif defined(SHISHUA)
auto seeds = get_seeds<4>();
uint64_t seed_state[4] = {seeds[0], seeds[1], seeds[2], seeds[3]};
Expand Down
16 changes: 16 additions & 0 deletions src/libvfcinstrumentonline/rand/src/rand.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@
#include <cstdlib>
#include <iostream>
#include <string>
#include <sys/syscall.h>
#include <sys/time.h>
#include <sys/types.h>
#include <type_traits>
#include <unistd.h>
#include <vector>
Expand Down Expand Up @@ -37,12 +40,15 @@ using rng_t = xoroshiro256plus::scalar::rn_uint64;
typedef xoroshiro256plus::scalar::state rng_state_t;
static __thread rng_state_t rng_state;
using rn_int64_x2 = sse4_2::int64x2_t;
using rn_int64_x4 = avx::int64x4_t;
using rn_int32_x4 = sse4_2::int32x4_t;
using float2 = scalar::floatx2_t;
using float4 = sse4_2::floatx4_t;
using double2 = sse4_2::doublex2_t;
typedef xoroshiro256plus::sse4_2::state rng_state_x2_t;
static __thread rng_state_x2_t rng_state_x2;
typedef xoroshiro256plus::avx2::state rng_state_x4_t;
static __thread rng_state_x4_t rng_state_x4;
#elif defined(SHISHUA)
#include "shishua.h"
typedef prng_state rng_state_t;
Expand Down Expand Up @@ -107,6 +113,16 @@ rn_int64_x2 get_rand_uint64_x2() {
return rng;
}

rn_int64_x4 get_rand_uint64_x4() {
rn_int64_x4 rng;
#ifdef XOROSHIRO
rng = xoroshiro256plus::avx2::next(rng_state_x4);
#else
#error "No PRNG defined"
#endif
return rng;
}

uint8_t get_rand_uint1() { return _get_rand_uint64() & 0x1; }

#ifdef XOROSHIRO
Expand Down
101 changes: 51 additions & 50 deletions src/libvfcinstrumentonline/rand/src/sr.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

#include "debug.hpp"
#include "eft.hpp"
#include "rand.hpp"
#include "utils.hpp"

template <typename T> bool isnumber(T a, T b) {
Expand Down Expand Up @@ -110,28 +111,28 @@ template <typename T> T sr_roundx2(const T sigma, const T tau, const T z) {
return round;
}

template <>
scalar::floatx2_t sr_roundx2(const scalar::floatx2_t &sigma,
const scalar::floatx2_t &tau,
const scalar::floatx2_t &z) {
using T = scalar::floatx2_t;
if (sse4_2::not0(isnumberx2<sse4_2::boolx2_t, T>(sigma, tau))) {
return {sr_round(sigma.f[0], tau.f[0], z.f[0]),
sr_round(sigma.f[1], tau.f[1], z.f[1])};
}
constexpr int32_t mantissa = sr::utils::IEEE754<float>::mantissa;
const auto sign_tau = sse4_2::cmplt(tau, sse4_2::setzero());
const auto sign_sigma = sse4_2::cmplt(sigma, sse4_2::setzero());
const scalar::int32x2_t eta;
const auto sign_cmp = scalar::bitwise_xor(sign_tau, sign_sigma);
scalar::int32x2_t eta;
eta.u32[0] = sse4_2::blendv(
sse4_2::get_exponent(sse4_2::get_predecessor_abs(sigma.f[0])),
sse4_2::get_exponent(sigma.f[0]), sign_cmp.u32[0]);
eta.u32[1] = sse4_2::blendv(
sse4_2::get_exponent(sse4_2::get_predecessor_abs(sigma.f[1])),
sse4_2::get_exponent(sigma.f[1]), sign_cmp.u32[1]);
}
// template <>
// scalar::floatx2_t sr_roundx2(const scalar::floatx2_t &sigma,
// const scalar::floatx2_t &tau,
// const scalar::floatx2_t &z) {
// using T = scalar::floatx2_t;
// if (sse4_2::not0(isnumberx2<sse4_2::boolx2_t, T>(sigma, tau))) {
// return {sr_round(sigma.f[0], tau.f[0], z.f[0]),
// sr_round(sigma.f[1], tau.f[1], z.f[1])};
// }
// constexpr int32_t mantissa = sr::utils::IEEE754<float>::mantissa;
// const auto sign_tau = sse4_2::cmplt(tau, sse4_2::setzero());
// const auto sign_sigma = sse4_2::cmplt(sigma, sse4_2::setzero());
// const scalar::int32x2_t eta;
// const auto sign_cmp = scalar::bitwise_xor(sign_tau, sign_sigma);
// scalar::int32x2_t eta;
// eta.u32[0] = sse4_2::blendv(
// sse4_2::get_exponent(sse4_2::get_predecessor_abs(sigma.f[0])),
// sse4_2::get_exponent(sigma.f[0]), sign_cmp.u32[0]);
// eta.u32[1] = sse4_2::blendv(
// sse4_2::get_exponent(sse4_2::get_predecessor_abs(sigma.f[1])),
// sse4_2::get_exponent(sigma.f[1]), sign_cmp.u32[1]);
// }

template <typename T> T sr_add(T a, T b) {
debug_start();
Expand All @@ -148,35 +149,35 @@ template <typename T> T sr_add(T a, T b) {
return sigma + round;
}

template <typename T> T sr_addx2(T a, T b) {
if (sse4_2::not0(isnumberx2<sse4_2::boolx2_t, T>(a, b))) {
if constexpr (std::is_same<T, scalar::floatx2_t>::value) {
return {.f = {sr_add(a.f[0], b.f[0]), sr_add(a.f[1], b.f[1])}};
}
if constexpr (std::is_same<T, scalar::double_t>::value) {
return {.f = {sr_add(a.f[0], b.f[0]), sr_add(a.f[1], b.f[1])}};
}
return sse4_2::add(a, b);
}
T z = get_rand_double01_x2();
T tau, sigma, round;
twosumx2(a, b, sigma, tau);
round = sr_roundx2(sigma, tau, z);
return sigma + round;
}
// template <typename T> T sr_addx2(T a, T b) {
// if (sse4_2::not0(isnumberx2<sse4_2::boolx2_t, T>(a, b))) {
// if constexpr (std::is_same<T, scalar::floatx2_t>::value) {
// return {.f = {sr_add(a.f[0], b.f[0]), sr_add(a.f[1], b.f[1])}};
// }
// if constexpr (std::is_same<T, scalar::double_t>::value) {
// return {.f = {sr_add(a.f[0], b.f[0]), sr_add(a.f[1], b.f[1])}};
// }
// return sse4_2::add(a, b);
// }
// T z = get_rand_double01_x2();
// T tau, sigma, round;
// twosumx2(a, b, sigma, tau);
// round = sr_roundx2(sigma, tau, z);
// return sigma + round;
// }

template <>
scalar::floatx2_t sr_addx2(scalar::floatx2_t a, scalar::floatx2_t b) {
using T = scalar::floatx2_t;
if (sse4_2::not0(isnumberx2<sse4_2::boolx2_t, T>(a, b))) {
return {.f = {sr_add(a.f[0], b.f[0]), sr_add(a.f[1], b.f[1])}};
}
T z = get_rand_float01_x2();
T tau, sigma, round;
twosumx2(a, b, sigma, tau);
round = sr_roundx2(sigma, tau, z);
return scalar::add(sigma, round);
}
// template <>
// scalar::floatx2_t sr_addx2(scalar::floatx2_t a, scalar::floatx2_t b) {
// using T = scalar::floatx2_t;
// if (sse4_2::not0(isnumberx2<sse4_2::boolx2_t, T>(a, b))) {
// return {.f = {sr_add(a.f[0], b.f[0]), sr_add(a.f[1], b.f[1])}};
// }
// T z = get_rand_float01_x2();
// T tau, sigma, round;
// twosumx2(a, b, sigma, tau);
// round = sr_roundx2(sigma, tau, z);
// return scalar::add(sigma, round);
// }

template <typename T> T sr_sub(T a, T b) { return sr_add(a, -b); }

Expand Down
Loading

0 comments on commit 55c1dca

Please sign in to comment.