From 55c1dca413041ecd788ece5659ed809ad75d6a58 Mon Sep 17 00:00:00 2001
From: Yohan Chatelain <yohan.chatelain@gmail.com>
Date: Tue, 6 Aug 2024 22:04:33 -0400
Subject: [PATCH] WIP

---
 src/libvfcinstrumentonline/rand/app/main.cpp  | 130 ++++++
 src/libvfcinstrumentonline/rand/src/eft.hpp   |  22 +-
 src/libvfcinstrumentonline/rand/src/main.hpp  |   3 +
 src/libvfcinstrumentonline/rand/src/rand.hpp  |  16 +
 src/libvfcinstrumentonline/rand/src/sr.hpp    | 101 ++---
 .../rand/src/sr_avx.hpp                       | 292 +++++++++++++
 .../rand/src/sr_avx2.hpp                      | 371 ++++++++++++++++
 .../rand/src/sr_sse4-2.hpp                    | 186 ++++++++
 src/libvfcinstrumentonline/rand/src/utils.hpp |   3 +-
 .../rand/src/vector_types.hpp                 |  21 +-
 .../rand/src/xoroshiro256+.hpp                | 406 ++++++++++++++++++
 src/libvfcinstrumentonline/rand/test.cpp      |   2 +-
 src/libvfcinstrumentonline/rand/test.sh       |   4 +-
 tests/test_online_instrumentation_sr/test.c   | 144 -------
 tests/test_online_instrumentation_sr/test.sh  |  22 -
 15 files changed, 1486 insertions(+), 237 deletions(-)
 create mode 100644 src/libvfcinstrumentonline/rand/app/main.cpp
 create mode 100644 src/libvfcinstrumentonline/rand/src/sr_avx.hpp
 create mode 100644 src/libvfcinstrumentonline/rand/src/sr_avx2.hpp
 create mode 100644 src/libvfcinstrumentonline/rand/src/sr_sse4-2.hpp
 create mode 100644 src/libvfcinstrumentonline/rand/src/xoroshiro256+.hpp
 delete mode 100644 tests/test_online_instrumentation_sr/test.c
 delete mode 100755 tests/test_online_instrumentation_sr/test.sh
diff --git a/src/libvfcinstrumentonline/rand/app/main.cpp b/src/libvfcinstrumentonline/rand/app/main.cpp
new file mode 100644
index 00000000..17df1df3
--- /dev/null
+++ b/src/libvfcinstrumentonline/rand/app/main.cpp
@@ -0,0 +1,130 @@
+#include <iostream>
+
+#include <immintrin.h>
+
+#include "src/debug.hpp"
+#include "src/main.hpp"
+#include "src/rand.hpp"
+#include "src/sr.hpp"
+#include "src/sr_avx2.hpp"
+
+#ifndef UNROLL
+#define UNROLL 1
+#endif
+
+void run_avx2_add_d(__m256d a, __m256d b, __m256d c[UNROLL]) {
+  for (int i = 0; i < UNROLL; i++) {
+    c[i] = sr_add_avx2(a, b);
+  }
+}
+
+void run_avx2_mul_d(__m256d a, __m256d b, __m256d c[UNROLL]) {
+  for (int i = 0; i < UNROLL; i++) {
+    c[i] = sr_mul_avx2(a, b);
+  }
+}
+
+void run_avx2_div_d(__m256d a, __m256d b, __m256d c[UNROLL]) {
+  for (int i = 0; i < UNROLL; i++) {
+    c[i] = sr_div_avx2(a, b);
+  }
+}
+
+void run_add_d(double a, double b, double c[UNROLL]) {
+  for (int i = 0; i < UNROLL; i++) {
+    c[i] = sr_add(a, b);
+  }
+}
+
+void run_mul_d(double a, double b, double c[UNROLL]) {
+  for (int i = 0; i < UNROLL; i++) {
+    c[i] = sr_mul(a, b);
+  }
+}
+
+void run_div_d(double a, double b, double c[UNROLL]) {
+  for (int i = 0; i < UNROLL; i++) {
+    c[i] = sr_div(a, b);
+  }
+}
+
+void print_m256d(__m256d a) {
+  std::cout << std::hexfloat;
+  for (int i = 0; i < sizeof(a) / sizeof(double); i++)
+    std::cout << a[i] << " ";
+  std::cout << std::defaultfloat;
+}
+
+void print_array_m256d(__m256d a[UNROLL]) {
+  for (int i = 0; i < UNROLL; i++) {
+    print_m256d(a[i]);
+    std::cout << std::endl;
+  }
+}
+
+void print_array_d(double a[UNROLL]) {
+  std::cout << std::hexfloat;
+  for (int i = 0; i < UNROLL; i++) {
+    std::cout << a[i] << std::endl;
+  }
+  std::cout << std::defaultfloat;
+}
+
+void test_scalar_d(double a, double b) {
+  std::cout << "sr_add" << std::endl;
+  double cs[UNROLL];
+  run_add_d(a, b, cs);
+  print_array_d(cs);
+  std::cout << "sr_mul" << std::endl;
+  run_mul_d(a, b, cs);
+  print_array_d(cs);
+  std::cout << "sr_div" << std::endl;
+  run_div_d(a, b, cs);
+  print_array_d(cs);
+}
+
+void test_avx2_d(__m256d a, __m256d b) {
+  std::cout << "sr_add_avx" << std::endl;
+  __m256d c[UNROLL];
+  run_avx2_add_d(a, b, c);
+
+  std::cout << std::hexfloat;
+  print_array_m256d(c);
+
+  std::cout << "sr_mul_avx" << std::endl;
+  run_avx2_mul_d(a, b, c);
+  print_array_m256d(c);
+
+  std::cout << "sr_div_avx" << std::endl;
+  run_avx2_div_d(a, b, c);
+  print_array_m256d(c);
+}
+
+int main() {
+
+  init();
+
+  std::vector<double> aa, bb;
+  __m256d a, b;
+  std::cout << "sizeof a " << sizeof(a) << std::endl;
+  std::cout << "elements in a " << sizeof(a) / sizeof(double) << std::endl;
+
+  for (int i = 0; i < 4; i++) {
+    double x;
+    std::cin >> x;
+    aa.push_back(x);
+  }
+  for (int i = 0; i < 4; i++) {
+    double x;
+    std::cin >> x;
+    bb.push_back(x);
+  }
+
+  a = _mm256_loadu_pd(aa.data());
+  b = _mm256_loadu_pd(bb.data());
+
+  test_avx2_d(a, b);
+  test_scalar_d(a[0], b[0]);
+
+  return 0;
+}
\ No newline at end of file
diff --git a/src/libvfcinstrumentonline/rand/src/eft.hpp b/src/libvfcinstrumentonline/rand/src/eft.hpp
index c26d9bb9..e9856ad1 100644
--- a/src/libvfcinstrumentonline/rand/src/eft.hpp
+++ b/src/libvfcinstrumentonline/rand/src/eft.hpp
@@ -1,6 +1,8 @@
 #ifndef __VERIFICARLO_SRLIB_EFT_HPP__
 #define __VERIFICARLO_SRLIB_EFT_HPP__
 
+#include <cmath>
+
 #include "debug.hpp"
 #include "vector_types.hpp"
 
@@ -15,16 +17,16 @@ template <typename T> void twosum(T a, T b, T &sigma, T &tau) {
   debug_print("twosum(%.13a, %.13a) = %.13a, %.13a\n", a, b, sigma, tau);
 }
 
-template <typename T> void twosumx2(T a, T b, T &sigma, T &tau) {
-  if (std::abs(a[0]) < std::abs(b[0]))
-    std::swap(a[0], b[0]);
-  if (std::abs(a[1]) < std::abs(b[1]))
-    std::swap(a[1], b[1]);
-  sigma = a + b;
-  T z = sigma - a;
-  tau = (a - (sigma - z)) + (b - z);
-  debug_print("twosumx2(%.13a, %.13a) = %.13a, %.13a\n", a, b, sigma, tau);
-}
+// template <typename T> void twosumx2(T a, T b, T &sigma, T &tau) {
+//   if (std::abs(a[0]) < std::abs(b[0]))
+//     std::swap(a[0], b[0]);
+//   if (std::abs(a[1]) < std::abs(b[1]))
+//     std::swap(a[1], b[1]);
+//   sigma = a + b;
+//   T z = sigma - a;
+//   tau = (a - (sigma - z)) + (b - z);
+//   debug_print("twosumx2(%.13a, %.13a) = %.13a, %.13a\n", a, b, sigma, tau);
+// }
 
 template <typename T> void twoprodfma(T a, T b, T &sigma, T &tau) {
   sigma = a * b;
diff --git a/src/libvfcinstrumentonline/rand/src/main.hpp b/src/libvfcinstrumentonline/rand/src/main.hpp
index c57ccab7..c7cc61db 100644
--- a/src/libvfcinstrumentonline/rand/src/main.hpp
+++ b/src/libvfcinstrumentonline/rand/src/main.hpp
@@ -32,6 +32,9 @@ void init() {
   auto seeds = get_seeds<1>();
   xoroshiro256plus::scalar::init(rng_state, seeds);
   debug_print("initialized xoroshiro256+ with seed %lu\n", seeds[0]);
+  auto seeds_avx2 = get_seeds<4>();
+  xoroshiro256plus::avx2::init(rng_state_x4, seeds_avx2);
+  debug_print("initialized xoroshiro256+ with seed %lu\n", seeds_avx2[0]);
 #elif defined(SHISHUA)
   auto seeds = get_seeds<4>();
   uint64_t seed_state[4] = {seeds[0], seeds[1], seeds[2], seeds[3]};
diff --git a/src/libvfcinstrumentonline/rand/src/rand.hpp b/src/libvfcinstrumentonline/rand/src/rand.hpp
index 295a8dea..0462f4d9 100644
--- a/src/libvfcinstrumentonline/rand/src/rand.hpp
+++ b/src/libvfcinstrumentonline/rand/src/rand.hpp
@@ -6,6 +6,9 @@
 #include <cstdlib>
 #include <iostream>
 #include <string>
+#include <sys/syscall.h>
+#include <sys/time.h>
+#include <sys/types.h>
 #include <type_traits>
 #include <unistd.h>
 #include <vector>
@@ -37,12 +40,15 @@ using rng_t = xoroshiro256plus::scalar::rn_uint64;
 typedef xoroshiro256plus::scalar::state rng_state_t;
 static __thread rng_state_t rng_state;
 using rn_int64_x2 = sse4_2::int64x2_t;
+using rn_int64_x4 = avx::int64x4_t;
 using rn_int32_x4 = sse4_2::int32x4_t;
 using float2 = scalar::floatx2_t;
 using float4 = sse4_2::floatx4_t;
 using double2 = sse4_2::doublex2_t;
 typedef xoroshiro256plus::sse4_2::state rng_state_x2_t;
 static __thread rng_state_x2_t rng_state_x2;
+typedef xoroshiro256plus::avx2::state rng_state_x4_t;
+static __thread rng_state_x4_t rng_state_x4;
 #elif defined(SHISHUA)
 #include "shishua.h"
 typedef prng_state rng_state_t;
@@ -107,6 +113,16 @@ rn_int64_x2 get_rand_uint64_x2() {
   return rng;
 }
 
+rn_int64_x4 get_rand_uint64_x4() {
+  rn_int64_x4 rng;
+#ifdef XOROSHIRO
+  rng = xoroshiro256plus::avx2::next(rng_state_x4);
+#else
+#error "No PRNG defined"
+#endif
+  return rng;
+}
+
 uint8_t get_rand_uint1() { return _get_rand_uint64() & 0x1; }
 
 #ifdef XOROSHIRO
diff --git a/src/libvfcinstrumentonline/rand/src/sr.hpp b/src/libvfcinstrumentonline/rand/src/sr.hpp
index 1e668852..0a1d4b09 100644
--- a/src/libvfcinstrumentonline/rand/src/sr.hpp
+++ b/src/libvfcinstrumentonline/rand/src/sr.hpp
@@ -3,6 +3,7 @@
 
 #include "debug.hpp"
 #include "eft.hpp"
+#include "rand.hpp"
 #include "utils.hpp"
 
 template <typename T> bool isnumber(T a, T b) {
@@ -110,28 +111,28 @@ template <typename T> T sr_roundx2(const T sigma, const T tau, const T z) {
   return round;
 }
 
-template <>
-scalar::floatx2_t sr_roundx2(const scalar::floatx2_t &sigma,
-                             const scalar::floatx2_t &tau,
-                             const scalar::floatx2_t &z) {
-  using T = scalar::floatx2_t;
-  if (sse4_2::not0(isnumberx2<sse4_2::boolx2_t, T>(sigma, tau))) {
-    return {sr_round(sigma.f[0], tau.f[0], z.f[0]),
-            sr_round(sigma.f[1], tau.f[1], z.f[1])};
-  }
-  constexpr int32_t mantissa = sr::utils::IEEE754<float>::mantissa;
-  const auto sign_tau = sse4_2::cmplt(tau, sse4_2::setzero());
-  const auto sign_sigma = sse4_2::cmplt(sigma, sse4_2::setzero());
-  const scalar::int32x2_t eta;
-  const auto sign_cmp = scalar::bitwise_xor(sign_tau, sign_sigma);
-  scalar::int32x2_t eta;
-  eta.u32[0] = sse4_2::blendv(
-      sse4_2::get_exponent(sse4_2::get_predecessor_abs(sigma.f[0])),
-      sse4_2::get_exponent(sigma.f[0]), sign_cmp.u32[0]);
-  eta.u32[1] = sse4_2::blendv(
-      sse4_2::get_exponent(sse4_2::get_predecessor_abs(sigma.f[1])),
-      sse4_2::get_exponent(sigma.f[1]), sign_cmp.u32[1]);
-}
+// template <>
+// scalar::floatx2_t sr_roundx2(const scalar::floatx2_t &sigma,
+//                              const scalar::floatx2_t &tau,
+//                              const scalar::floatx2_t &z) {
+//   using T = scalar::floatx2_t;
+//   if (sse4_2::not0(isnumberx2<sse4_2::boolx2_t, T>(sigma, tau))) {
+//     return {sr_round(sigma.f[0], tau.f[0], z.f[0]),
+//             sr_round(sigma.f[1], tau.f[1], z.f[1])};
+//   }
+//   constexpr int32_t mantissa = sr::utils::IEEE754<float>::mantissa;
+//   const auto sign_tau = sse4_2::cmplt(tau, sse4_2::setzero());
+//   const auto sign_sigma = sse4_2::cmplt(sigma, sse4_2::setzero());
+//   const scalar::int32x2_t eta;
+//   const auto sign_cmp = scalar::bitwise_xor(sign_tau, sign_sigma);
+//   scalar::int32x2_t eta;
+//   eta.u32[0] = sse4_2::blendv(
+//       sse4_2::get_exponent(sse4_2::get_predecessor_abs(sigma.f[0])),
+//       sse4_2::get_exponent(sigma.f[0]), sign_cmp.u32[0]);
+//   eta.u32[1] = sse4_2::blendv(
+//       sse4_2::get_exponent(sse4_2::get_predecessor_abs(sigma.f[1])),
+//       sse4_2::get_exponent(sigma.f[1]), sign_cmp.u32[1]);
+// }
 
 template <typename T> T sr_add(T a, T b) {
   debug_start();
@@ -148,35 +149,35 @@ template <typename T> T sr_add(T a, T b) {
   return sigma + round;
 }
 
-template <typename T> T sr_addx2(T a, T b) {
-  if (sse4_2::not0(isnumberx2<sse4_2::boolx2_t, T>(a, b))) {
-    if constexpr (std::is_same<T, scalar::floatx2_t>::value) {
-      return {.f = {sr_add(a.f[0], b.f[0]), sr_add(a.f[1], b.f[1])}};
-    }
-    if constexpr (std::is_same<T, scalar::double_t>::value) {
-      return {.f = {sr_add(a.f[0], b.f[0]), sr_add(a.f[1], b.f[1])}};
-    }
-    return sse4_2::add(a, b);
-  }
-  T z = get_rand_double01_x2();
-  T tau, sigma, round;
-  twosumx2(a, b, sigma, tau);
-  round = sr_roundx2(sigma, tau, z);
-  return sigma + round;
-}
+// template <typename T> T sr_addx2(T a, T b) {
+//   if (sse4_2::not0(isnumberx2<sse4_2::boolx2_t, T>(a, b))) {
+//     if constexpr (std::is_same<T, scalar::floatx2_t>::value) {
+//       return {.f = {sr_add(a.f[0], b.f[0]), sr_add(a.f[1], b.f[1])}};
+//     }
+//     if constexpr (std::is_same<T, scalar::double_t>::value) {
+//       return {.f = {sr_add(a.f[0], b.f[0]), sr_add(a.f[1], b.f[1])}};
+//     }
+//     return sse4_2::add(a, b);
+//   }
+//   T z = get_rand_double01_x2();
+//   T tau, sigma, round;
+//   twosumx2(a, b, sigma, tau);
+//   round = sr_roundx2(sigma, tau, z);
+//   return sigma + round;
+// }
 
-template <>
-scalar::floatx2_t sr_addx2(scalar::floatx2_t a, scalar::floatx2_t b) {
-  using T = scalar::floatx2_t;
-  if (sse4_2::not0(isnumberx2<sse4_2::boolx2_t, T>(a, b))) {
-    return {.f = {sr_add(a.f[0], b.f[0]), sr_add(a.f[1], b.f[1])}};
-  }
-  T z = get_rand_float01_x2();
-  T tau, sigma, round;
-  twosumx2(a, b, sigma, tau);
-  round = sr_roundx2(sigma, tau, z);
-  return scalar::add(sigma, round);
-}
+// template <>
+// scalar::floatx2_t sr_addx2(scalar::floatx2_t a, scalar::floatx2_t b) {
+//   using T = scalar::floatx2_t;
+//   if (sse4_2::not0(isnumberx2<sse4_2::boolx2_t, T>(a, b))) {
+//     return {.f = {sr_add(a.f[0], b.f[0]), sr_add(a.f[1], b.f[1])}};
+//   }
+//   T z = get_rand_float01_x2();
+//   T tau, sigma, round;
+//   twosumx2(a, b, sigma, tau);
+//   round = sr_roundx2(sigma, tau, z);
+//   return scalar::add(sigma, round);
+// }
 
 template <typename T> T sr_sub(T a, T b) { return sr_add(a, -b); }
 
diff --git a/src/libvfcinstrumentonline/rand/src/sr_avx.hpp b/src/libvfcinstrumentonline/rand/src/sr_avx.hpp
new file mode 100644
index 00000000..0dddfa0d
--- /dev/null
+++ b/src/libvfcinstrumentonline/rand/src/sr_avx.hpp
@@ -0,0 +1,292 @@
+#include <cmath>
+#include <cstdint>
+#include <immintrin.h>
+#include <type_traits>
+
+#include "rand.hpp"
+#include "utils.hpp"
+
+#ifndef __AVX__
+#error "AVX is required for this file"
+#endif
+
+namespace avx {
+
+typedef __m128d double2;
+typedef __m128i short8;
+typedef __m128i int4;
+typedef __m128i long2;
+typedef __m128 float4;
+
+typedef __m256d double4;
+typedef __m256i short16;
+typedef __m256i long4;
+typedef __m256i int8;
+typedef __m256 float8;
+
+void debug_m256d(const char msg[], __m256d a) {
+  std::cout << msg << ": ";
+  for (int i = 0; i < sizeof(a) / sizeof(double); i++)
+    std::cout << a[i] << " ";
+  std::cout << std::endl;
+}
+
+namespace x2double {
+
+static inline double2 get_rand_double01() {
+  return xoroshiro256plus::sse4_2::get_rand_double01(rng_state);
+}
+
+static inline double2 get_predecessor_abs(double2 a) {
+  const __m128d phi = _mm_set1_pd(1.0 - 0x1.0p-53);
+  return _mm_mul_pd(a, phi);
+}
+
+static inline long2 get_exponent(double2 a) {
+  const double2 zero = _mm_setzero_pd();
+  const long2 exp_mask = _mm_set1_epi64x(0x7ff0000000000000ULL);
+  const long2 bias = _mm_set1_pd(1023);
+
+  double2 is_zero = _mm_cmp_pd(a, zero, _CMP_EQ_OQ);
+
+  // Extract exponent using floating-point operations
+  long2 a_bits = _mm_castpd_si128(a);
+  long2 exp_bits = _mm_and_pd(a_bits, exp_mask);
+  exp_bits = _mm_srli_epi64(exp_bits, 52);
+
+  // Subtract bias
+  exp_bits = _mm_sub_epi64(exp_bits, bias);
+
+  // Blend result with zero for zero inputs
+  return _mm_blendv_pd(exp_bits, _mm_setzero_pd(), is_zero);
+}
+
+static inline double2 isnumber(double2 a, double2 b) {
+  const long2 zero = _mm_setzero_si128();
+  const long2 naninf_mask = _mm_set1_epi64x(0x7ff0000000000000);
+
+  long2 a_int = _mm_castpd_si128(a);
+  long2 b_int = _mm_castpd_si128(b);
+  long2 a_not_zero = ~_mm_cmpeq_epi64(a_int, zero);
+  long2 b_not_zero = ~_mm_cmpeq_epi64(b_int, zero);
+  long2 a_not_naninf =
+      ~_mm_cmpeq_epi64(_mm_and_si128(a, naninf_mask), naninf_mask);
+  long2 b_not_naninf =
+      ~_mm_cmpeq_epi64(_mm_and_si128(b, naninf_mask), naninf_mask);
+
+  return _mm_and_si128(_mm_and_si128(a_not_zero, a_not_naninf),
+                       _mm_and_si128(b_not_zero, b_not_naninf))
+}
+
+static inline long2 cmpgt(long2 a, long2 b) { return _mm_cmpgt_epi64(a, b); }
+
+static inline double2 pow2(long2 n) {
+
+  const int _mantissa = sr::utils::IEEE754<double>::mantissa;
+  const int _min_exponent = sr::utils::IEEE754<double>::min_exponent;
+
+  long2 min_exp_vec = _mm_set1_epi64x(_min_exponent);
+  long2 one = _mm_set1_epi64x(1);
+  long2 mantissa_vec = _mm_set1_epi64x(_mantissa);
+
+  long2 is_subnormal = cmpgt(min_exp_vec, n);
+  long2 precision_loss = _mm_sub_epi64(min_exp_vec, n - 1);
+  precision_loss = _mm_and_si128(precision_loss, is_subnormal);
+
+  long2 n_adjusted = _mm_blendv_epi8(n, one, is_subnormal);
+  double2 res = _mm_blendv_pd(_mm_set1_pd(1.0), _mm_set1_pd(0.0),
+                              _mm_castsi128_pd(is_subnormal));
+
+  long2 i = _mm_castpd_si128(res);
+  long2 shift = _mm_sub_epi64(mantissa_vec, precision_loss);
+  long2 to_add = _mm_sllv_epi64(n_adjusted, shift);
+  i = _mm_add_epi64(i, to_add);
+  return _mm_castsi128_pd(i);
+}
+
+static inline double2 abs(double2 a) {
+  const double2 sign_mask = _mm_set1_pd(-0.0);
+  return _mm_andnot_pd(sign_mask, a);
+}
+
+static inline double2 sr_round(double2 sigma, double2 tau, double2 z) {
+  const int _mantissa = sr::utils::IEEE754<double>::mantissa;
+  const int _min_exponent = sr::utils::IEEE754<double>::min_exponent;
+
+  const double2 zero = _mm_setzero_pd();
+  const long2 mantissa = _mm_set1_epi64x(_mantissa);
+
+  double2 tau_is_zero = _mm_cmp_pd(tau, zero, _CMP_EQ_OQ);
+  double2 sign_tau = _mm_cmp_pd(tau, zero, _CMP_LT_OQ);
+  double2 sign_sigma = _mm_cmp_pd(sigma, zero, _CMP_LT_OQ);
+  double2 sign_diff = _mm_xor_pd(sign_tau, sign_sigma);
+
+  double2 pred_sigma = get_predecessor_abs(sigma);
+  long2 eta = get_exponent(sigma);
+  long2 pred_eta = get_exponent(pred_sigma);
+  eta = _mm_blendv_pd(eta, pred_eta, sign_diff);
+
+  double2 ulp = pow2(_mm_sub_epi64(eta, mantissa));
+  ulp = _mm_blendv_pd(ulp, _mm_sub_pd(zero, ulp), sign_tau);
+
+  double2 pi = _mm_mul_pd(ulp, z);
+  double2 abs_tau_plus_pi = abs(_mm_add_pd(tau, pi));
+  double2 abs_ulp = abs(ulp);
+  double2 round = _mm_blendv_pd(
+      zero, ulp, _mm_cmp_pd(abs_tau_plus_pi, abs_ulp, _CMP_GE_OQ));
+
+  return _mm_blendv_pd(round, sigma, tau_is_zero);
+}
+
+static inline double2 sr_add(double2 a, double2 b) {
+  double2 is_number = isnumber(a, b);
+  double2 normal_sum = _mm_add_pd(a, b);
+
+  double2 z = get_rand_double01();
+  double2 sigma, tau;
+
+  double2 tmp = a;
+  a = _mm_min_pd(a, b);
+  b = _mm_max_pd(tmp, b);
+
+  sigma = normal_sum;
+  double2 v = _mm_sub_pd(sigma, a);
+  tau = _mm_sub_pd(b, v);
+
+  double2 round = sr_round(sigma, tau, z);
+  double2 stochastic_sum = _mm_add_pd(sigma, round);
+
+  return _mm_blendv_pd(normal_sum, stochastic_sum, is_number);
+}
+
+} // namespace x2double
+
+// Helper function to get random doubles between 0 and 1
+__m256d get_rand_double01_avx() {
+  // Implementation depends on your random number generator
+  // This is a placeholder
+  return xoroshiro256plus::avx::get_rand_double01(rng_state_x4);
+}
+
+__m256d get_predecessor_abs_avx(__m256d a) {
+  const __m256d phi = _mm256_set1_pd(1.0 - 0x1.0p-53);
+  return _mm256_mul_pd(a, phi);
+}
+
+__m256d get_exponent_avx(__m256d a) {
+  const __m256d zero = _mm256_setzero_pd();
+  const __m256d exp_mask = _mm256_set1_epi64x(0x7ff0000000000000ULL);
+  const __m256d bias = _mm256_set1_pd(1023.0);
+
+  __m256d is_zero = _mm256_cmp_pd(a, zero, _CMP_EQ_OQ);
+
+  // Extract exponent using floating-point operations
+  __m256d exp_bits = _mm256_and_pd(a, exp_mask);
+
+  // Use multiplication to effectively right-shift by 52 bits
+  const __m256d shift_const = _mm256_set1_pd(1.0 / (1ULL << 52));
+  __m256d exp = _mm256_mul_pd(exp_bits, shift_const);
+
+  // Subtract bias
+  exp = _mm256_sub_pd(exp, bias);
+
+  // Blend result with zero for zero inputs
+  return _mm256_blendv_pd(exp, zero, is_zero);
+}
+
+__m256d isnumber_avx(__m256d a, __m256d b) {
+  const __m256d zero = _mm256_setzero_pd();
+  const __m256d naninf_mask = _mm256_set1_pd(0x7ff0000000000000);
+
+  __m256d a_not_zero = _mm256_cmp_pd(a, zero, _CMP_NEQ_UQ);
+  __m256d b_not_zero = _mm256_cmp_pd(b, zero, _CMP_NEQ_UQ);
+  __m256d a_not_naninf =
+      _mm256_cmp_pd(_mm256_and_pd(a, naninf_mask), naninf_mask, _CMP_NEQ_UQ);
+  __m256d b_not_naninf =
+      _mm256_cmp_pd(_mm256_and_pd(b, naninf_mask), naninf_mask, _CMP_NEQ_UQ);
+
+  return _mm256_and_pd(_mm256_and_pd(a_not_zero, a_not_naninf),
+                       _mm256_and_pd(b_not_zero, b_not_naninf));
+}
+
+__m256i cmpgt(__m256d a, __m256d b) {
+  return _mm256_castpd_si256(_mm256_cmp_pd(a, b, _CMP_GT_OQ));
+}
+
+__m256d pow2_avx(__m256i n) {
+  const int mantissa = sr::utils::IEEE754<double>::mantissa;
+  const int min_exponent = sr::utils::IEEE754<double>::min_exponent;
+
+  __m256i min_exp_vec = _mm256_set1_epi64x(min_exponent);
+  __m256i one = _mm256_set1_epi64x(1);
+  __m256i mantissa_vec = _mm256_set1_epi64x(mantissa);
+
+  __m256i is_subnormal = cmpgt(min_exp_vec, n);
+  __m256i precision_loss = min_exp_vec - n;
+  precision_loss = precision_loss - one;
+  precision_loss = precision_loss & is_subnormal;
+
+  __m256i n_adjusted = _mm256_blendv_pd(n, one, is_subnormal);
+
+  __m256d res = _mm256_blendv_pd(_mm256_set1_pd(1.0), _mm256_set1_pd(0.0),
+                                 _mm256_castsi256_pd(is_subnormal));
+
+  __m256i i = _mm256_castpd_si256(res);
+  __m256i shift = mantissa_vec - precision_loss;
+  __m256i to_add = n_adjusted << shift;
+  i += to_add;
+
+  return _mm256_castsi256_pd(i);
+}
+
+__m256d abs_pd(__m256d a) {
+  const __m256d sign_mask = _mm256_set1_pd(-0.0);
+  return _mm256_andnot_pd(sign_mask, a);
+}
+
+__m256d sr_round_avx(__m256d sigma, __m256d tau, __m256d z) {
+  const __m256d zero = _mm256_setzero_pd();
+  const __m256d one = _mm256_set1_pd(1.0);
+  const __m256d mantissa = _mm256_set1_pd(52);
+
+  __m256d tau_is_zero = _mm256_cmp_pd(tau, zero, _CMP_EQ_OQ);
+  __m256d sign_tau = _mm256_cmp_pd(tau, zero, _CMP_LT_OQ);
+  __m256d sign_sigma = _mm256_cmp_pd(sigma, zero, _CMP_LT_OQ);
+  __m256d sign_diff = _mm256_xor_pd(sign_tau, sign_sigma);
+
+  __m256d pred_sigma = get_predecessor_abs_avx(sigma);
+  __m256d eta = _mm256_blendv_pd(get_exponent_avx(sigma),
+                                 get_exponent_avx(pred_sigma), sign_diff);
+
+  __m256d ulp = pow2_avx(_mm256_sub_pd(eta, mantissa));
+  ulp = _mm256_blendv_pd(ulp, _mm256_sub_pd(zero, ulp), sign_tau);
+
+  __m256d pi = _mm256_mul_pd(ulp, z);
+  __m256d abs_tau_plus_pi = abs_pd(_mm256_add_pd(tau, pi));
+  __m256d abs_ulp = abs_pd(ulp);
+  __m256d round = _mm256_blendv_pd(
+      zero, ulp, _mm256_cmp_pd(abs_tau_plus_pi, abs_ulp, _CMP_GE_OQ));
+
+  return _mm256_blendv_pd(round, sigma, tau_is_zero);
+}
+
+__m256d sr_add_avx(__m256d a, __m256d b) {
+  __m256d is_number = isnumber_avx(a, b);
+  __m256d normal_sum = _mm256_add_pd(a, b);
+
+  __m256d z = get_rand_double01_avx();
+  debug_m256d("z", z);
+  __m256d sigma, tau;
+
+  // Implement twosum using AVX instructions
+  sigma = _mm256_add_pd(a, b);
+  __m256d v = _mm256_sub_pd(sigma, a);
+  tau = _mm256_sub_pd(b, v);
+
+  __m256d round = sr_round_avx(sigma, tau, z);
+  __m256d stochastic_sum = _mm256_add_pd(sigma, round);
+
+  return _mm256_blendv_pd(normal_sum, stochastic_sum, is_number);
+}
+
+} // namespace avx
diff --git a/src/libvfcinstrumentonline/rand/src/sr_avx2.hpp b/src/libvfcinstrumentonline/rand/src/sr_avx2.hpp
new file mode 100644
index 00000000..58cdb1ed
--- /dev/null
+++ b/src/libvfcinstrumentonline/rand/src/sr_avx2.hpp
@@ -0,0 +1,371 @@
+#include <cmath>
+#include <cstdint>
+#include <immintrin.h>
+#include <type_traits>
+
+#include "rand.hpp"
+#include "utils.hpp"
+
+#ifndef __AVX2__
+#error "AVX2 is required for this file"
+#endif
+
+typedef __m256 float8;
+typedef __m256i short16;
+typedef __m256i int8;
+typedef __m256i long4;
+typedef __m256d double4;
+
+static inline void debug(const char msg[]) { std::cerr << msg << std::endl; }
+
+static inline void debug_m256d(const char msg[], double4 a) {
+  std::cerr << std::hexfloat;
+  std::cerr << msg << ": ";
+  for (int i = 0; i < sizeof(a) / sizeof(double); i++)
+    std::cerr << a[i] << " ";
+  std::cerr << std::endl;
+  std::cerr << std::defaultfloat;
+}
+
+static inline void debug_m256(const char msg[], float8 a) {
+  std::cerr << std::hexfloat;
+  std::cerr << msg << ": ";
+  for (int i = 0; i < sizeof(a) / sizeof(float); i++)
+    std::cerr << a[i] << " ";
+  std::cerr << std::endl;
+  std::cerr << std::defaultfloat;
+}
+
+static inline void debug_m256i(const char msg[], __m256i a, bool hex = true) {
+  if (hex)
+    std::cerr << std::hex;
+  else
+    std::cerr << std::dec;
+  std::cerr << msg << ": ";
+  for (int i = 0; i < sizeof(a) / sizeof(double); i++)
+    std::cerr << a[i] << " ";
+  std::cerr << std::endl;
+  std::cerr << std::defaultfloat;
+}
+// Helper function to get random doubles between 0 and 1
+static inline double4 get_rand_double01_avx2() {
+  // Implementation depends on your random number generator
+  // This is a placeholder
+  return xoroshiro256plus::avx2::get_rand_double01(rng_state_x4);
+}
+
+static inline double4 get_predecessor_abs_avx2(double4 a) {
+  const double4 phi = _mm256_set1_pd(1.0 - 0x1.0p-53);
+  return _mm256_mul_pd(a, phi);
+}
+
+static inline float8 get_predecessor_abs_avx2f(float8 a) {
+  const float8 phi = _mm256_set1_ps(1.0f - 0x1.0p-24f);
+  return _mm256_mul_ps(a, phi);
+}
+
+static inline double4 get_exponent_avx2(double4 a) {
+  debug(" [get_exponent]\tstart");
+  const double4 zero = _mm256_setzero_pd();
+  const __m256i exp_mask = _mm256_set1_epi64x(0x7ff0000000000000ULL);
+  const __m256i bias = _mm256_set1_epi64x(1023);
+
+  double4 is_zero = _mm256_cmp_pd(a, zero, _CMP_EQ_OQ);
+
+  debug_m256d("\t\ta", a);
+  __m256i a_bits = _mm256_castpd_si256(a);
+  debug_m256i("\t\ta_bits", a_bits);
+  __m256i exp_bits = _mm256_and_si256(a_bits, exp_mask);
+  debug_m256i("\t\texp_bits", exp_bits);
+  exp_bits = _mm256_srli_epi64(exp_bits, 52);
+  debug_m256i("\t\texp_bits", exp_bits, false);
+
+  exp_bits = _mm256_sub_epi64(exp_bits, bias);
+  debug_m256i("\t\texp", exp_bits, false);
+
+  debug(" [get_exponent]\tend");
+  return _mm256_blendv_pd(exp_bits, zero, is_zero);
+}
+
+static inline float8 get_exponent_avx2f(float8 a) {
+  debug(" [get_exponent]\tstart");
+  const float8 zero = _mm256_setzero_ps();
+  const __m256i exp_mask = _mm256_set1_epi32(0x7f800000);
+  const __m256i bias = _mm256_set1_epi32(127);
+
+  float8 is_zero = _mm256_cmp_ps(a, zero, _CMP_EQ_OQ);
+
+  debug_m256("\t\ta", a);
+  __m256i a_bits = _mm256_castps_si256(a);
+  debug_m256i("\t\ta_bits", a_bits);
+  __m256i exp_bits = _mm256_and_si256(a_bits, exp_mask);
+  debug_m256i("\t\texp_bits", exp_bits);
+  exp_bits = _mm256_srli_epi32(exp_bits, 23);
+  debug_m256i("\t\texp_bits", exp_bits, false);
+
+  exp_bits = exp_bits - bias;
+  debug_m256i("\t\texp", exp_bits, false);
+
+  debug(" [get_exponent]\tend");
+  return _mm256_blendv_ps(exp_bits, zero, is_zero);
+}
+
+static inline double4 isnumber_avx2(double4 a, double4 b) {
+  const double4 zero = _mm256_setzero_pd();
+  const double4 naninf_mask = _mm256_set1_pd(0x7ff0000000000000);
+
+  double4 a_not_zero = _mm256_cmp_pd(a, zero, _CMP_NEQ_UQ);
+  double4 b_not_zero = _mm256_cmp_pd(b, zero, _CMP_NEQ_UQ);
+  double4 a_not_naninf =
+      _mm256_cmp_pd(_mm256_and_pd(a, naninf_mask), naninf_mask, _CMP_NEQ_UQ);
+  double4 b_not_naninf =
+      _mm256_cmp_pd(_mm256_and_pd(b, naninf_mask), naninf_mask, _CMP_NEQ_UQ);
+
+  return _mm256_and_pd(_mm256_and_pd(a_not_zero, a_not_naninf),
+                       _mm256_and_pd(b_not_zero, b_not_naninf));
+}
+
+static inline float8 isnumber_avx2f(float8 a, float8 b) {
+  const float8 zero = _mm256_setzero_ps();
+  const float8 naninf_mask = _mm256_set1_ps(0x7f800000);
+
+  float8 a_not_zero = _mm256_cmp_ps(a, zero, _CMP_NEQ_UQ);
+  float8 b_not_zero = _mm256_cmp_ps(b, zero, _CMP_NEQ_UQ);
+  float8 a_not_naninf =
+      _mm256_cmp_ps(_mm256_and_ps(a, naninf_mask), naninf_mask, _CMP_NEQ_UQ);
+  float8 b_not_naninf =
+      _mm256_cmp_ps(_mm256_and_ps(b, naninf_mask), naninf_mask, _CMP_NEQ_UQ);
+
+  return _mm256_and_ps(_mm256_and_ps(a_not_zero, a_not_naninf),
+                       _mm256_and_ps(b_not_zero, b_not_naninf));
+}
+
+// AVX2 version of cmpgt
+static inline __m256i cmpgt(__m256i a, __m256i b) {
+  return _mm256_cmpgt_epi64(a, b);
+}
+
+static inline __m256i cmpgtf(float8 a, float8 b) {
+  return _mm256_cmpgt_epi32(a, b);
+}
+
+static inline double4 pow2_avx2(__m256i n) {
+  debug(" [POW2]\tstart");
+  debug_m256i("\t\tn", n, false);
+  const int mantissa = sr::utils::IEEE754<double>::mantissa;
+  const int min_exponent = sr::utils::IEEE754<double>::min_exponent;
+
+  __m256i min_exp_vec = _mm256_set1_epi64x(min_exponent);
+  __m256i one = _mm256_set1_epi64x(1);
+  __m256i mantissa_vec = _mm256_set1_epi64x(mantissa);
+
+  __m256i is_subnormal = cmpgt(min_exp_vec, n);
+  __m256i precision_loss = _mm256_sub_epi64(min_exp_vec, n - 1);
+  precision_loss = _mm256_and_si256(precision_loss, is_subnormal);
+  debug_m256i("\t\tprecision_loss", precision_loss, false);
+
+  __m256i n_adjusted = _mm256_blendv_epi8(n, one, is_subnormal);
+
+  double4 res = _mm256_blendv_pd(_mm256_set1_pd(1.0), _mm256_set1_pd(0.0),
+                                 _mm256_castsi256_pd(is_subnormal));
+
+  __m256i i = _mm256_castpd_si256(res);
+  __m256i shift = _mm256_sub_epi64(mantissa_vec, precision_loss);
+  __m256i to_add = _mm256_sllv_epi64(n_adjusted, shift);
+  debug_m256i("\t\tto_add", to_add);
+  i = _mm256_add_epi64(i, to_add);
+
+  debug_m256d("\t\tres", _mm256_castsi256_pd(i));
+  debug(" [POW2]\tend");
+  return _mm256_castsi256_pd(i);
+}
+
+static inline float8 pow2_avx2f(__m256i n) {
+  debug(" [POW2]\tstart");
+  debug_m256i("\t\tn", n, false);
+  const int mantissa = sr::utils::IEEE754<float>::mantissa;
+  const int min_exponent = sr::utils::IEEE754<float>::min_exponent;
+
+  __m256i min_exp_vec = _mm256_set1_epi32(min_exponent);
+  __m256i one = _mm256_set1_epi32(1);
+  __m256i mantissa_vec = _mm256_set1_epi32(mantissa);
+
+  __m256i is_subnormal = cmpgtf(min_exp_vec, n);
+  __m256i precision_loss = _mm256_sub_epi32(min_exp_vec, n - 1);
+  precision_loss = _mm256_and_si256(precision_loss, is_subnormal);
+  debug_m256i("\t\tprecision_loss", precision_loss, false);
+
+  __m256i n_adjusted = _mm256_blendv_epi8(n, one, is_subnormal);
+
+  float8 res = _mm256_blendv_ps(_mm256_set1_ps(1.0), _mm256_set1_ps(0.0),
+                                _mm256_castsi256_ps(is_subnormal));
+
+  __m256i i = _mm256_castps_si256(res);
+  __m256i shift = _mm256_sub_epi32(mantissa_vec, precision_loss);
+  __m256i to_add = _mm256_sllv_epi32(n_adjusted, shift);
+  debug_m256i("\t\tto_add", to_add);
+  i = _mm256_add_epi32(i, to_add);
+
+  debug_m256("\t\tres", _mm256_castsi256_ps(i));
+  debug(" [POW2]\tend");
+  return _mm256_castsi256_ps(i);
+}
+
+static inline double4 abs_pd(double4 a) {
+  const double4 sign_mask = _mm256_set1_pd(-0.0);
+  return _mm256_andnot_pd(sign_mask, a);
+}
+
+static inline float8 abs_ps(float8 a) {
+  const float8 sign_mask = _mm256_set1_ps(-0.0f);
+  return _mm256_andnot_ps(sign_mask, a);
+}
+
+static inline double4 sr_round_avx2(double4 sigma, double4 tau, double4 z) {
+  debug(" [SR_ROUND]\tstart");
+  debug_m256d("\tsigma ", sigma);
+  debug_m256d("\ttau   ", tau);
+
+  const int _mantissa = sr::utils::IEEE754<double>::mantissa;
+  const int _min_exponent = sr::utils::IEEE754<double>::min_exponent;
+
+  const double4 zero = _mm256_setzero_pd();
+  const __m256i mantissa = _mm256_set1_epi64x(_mantissa);
+
+  double4 tau_is_zero = _mm256_cmp_pd(tau, zero, _CMP_EQ_OQ);
+  double4 sign_tau = _mm256_cmp_pd(tau, zero, _CMP_LT_OQ);
+  double4 sign_sigma = _mm256_cmp_pd(sigma, zero, _CMP_LT_OQ);
+  double4 sign_diff = _mm256_xor_pd(sign_tau, sign_sigma);
+
+  double4 pred_sigma = get_predecessor_abs_avx2(sigma);
+  __m256i eta = _mm256_blendv_pd(get_exponent_avx2(sigma),
+                                 get_exponent_avx2(pred_sigma), sign_diff);
+
+  debug_m256i("eta-mantissa", eta - mantissa, false);
+  double4 ulp = pow2_avx2(_mm256_sub_epi64(eta, mantissa));
+  debug_m256d("\tulp   ", ulp);
+  ulp = _mm256_blendv_pd(ulp, _mm256_sub_pd(zero, ulp), sign_tau);
+
+  double4 pi = _mm256_mul_pd(ulp, z);
+  double4 abs_tau_plus_pi = abs_pd(_mm256_add_pd(tau, pi));
+  double4 abs_ulp = abs_pd(ulp);
+  double4 round = _mm256_blendv_pd(
+      zero, ulp, _mm256_cmp_pd(abs_tau_plus_pi, abs_ulp, _CMP_GE_OQ));
+
+  debug_m256d("\tz     ", z);
+  debug_m256i("\teta   ", eta, false);
+  debug_m256d("\tpi    ", pi);
+  debug_m256d("\ttau+pi", _mm256_add_pd(tau, pi));
+  debug_m256d("\tulp   ", ulp);
+  debug_m256d("\tround ", round);
+  debug_m256d("\t--", _mm256_setzero_pd());
+  debug(" [SR_ROUND]\tend");
+  return _mm256_blendv_pd(round, sigma, tau_is_zero);
+}
+
+static inline float8 sr_round_avx2f(float8 sigma, float8 tau, float8 z) {
+  debug(" [SR_ROUND]\tstart");
+  debug_m256("\tsigma ", sigma);
+  debug_m256("\ttau   ", tau);
+
+  const int _mantissa = sr::utils::IEEE754<float>::mantissa;
+  const int _min_exponent = sr::utils::IEEE754<float>::min_exponent;
+
+  const float8 zero = _mm256_setzero_ps();
+  const __m256i mantissa = _mm256_set1_epi32(_mantissa);
+
+  float8 tau_is_zero = _mm256_cmp_ps(tau, zero, _CMP_EQ_OQ);
+  float8 sign_tau = _mm256_cmp_ps(tau, zero, _CMP_LT_OQ);
+  float8 sign_sigma = _mm256_cmp_ps(sigma, zero, _CMP_LT_OQ);
+  float8 sign_diff = _mm256_xor_ps(sign_tau, sign_sigma);
+
+  float8 pred_sigma = get_predecessor_abs_avx2f(sigma);
+  __m256i eta = _mm256_blendv_ps(get_exponent_avx2f(sigma),
+                                 get_exponent_avx2f(pred_sigma), sign_diff);
+
+  debug_m256i("eta-mantissa", eta - mantissa, false);
+  float8 ulp = pow2_avx2f(eta - mantissa);
+  debug_m256("\tulp   ", ulp);
+  ulp = _mm256_blendv_ps(ulp, _mm256_sub_ps(zero, ulp), sign_tau);
+
+  float8 pi = _mm256_mul_ps(ulp, z);
+  float8 abs_tau_plus_pi = abs_ps(_mm256_add_ps(tau, pi));
+  float8 abs_ulp = abs_ps(ulp);
+  float8 round = _mm256_blendv_ps(
+      zero, ulp, _mm256_cmp_ps(abs_tau_plus_pi, abs_ulp, _CMP_GE_OQ));
+
+  debug_m256("\tz     ", z);
+  debug_m256i("\teta   ", eta, false);
+  debug_m256("\tpi    ", pi);
+  debug_m256("\ttau+pi", _mm256_add_ps(tau, pi));
+  debug_m256("\tulp   ", ulp);
+  debug_m256("\tround ", round);
+  debug_m256("\t--", _mm256_setzero_ps());
+  debug(" [SR_ROUND]\tend");
+  return _mm256_blendv_ps(round, sigma, tau_is_zero);
+}
+
+static inline double4 sr_add_avx2(double4 a, double4 b) {
+  debug(" [SR_ADD]\t\tstart");
+  double4 is_number = isnumber_avx2(a, b);
+  double4 normal_sum = _mm256_add_pd(a, b);
+
+  double4 z = get_rand_double01_avx2();
+  debug_m256d("\t\tz", z);
+  double4 sigma, tau;
+
+  // Implement twosum using AVX instructions
+  // swap a and b if |a| < |b|
+  double4 swap = _mm256_cmp_pd(abs_pd(a), abs_pd(b), _CMP_LT_OQ);
+  double4 tmp = a;
+  a = _mm256_blendv_pd(a, b, swap);
+  b = _mm256_blendv_pd(b, tmp, swap);
+
+  sigma = _mm256_add_pd(a, b);
+  double4 v = _mm256_sub_pd(sigma, a);
+  tau = _mm256_sub_pd(b, v);
+
+  double4 round = sr_round_avx2(sigma, tau, z);
+  double4 stochastic_sum = _mm256_add_pd(sigma, round);
+
+  debug_m256d("\t\tsigma", sigma);
+  debug_m256d("\t\ttau  ", tau);
+  debug_m256d("\t\tstochastic_sum", stochastic_sum);
+  debug(" [SR_ADD]\t\tend");
+
+  return _mm256_blendv_pd(normal_sum, stochastic_sum, is_number);
+}
+
+static inline double4 sr_sub_avx2(double4 a, double4 b) {
+  b = _mm256_xor_pd(b, _mm256_set1_pd(-0.0));
+  return sr_add_avx2(a, b);
+}
+
+static inline double4 sr_mul_avx2(double4 a, double4 b) {
+  debug(" [SR_MUL]\t\tstart");
+  double4 is_number = isnumber_avx2(a, b);
+  double4 normal_mul = _mm256_mul_pd(a, b);
+  double4 z = get_rand_double01_avx2();
+  double4 sigma = normal_mul;
+  double4 tau = _mm256_fmsub_pd(a, b, sigma);
+  double4 round = sr_round_avx2(sigma, tau, z);
+  double4 stochastic_mul = _mm256_add_pd(sigma, round);
+
+  debug(" [SR_MUL]\t\tend");
+  return _mm256_blendv_pd(normal_mul, stochastic_mul, is_number);
+}
+
+static inline double4 sr_div_avx2(double4 a, double4 b) {
+  debug(" [SR_DIV]\t\tstart");
+  double4 is_number = isnumber_avx2(a, b);
+  double4 normal_div = _mm256_div_pd(a, b);
+  double4 z = get_rand_double01_avx2();
+  double4 sigma = normal_div;
+  double4 tau =
+      _mm256_fmadd_pd(_mm256_xor_pd(sigma, _mm256_set1_pd(-0.0)), b, a);
+  double4 round = sr_round_avx2(sigma, tau, z);
+  double4 stochastic_div = _mm256_add_pd(sigma, round);
+
+  debug(" [SR_DIV]\t\tend");
+  return _mm256_blendv_pd(normal_div, stochastic_div, is_number);
+}
\ No newline at end of file
diff --git a/src/libvfcinstrumentonline/rand/src/sr_sse4-2.hpp b/src/libvfcinstrumentonline/rand/src/sr_sse4-2.hpp
new file mode 100644
index 00000000..93f36cbf
--- /dev/null
+++ b/src/libvfcinstrumentonline/rand/src/sr_sse4-2.hpp
@@ -0,0 +1,186 @@
+#include <cmath>
+#include <cstdint>
+#include <immintrin.h>
+#include <type_traits>
+
+#include "rand.hpp"
+#include "utils.hpp"
+#include "xoroshiro256+.hpp"
+
+#ifndef __SSE4_2__
+#error "SSE4.2 is required for this file"
+#endif
+
+namespace sse4_2 {
+
+typedef __m128d double2;
+typedef __m128i short8;
+typedef __m128i int4;
+typedef __m128i long2;
+typedef __m128 float4;
+
+void debug_m128d(const char msg[], __m128d a) {
+  std::cout << msg << ": ";
+  for (int i = 0; i < sizeof(a) / sizeof(double); i++)
+    std::cout << a[i] << " ";
+  std::cout << std::endl;
+}
+
+void debug_m128i(const char msg[], __m128i a) {
+  std::cout << msg << ": ";
+  for (int i = 0; i < sizeof(a) / sizeof(int); i++)
+    std::cout << a[i] << " ";
+  std::cout << std::endl;
+}
+
+namespace doublex2 {
+
+static inline double2 get_rand_double01() {
+  return xoroshiro256plus::sse4_2::get_rand_double01(rng_state_x2);
+}
+
+static inline double2 get_predecessor_abs(double2 a) {
+  const double2 phi = _mm_set1_pd(1.0 - 0x1.0p-53);
+  return _mm_mul_pd(a, phi);
+}
+
+static inline double2 get_exponent(double2 a) {
+  const double2 zero = _mm_setzero_pd();
+  const long2 exp_mask = _mm_set1_epi64x(0x7ff0000000000000ULL);
+  const long2 bias = _mm_set1_pd(1023);
+
+  double2 is_zero = _mm_cmp_pd(a, zero, _CMP_EQ_OQ);
+
+  // Extract exponent using floating-point operations
+  long2 a_bits = _mm_castpd_si128(a);
+  long2 exp_bits = _mm_and_pd(a_bits, exp_mask);
+  exp_bits = _mm_srli_epi64(exp_bits, 52);
+
+  // Subtract bias
+  exp_bits = _mm_sub_pd(exp_bits, bias);
+
+  // Blend result with zero for zero inputs
+  return _mm_blendv_pd(exp_bits, zero, is_zero);
+}
+
+static inline double2 isnumber(double2 a, double2 b) {
+  const double2 zero = _mm_setzero_pd();
+  const double2 naninf_mask = _mm_set1_pd(0x7ff0000000000000);
+
+  double2 a_not_zero = _mm_cmp_pd(a, zero, _CMP_NEQ_UQ);
+  double2 b_not_zero = _mm_cmp_pd(b, zero, _CMP_NEQ_UQ);
+  double2 a_not_naninf =
+      _mm_cmp_pd(_mm_and_pd(a, naninf_mask), naninf_mask, _CMP_NEQ_UQ);
+  double2 b_not_naninf =
+      _mm_cmp_pd(_mm_and_pd(b, naninf_mask), naninf_mask, _CMP_NEQ_UQ);
+
+  return _mm_and_pd(_mm_and_pd(a_not_zero, a_not_naninf),
+                    _mm_and_pd(b_not_zero, b_not_naninf));
+}
+
+static inline int4 cmpgt(double2 a, double2 b) {
+  return _mm_castpd_si128(_mm_cmp_pd(a, b, _CMP_GT_OQ));
+}
+
+static inline double2 pow2(long2 n) {
+  const int mantissa = sr::utils::IEEE754<double>::mantissa;
+  const int min_exponent = sr::utils::IEEE754<double>::min_exponent;
+
+  const double2 zero = _mm_setzero_pd();
+  const double2 one = _mm_set1_pd(1.0);
+
+  long2 min_exp_vec = _mm_set1_epi64x(min_exponent);
+  long2 one = _mm_set1_epi64x(1);
+  long2 mantissa_vec = _mm_set1_epi64x(mantissa);
+
+  long2 is_subnormal = _mm_cmpeq_epi64(min_exp_vec, n);
+  long2 precision_loss = _mm_sub_epi64(min_exp_vec, n - 1);
+  precision_loss = _mm_and_si128(precision_loss, is_subnormal);
+
+  long2 n_adjusted = _mm_blendv_epi8(n, one, is_subnormal);
+  double2 res = _mm_blendv_pd(one, zero, _mm_castsi128_pd(is_subnormal));
+
+  long2 i = _mm_castpd_si128(res);
+  long2 shift = _mm_sub_epi64(mantissa_vec, precision_loss);
+  long2 to_add = _mm_sllv_epi64(n_adjusted, shift);
+  i = _mm_add_epi64(i, to_add);
+  return _mm_castsi128_pd(i);
+}
+
+static inline double2 abs(double2 a) {
+  const long2 sign_mask = _mm_set1_epi64x(0x8000000000000000ULL);
+  return _mm_andnot_pd(_mm_castsi128_pd(sign_mask), a);
+}
+
+static inline double2 sr_round(double2 sigma, double2 tau, double2 z) {
+  const int _mantissa = sr::utils::IEEE754<double>::mantissa;
+  const int _min_exponent = sr::utils::IEEE754<double>::min_exponent;
+
+  const double2 zero = _mm_setzero_pd();
+  const long2 mantissa = _mm_set1_epi64x(_mantissa);
+
+  double2 tau_is_zero = _mm_cmp_pd(tau, zero, _CMP_EQ_OQ);
+  double2 sign_tau = _mm_and_pd(tau, _mm_set1_pd(-0.0));
+  double2 sign_sigma = _mm_and_pd(sigma, _mm_set1_pd(-0.0));
+  double2 sign_diff = _mm_xor_pd(sign_tau, sign_sigma);
+
+  double2 pred_sigma = get_predecessor_abs(sigma);
+  long2 eta =
+      _mm_blendv_pd(get_exponent(sigma), get_exponent(pred_sigma), sign_diff);
+
+  double2 ulp = pow2(_mm_sub_epi64(eta, mantissa));
+  ulp = _mm_blendv_pd(ulp, -ulp, tau_is_zero);
+  double2 pi = ulp * z;
+  double2 abs_ulp = abs(ulp);
+  double2 abs_tau_plus_pi = abs(tau + pi);
+  double2 round = _mm_blendv_pd(
+      zero, sign_diff, _mm_cmp_pd(abs_tau_plus_pi, abs_ulp, _CMP_GE_OQ));
+  return _mm_blendv_pd(round, sigma, tau_is_zero);
+}
+
+static inline double2 sr_add(double2 a, double2 b) {
+  double2 is_number = isnumber(a, b);
+  double2 normal_sum = a + b;
+  double2 z = get_rand_double01();
+  double2 tau, sigma, round;
+
+  double2 tmp = a;
+  a = _mm_min_pd(a, b);
+  b = _mm_max_pd(tmp, b);
+
+  sigma = normal_sum;
+  double2 v = sigma - a;
+  tau = b - v;
+
+  round = sr_round(sigma, tau, z);
+  return sigma + round;
+}
+
+static inline double2 sr_sub(double2 a, double2 b) { return sr_add(a, -b); }
+
+static inline double2 sr_mul(double2 a, double2 b) {
+  double2 is_number = isnumber(a, b);
+  double2 normal_sum = a + b;
+  double2 z = get_rand_double01();
+  double2 tau, sigma, round;
+
+  double2 z = get_rand_double01();
+  double2 tau, sigma, round;
+  twoproduct(a, b, sigma, tau);
+  round = sr_round(sigma, tau, z);
+  return sigma + round;
+}
+
+static inline double2 sr_div(double2 a, double2 b) {
+  if (not isnumber(a, b)) {
+    return a / b;
+  }
+  double2 z = get_rand_double01();
+  double2 tau, sigma, round;
+  twodiv(a, b, sigma, tau);
+  round = sr_round(sigma, tau, z);
+  return sigma + round;
+}
+
+} // namespace doublex2
+} // namespace sse4_2
diff --git a/src/libvfcinstrumentonline/rand/src/utils.hpp b/src/libvfcinstrumentonline/rand/src/utils.hpp
index c13019a3..a9c89b09 100644
--- a/src/libvfcinstrumentonline/rand/src/utils.hpp
+++ b/src/libvfcinstrumentonline/rand/src/utils.hpp
@@ -3,6 +3,7 @@
 
 #include <cstdint>
 #include <immintrin.h>
+#include <type_traits>
 
 #include "vector_types.hpp"
 
@@ -194,7 +195,7 @@ template <typename T> T pow2(int n) {
   return res;
 }
 
-// TODO: finish implementing this function
+// TODO: finish to implement this function
 template <typename T> T add_round_odd(T a, T b) {
   // return addition with rounding to odd
   // https://www.lri.fr/~melquion/doc/08-tc.pdf
diff --git a/src/libvfcinstrumentonline/rand/src/vector_types.hpp b/src/libvfcinstrumentonline/rand/src/vector_types.hpp
index 2f1a188a..3e31cff6 100644
--- a/src/libvfcinstrumentonline/rand/src/vector_types.hpp
+++ b/src/libvfcinstrumentonline/rand/src/vector_types.hpp
@@ -13,6 +13,7 @@ typedef union {
   uint32_t u32[2];
   int32_t i32[2];
 } double_t;
+
 typedef double_t floatx2_t;
 typedef double_t uint64_t;
 typedef double_t int64_t;
@@ -20,8 +21,9 @@ typedef double_t uint32x2_t;
 typedef double_t int32x2_t;
 typedef double_t boolx2_t;
 
-floatx2_t add(floatx2_t a,
-              floatx2_t b){return {.f = {a.f[0] + b.f[0], a.f[1] + b.f[1]}}};
+floatx2_t add(floatx2_t a, floatx2_t b) {
+  return {.f = {a.f[0] + b.f[0], a.f[1] + b.f[1]}};
+}
 boolx2_t cmplt(floatx2_t a, floatx2_t b) {
   return {.u32 = {a.f[0] < b.f[0], a.f[1] < b.f[1]}};
 }
@@ -34,6 +36,7 @@ boolx2_t bitwise_xor(boolx2_t a, boolx2_t b) {
 
 floatx2_t mul(floatx2_t a, floatx2_t b) {
   return {.f = {a.f[0] * b.f[0], a.f[1] * b.f[1]}};
+}
 
 } // namespace scalar
 
@@ -80,19 +83,23 @@ doublex2_t add(doublex2_t a, doublex2_t b) { return _mm_add_pd(a, b); };
 } // namespace sse4_2
 
 namespace avx {
-typedef __m128i uint64x2_t;
-typedef __m128i uint32x4_t;
-typedef __m128 floatx4_t;
-typedef __m128d doublex2_t;
+typedef __m256i int64x4_t;
+typedef __m256i int32x8_t;
+typedef __m256i uint64x4_t;
+typedef __m256i uint32x8_t;
+typedef __m256 floatx8_t;
+typedef __m256d doublex4_t;
 
 /* The current state of the generators. */
 typedef struct {
-  __m128i s[4];
+  __m256i s[4];
 } state;
 
 } // namespace avx
 
 namespace avx2 {
+typedef __m256i int64x4_t;
+typedef __m256i int32x8_t;
 typedef __m256i uint64x4_t;
 typedef __m256i uint32x8_t;
 typedef __m256 floatx8_t;
diff --git a/src/libvfcinstrumentonline/rand/src/xoroshiro256+.hpp b/src/libvfcinstrumentonline/rand/src/xoroshiro256+.hpp
new file mode 100644
index 00000000..a145e67b
--- /dev/null
+++ b/src/libvfcinstrumentonline/rand/src/xoroshiro256+.hpp
@@ -0,0 +1,406 @@
+#ifndef __VERIFICARLO_SRLIB_RAND_XOROSHIRO256P_HPP__
+#define __VERIFICARLO_SRLIB_RAND_XOROSHIRO256P_HPP__
+
+#include <stdio.h>
+
+#include <array>
+#include <immintrin.h>
+#include <stdint.h>
+
+#include "vector_types.hpp"
+
+namespace xoroshiro256plus {
+
+namespace scalar {
+typedef uint64_t rn_uint64;
+
+/* The current state of the generators. */
+typedef struct {
+  uint64_t s[4];
+} state;
+
+static inline uint64_t rotl(uint64_t x, int k) {
+  return (x << k) | (x >> (64 - k));
+}
+
+static inline uint64_t next(state &s) {
+  uint64_t result = s.s[0] + s.s[3];
+  uint64_t t = s.s[1] << 17;
+
+  s.s[2] ^= s.s[0];
+  s.s[3] ^= s.s[1];
+  s.s[1] ^= s.s[2];
+  s.s[0] ^= s.s[3];
+  s.s[2] ^= t;
+  s.s[3] = rotl(s.s[3], 45);
+
+  return result;
+}
+
+void init(state &s, const std::array<uint64_t, 1> &seeds) {
+  s.s[0] = seeds[0];
+  s.s[1] = 0;
+  s.s[2] = 0;
+  s.s[3] = 0;
+}
+
+} // namespace scalar
+
+namespace sse4_2 {
+/* The current state of the generators. */
+typedef struct {
+  __m128i s[4];
+} state;
+
+__attribute__((target("sse4.2"))) static __inline __m128i rotl(__m128i x,
+                                                               int k) {
+  return _mm_or_si128(_mm_slli_epi64(x, k), _mm_srli_epi64(x, 64 - k));
+}
+
+__attribute__((target("sse4.2"))) static inline __m128i next(state &s) {
+  // result[i] = s[0][i] + s[3][i];
+  __m128i result = _mm_add_epi64(s.s[0], s.s[3]);
+
+  // t[i] = s[1][i] << 17;
+  __m128i t = _mm_slli_epi64(s.s[1], 17);
+
+  // s[2][i] ^= s[0][i];
+  s.s[2] = _mm_xor_si128(s.s[2], s.s[0]);
+
+  // s[3][i] ^= s[1][i];
+  s.s[3] = _mm_xor_si128(s.s[3], s.s[1]);
+
+  // s[1][i] ^= s[2][i];
+  s.s[1] = _mm_xor_si128(s.s[1], s.s[2]);
+
+  // s[0][i] ^= s[3][i];
+  s.s[0] = _mm_xor_si128(s.s[0], s.s[3]);
+
+  // s[2][i] ^= t[i];
+  s.s[2] = _mm_xor_si128(s.s[2], t);
+
+  // s[3][i] = rotl(s[3][i], 45);
+  s.s[3] = rotl(s.s[3], 45);
+
+  return result;
+}
+
+__attribute__((target("sse4.2"))) static inline __m128d
+get_rand_double01(state &s) {
+  __m128i rng = next(s);
+  __m128i shift = _mm_srli_epi64(rng, 11);
+  __m128d shiftd = _mm_cvtepi64_pd(shift) * 0x1.0p-53;
+  return shiftd;
+}
+
+__attribute__((target("sse4.2"))) void
+init(state &s, const std::array<uint64_t, 2> &seeds) {
+  s.s[0] = _mm_set_epi64x(seeds[0], seeds[1]);
+  s.s[1] = _mm_setzero_si128();
+  s.s[2] = _mm_setzero_si128();
+  s.s[3] = _mm_setzero_si128();
+}
+
+} // namespace sse4_2
+
+namespace avx {
+
+void debug_m256i(const char msg[], __m256i a = _mm256_setzero_si256(),
+                 bool novalue = false) {
+  printf("%s: ", msg);
+  if (novalue) {
+    printf("\n");
+    return;
+  }
+  for (int i = 0; i < 4; i++) {
+    printf("%016lx ", (uint64_t)a[i]);
+  }
+  printf("\n");
+}
+
+void debug_m256d(const char msg[], __m256d a = _mm256_setzero_pd(),
+                 bool novalue = false) {
+  printf("%s: ", msg);
+  if (novalue) {
+    printf("\n");
+    return;
+  }
+  for (int i = 0; i < 4; i++) {
+    printf("%+.13a ", a[i]);
+  }
+  printf("\n");
+}
+
+void debug(const char msg[]) { debug_m256i(msg, _mm256_setzero_si256(), true); }
+
+/* The current state of the generators. */
+typedef struct {
+  __m256i s[4];
+} state;
+
+__attribute__((target("avx"))) static __inline __m256i rotl(__m256i x, int k) {
+  return x << k | (x >> (64 - k));
+}
+
+__attribute__((target("avx"))) static inline __m256i next(state &s) {
+  debug_m256i("state[0]", s.s[0]);
+  debug_m256i("state[1]", s.s[1]);
+  debug_m256i("state[2]", s.s[2]);
+  debug_m256i("state[3]", s.s[3]);
+  debug("----\n");
+  // result[i] = s[0][i] + s[3][i];
+  __m256i result = s.s[0] + s.s[3];
+  debug("result[i] = s[0][i] + s[3][i];");
+  debug_m256i("result  ", result);
+
+  // t[i] = s[1][i] << 17;
+  __m256i t = s.s[1] << 17;
+  debug("t[i] = s[1][i] << 17;");
+  debug_m256i("t       ", t);
+
+  // s[2][i] ^= s[0][i];
+  s.s[2] = s.s[2] ^ s.s[0];
+  debug("s[2][i] ^= s[0][i];");
+  debug_m256i("s[2]    ", s.s[2]);
+
+  // s[3][i] ^= s[1][i];
+  s.s[3] = s.s[3] ^ s.s[1];
+  debug("s[3][i] ^= s[1][i];");
+  debug_m256i("s[3]    ", s.s[3]);
+
+  // s[1][i] ^= s[2][i];
+  s.s[1] = s.s[1] ^ s.s[2];
+  debug("s[1][i] ^= s[2][i];");
+  debug_m256i("s[1]    ", s.s[1]);
+
+  // s[0][i] ^= s[3][i];
+  s.s[0] = s.s[0] ^ s.s[3];
+  debug("s[0][i] ^= s[3][i];");
+  debug_m256i("s[0]    ", s.s[0]);
+
+  // s[2][i] ^= t[i];
+  s.s[2] = s.s[2] ^ t;
+  debug("s[2][i] ^= t[i];");
+  debug_m256i("s[2]    ", s.s[2]);
+
+  // s[3][i] = rotl(s[3][i], 45);
+  s.s[3] = rotl(s.s[3], 45);
+  debug("s[3][i] = rotl(s[3][i], 45);");
+  debug_m256i("s[3]    ", s.s[3]);
+
+  return result;
+}
+
+__attribute__((target("avx"))) static inline __m256d
+get_rand_double01(state &s) {
+  __m256i rng = next(s);
+  __m128i rng_lo = _mm256_extractf128_si256(rng, 0);
+  __m128i rng_hi = _mm256_extractf128_si256(rng, 1);
+  debug_m256i("next:", rng);
+  debug("----\n");
+  __m128i shift_lo = _mm_srli_epi64(rng_lo, 11);
+  __m128i shift_hi = _mm_srli_epi64(rng_hi, 11);
+  debug_m256i("shift", _mm256_set_m128i(shift_hi, shift_lo));
+  debug_m256d("shift", _mm256_set_m128d(_mm_cvtepi64_pd(shift_hi),
+                                        _mm_cvtepi64_pd(shift_lo)));
+  __m128d shiftd_lo = _mm_cvtepi64_pd(shift_lo) * 0x1.0p-53;
+  __m128d shiftd_hi = _mm_cvtepi64_pd(shift_hi) * 0x1.0p-53;
+  __m256d shiftd = _mm256_set_m128d(shiftd_hi, shiftd_lo);
+  debug_m256d("shiftd", shiftd);
+  return shiftd;
+}
+
+__attribute__((target("avx"))) void init(state &s,
+                                         const std::array<uint64_t, 4> &seeds) {
+  s.s[0] = _mm256_set_epi64x(seeds[0], seeds[1], seeds[2], seeds[3]);
+  s.s[1] = _mm256_setzero_si256();
+  s.s[2] = _mm256_setzero_si256();
+  s.s[3] = _mm256_setzero_si256();
+}
+
+} // namespace avx
+
+namespace avx2 {
+
+#define RET_DEBUG
+#ifdef RET_DEGUG
+#define RET return;
+#else
+#define RET
+#endif
+
+void debug_m128i(const char msg[], __m128i a = _mm_setzero_si128(),
+                 bool novalue = false) {
+  return;
+  printf("%s: ", msg);
+  if (novalue) {
+    printf("\n");
+    return;
+  }
+  for (int i = 0; i < 2; i++) {
+    printf("%016lx ", (uint64_t)a[i]);
+  }
+  printf("\n");
+}
+
+void debug_m256i(const char msg[], __m256i a = _mm256_setzero_si256(),
+                 bool novalue = false) {
+  return;
+  printf("%s: ", msg);
+  if (novalue) {
+    printf("\n");
+    return;
+  }
+  for (int i = 0; i < 4; i++) {
+    printf("%016lx ", (uint64_t)a[i]);
+  }
+  printf("\n");
+}
+
+void debug_m256d(const char msg[], __m256d a = _mm256_setzero_pd(),
+                 bool novalue = false) {
+  return;
+  printf("%s: ", msg);
+  if (novalue) {
+    printf("\n");
+    return;
+  }
+  for (int i = 0; i < 4; i++) {
+    printf("%+.13a ", a[i]);
+  }
+  printf("\n");
+}
+
+void debug_m128d(const char msg[], __m128d a = _mm_setzero_pd(),
+                 bool novalue = false) {
+  return;
+  printf("%s: ", msg);
+  if (novalue) {
+    printf("\n");
+    return;
+  }
+  for (int i = 0; i < 2; i++) {
+    printf("%+.13a ", a[i]);
+  }
+  printf("\n");
+}
+
+void debug(const char msg[]) { debug_m256i(msg, _mm256_setzero_si256(), true); }
+
+/* The current state of the generators. */
+typedef struct {
+  __m256i s[4];
+} state;
+
+__attribute__((target("avx2"))) static __inline __m256i rotl(__m256i x, int k) {
+  return _mm256_or_si256(_mm256_slli_epi64(x, k), _mm256_srli_epi64(x, 64 - k));
+}
+
+__attribute__((target("avx2"))) static inline __m256i next(state &s) {
+  // result[i] = s[0][i] + s[3][i];
+  __m256i result = _mm256_add_epi64(s.s[0], s.s[3]);
+
+  // t[i] = s[1][i] << 17;
+  __m256i t = _mm256_slli_epi64(s.s[1], 17);
+
+  // s[2][i] ^= s[0][i];
+  s.s[2] = _mm256_xor_si256(s.s[2], s.s[0]);
+
+  // s[3][i] ^= s[1][i];
+  s.s[3] = _mm256_xor_si256(s.s[3], s.s[1]);
+
+  // s[1][i] ^= s[2][i];
+  s.s[1] = _mm256_xor_si256(s.s[1], s.s[2]);
+
+  // s[0][i] ^= s[3][i];
+  s.s[0] = _mm256_xor_si256(s.s[0], s.s[3]);
+
+  // s[2][i] ^= t[i];
+  s.s[2] = _mm256_xor_si256(s.s[2], t);
+
+  // s[3][i] = rotl(s[3][i], 45);
+  s.s[3] = rotl(s.s[3], 45);
+
+  return result;
+}
+
+__m256d get_rand_double01(state &s) {
+  __m256i rng = next(s);
+  debug_m256i("rng", rng);
+  __m256i shift = _mm256_srli_epi64(rng, 11);
+  __m128i shift_lo = _mm256_extractf128_si256(shift, 0);
+  __m128i shift_hi = _mm256_extractf128_si256(shift, 1);
+  // convert to double
+  debug_m128i("shift lo", shift_lo);
+  debug_m128i("shift hi", shift_hi);
+  debug_m128d("shiftd lo", _mm_castsi128_pd(shift_lo));
+  debug_m128d("shiftd hi", _mm_castsi128_pd(shift_hi));
+  __m256d x = _mm256_set_pd(shift_hi[1], shift_hi[0], shift_lo[1], shift_lo[0]);
+  debug_m256d("x", x * 0x1.0p-53);
+  return x * 0x1.0p-53;
+}
+
+__attribute__((target("avx2"))) void
+init(state &s, const std::array<uint64_t, 4> &seeds) {
+  s.s[0] = _mm256_set_epi64x(seeds[0], seeds[1], seeds[2], seeds[3]);
+  s.s[1] = _mm256_setzero_si256();
+  s.s[2] = _mm256_setzero_si256();
+  s.s[3] = _mm256_setzero_si256();
+}
+
+} // namespace avx2
+
+namespace avx512 {
+typedef __m512i rn_uint64_x8;
+
+/* The current state of the generators. */
+typedef struct {
+  __m512i s[4];
+} state;
+
+__attribute__((target("avx512f"))) static __inline __m512i rotl(__m512i x,
+                                                                int k) {
+  return _mm512_or_si512(_mm512_slli_epi64(x, k), _mm512_srli_epi64(x, 64 - k));
+}
+
+__attribute__((target("avx512f"))) static inline __m512i next(state &s) {
+  // result[i] = s[0][i] + s[3][i];
+  __m512i result = _mm512_add_epi64(s.s[0], s.s[3]);
+
+  // t[i] = s[1][i] << 17;
+  __m512i t = _mm512_slli_epi64(s.s[1], 17);
+
+  // s[2][i] ^= s[0][i];
+  s.s[2] = _mm512_xor_si512(s.s[2], s.s[0]);
+
+  // s[3][i] ^= s[1][i];
+  s.s[3] = _mm512_xor_si512(s.s[3], s.s[1]);
+
+  // s[1][i] ^= s[2][i];
+  s.s[1] = _mm512_xor_si512(s.s[1], s.s[2]);
+
+  // s[0][i] ^= s[3][i];
+  s.s[0] = _mm512_xor_si512(s.s[0], s.s[3]);
+
+  // s[2][i] ^= t[i];
+  s.s[2] = _mm512_xor_si512(s.s[2], t);
+
+  // s[3][i] = rotl(s[3][i], 45);
+  s.s[3] = rotl(s.s[3], 45);
+
+  return result;
+}
+
+__attribute__((target("avx512f"))) void
+init(state &s, const std::array<uint64_t, 8> &seeds) {
+  s.s[0] = _mm512_set_epi64(seeds[0], seeds[1], seeds[2], seeds[3], seeds[4],
+                            seeds[5], seeds[6], seeds[7]);
+  s.s[1] = _mm512_setzero_si512();
+  s.s[2] = _mm512_setzero_si512();
+  s.s[3] = _mm512_setzero_si512();
+}
+
+} // namespace avx512
+
+} // namespace xoroshiro256plus
+#endif // __VERIFICARLO_SRLIB_RAND_XOROSHIRO256P_HPP__
\ No newline at end of file
diff --git a/src/libvfcinstrumentonline/rand/test.cpp b/src/libvfcinstrumentonline/rand/test.cpp
index 179ede24..7e22ad0c 100644
--- a/src/libvfcinstrumentonline/rand/test.cpp
+++ b/src/libvfcinstrumentonline/rand/test.cpp
@@ -21,7 +21,7 @@ template <typename T> T apply_op(const char op, T a, T b) {
     return a + b;
   case '-':
     return a - b;
-  case '*':
+  case 'x':
     return a * b;
   case '/':
     return a / b;
diff --git a/src/libvfcinstrumentonline/rand/test.sh b/src/libvfcinstrumentonline/rand/test.sh
index 4dfaa536..95909a8d 100755
--- a/src/libvfcinstrumentonline/rand/test.sh
+++ b/src/libvfcinstrumentonline/rand/test.sh
@@ -5,8 +5,8 @@ if [ $# -ne 3 ]; then
     exit 1
 fi
 
-verificarlo-c++ -DREAL=$REAL --function=apply_op test.cpp -std=c++17 -Wall -o test
+verificarlo-c++ -DREAL=$REAL --function=_Z8apply_opIdET_cS0_S0_ test.cpp -std=c++17 -Wall -o test
 
-export VFC_BACKENDS="libinterflop_mca.so --precision-binary64=53 --precision-binary32=24"
+export VFC_BACKENDS="libinterflop_mca.so --mode=rr --precision-binary64=53 --precision-binary32=24"
 export VFC_BACKENDS_LOGGER=False
 ./test $1 $2 $3
diff --git a/tests/test_online_instrumentation_sr/test.c b/tests/test_online_instrumentation_sr/test.c
deleted file mode 100644
index 9bc9a110..00000000
--- a/tests/test_online_instrumentation_sr/test.c
+++ /dev/null
@@ -1,144 +0,0 @@
-#include "interflop/common/float_const.h"
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <sys/time.h>
-#include <sys/types.h>
-#include <unistd.h>
-
-#ifndef REAL_TYPE
-#warning "REAL is not defined, defaulting to double"
-#define REAL double
-#define REAL_TYPE 1
-#endif
-
-#if REAL_TYPE == 1
-#define DOUBLE
-#define REAL double
-#elif REAL_TYPE == 2
-#define FLOAT
-#define REAL float
-#endif
-
-#ifndef REAL_TYPE
-#error "REAL_TYPE is not defined"
-#endif
-
-
-double add_double(double a, double b) { return a + b; }
-
-double sub_double(double a, double b) { return a - b; }
-
-double mul_double(double a, double b) { return a * b; }
-
-double div_double(double a, double b) { return a / b; }
-
-double sqrt_double(double a) { return __builtin_sqrt(a); }
-
-double fma_double(double a, double b, double c) {
-  return __builtin_fma(a, b, c);
-}
-
-double operator_double(char op, double a, double b, double c) {
-  switch (op) {
-  case '+':
-    return add_double(a, b);
-  case '-':
-    return sub_double(a, b);
-  case 'x':
-    return mul_double(a, b);
-  case '/':
-    return div_double(a, b);
-  case 's':
-    return sqrt_double(a);
-  case 'f':
-    return fma_double(a, b, c);
-  default:
-    fprintf(stderr, "Error: unknown operation\n");
-    abort();
-  }
-}
-
-float add_float(float a, float b) { return a + b; }
-
-float sub_float(float a, float b) { return a - b; }
-
-float mul_float(float a, float b) { return a * b; }
-
-float div_float(float a, float b) { return a / b; }
-
-float sqrt_float(float a) { return __builtin_sqrtf(a); }
-
-float fma_float(float a, float b, float c) { return __builtin_fmaf(a, b, c); }
-
-float operator_float(char op, float a, float b, float c) {
-  switch (op) {
-  case '+':
-    return add_float(a, b);
-  case '-':
-    return sub_float(a, b);
-  case 'x':
-    return mul_float(a, b);
-  case '/':
-    return div_float(a, b);
-  case 's':
-    return sqrt_float(a);
-  case 'f':
-    return fma_float(a, b, c);
-  default:
-    fprintf(stderr, "Error: unknown operation\n");
-    abort();
-  }
-}
-
-// void run_test() {
-//   double a = 0.1, b = 0.01, c = 0.0, d = 0.0, tau = 0.0, sigma = 0.0;
-//   fprintf(stderr, "\n");
-//   for (int i = 0; i < 10; i++) {
-//     twosum_double(a, b, &tau, &sigma);
-//     fprintf(stderr, "a = %.13a, b = %.13a, tau = %.13a, sigma = %.13a\n", a,
-//     b,
-//             tau, sigma);
-//     d = sr_rounding_double(tau, sigma, i / 10.0);
-//     fprintf(stderr, "round = %.13a\n", c);
-//   }
-// }
-
-int main(int argc, const char *argv[]) {
-
-  if (argc == 3) {
-    if (argv[1][0] != 's') {
-      fprintf(stderr, "usage: ./test <op> a\n");
-      exit(1);
-    }
-  } else if (argc == 5) {
-    if (argv[1][0] != 'f') {
-      fprintf(stderr, "usage: ./test <op> a b c\n");
-      exit(1);
-    }
-
-  } else if (argc != 4) {
-    fprintf(stderr, "usage: ./test <op> a b\n");
-    exit(1);
-  }
-  // run_test();
-
-  const char op = argv[1][0];
-  REAL a = atof(argv[2]), b = 0, c = 0;
-
-  if (op != 's')
-    b = atof(argv[3]);
-
-  if (op == 'f')
-    c = atof(argv[4]);
-
-#ifdef DOUBLE
-  // fprintf(stderr, "[double] op = %c, a = %.13a, b = %.13a\n", op, a, b);
-  printf("%.13a\n", operator_double(op, a, b, c));
-#elif defined(FLOAT)
-  // fprintf(stderr, "[float] op = %c, a = %.6a, b = %.6a\n", op, a, b);
-  printf("%.6a\n", operator_float(op, a, b, c));
-#endif
-
-  return EXIT_SUCCESS;
-}
diff --git a/tests/test_online_instrumentation_sr/test.sh b/tests/test_online_instrumentation_sr/test.sh
deleted file mode 100755
index d8559356..00000000
--- a/tests/test_online_instrumentation_sr/test.sh
+++ /dev/null
@@ -1,22 +0,0 @@
-#!/bin/bash
-
-set -e
-
-echo "Test scalar"
-./test_scalar.sh
-
-if [[ $? != 0 ]]; then
-    echo "Failed!"
-    exit 1
-fi
-
-echo "Test vector"
-./test_vector.sh
-
-if [[ $? != 0 ]]; then
-    echo "Failed!"
-    exit 1
-else
-    echo "Success!"
-    exit 0
-fi