diff --git a/src/libvfcinstrumentonline/rand/build-ir.sh b/src/libvfcinstrumentonline/rand/build-ir.sh
index 58242ad2..49963c06 100755
--- a/src/libvfcinstrumentonline/rand/build-ir.sh
+++ b/src/libvfcinstrumentonline/rand/build-ir.sh
@@ -6,4 +6,11 @@ bazelisk build --repo_env=CC=$(llvm-config-14 --bindir)/clang \
     --cxxopt="-march=native" --cxxopt="-emit-llvm" --cxxopt="-O3" --save_temps \
     --cxxopt="-USR_DEBUG" --cxxopt="-g0"
 
+bazelisk build --repo_env=CC=$(llvm-config-14 --bindir)/clang \
+    --repo_env=CXX=$(llvm-config-14 --bindir)/clang++ \
+    --compile_one_dependency //src:ud_hw.cpp \
+    --cxxopt="-march=native" --cxxopt="-emit-llvm" --cxxopt="-O3" --save_temps \
+    --cxxopt="-USR_DEBUG" --cxxopt="-g0"
+
 cp -f bazel-bin/src/_objs/sr/sr_hw.pic.s sr_hw.ll
+cp -f bazel-bin/src/_objs/ud/ud_hw.pic.s ud_hw.ll
diff --git a/src/libvfcinstrumentonline/rand/src/BUILD b/src/libvfcinstrumentonline/rand/src/BUILD
index dc9cf666..02fc564b 100644
--- a/src/libvfcinstrumentonline/rand/src/BUILD
+++ b/src/libvfcinstrumentonline/rand/src/BUILD
@@ -10,7 +10,10 @@ exports_files([
     "sr_hw.h",
     "sr.h",
     "sr_hw-inl.h",
-    "ud.hpp",
+    "ud.h",
+    "ud_hw.h",
+    "ud_hw.cpp",
+    "ud_hw-inl.h",
     "utils.hpp",
     "xoroshiro256+_hw.hpp",
     "xoroshiro256+_hw.h",
@@ -20,23 +23,62 @@ exports_files([
 
 # Compiler options
 COPTS = [
-    "-std=c++20",
+    "-std=c++17",
     "-Wfatal-errors",
 ]
 
 filegroup(
-    name = "srcs",
-    srcs = glob([
-        "*.cpp",
-        "*.hpp",
-        "*.h",
-    ]),
+    name = "xoroshiro",
+    srcs = [
+        "debug_hwy-inl.h",
+        "random-inl.h",
+        "target-utils.h",
+        "xoroshiro256+_hw.cpp",
+        "xoroshiro256+_hw.h",
+        "xoroshiro256+_hw-inl.hpp",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+filegroup(
+    name = "srcs-sr",
+    srcs = glob(
+        [
+            "*.cpp",
+            "*.hpp",
+            "*.h",
+        ],
+        exclude = [
+            "ud_hw.cpp",
+            "ud_hw.h",
+            "ud_hw-inl.h",
+            "ud.h",
+        ],
+    ),
+    visibility = ["//visibility:public"],
+)
+
+filegroup(
+    name = "srcs-ud",
+    srcs = glob(
+        [
+            "*.cpp",
+            "*.hpp",
+            "*.h",
+        ],
+        exclude = [
+            "sr_hw.cpp",
+            "sr_hw.h",
+            "sr_hw-inl.h",
+            "sr.h",
+        ],
+    ),
     visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "sr",
-    srcs = [":srcs"],
+    srcs = [":srcs-sr"],
     hdrs = ["//src:sr_hw.h"],
     copts = COPTS + [
         "-O2",
@@ -47,6 +89,19 @@ cc_library(
     deps = ["@hwy"],
 )
 
+cc_library(
+    name = "ud",
+    srcs = [":srcs-ud"],
+    hdrs = ["//src:ud_hw.h"],
+    copts = COPTS + [
+        "-O2",
+        "-Wall",
+        "-Wno-psabi",
+    ],
+    visibility = ["//visibility:public"],
+    deps = ["@hwy"],
+)
+
 cc_library(
     name = "sr-dbg",
     srcs = [
@@ -66,10 +121,8 @@ cc_library(
         "//src:sr_hw.h",
     ],
     copts = COPTS + [
-        "-std=c++20",
         "-Og",
         "-g",
-        "-march=native",
         "-Wfatal-errors",
         "-DHWY_DISABLED_TARGETS=(HWY_AVX3|HWY_AVX3_ZEN4|HWY_AVX3_SPR)",
         "-DSR_DEBUG",
diff --git a/src/libvfcinstrumentonline/rand/src/inputs_test_generator/generate.py b/src/libvfcinstrumentonline/rand/src/inputs_test_generator/generate.py
new file mode 100644
index 00000000..e69de29b
diff --git a/src/libvfcinstrumentonline/rand/src/inputs_test_generator/test b/src/libvfcinstrumentonline/rand/src/inputs_test_generator/test
new file mode 100755
index 00000000..cddbe63b
Binary files /dev/null and b/src/libvfcinstrumentonline/rand/src/inputs_test_generator/test differ
diff --git a/src/libvfcinstrumentonline/rand/src/inputs_test_generator/test.cpp b/src/libvfcinstrumentonline/rand/src/inputs_test_generator/test.cpp
index 7348e0cd..ad274e79 100644
--- a/src/libvfcinstrumentonline/rand/src/inputs_test_generator/test.cpp
+++ b/src/libvfcinstrumentonline/rand/src/inputs_test_generator/test.cpp
@@ -3,6 +3,8 @@
 #include <iostream>
 #include <map>
 
+const std::size_t N = 1'000'000;
+
 #ifndef REAL
 #define REAL double
 #warning "REAL not defined, using double"
@@ -49,10 +51,10 @@ int main(int argc, char *argv[]) {
 
   std::map<REAL, int> visited;
 
-  for (int i = 0; i < 1000; i++)
+  for (int i = 0; i < N; i++)
     visited[apply_op(argv[1][0], a, b)]++;
 
-  compute_proba(visited, 1000);
+  compute_proba(visited, N);
 
   return 0;
 }
\ No newline at end of file
diff --git a/src/libvfcinstrumentonline/rand/src/inputs_test_generator/test_double b/src/libvfcinstrumentonline/rand/src/inputs_test_generator/test_double
new file mode 100755
index 00000000..7c89fe72
Binary files /dev/null and b/src/libvfcinstrumentonline/rand/src/inputs_test_generator/test_double differ
diff --git a/src/libvfcinstrumentonline/rand/src/inputs_test_generator/test_float b/src/libvfcinstrumentonline/rand/src/inputs_test_generator/test_float
new file mode 100755
index 00000000..7b477ff3
Binary files /dev/null and b/src/libvfcinstrumentonline/rand/src/inputs_test_generator/test_float differ
diff --git a/src/libvfcinstrumentonline/rand/src/random-inl.h b/src/libvfcinstrumentonline/rand/src/random-inl.h
index 69360e90..31cbeb81 100644
--- a/src/libvfcinstrumentonline/rand/src/random-inl.h
+++ b/src/libvfcinstrumentonline/rand/src/random-inl.h
@@ -205,7 +205,7 @@ class Xoshiro {
 class VectorXoshiro {
 private:
   using VU64 = Vec<ScalableTag<std::uint64_t>>;
-  using VU32 = Vec<ScalableTag<std::uint64_t>>;
+  using VU32 = Vec<ScalableTag<std::uint32_t>>;
   using StateType = AlignedNDArray<std::uint64_t, 2>;
   using VF32 = Vec<ScalableTag<float>>;
 #if HWY_HAVE_FLOAT64
@@ -232,9 +232,34 @@ class VectorXoshiro {
     }
   }
 
-  HWY_INLINE VU64 operator()() noexcept { return Next(); }
+  HWY_INLINE VU32 operator()(std::uint32_t) noexcept {
+    const auto result = Next();
+    return BitCast(ScalableTag<std::uint32_t>{}, result);
+  }
+
+  HWY_INLINE VU64 operator()(std::uint64_t) noexcept { return Next(); }
 
-  AlignedVector<std::uint64_t> operator()(const std::size_t n) {
+  AlignedVector<std::uint32_t> operator()(std::uint32_t, const std::size_t n) {
+    const auto u32_tag = ScalableTag<std::uint32_t>{};
+    AlignedVector<std::uint32_t> result(n);
+    const ScalableTag<std::uint64_t> tag{};
+    auto s0 = Load(tag, state_[{0}].data());
+    auto s1 = Load(tag, state_[{1}].data());
+    auto s2 = Load(tag, state_[{2}].data());
+    auto s3 = Load(tag, state_[{3}].data());
+    for (std::uint64_t i = 0; i < n; i += Lanes(u32_tag)) {
+      const auto next = Update(s0, s1, s2, s3);
+      const auto next_u32 = BitCast(u32_tag, next);
+      Store(next_u32, u32_tag, result.data() + i);
+    }
+    Store(s0, tag, state_[{0}].data());
+    Store(s1, tag, state_[{1}].data());
+    Store(s2, tag, state_[{2}].data());
+    Store(s3, tag, state_[{3}].data());
+    return result;
+  }
+
+  AlignedVector<std::uint64_t> operator()(std::uint64_t, const std::size_t n) {
     AlignedVector<std::uint64_t> result(n);
     const ScalableTag<std::uint64_t> tag{};
     auto s0 = Load(tag, state_[{0}].data());
@@ -252,8 +277,29 @@ class VectorXoshiro {
     return result;
   }
 
+  template <std::uint32_t N>
+  std::array<std::uint32_t, N> operator()(std::uint32_t) noexcept {
+    alignas(HWY_ALIGNMENT) std::array<std::uint32_t, N> result;
+    const ScalableTag<std::uint64_t> tag{};
+    const ScalableTag<std::uint32_t> u32_tag{};
+    auto s0 = Load(tag, state_[{0}].data());
+    auto s1 = Load(tag, state_[{1}].data());
+    auto s2 = Load(tag, state_[{2}].data());
+    auto s3 = Load(tag, state_[{3}].data());
+    for (std::uint64_t i = 0; i < N; i += Lanes(u32_tag)) {
+      const auto next = Update(s0, s1, s2, s3);
+      const auto next_u32 = BitCast(u32_tag, next);
+      Store(next_u32, u32_tag, result.data() + i);
+    }
+    Store(s0, tag, state_[{0}].data());
+    Store(s1, tag, state_[{1}].data());
+    Store(s2, tag, state_[{2}].data());
+    Store(s3, tag, state_[{3}].data());
+    return result;
+  }
+
   template <std::uint64_t N>
-  std::array<std::uint64_t, N> operator()() noexcept {
+  std::array<std::uint64_t, N> operator()(std::uint64_t) noexcept {
     alignas(HWY_ALIGNMENT) std::array<std::uint64_t, N> result;
     const ScalableTag<std::uint64_t> tag{};
     auto s0 = Load(tag, state_[{0}].data());
@@ -294,10 +340,7 @@ class VectorXoshiro {
   }
 
   HWY_INLINE AlignedVector<float> Uniform(float, const std::size_t n) {
-    // std::cerr << "Uniform float start\n";
     AlignedVector<float> result(n);
-    // std::cerr << "result address: " << &result << "\n";
-    // std::cerr << "result size: " << result.size() << "\n";
     const ScalableTag<std::uint32_t> u32_tag{};
     const ScalableTag<std::uint64_t> tag{};
     const ScalableTag<float> real_tag{};
@@ -309,22 +352,17 @@ class VectorXoshiro {
     auto s3 = Load(tag, state_[{3}].data());
 
     for (std::size_t i = 0; i < n; i += Lanes(real_tag)) {
-      // std::cerr << "i: " << i << "\n";
       const auto next = Update(s0, s1, s2, s3);
       const auto bits = BitCast(u32_tag, next);
       const auto bitscast = ShiftRight<8>(bits);
       const auto real = ConvertTo(real_tag, bitscast);
       const auto uniform = Mul(real, MUL_VALUE);
-      // std::cerr << "store at " << i << " " << result.data() + i << "\n";
       Store(uniform, real_tag, result.data() + i);
     }
-
     Store(s0, tag, state_[{0}].data());
     Store(s1, tag, state_[{1}].data());
     Store(s2, tag, state_[{2}].data());
     Store(s3, tag, state_[{3}].data());
-
-    // std::cerr << "Uniform float end\n";
     return result;
   }
 
@@ -466,12 +504,12 @@ template <std::uint64_t size = 1024> class CachedXoshiro {
 
   explicit CachedXoshiro(const result_type seed,
                          const result_type threadNumber = 0)
-      : generator_{seed, threadNumber}, cache_{generator_.operator()<size>()},
-        index_{0} {}
+      : generator_{seed, threadNumber},
+        cache_{generator_.operator()<size>(result_type{})}, index_{0} {}
 
   result_type operator()() noexcept {
     if (HWY_UNLIKELY(index_ == size)) {
-      cache_ = std::move(generator_.operator()<size>());
+      cache_ = std::move(generator_.operator()<size>(result_type{}));
       index_ = 0;
     }
     return cache_[index_++];
diff --git a/src/libvfcinstrumentonline/rand/src/sr.h b/src/libvfcinstrumentonline/rand/src/sr.h
index 230546ec..89c92e88 100644
--- a/src/libvfcinstrumentonline/rand/src/sr.h
+++ b/src/libvfcinstrumentonline/rand/src/sr.h
@@ -34,7 +34,7 @@ template <typename T> inline T sround(const T sigma, const T tau) {
   debug_start();
   if (tau == 0) {
     debug_end();
-    return sigma;
+    return 0;
   }
   constexpr int32_t mantissa = IEEE754<T>::mantissa;
   const bool sign_tau = tau < 0;
diff --git a/src/libvfcinstrumentonline/rand/src/sr_hw-inl.h b/src/libvfcinstrumentonline/rand/src/sr_hw-inl.h
index f173e1bc..fac8f114 100644
--- a/src/libvfcinstrumentonline/rand/src/sr_hw-inl.h
+++ b/src/libvfcinstrumentonline/rand/src/sr_hw-inl.h
@@ -49,15 +49,18 @@ void twosum(V a, V b, V &sigma, V &tau) {
 }
 
 /*
+"Emulation of the FMA in rounded-to-nearest loating-point arithmetic"
+Stef Graillat, Jean-Michel Muller
+---
 Algorithm 3 – Split(x, s). Veltkamp’s splitting algorithm. Returns a pair
 (xh, xℓ) of FP numbers such that the significand of xh fits in s − p bits, the
 significand of xℓ fits in s − 1 bits, and xh + xℓ = x.
-Require: K = 2s + 1
+Require: K = 2^s + 1
 Require: 2 ≤ s ≤ p − 2
 γ ← RN(K · x)
 δ ← RN(x − γ)
-ah ← RN(γ + δ)
-aℓ ← RN(x − ah)
+xh ← RN(γ + δ)
+xℓ ← RN(x − xh)
 return (xh, xℓ)
 */
 template <class D, class V = hn::TFromD<D>, typename T = hn::TFromD<D>>
@@ -80,6 +83,9 @@ void Split(V x, V &xh, V &xl) {
 }
 
 /*
+"Emulation of the FMA in rounded-to-nearest loating-point arithmetic"
+Stef Graillat, Jean-Michel Muller
+---
 Algorithm 4 – DekkerProd(a, b). Dekker’s product. Returns a pair (πh, πℓ)
 of FP numbers such that πh = RN(ab) and πh + πℓ = ab.
 Require: s = ⌈p/2⌉
@@ -115,6 +121,9 @@ void DekkerProd(V a, V b, V &pi_h, V &pi_l) {
 }
 
 /*
+"Emulation of the FMA in rounded-to-nearest loating-point arithmetic"
+Stef Graillat, Jean-Michel Muller
+---
 Algorithm 7 EmulFMA(a, b, c).
 Require: P = 2^(p−1) + 1
 Require: Q = 2^(p−1)
@@ -192,11 +201,12 @@ V fma(V a, V b, V c) {
   auto g = hn::Mul(t, w);
   auto mask3 = hn::Lt(g, hn::Zero(d)); // if g < 0 then
 
-  auto res = hn::IfThenElse(
-      mask, d_temp_1,
-      hn::IfThenElse(mask1, z_h,
-                     hn::IfThenElse(mask2, d_temp_2,
-                                    hn::IfThenElse(mask3, z_h, d_temp_2))));
+  auto ret3 = hn::IfThenElse(mask3, z_h, d_temp_2);
+  auto ret2 = hn::IfThenElse(mask2, d_temp_2, ret3);
+  auto ret1 = hn::IfThenElse(mask1, z_h, ret2);
+  auto ret = hn::IfThenElse(mask, d_temp_1, ret1);
+
+  auto res = ret;
 
   debug_vec<D>("[fma] res", res);
   debug_msg("[fma] END\n");
@@ -276,7 +286,7 @@ HWY_INLINE hn::Vec<D> FastPow2I(D d, VI x) {
   const hn::Rebind<hwy::MakeSigned<D>, D> di;
   const auto kOffset = Set(di, kOffsetS);
   const auto offset = Add(x, kOffset);
-  const auto shift = ShiftLeft<mantissa>(offset);
+  const auto shift = hn::ShiftLeft<mantissa>(offset);
   return BitCast(d, shift);
 }
 
@@ -329,7 +339,7 @@ hn::Vec<D> pow2(D d, V n) {
   return hn::BitCast(d, res);
 }
 
-template <class D, class V, typename T = hn::TFromD<D>>
+template <class D, class V = hn::VFromD<D>, typename T = hn::TFromD<D>>
 V sr_round(V sigma, V tau) {
   debug_msg("\n[sr_round] START");
   debug_vec<D>("[sr_round] sigma", sigma);
@@ -362,14 +372,17 @@ V sr_round(V sigma, V tau) {
 
   auto exp = hn::Sub(eta, hn::Set(di, mantissa));
   auto abs_ulp = pow2(d, exp);
-  debug_vec<D>("[sr_round] abs_ulp", abs_ulp);
+  debug_vec<D>("[sr_round] |ulp|", abs_ulp);
+
   auto ulp = hn::CopySign(abs_ulp, tau);
   debug_vec<D>("[sr_round] ulp", ulp);
 
   auto pi = hn::Mul(ulp, z);
   debug_vec<D>("[sr_round] pi", pi);
+
   auto abs_tau_plus_pi = hn::Abs(hn::Add(tau, pi));
-  debug_vec<D>("[sr_round] abs_tau_plus_pi", abs_tau_plus_pi);
+  debug_vec<D>("[sr_round] |tau|+pi", abs_tau_plus_pi);
+
   auto round = hn::IfThenElse(hn::Ge(abs_tau_plus_pi, abs_ulp), ulp, zero);
   debug_vec<D>("[sr_round] round", round);
 
diff --git a/src/libvfcinstrumentonline/rand/src/sr_hw.cpp b/src/libvfcinstrumentonline/rand/src/sr_hw.cpp
index 5053a087..4e34d2ca 100644
--- a/src/libvfcinstrumentonline/rand/src/sr_hw.cpp
+++ b/src/libvfcinstrumentonline/rand/src/sr_hw.cpp
@@ -294,17 +294,26 @@ define_fp_xN_unary_op(float, f32, 16, sqrt);
 #endif
 
 template <typename T>
-void _round(const T *HWY_RESTRICT sigma, const T *HWY_RESTRICT tau,
-            T *HWY_RESTRICT result, const size_t count) {
+HWY_NOINLINE void _round(const T *HWY_RESTRICT sigma, const T *HWY_RESTRICT tau,
+                         T *HWY_RESTRICT result, const size_t count) {
   using D = hn::ScalableTag<T>;
   const D d;
   const size_t N = hn::Lanes(d);
-  for (size_t i = 0; i + N <= count; i += N) {
+  size_t i = 0;
+  for (; i + N <= count; i += N) {
     auto sigma_vec = hn::Load(d, sigma + i);
     auto tau_vec = hn::Load(d, tau + i);
     auto res = sr_round<D>(sigma_vec, tau_vec);
     hn::Store(res, d, result + i);
   }
+  // using C = hn::CappedTag<T, 1>;
+  // const C c;
+  // for (; i < count; ++i) {
+  //   auto sigma_vec = hn::Load(d, sigma + i);
+  //   auto tau_vec = hn::Load(d, tau + i);
+  //   auto res = sr_round<C>(sigma_vec, tau_vec);
+  //   hn::Store(res, c, result + i);
+  // }
 }
 
 template <typename T>
@@ -336,12 +345,21 @@ HWY_NOINLINE void _sub(const T *HWY_RESTRICT a, const T *HWY_RESTRICT b,
   using D = hn::ScalableTag<T>;
   const D d;
   const size_t N = hn::Lanes(d);
-  for (size_t i = 0; i + N <= count; i += N) {
+  size_t i = 0;
+  for (; i + N <= count; i += N) {
     auto a_vec = hn::Load(d, a + i);
     auto b_vec = hn::Load(d, b + i);
     auto res = sr_sub<D>(a_vec, b_vec);
     hn::Store(res, d, result + i);
   }
+  using C = hn::CappedTag<T, 1>;
+  const C c;
+  for (; i < count; ++i) {
+    auto a_vec = hn::Load(c, a + i);
+    auto b_vec = hn::Load(c, b + i);
+    auto res = sr_sub<C>(a_vec, b_vec);
+    hn::Store(res, c, result + i);
+  }
 }
 
 template <typename T>
@@ -350,12 +368,21 @@ HWY_NOINLINE void _mul(const T *HWY_RESTRICT a, const T *HWY_RESTRICT b,
   using D = hn::ScalableTag<T>;
   const hn::ScalableTag<T> d;
   const size_t N = hn::Lanes(d);
-  for (size_t i = 0; i + N <= count; i += N) {
+  size_t i = 0;
+  for (; i + N <= count; i += N) {
     auto a_vec = hn::Load(d, a + i);
     auto b_vec = hn::Load(d, b + i);
     auto res = sr_mul<D>(a_vec, b_vec);
     hn::Store(res, d, result + i);
   }
+  using C = hn::CappedTag<T, 1>;
+  const C c;
+  for (; i < count; ++i) {
+    auto a_vec = hn::Load(c, a + i);
+    auto b_vec = hn::Load(c, b + i);
+    auto res = sr_mul<C>(a_vec, b_vec);
+    hn::Store(res, c, result + i);
+  }
 }
 
 template <typename T>
@@ -364,12 +391,21 @@ HWY_NOINLINE void _div(const T *HWY_RESTRICT a, const T *HWY_RESTRICT b,
   using D = hn::ScalableTag<T>;
   const hn::ScalableTag<T> d;
   const size_t N = hn::Lanes(d);
-  for (size_t i = 0; i + N <= count; i += N) {
+  size_t i = 0;
+  for (; i + N <= count; i += N) {
     auto a_vec = hn::Load(d, a + i);
     auto b_vec = hn::Load(d, b + i);
     auto res = sr_div<D>(a_vec, b_vec);
     hn::Store(res, d, result + i);
   }
+  using C = hn::CappedTag<T, 1>;
+  const C c;
+  for (; i < count; ++i) {
+    auto a_vec = hn::Load(c, a + i);
+    auto b_vec = hn::Load(c, b + i);
+    auto res = sr_div<C>(a_vec, b_vec);
+    hn::Store(res, c, result + i);
+  }
 }
 
 template <typename T>
@@ -378,11 +414,19 @@ HWY_NOINLINE void _sqrt(const T *HWY_RESTRICT a, T *HWY_RESTRICT result,
   using D = hn::ScalableTag<T>;
   const hn::ScalableTag<T> d;
   const size_t N = hn::Lanes(d);
-  for (size_t i = 0; i + N <= count; i += N) {
+  size_t i = 0;
+  for (; i + N <= count; i += N) {
     auto a_vec = hn::Load(d, a + i);
     auto res = sr_sqrt<D>(a_vec);
     hn::Store(res, d, result + i);
   }
+  // using C = hn::CappedTag<T, 1>;
+  // const C c;
+  // for (; i < count; ++i) {
+  //   auto a_vec = hn::Load(d, a + i);
+  //   auto res = sr_sqrt<C>(a_vec);
+  //   hn::Store(res, c, result + i);
+  // }
 }
 
 void _round_f32(const float *HWY_RESTRICT sigma, const float *HWY_RESTRICT tau,
diff --git a/src/libvfcinstrumentonline/rand/src/ud.h b/src/libvfcinstrumentonline/rand/src/ud.h
new file mode 100644
index 00000000..5749dba8
--- /dev/null
+++ b/src/libvfcinstrumentonline/rand/src/ud.h
@@ -0,0 +1,40 @@
+#ifndef __VERIFICARLO_SRLIB_UD_HPP__
+#define __VERIFICARLO_SRLIB_UD_HPP__
+
+#include "debug.hpp"
+#include "eft.hpp"
+#include "utils.hpp"
+#include "xoroshiro256+_hw.h"
+
+namespace ud {
+namespace scalar {
+
+template <typename T> T udround(T a) {
+  debug_start();
+  if (a == 0) {
+    debug_end();
+    return a;
+  }
+  debug_print("a        = %.13a\n", a);
+  using I = typename sr::utils::IEEE754<T>::I;
+  I a_bits = *reinterpret_cast<I *>(&a);
+  std::uint64_t rand = sr::scalar::xoroshiro256plus::static_dispatch::random();
+  debug_print("rand     = 0x%02x\n", rand);
+  // get the last bit of the random number to get -1 or 1
+  a_bits += 1 - ((rand & 1) << 1);
+  a = *reinterpret_cast<T *>(&a_bits);
+  debug_print("round(a) = %.13a\n", a);
+  debug_end();
+  return a;
+}
+
+template <typename T> T add(T a, T b) { return udround(a + b); }
+template <typename T> T sub(T a, T b) { return udround(a - b); }
+template <typename T> T mul(T a, T b) { return udround(a * b); }
+template <typename T> T div(T a, T b) { return udround(a / b); }
+template <typename T> T sqrt(T a) { return udround(std::sqrt(a)); }
+
+} // namespace scalar
+} // namespace ud
+
+#endif // __UD_HPP__
\ No newline at end of file
diff --git a/src/libvfcinstrumentonline/rand/src/ud.hpp b/src/libvfcinstrumentonline/rand/src/ud.hpp
deleted file mode 100644
index 15cb172d..00000000
--- a/src/libvfcinstrumentonline/rand/src/ud.hpp
+++ /dev/null
@@ -1,43 +0,0 @@
-#ifndef __VERIFICARLO_SRLIB_UD_HPP__
-#define __VERIFICARLO_SRLIB_UD_HPP__
-
-#include "debug.hpp"
-#include "eft.hpp"
-#include "rand.hpp"
-#include "utils.hpp"
-
-template <typename T> T ud_round(T a) {
-  debug_start();
-  if (a == 0) {
-    debug_end();
-    return a;
-  }
-  debug_print("a = %.13a\n", a);
-  using I = typename sr::utils::IEEE754<T>::I;
-  I a_bits = *reinterpret_cast<I *>(&a);
-  uint8_t rand = get_rand_uint1();
-  debug_print("rand = 0x%02x\n", rand);
-  a_bits += 1 - (rand * 2);
-  debug_print("ud_round(%.13a)", a);
-  a = *reinterpret_cast<T *>(&a_bits);
-  debug_print(" = %.13a\n", a);
-  debug_end();
-  return a;
-}
-
-template <typename T> T ud_add(T a, T b) { return ud_round(a + b); }
-template <typename T> T ud_sub(T a, T b) { return ud_add(a, -b); }
-template <typename T> T ud_mul(T a, T b) { return ud_round(a * b); }
-template <typename T> T ud_div(T a, T b) { return ud_round(a / b); }
-
-float ud_add_float(float a, float b) { return ud_add<float>(a, b); }
-float ud_sub_float(float a, float b) { return ud_sub<float>(a, b); }
-float ud_mul_float(float a, float b) { return ud_mul<float>(a, b); }
-float ud_div_float(float a, float b) { return ud_div<float>(a, b); }
-
-double ud_add_double(double a, double b) { return ud_add<double>(a, b); }
-double ud_sub_double(double a, double b) { return ud_sub<double>(a, b); }
-double ud_mul_double(double a, double b) { return ud_mul<double>(a, b); }
-double ud_div_double(double a, double b) { return ud_div<double>(a, b); }
-
-#endif // __UD_HPP__
\ No newline at end of file
diff --git a/src/libvfcinstrumentonline/rand/src/ud_hw-inl.h b/src/libvfcinstrumentonline/rand/src/ud_hw-inl.h
new file mode 100644
index 00000000..60b0b9ac
--- /dev/null
+++ b/src/libvfcinstrumentonline/rand/src/ud_hw-inl.h
@@ -0,0 +1,139 @@
+#if defined(HIGHWAY_HWY_VERIFICARLO_UD_HW_INL_H_) == defined(HWY_TARGET_TOGGLE)
+#ifdef HIGHWAY_HWY_VERIFICARLO_UD_HW_INL_H_
+#undef HIGHWAY_HWY_VERIFICARLO_UD_HW_INL_H_
+#else
+#define HIGHWAY_HWY_VERIFICARLO_UD_HW_INL_H_
+#endif
+
+// clang-format off
+#include "hwy/highway.h"
+#include "hwy/print-inl.h"
+#include "src/debug_hwy-inl.h"
+#include "src/utils.hpp"
+#include "src/xoroshiro256+_hw-inl.hpp"
+// clang-format on
+
+HWY_BEFORE_NAMESPACE(); // at file scope
+namespace ud {
+namespace vector {
+namespace HWY_NAMESPACE {
+
+namespace hn = hwy::HWY_NAMESPACE;
+namespace rng = sr::vector::xoroshiro256plus::HWY_NAMESPACE;
+namespace dbg = sr::vector::HWY_NAMESPACE;
+
+template <class D, class V = hn::VFromD<D>, typename T = hn::TFromD<D>>
+V round(V a) {
+  debug_start();
+  dbg::debug_vec<D>("[round] a", a);
+
+  using DI = hn::RebindToSigned<D>;
+  using DU = hn::RebindToUnsigned<D>;
+  const DI di{};
+  const DU du{};
+  const D d{};
+
+  auto is_not_zero = hn::Ne(a, hn::Set(d, 0));
+  auto is_finite = hn::IsFinite(a);
+  auto must_be_rounded = hn::And(is_finite, is_not_zero);
+
+  // rand = 1 - 2 * (z & 1)
+  auto z = rng::random(du);
+  auto z_di = hn::BitCast(di, z);
+  auto z_last_bit = hn::And(hn::Set(di, 0x1), z_di);
+  auto z_last_bit_two = hn::ShiftLeft<1>(z_last_bit);
+  auto rand = hn::Sub(hn::Set(di, 1), z_last_bit_two);
+
+  auto a_di = hn::BitCast(di, a);
+  auto a_rounded = hn::Add(a_di, rand);
+  auto res = hn::BitCast(d, a_rounded);
+
+  auto ret = hn::IfThenElse(must_be_rounded, res, a);
+
+  dbg::debug_vec<D>("[round] res", ret);
+  debug_end();
+
+  return ret;
+}
+
+template <class D, class V = hn::VFromD<D>, typename T = hn::TFromD<D>>
+V add(V a, V b) {
+  debug_start();
+  dbg::debug_vec<D>("[add] a", a);
+  dbg::debug_vec<D>("[add] b", b);
+
+  V c = hn::Add(a, b);
+  auto res = round<D>(c);
+
+  dbg::debug_vec<D>("[add] res", res);
+  debug_end();
+
+  return res;
+}
+
+template <class D, class V = hn::VFromD<D>, typename T = hn::TFromD<D>>
+V sub(V a, V b) {
+  debug_start();
+  dbg::debug_vec<D>("[sub] a", a);
+  dbg::debug_vec<D>("[sub] b", b);
+
+  V c = hn::Sub(a, b);
+  auto res = round<D>(c);
+
+  dbg::debug_vec<D>("[sub] res", res);
+  debug_end();
+
+  return res;
+}
+
+template <class D, class V = hn::VFromD<D>, typename T = hn::TFromD<D>>
+V mul(V a, V b) {
+  debug_start();
+  dbg::debug_vec<D>("[mul] a", a);
+  dbg::debug_vec<D>("[mul] b", b);
+
+  V c = hn::Mul(a, b);
+  auto res = round<D>(c);
+
+  dbg::debug_vec<D>("[mul] res", res);
+  debug_end();
+
+  return res;
+}
+
+template <class D, class V = hn::VFromD<D>, typename T = hn::TFromD<D>>
+V div(V a, V b) {
+  debug_start();
+  dbg::debug_vec<D>("[div] a", a);
+  dbg::debug_vec<D>("[div] b", b);
+
+  V c = hn::Div(a, b);
+  auto res = round<D>(c);
+
+  dbg::debug_vec<D>("[div] res", res);
+  debug_end();
+
+  return res;
+}
+
+template <class D, class V = hn::VFromD<D>, typename T = hn::TFromD<D>>
+V sqrt(V a) {
+  debug_start();
+  dbg::debug_vec<D>("[sqrt] a", a);
+
+  V c = hn::Sqrt(a);
+  auto res = round<D>(c);
+
+  dbg::debug_vec<D>("[sqrt] res", res);
+  debug_end();
+
+  return res;
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+} // namespace HWY_NAMESPACE
+} // namespace vector
+} // namespace ud
+HWY_AFTER_NAMESPACE();
+
+#endif // HIGHWAY_HWY_VERIFICARLO_UD_HW_INL_H_
diff --git a/src/libvfcinstrumentonline/rand/src/ud_hw.cpp b/src/libvfcinstrumentonline/rand/src/ud_hw.cpp
new file mode 100644
index 00000000..f8ecaec1
--- /dev/null
+++ b/src/libvfcinstrumentonline/rand/src/ud_hw.cpp
@@ -0,0 +1,1584 @@
+
+#include "src/debug.hpp"
+#include "src/ud.h"
+
+// Generates code for every target that this compiler can support.
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "src/ud_hw.cpp" // this file
+#include "hwy/foreach_target.h"            // must come before highway.h
+
+// clang-format off
+#include "hwy/highway.h"
+#include "src/ud_hw-inl.h"
+#include "src/ud_hw.h"
+#include "src/xoroshiro256+_hw-inl.hpp"
+// clang-format on
+
+// Optional, can instead add HWY_ATTR to all functions.
+HWY_BEFORE_NAMESPACE();
+namespace ud {
+namespace vector {
+namespace HWY_NAMESPACE {
+
+namespace hn = hwy::HWY_NAMESPACE;
+
+namespace internal {
+
+template <typename T, std::size_t N, class D = hn::FixedTag<T, N>,
+          class V = hn::Vec<D>>
+V _add_fp_xN(V a, V b) {
+  auto res = add<D>(a, b);
+  return res;
+}
+
+template <typename T, std::size_t N, class D = hn::FixedTag<T, N>,
+          class V = hn::Vec<D>>
+V _sub_fp_xN(V a, V b) {
+  auto res = sub<D>(a, b);
+  return res;
+}
+
+template <typename T, std::size_t N, class D = hn::FixedTag<T, N>,
+          class V = hn::Vec<D>>
+V _mul_fp_xN(V a, V b) {
+  auto res = mul<D>(a, b);
+  return res;
+}
+
+template <typename T, std::size_t N, class D = hn::FixedTag<T, N>,
+          class V = hn::Vec<D>>
+V _div_fp_xN(V a, V b) {
+  auto res = div<D>(a, b);
+  return res;
+}
+
+template <typename T, std::size_t N, class D = hn::FixedTag<T, N>,
+          class V = hn::Vec<D>>
+V _sqrt_fp_xN(V a) {
+  auto res = sqrt<D>(a);
+  return res;
+}
+
+/* 32-bits */
+#if HWY_MAX_BYTES >= 4
+using f32x1_t = hn::FixedTag<float, 1>;
+using f32x1 = hn::Vec<f32x1_t>;
+
+f32x1 _add_f32_x1(f32x1 a, f32x1 b) { return _add_fp_xN<float, 1>(a, b); }
+f32x1 _sub_f32_x1(f32x1 a, f32x1 b) { return _sub_fp_xN<float, 1>(a, b); }
+f32x1 _mul_f32_x1(f32x1 a, f32x1 b) { return _mul_fp_xN<float, 1>(a, b); }
+f32x1 _div_f32_x1(f32x1 a, f32x1 b) { return _div_fp_xN<float, 1>(a, b); }
+f32x1 _sqrt_f32_x1(f32x1 a) { return _sqrt_fp_xN<float, 1>(a); }
+#endif
+
+/* 64-bits */
+#if HWY_MAX_BYTES >= 8
+using f64x1_t = hn::FixedTag<double, 1>;
+using f64x1 = hn::Vec<f64x1_t>;
+using f32x2_t = hn::FixedTag<float, 2>;
+using f32x2 = hn::Vec<f32x2_t>;
+
+f64x1 _add_f64_x1(f64x1 a, f64x1 b) { return _add_fp_xN<double, 1>(a, b); }
+f64x1 _sub_f64_x1(f64x1 a, f64x1 b) { return _sub_fp_xN<double, 1>(a, b); }
+f64x1 _mul_f64_x1(f64x1 a, f64x1 b) { return _mul_fp_xN<double, 1>(a, b); }
+f64x1 _div_f64_x1(f64x1 a, f64x1 b) { return _div_fp_xN<double, 1>(a, b); }
+f64x1 _sqrt_f64_x1(f64x1 a) { return _sqrt_fp_xN<double, 1>(a); }
+
+f32x2 _add_f32_x2(f32x2 a, f32x2 b) { return _add_fp_xN<float, 2>(a, b); }
+f32x2 _sub_f32_x2(f32x2 a, f32x2 b) { return _sub_fp_xN<float, 2>(a, b); }
+f32x2 _mul_f32_x2(f32x2 a, f32x2 b) { return _mul_fp_xN<float, 2>(a, b); }
+f32x2 _div_f32_x2(f32x2 a, f32x2 b) { return _div_fp_xN<float, 2>(a, b); }
+f32x2 _sqrt_f32_x2(f32x2 a) { return _sqrt_fp_xN<float, 2>(a); }
+#endif
+
+/* 128-bits */
+#if HWY_MAX_BYTES >= 16
+using f64x2_t = hn::FixedTag<double, 2>;
+using f64x2 = hn::Vec<f64x2_t>;
+using f32x4_t = hn::FixedTag<float, 4>;
+using f32x4 = hn::Vec<f32x4_t>;
+
+f64x2 _add_f64_x2(f64x2 a, f64x2 b) { return _add_fp_xN<double, 2>(a, b); }
+f64x2 _sub_f64_x2(f64x2 a, f64x2 b) { return _sub_fp_xN<double, 2>(a, b); }
+f64x2 _mul_f64_x2(f64x2 a, f64x2 b) { return _mul_fp_xN<double, 2>(a, b); }
+f64x2 _div_f64_x2(f64x2 a, f64x2 b) { return _div_fp_xN<double, 2>(a, b); }
+f64x2 _sqrt_f64_x2(f64x2 a) { return _sqrt_fp_xN<double, 2>(a); }
+
+f32x4 _add_f32_x4(f32x4 a, f32x4 b) { return _add_fp_xN<float, 4>(a, b); }
+f32x4 _sub_f32_x4(f32x4 a, f32x4 b) { return _sub_fp_xN<float, 4>(a, b); }
+f32x4 _mul_f32_x4(f32x4 a, f32x4 b) { return _mul_fp_xN<float, 4>(a, b); }
+f32x4 _div_f32_x4(f32x4 a, f32x4 b) { return _div_fp_xN<float, 4>(a, b); }
+f32x4 _sqrt_f32_x4(f32x4 a) { return _sqrt_fp_xN<float, 4>(a); }
+
+#endif
+
+/* 256-bits */
+#if HWY_MAX_BYTES >= 32
+using f64x4_t = hn::FixedTag<double, 4>;
+using f64x4 = hn::Vec<f64x4_t>;
+using f32x8_t = hn::FixedTag<float, 8>;
+using f32x8 = hn::Vec<f32x8_t>;
+
+f64x4 _add_f64_x4(f64x4 a, f64x4 b) { return _add_fp_xN<double, 4>(a, b); }
+f64x4 _sub_f64_x4(f64x4 a, f64x4 b) { return _sub_fp_xN<double, 4>(a, b); }
+f64x4 _mul_f64_x4(f64x4 a, f64x4 b) { return _mul_fp_xN<double, 4>(a, b); }
+f64x4 _div_f64_x4(f64x4 a, f64x4 b) { return _div_fp_xN<double, 4>(a, b); }
+f64x4 _sqrt_f64_x4(f64x4 a) { return _sqrt_fp_xN<double, 4>(a); }
+
+f32x8 _add_f32_x8(f32x8 a, f32x8 b) { return _add_fp_xN<float, 8>(a, b); }
+f32x8 _sub_f32_x8(f32x8 a, f32x8 b) { return _sub_fp_xN<float, 8>(a, b); }
+f32x8 _mul_f32_x8(f32x8 a, f32x8 b) { return _mul_fp_xN<float, 8>(a, b); }
+f32x8 _div_f32_x8(f32x8 a, f32x8 b) { return _div_fp_xN<float, 8>(a, b); }
+f32x8 _sqrt_f32_x8(f32x8 a) { return _sqrt_fp_xN<float, 8>(a); }
+#endif
+
+/* 512-bits */
+#if HWY_MAX_BYTES >= 64
+using f64x8_t = hn::FixedTag<double, 8>;
+using f64x8 = hn::Vec<f64x8_t>;
+using f32x16_t = hn::FixedTag<float, 16>;
+using f32x16 = hn::Vec<f32x16_t>;
+
+f64x8 _add_f64_x8(f64x8 a, f64x8 b) { return _add_fp_xN<double, 8>(a, b); }
+f64x8 _sub_f64_x8(f64x8 a, f64x8 b) { return _sub_fp_xN<double, 8>(a, b); }
+f64x8 _mul_f64_x8(f64x8 a, f64x8 b) { return _mul_fp_xN<double, 8>(a, b); }
+f64x8 _div_f64_x8(f64x8 a, f64x8 b) { return _div_fp_xN<double, 8>(a, b); }
+f64x8 _sqrt_f64_x8(f64x8 a) { return _sqrt_fp_xN<double, 8>(a); }
+
+f32x16 _add_f32_x16(f32x16 a, f32x16 b) { return _add_fp_xN<float, 16>(a, b); }
+f32x16 _sub_f32_x16(f32x16 a, f32x16 b) { return _sub_fp_xN<float, 16>(a, b); }
+f32x16 _mul_f32_x16(f32x16 a, f32x16 b) { return _mul_fp_xN<float, 16>(a, b); }
+f32x16 _div_f32_x16(f32x16 a, f32x16 b) { return _div_fp_xN<float, 16>(a, b); }
+f32x16 _sqrt_f32_x16(f32x16 a) { return _sqrt_fp_xN<float, 16>(a); }
+#endif
+} // namespace internal
+
+template <typename T, std::size_t N, class D = hn::FixedTag<T, N>,
+          class V = hn::Vec<D>>
+void _addxN(const T *HWY_RESTRICT a, const T *HWY_RESTRICT b,
+            T *HWY_RESTRICT c) {
+  const D d;
+  const auto va = hn::Load(d, a);
+  const auto vb = hn::Load(d, b);
+  auto res = internal::_add_fp_xN<T, N>(va, vb);
+  hn::Store(res, d, c);
+}
+
+template <typename T, std::size_t N, class D = hn::FixedTag<T, N>,
+          class V = hn::Vec<D>>
+void _subxN(const T *HWY_RESTRICT a, const T *HWY_RESTRICT b,
+            T *HWY_RESTRICT c) {
+  const D d;
+  const auto va = hn::Load(d, a);
+  const auto vb = hn::Load(d, b);
+  auto res = internal::_sub_fp_xN<T, N>(va, vb);
+  hn::Store(res, d, c);
+}
+
+template <typename T, std::size_t N, class D = hn::FixedTag<T, N>,
+          class V = hn::Vec<D>>
+void _mulxN(const T *HWY_RESTRICT a, const T *HWY_RESTRICT b,
+            T *HWY_RESTRICT c) {
+  const D d;
+  const auto va = hn::Load(d, a);
+  const auto vb = hn::Load(d, b);
+  auto res = internal::_mul_fp_xN<T, N>(va, vb);
+  hn::Store(res, d, c);
+}
+
+template <typename T, std::size_t N, class D = hn::FixedTag<T, N>,
+          class V = hn::Vec<D>>
+void _divxN(const T *HWY_RESTRICT a, const T *HWY_RESTRICT b,
+            T *HWY_RESTRICT c) {
+  const D d;
+  const auto va = hn::Load(d, a);
+  const auto vb = hn::Load(d, b);
+  auto res = internal::_div_fp_xN<T, N>(va, vb);
+  hn::Store(res, d, c);
+}
+
+template <typename T, std::size_t N, class D = hn::FixedTag<T, N>,
+          class V = hn::Vec<D>>
+void _sqrtxN(const T *HWY_RESTRICT a, T *HWY_RESTRICT c) {
+  const D d;
+  const auto va = hn::Load(d, a);
+  auto res = internal::_sqrt_fp_xN<T, N>(va);
+  hn::Store(res, d, c);
+}
+
+/* _<op>x<size>_<type> */
+
+#define define_fp_xN_unary_op(type, name, size, op)                            \
+  void _##op##x##size##_##name(const type *HWY_RESTRICT a,                     \
+                               type *HWY_RESTRICT c) {                         \
+    _##op##xN<type, size>(a, c);                                               \
+  }
+
+#define define_fp_xN_bin_op(type, name, size, op)                              \
+  void _##op##x##size##_##name(const type *HWY_RESTRICT a,                     \
+                               const type *HWY_RESTRICT b,                     \
+                               type *HWY_RESTRICT c) {                         \
+    _##op##xN<type, size>(a, b, c);                                            \
+  }
+
+/* 32-bits */
+#if HWY_MAX_BYTES >= 4
+define_fp_xN_bin_op(float, f32, 1, add);
+define_fp_xN_bin_op(float, f32, 1, sub);
+define_fp_xN_bin_op(float, f32, 1, mul);
+define_fp_xN_bin_op(float, f32, 1, div);
+define_fp_xN_unary_op(float, f32, 1, sqrt);
+#endif
+
+/* 64-bits */
+#if HWY_MAX_BYTES >= 8
+define_fp_xN_bin_op(double, f64, 1, add);
+define_fp_xN_bin_op(double, f64, 1, sub);
+define_fp_xN_bin_op(double, f64, 1, mul);
+define_fp_xN_bin_op(double, f64, 1, div);
+define_fp_xN_unary_op(double, f64, 1, sqrt);
+
+define_fp_xN_bin_op(float, f32, 2, add);
+define_fp_xN_bin_op(float, f32, 2, sub);
+define_fp_xN_bin_op(float, f32, 2, mul);
+define_fp_xN_bin_op(float, f32, 2, div);
+define_fp_xN_unary_op(float, f32, 2, sqrt);
+#endif
+
+/* 128-bits */
+#if HWY_MAX_BYTES >= 16
+define_fp_xN_bin_op(double, f64, 2, add);
+define_fp_xN_bin_op(double, f64, 2, sub);
+define_fp_xN_bin_op(double, f64, 2, mul);
+define_fp_xN_bin_op(double, f64, 2, div);
+define_fp_xN_unary_op(double, f64, 2, sqrt);
+
+define_fp_xN_bin_op(float, f32, 4, add);
+define_fp_xN_bin_op(float, f32, 4, sub);
+define_fp_xN_bin_op(float, f32, 4, mul);
+define_fp_xN_bin_op(float, f32, 4, div);
+define_fp_xN_unary_op(float, f32, 4, sqrt);
+#endif
+
+/* 256-bits */
+#if HWY_MAX_BYTES >= 32
+define_fp_xN_bin_op(double, f64, 4, add);
+define_fp_xN_bin_op(double, f64, 4, sub);
+define_fp_xN_bin_op(double, f64, 4, mul);
+define_fp_xN_bin_op(double, f64, 4, div);
+define_fp_xN_unary_op(double, f64, 4, sqrt);
+
+define_fp_xN_bin_op(float, f32, 8, add);
+define_fp_xN_bin_op(float, f32, 8, sub);
+define_fp_xN_bin_op(float, f32, 8, mul);
+define_fp_xN_bin_op(float, f32, 8, div);
+define_fp_xN_unary_op(float, f32, 8, sqrt);
+#endif
+
+/* 512-bits */
+#if HWY_MAX_BYTES >= 64
+define_fp_xN_bin_op(double, f64, 8, add);
+define_fp_xN_bin_op(double, f64, 8, sub);
+define_fp_xN_bin_op(double, f64, 8, mul);
+define_fp_xN_bin_op(double, f64, 8, div);
+define_fp_xN_unary_op(double, f64, 8, sqrt);
+
+define_fp_xN_bin_op(float, f32, 16, add);
+define_fp_xN_bin_op(float, f32, 16, sub);
+define_fp_xN_bin_op(float, f32, 16, mul);
+define_fp_xN_bin_op(float, f32, 16, div);
+define_fp_xN_unary_op(float, f32, 16, sqrt);
+#endif
+template <typename T>
+void _round(const T *HWY_RESTRICT a, T *HWY_RESTRICT result,
+            const size_t count) {
+  using D = hn::ScalableTag<T>;
+  const D d;
+  const size_t N = hn::Lanes(d);
+  size_t i = 0;
+  for (; i + N <= count; i += N) {
+    auto a_vec = hn::Load(d, a + i);
+    auto res = round<D>(a_vec);
+    hn::Store(res, d, result + i);
+  }
+  //   using C = hn::CappedTag<T, 1>;
+  //   const C c;
+  //   for (; i < count; ++i) {
+  //     auto sigma_vec = hn::Load(d, sigma + i);
+  //     auto tau_vec = hn::Load(d, tau + i);
+  //     auto res = ud_round<C>(sigma_vec, tau_vec);
+  //     hn::Store(res, c, result + i);
+  //   }
+}
+
+template <typename T>
+HWY_NOINLINE void _add(const T *HWY_RESTRICT a, const T *HWY_RESTRICT b,
+                       T *HWY_RESTRICT result, const size_t count) {
+  using D = hn::ScalableTag<T>;
+  const D d;
+  const size_t N = hn::Lanes(d);
+  size_t i = 0;
+  for (; i + N <= count; i += N) {
+    auto a_vec = hn::Load(d, a + i);
+    auto b_vec = hn::Load(d, b + i);
+    auto res = add<D>(a_vec, b_vec);
+    hn::Store(res, d, result + i);
+  }
+  //   using C = hn::CappedTag<T, 1>;
+  //   const C c;
+  //   for (; i < count; ++i) {
+  //     auto a_vec = hn::Load(c, a + i);
+  //     auto b_vec = hn::Load(c, b + i);
+  //     auto res = ud_add<C>(a_vec, b_vec);
+  //     hn::Store(res, c, result + i);
+  //   }
+}
+
+template <typename T>
+HWY_NOINLINE void _sub(const T *HWY_RESTRICT a, const T *HWY_RESTRICT b,
+                       T *HWY_RESTRICT result, const size_t count) {
+  using D = hn::ScalableTag<T>;
+  const D d;
+  const size_t N = hn::Lanes(d);
+  size_t i = 0;
+  for (; i + N <= count; i += N) {
+    auto a_vec = hn::Load(d, a + i);
+    auto b_vec = hn::Load(d, b + i);
+    auto res = sub<D>(a_vec, b_vec);
+    hn::Store(res, d, result + i);
+  }
+  //   using C = hn::CappedTag<T, 1>;
+  //   const C c;
+  //   for (; i < count; ++i) {
+  //     auto a_vec = hn::Load(c, a + i);
+  //     auto b_vec = hn::Load(c, b + i);
+  //     auto res = ud_sub<C>(a_vec, b_vec);
+  //     hn::Store(res, c, result + i);
+  //   }
+}
+
+template <typename T>
+HWY_NOINLINE void _mul(const T *HWY_RESTRICT a, const T *HWY_RESTRICT b,
+                       T *HWY_RESTRICT result, const size_t count) {
+  using D = hn::ScalableTag<T>;
+  const hn::ScalableTag<T> d;
+  const size_t N = hn::Lanes(d);
+  size_t i = 0;
+  for (; i + N <= count; i += N) {
+    auto a_vec = hn::Load(d, a + i);
+    auto b_vec = hn::Load(d, b + i);
+    auto res = mul<D>(a_vec, b_vec);
+    hn::Store(res, d, result + i);
+  }
+  //   using C = hn::CappedTag<T, 1>;
+  //   const C c;
+  //   for (; i < count; ++i) {
+  //     auto a_vec = hn::Load(c, a + i);
+  //     auto b_vec = hn::Load(c, b + i);
+  //     auto res = ud_mul<C>(a_vec, b_vec);
+  //     hn::Store(res, c, result + i);
+  //   }
+}
+
+template <typename T>
+HWY_NOINLINE void _div(const T *HWY_RESTRICT a, const T *HWY_RESTRICT b,
+                       T *HWY_RESTRICT result, const size_t count) {
+  using D = hn::ScalableTag<T>;
+  const hn::ScalableTag<T> d;
+  const size_t N = hn::Lanes(d);
+  size_t i = 0;
+  for (; i + N <= count; i += N) {
+    auto a_vec = hn::Load(d, a + i);
+    auto b_vec = hn::Load(d, b + i);
+    auto res = div<D>(a_vec, b_vec);
+    hn::Store(res, d, result + i);
+  }
+  //   using C = hn::CappedTag<T, 1>;
+  //   const C c;
+  //   for (; i < count; ++i) {
+  //     auto a_vec = hn::Load(c, a + i);
+  //     auto b_vec = hn::Load(c, b + i);
+  //     auto res = ud_div<C>(a_vec, b_vec);
+  //     hn::Store(res, c, result + i);
+  //   }
+}
+
+template <typename T>
+HWY_NOINLINE void _sqrt(const T *HWY_RESTRICT a, T *HWY_RESTRICT result,
+                        const size_t count) {
+  using D = hn::ScalableTag<T>;
+  const hn::ScalableTag<T> d;
+  const size_t N = hn::Lanes(d);
+  size_t i = 0;
+  for (; i + N <= count; i += N) {
+    auto a_vec = hn::Load(d, a + i);
+    auto res = sqrt<D>(a_vec);
+    hn::Store(res, d, result + i);
+  }
+  //   using C = hn::CappedTag<T, 1>;
+  //   const C c;
+  //   for (; i < count; ++i) {
+  //     auto a_vec = hn::Load(d, a + i);
+  //     auto res = ud_sqrt<C>(a_vec);
+  //     hn::Store(res, c, result + i);
+  //   }
+}
+void _round_f32(const float *HWY_RESTRICT a, float *HWY_RESTRICT result,
+                const size_t count) {
+  _round<float>(a, result, count);
+}
+
+void _add_f32(const float *HWY_RESTRICT a, const float *HWY_RESTRICT b,
+              float *HWY_RESTRICT result, const size_t count) {
+  _add<float>(a, b, result, count);
+}
+void _sub_f32(const float *HWY_RESTRICT a, const float *HWY_RESTRICT b,
+              float *HWY_RESTRICT result, const size_t count) {
+  _sub<float>(a, b, result, count);
+}
+
+void _mul_f32(const float *HWY_RESTRICT a, const float *HWY_RESTRICT b,
+              float *HWY_RESTRICT result, const size_t count) {
+  _mul<float>(a, b, result, count);
+}
+
+void _div_f32(const float *HWY_RESTRICT a, const float *HWY_RESTRICT b,
+              float *HWY_RESTRICT result, const size_t count) {
+  _div<float>(a, b, result, count);
+}
+
+void _sqrt_f32(const float *HWY_RESTRICT a, float *HWY_RESTRICT result,
+               const size_t count) {
+  _sqrt<float>(a, result, count);
+}
+
+void _round_f64(const double *HWY_RESTRICT a, double *HWY_RESTRICT result,
+                const size_t count) {
+  _round<double>(a, result, count);
+}
+
+void _add_f64(const double *HWY_RESTRICT a, const double *HWY_RESTRICT b,
+              double *HWY_RESTRICT result, const size_t count) {
+  _add<double>(a, b, result, count);
+}
+
+void _sub_f64(const double *HWY_RESTRICT a, const double *HWY_RESTRICT b,
+              double *HWY_RESTRICT result, const size_t count) {
+  _sub<double>(a, b, result, count);
+}
+
+void _mul_f64(const double *HWY_RESTRICT a, const double *HWY_RESTRICT b,
+              double *HWY_RESTRICT result, const size_t count) {
+  _mul<double>(a, b, result, count);
+}
+
+void _div_f64(const double *HWY_RESTRICT a, const double *HWY_RESTRICT b,
+              double *HWY_RESTRICT result, const size_t count) {
+  _div<double>(a, b, result, count);
+}
+
+void _sqrt_f64(const double *HWY_RESTRICT a, double *HWY_RESTRICT result,
+               const size_t count) {
+  _sqrt<double>(a, result, count);
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+} // namespace HWY_NAMESPACE
+} // namespace vector
+} // namespace ud
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace ud {
+namespace vector {
+
+HWY_EXPORT(_round_f32);
+HWY_EXPORT(_round_f64);
+HWY_EXPORT(_add_f32);
+HWY_EXPORT(_add_f64);
+HWY_EXPORT(_sub_f32);
+HWY_EXPORT(_sub_f64);
+HWY_EXPORT(_mul_f32);
+HWY_EXPORT(_mul_f64);
+HWY_EXPORT(_div_f32);
+HWY_EXPORT(_div_f64);
+HWY_EXPORT(_sqrt_f32);
+HWY_EXPORT(_sqrt_f64);
+
+/* 32-bits */
+#if HWY_MAX_BYTES >= 4
+HWY_EXPORT(_addx1_f32);
+HWY_EXPORT(_subx1_f32);
+HWY_EXPORT(_mulx1_f32);
+HWY_EXPORT(_divx1_f32);
+HWY_EXPORT(_sqrtx1_f32);
+#endif
+
+/* 64-bits */
+#if HWY_MAX_BYTES >= 8
+HWY_EXPORT(_addx1_f64);
+HWY_EXPORT(_subx1_f64);
+HWY_EXPORT(_mulx1_f64);
+HWY_EXPORT(_divx1_f64);
+HWY_EXPORT(_sqrtx1_f64);
+HWY_EXPORT(_addx2_f32);
+HWY_EXPORT(_subx2_f32);
+HWY_EXPORT(_mulx2_f32);
+HWY_EXPORT(_divx2_f32);
+HWY_EXPORT(_sqrtx2_f32);
+#endif
+
+/* 128-bits */
+#if HWY_MAX_BYTES >= 16
+HWY_EXPORT(_addx2_f64);
+HWY_EXPORT(_subx2_f64);
+HWY_EXPORT(_mulx2_f64);
+HWY_EXPORT(_divx2_f64);
+HWY_EXPORT(_sqrtx2_f64);
+HWY_EXPORT(_addx4_f32);
+HWY_EXPORT(_subx4_f32);
+HWY_EXPORT(_mulx4_f32);
+HWY_EXPORT(_divx4_f32);
+HWY_EXPORT(_sqrtx4_f32);
+#endif
+
+/* 256-bits */
+#if HWY_MAX_BYTES >= 32
+HWY_EXPORT(_addx4_f64);
+HWY_EXPORT(_subx4_f64);
+HWY_EXPORT(_mulx4_f64);
+HWY_EXPORT(_divx4_f64);
+HWY_EXPORT(_sqrtx4_f64);
+HWY_EXPORT(_addx8_f32);
+HWY_EXPORT(_subx8_f32);
+HWY_EXPORT(_mulx8_f32);
+HWY_EXPORT(_divx8_f32);
+HWY_EXPORT(_sqrtx8_f32);
+#endif
+
+/* 512-bits */
+#if HWY_MAX_BYTES >= 64
+HWY_EXPORT(_addx8_f64);
+HWY_EXPORT(_subx8_f64);
+HWY_EXPORT(_mulx8_f64);
+HWY_EXPORT(_divx8_f64);
+HWY_EXPORT(_sqrtx8_f64);
+HWY_EXPORT(_addx16_f32);
+HWY_EXPORT(_subx16_f32);
+HWY_EXPORT(_mulx16_f32);
+HWY_EXPORT(_divx16_f32);
+HWY_EXPORT(_sqrtx16_f32);
+#endif
+
+/* 1024-bits */
+#if HWY_MAX_BYTES >= 128
+HWY_EXPORT(_addx16_f64);
+HWY_EXPORT(_subx16_f64);
+HWY_EXPORT(_mulx16_f64);
+HWY_EXPORT(_divx16_f64);
+HWY_EXPORT(_sqrtx16_f64);
+#endif
+
+template <typename T>
+void round(const T *HWY_RESTRICT a, T *HWY_RESTRICT result,
+           const size_t count) {
+  if constexpr (std::is_same_v<T, float>) {
+    return HWY_DYNAMIC_DISPATCH(_round_f32)(a, result, count);
+  } else if constexpr (std::is_same_v<T, double>) {
+    return HWY_DYNAMIC_DISPATCH(_round_f64)(a, result, count);
+  }
+}
+
+template <typename T>
+void add(const T *HWY_RESTRICT a, const T *HWY_RESTRICT b,
+         T *HWY_RESTRICT result, const size_t count) {
+  if constexpr (std::is_same_v<T, float>) {
+    return HWY_DYNAMIC_DISPATCH(_add_f32)(a, b, result, count);
+  } else if constexpr (std::is_same_v<T, double>) {
+    return HWY_DYNAMIC_DISPATCH(_add_f64)(a, b, result, count);
+  }
+}
+
+template <typename T>
+void sub(const T *HWY_RESTRICT a, const T *HWY_RESTRICT b,
+         T *HWY_RESTRICT result, const size_t count) {
+  if constexpr (std::is_same_v<T, float>) {
+    return HWY_DYNAMIC_DISPATCH(_sub_f32)(a, b, result, count);
+  } else if constexpr (std::is_same_v<T, double>) {
+    return HWY_DYNAMIC_DISPATCH(_sub_f64)(a, b, result, count);
+  }
+}
+
+template <typename T>
+void mul(const T *HWY_RESTRICT a, const T *HWY_RESTRICT b,
+         T *HWY_RESTRICT result, const size_t count) {
+  if constexpr (std::is_same_v<T, float>) {
+    return HWY_DYNAMIC_DISPATCH(_mul_f32)(a, b, result, count);
+  } else if constexpr (std::is_same_v<T, double>) {
+    return HWY_DYNAMIC_DISPATCH(_mul_f64)(a, b, result, count);
+  }
+}
+
+template <typename T>
+void div(const T *HWY_RESTRICT a, const T *HWY_RESTRICT b,
+         T *HWY_RESTRICT result, const size_t count) {
+  if constexpr (std::is_same_v<T, float>) {
+    return HWY_DYNAMIC_DISPATCH(_div_f32)(a, b, result, count);
+  } else if constexpr (std::is_same_v<T, double>) {
+    return HWY_DYNAMIC_DISPATCH(_div_f64)(a, b, result, count);
+  }
+}
+
+template <typename T>
+void sqrt(const T *HWY_RESTRICT a, T *HWY_RESTRICT result, const size_t count) {
+  if constexpr (std::is_same_v<T, float>) {
+    return HWY_DYNAMIC_DISPATCH(_sqrt_f32)(a, result, count);
+  } else if constexpr (std::is_same_v<T, double>) {
+    return HWY_DYNAMIC_DISPATCH(_sqrt_f64)(a, result, count);
+  }
+}
+
+/* Array functions */
+
+void addf32(const float *HWY_RESTRICT a, const float *HWY_RESTRICT b,
+            float *HWY_RESTRICT result, const size_t count) {
+  return add(a, b, result, count);
+}
+
+void subf32(const float *HWY_RESTRICT a, const float *HWY_RESTRICT b,
+            float *HWY_RESTRICT result, const size_t count) {
+  return sub(a, b, result, count);
+}
+
+void mulf32(const float *HWY_RESTRICT a, const float *HWY_RESTRICT b,
+            float *HWY_RESTRICT result, const size_t count) {
+  return mul(a, b, result, count);
+}
+
+void divf32(const float *HWY_RESTRICT a, const float *HWY_RESTRICT b,
+            float *HWY_RESTRICT result, const size_t count) {
+  return div(a, b, result, count);
+}
+
+void sqrtf32(const float *HWY_RESTRICT a, float *HWY_RESTRICT result,
+             const size_t count) {
+  return sqrt(a, result, count);
+}
+
+void addf64(const double *HWY_RESTRICT a, const double *HWY_RESTRICT b,
+            double *HWY_RESTRICT result, const size_t count) {
+  return add(a, b, result, count);
+}
+
+void subf64(const double *HWY_RESTRICT a, const double *HWY_RESTRICT b,
+            double *HWY_RESTRICT result, const size_t count) {
+  return sub(a, b, result, count);
+}
+
+void mulf64(const double *HWY_RESTRICT a, const double *HWY_RESTRICT b,
+            double *HWY_RESTRICT result, const size_t count) {
+  return mul(a, b, result, count);
+}
+
+void divf64(const double *HWY_RESTRICT a, const double *HWY_RESTRICT b,
+            double *HWY_RESTRICT result, const size_t count) {
+  return div(a, b, result, count);
+}
+
+void sqrtf64(const double *HWY_RESTRICT a, double *HWY_RESTRICT result,
+             const size_t count) {
+  return sqrt(a, result, count);
+}
+
+/* Single vector instructions with dynamic dispatch */
+
+/* 64-bits functions */
+#if HWY_MAX_BYTES >= 8
+void addf32x2(const float *HWY_RESTRICT a, const float *HWY_RESTRICT b,
+              float *HWY_RESTRICT result) {
+  HWY_DYNAMIC_DISPATCH(_addx2_f32)(a, b, result);
+}
+
+void subf32x2(const float *HWY_RESTRICT a, const float *HWY_RESTRICT b,
+              float *HWY_RESTRICT result) {
+  HWY_DYNAMIC_DISPATCH(_subx2_f32)(a, b, result);
+}
+
+void mulf32x2(const float *HWY_RESTRICT a, const float *HWY_RESTRICT b,
+              float *HWY_RESTRICT result) {
+  HWY_DYNAMIC_DISPATCH(_mulx2_f32)(a, b, result);
+}
+
+void divf32x2(const float *HWY_RESTRICT a, const float *HWY_RESTRICT b,
+              float *HWY_RESTRICT result) {
+  HWY_DYNAMIC_DISPATCH(_divx2_f32)(a, b, result);
+}
+#endif
+
+/* 128-bits functions */
+#if HWY_MAX_BYTES >= 16
+void addf64x2(const double *HWY_RESTRICT a, const double *HWY_RESTRICT b,
+              double *HWY_RESTRICT result) {
+  HWY_DYNAMIC_DISPATCH(_addx2_f64)(a, b, result);
+}
+
+void subf64x2(const double *HWY_RESTRICT a, const double *HWY_RESTRICT b,
+              double *HWY_RESTRICT result) {
+  HWY_DYNAMIC_DISPATCH(_subx2_f64)(a, b, result);
+}
+
+void mulf64x2(const double *HWY_RESTRICT a, const double *HWY_RESTRICT b,
+              double *HWY_RESTRICT result) {
+  HWY_DYNAMIC_DISPATCH(_mulx2_f64)(a, b, result);
+}
+
+void divf64x2(const double *HWY_RESTRICT a, const double *HWY_RESTRICT b,
+              double *HWY_RESTRICT result) {
+  HWY_DYNAMIC_DISPATCH(_divx2_f64)(a, b, result);
+}
+
+void sqrtf64x2(const double *HWY_RESTRICT a, double *HWY_RESTRICT result) {
+  HWY_DYNAMIC_DISPATCH(_sqrtx2_f64)(a, result);
+}
+
+void addf32x4(const float *HWY_RESTRICT a, const float *HWY_RESTRICT b,
+              float *HWY_RESTRICT result) {
+  HWY_DYNAMIC_DISPATCH(_addx4_f32)(a, b, result);
+}
+
+void subf32x4(const float *HWY_RESTRICT a, const float *HWY_RESTRICT b,
+              float *HWY_RESTRICT result) {
+  HWY_DYNAMIC_DISPATCH(_subx4_f32)(a, b, result);
+}
+
+void mulf32x4(const float *HWY_RESTRICT a, const float *HWY_RESTRICT b,
+              float *HWY_RESTRICT result) {
+  HWY_DYNAMIC_DISPATCH(_mulx4_f32)(a, b, result);
+}
+
+void divf32x4(const float *HWY_RESTRICT a, const float *HWY_RESTRICT b,
+              float *HWY_RESTRICT result) {
+  HWY_DYNAMIC_DISPATCH(_divx4_f32)(a, b, result);
+}
+#endif
+
+/* 256-bits functions*/
+
+#if HWY_MAX_BYTES >= 32
+void addf32x8(const float *HWY_RESTRICT a, const float *HWY_RESTRICT b,
+              float *HWY_RESTRICT result) {
+  HWY_DYNAMIC_DISPATCH(_addx8_f32)(a, b, result);
+}
+
+void subf32x8(const float *HWY_RESTRICT a, const float *HWY_RESTRICT b,
+              float *HWY_RESTRICT result) {
+  HWY_DYNAMIC_DISPATCH(_subx8_f32)(a, b, result);
+}
+
+void mulf32x8(const float *HWY_RESTRICT a, const float *HWY_RESTRICT b,
+              float *HWY_RESTRICT result) {
+  HWY_DYNAMIC_DISPATCH(_mulx8_f32)(a, b, result);
+}
+
+void divf32x8(const float *HWY_RESTRICT a, const float *HWY_RESTRICT b,
+              float *HWY_RESTRICT result) {
+  HWY_DYNAMIC_DISPATCH(_divx8_f32)(a, b, result);
+}
+
+void addf64x4(const double *HWY_RESTRICT a, const double *HWY_RESTRICT b,
+              double *HWY_RESTRICT result) {
+  HWY_DYNAMIC_DISPATCH(_addx4_f64)(a, b, result);
+}
+
+void subf64x4(const double *HWY_RESTRICT a, const double *HWY_RESTRICT b,
+              double *HWY_RESTRICT result) {
+  HWY_DYNAMIC_DISPATCH(_subx4_f64)(a, b, result);
+}
+
+void mulf64x4(const double *HWY_RESTRICT a, const double *HWY_RESTRICT b,
+              double *HWY_RESTRICT result) {
+  HWY_DYNAMIC_DISPATCH(_mulx4_f64)(a, b, result);
+}
+
+void divf64x4(const double *HWY_RESTRICT a, const double *HWY_RESTRICT b,
+              double *HWY_RESTRICT result) {
+  HWY_DYNAMIC_DISPATCH(_divx4_f64)(a, b, result);
+}
+#endif
+
+/* 512-bits functions */
+
+#if HWY_MAX_BYTES >= 64
+void addf64x8(const double *HWY_RESTRICT a, const double *HWY_RESTRICT b,
+              double *HWY_RESTRICT result) {
+  HWY_DYNAMIC_DISPATCH(_addx8_f64)(a, b, result);
+}
+
+void subf64x8(const double *HWY_RESTRICT a, const double *HWY_RESTRICT b,
+              double *HWY_RESTRICT result) {
+  HWY_DYNAMIC_DISPATCH(_subx8_f64)(a, b, result);
+}
+
+void mulf64x8(const double *HWY_RESTRICT a, const double *HWY_RESTRICT b,
+              double *HWY_RESTRICT result) {
+  HWY_DYNAMIC_DISPATCH(_mulx8_f64)(a, b, result);
+}
+
+void divf64x8(const double *HWY_RESTRICT a, const double *HWY_RESTRICT b,
+              double *HWY_RESTRICT result) {
+  HWY_DYNAMIC_DISPATCH(_divx8_f64)(a, b, result);
+}
+
+void addf32x16(const float *HWY_RESTRICT a, const float *HWY_RESTRICT b,
+               float *HWY_RESTRICT result) {
+  HWY_DYNAMIC_DISPATCH(_addx16_f32)(a, b, result);
+}
+
+void subf32x16(const float *HWY_RESTRICT a, const float *HWY_RESTRICT b,
+               float *HWY_RESTRICT result) {
+  HWY_DYNAMIC_DISPATCH(_subx16_f32)(a, b, result);
+}
+
+void mulf32x16(const float *HWY_RESTRICT a, const float *HWY_RESTRICT b,
+               float *HWY_RESTRICT result) {
+  HWY_DYNAMIC_DISPATCH(_mulx16_f32)(a, b, result);
+}
+
+void divf32x16(const float *HWY_RESTRICT a, const float *HWY_RESTRICT b,
+               float *HWY_RESTRICT result) {
+  HWY_DYNAMIC_DISPATCH(_divx16_f32)(a, b, result);
+}
+#endif
+
+/* 1024-bits functions */
+#if HWY_MAX_BYTES >= 128
+void addf64x16(const double *HWY_RESTRICT a, const double *HWY_RESTRICT b,
+               double *HWY_RESTRICT result) {
+  HWY_DYNAMIC_DISPATCH(_addx16_f64)(a, b, result);
+}
+
+void subf64x16(const double *HWY_RESTRICT a, const double *HWY_RESTRICT b,
+               double *HWY_RESTRICT result) {
+  HWY_DYNAMIC_DISPATCH(_subx16_f64)(a, b, result);
+}
+
+void mulf64x16(const double *HWY_RESTRICT a, const double *HWY_RESTRICT b,
+               double *HWY_RESTRICT result) {
+  HWY_DYNAMIC_DISPATCH(_mulx16_f64)(a, b, result);
+}
+
+void divf64x16(const double *HWY_RESTRICT a, const double *HWY_RESTRICT b,
+               double *HWY_RESTRICT result) {
+  HWY_DYNAMIC_DISPATCH(_divx16_f64)(a, b, result);
+}
+#endif
+
+/* Single vector functions with static dispatch */
+
+/* 64-bits */
+
+#if HWY_MAX_BYTES >= 8
+
+f32x2_v addf32x2_v(const f32x2_v a, const f32x2_v b) {
+  const float *a_ptr = reinterpret_cast<const float *>(&a);
+  const float *b_ptr = reinterpret_cast<const float *>(&b);
+  float *result_ptr = (float *)aligned_alloc(8, sizeof(f32x2_v));
+  HWY_STATIC_DISPATCH(_addx2_f32)(a_ptr, b_ptr, result_ptr);
+  f32x2_v result;
+  std::memcpy(&result, result_ptr, sizeof(f32x2_v));
+  return result;
+}
+
+f32x2_v subf32x2_v(const f32x2_v a, const f32x2_v b) {
+  const float *a_ptr = reinterpret_cast<const float *>(&a);
+  const float *b_ptr = reinterpret_cast<const float *>(&b);
+  float *result_ptr = (float *)aligned_alloc(8, sizeof(f32x2_v));
+  HWY_STATIC_DISPATCH(_subx2_f32)(a_ptr, b_ptr, result_ptr);
+  f32x2_v result;
+  std::memcpy(&result, result_ptr, sizeof(f32x2_v));
+  return result;
+}
+
+f32x2_v mulf32x2_v(const f32x2_v a, const f32x2_v b) {
+  const float *a_ptr = reinterpret_cast<const float *>(&a);
+  const float *b_ptr = reinterpret_cast<const float *>(&b);
+  float *result_ptr = (float *)aligned_alloc(8, sizeof(f32x2_v));
+  HWY_STATIC_DISPATCH(_mulx2_f32)(a_ptr, b_ptr, result_ptr);
+  f32x2_v result;
+  std::memcpy(&result, result_ptr, sizeof(f32x2_v));
+  return result;
+}
+
+f32x2_v divf32x2_v(const f32x2_v a, const f32x2_v b) {
+  const float *a_ptr = reinterpret_cast<const float *>(&a);
+  const float *b_ptr = reinterpret_cast<const float *>(&b);
+  float *result_ptr = (float *)aligned_alloc(8, sizeof(f32x2_v));
+  HWY_STATIC_DISPATCH(_divx2_f32)(a_ptr, b_ptr, result_ptr);
+  f32x2_v result;
+  std::memcpy(&result, result_ptr, sizeof(f32x2_v));
+  return result;
+}
+
+#endif
+
+/* 128-bits */
+
+#if HWY_MAX_BYTES >= 16
+
+f64x2_v addf64x2_v(const f64x2_v a, const f64x2_v b) {
+  const double *a_ptr = reinterpret_cast<const double *>(&a);
+  const double *b_ptr = reinterpret_cast<const double *>(&b);
+  double *result_ptr = (double *)aligned_alloc(16, sizeof(f64x2_v));
+  HWY_STATIC_DISPATCH(_addx2_f64)(a_ptr, b_ptr, result_ptr);
+  f64x2_v result;
+  std::memcpy(&result, result_ptr, sizeof(f64x2_v));
+  return result;
+}
+
+f64x2_v subf64x2_v(const f64x2_v a, const f64x2_v b) {
+  const double *a_ptr = reinterpret_cast<const double *>(&a);
+  const double *b_ptr = reinterpret_cast<const double *>(&b);
+  double *result_ptr = (double *)aligned_alloc(16, sizeof(f64x2_v));
+  HWY_STATIC_DISPATCH(_subx2_f64)(a_ptr, b_ptr, result_ptr);
+  f64x2_v result;
+  std::memcpy(&result, result_ptr, sizeof(f64x2_v));
+  return result;
+}
+
+f64x2_v mulf64x2_v(const f64x2_v a, const f64x2_v b) {
+  const double *a_ptr = reinterpret_cast<const double *>(&a);
+  const double *b_ptr = reinterpret_cast<const double *>(&b);
+  double *result_ptr = (double *)aligned_alloc(16, sizeof(f64x2_v));
+  HWY_STATIC_DISPATCH(_mulx2_f64)(a_ptr, b_ptr, result_ptr);
+  f64x2_v result;
+  std::memcpy(&result, result_ptr, sizeof(f64x2_v));
+  return result;
+}
+
+f64x2_v divf64x2_v(const f64x2_v a, const f64x2_v b) {
+  const double *a_ptr = reinterpret_cast<const double *>(&a);
+  const double *b_ptr = reinterpret_cast<const double *>(&b);
+  double *result_ptr = (double *)aligned_alloc(16, sizeof(f64x2_v));
+  HWY_STATIC_DISPATCH(_divx2_f64)(a_ptr, b_ptr, result_ptr);
+  f64x2_v result;
+  std::memcpy(&result, result_ptr, sizeof(f64x2_v));
+  return result;
+}
+
+f32x4_v addf32x4_v(const f32x4_v a, const f32x4_v b) {
+  const float *a_ptr = reinterpret_cast<const float *>(&a);
+  const float *b_ptr = reinterpret_cast<const float *>(&b);
+  float *result_ptr = (float *)aligned_alloc(16, sizeof(f32x4_v));
+  HWY_STATIC_DISPATCH(_addx4_f32)(a_ptr, b_ptr, result_ptr);
+  f32x4_v result;
+  std::memcpy(&result, result_ptr, sizeof(f32x4_v));
+  return result;
+}
+
+f32x4_v subf32x4_v(const f32x4_v a, const f32x4_v b) {
+  const float *a_ptr = reinterpret_cast<const float *>(&a);
+  const float *b_ptr = reinterpret_cast<const float *>(&b);
+  float *result_ptr = (float *)aligned_alloc(16, sizeof(f32x4_v));
+  HWY_STATIC_DISPATCH(_subx4_f32)(a_ptr, b_ptr, result_ptr);
+  f32x4_v result;
+  std::memcpy(&result, result_ptr, sizeof(f32x4_v));
+  return result;
+}
+
+f32x4_v mulf32x4_v(const f32x4_v a, const f32x4_v b) {
+  const float *a_ptr = reinterpret_cast<const float *>(&a);
+  const float *b_ptr = reinterpret_cast<const float *>(&b);
+  float *result_ptr = (float *)aligned_alloc(16, sizeof(f32x4_v));
+  HWY_STATIC_DISPATCH(_mulx4_f32)(a_ptr, b_ptr, result_ptr);
+  f32x4_v result;
+  std::memcpy(&result, result_ptr, sizeof(f32x4_v));
+  return result;
+}
+
+f32x4_v divf32x4_v(const f32x4_v a, const f32x4_v b) {
+  const float *a_ptr = reinterpret_cast<const float *>(&a);
+  const float *b_ptr = reinterpret_cast<const float *>(&b);
+  float *result_ptr = (float *)aligned_alloc(16, sizeof(f32x4_v));
+  HWY_STATIC_DISPATCH(_divx4_f32)(a_ptr, b_ptr, result_ptr);
+  f32x4_v result;
+  std::memcpy(&result, result_ptr, sizeof(f32x4_v));
+  return result;
+}
+
+#endif
+
+/* 256-bits */
+
+#if HWY_MAX_BYTES >= 32
+
+f64x4_v addf64x4_v(const f64x4_v a, const f64x4_v b) {
+  const double *a_ptr = reinterpret_cast<const double *>(&a);
+  const double *b_ptr = reinterpret_cast<const double *>(&b);
+  double *result_ptr = (double *)aligned_alloc(32, sizeof(f64x4_v));
+  HWY_STATIC_DISPATCH(_addx4_f64)(a_ptr, b_ptr, result_ptr);
+  f64x4_v result;
+  std::memcpy(&result, result_ptr, sizeof(f64x4_v));
+  return result;
+}
+
+f64x4_v subf64x4_v(const f64x4_v a, const f64x4_v b) {
+  const double *a_ptr = reinterpret_cast<const double *>(&a);
+  const double *b_ptr = reinterpret_cast<const double *>(&b);
+  double *result_ptr = (double *)aligned_alloc(32, sizeof(f64x4_v));
+  HWY_STATIC_DISPATCH(_subx4_f64)(a_ptr, b_ptr, result_ptr);
+  f64x4_v result;
+  std::memcpy(&result, result_ptr, sizeof(f64x4_v));
+  return result;
+}
+
+f64x4_v mulf64x4_v(const f64x4_v a, const f64x4_v b) {
+  const double *a_ptr = reinterpret_cast<const double *>(&a);
+  const double *b_ptr = reinterpret_cast<const double *>(&b);
+  double *result_ptr = (double *)aligned_alloc(32, sizeof(f64x4_v));
+  HWY_STATIC_DISPATCH(_mulx4_f64)(a_ptr, b_ptr, result_ptr);
+  f64x4_v result;
+  std::memcpy(&result, result_ptr, sizeof(f64x4_v));
+  return result;
+}
+
+f64x4_v divf64x4_v(const f64x4_v a, const f64x4_v b) {
+  const double *a_ptr = reinterpret_cast<const double *>(&a);
+  const double *b_ptr = reinterpret_cast<const double *>(&b);
+  double *result_ptr = (double *)aligned_alloc(32, sizeof(f64x4_v));
+  HWY_STATIC_DISPATCH(_divx4_f64)(a_ptr, b_ptr, result_ptr);
+  f64x4_v result;
+  std::memcpy(&result, result_ptr, sizeof(f64x4_v));
+  return result;
+}
+
+f32x8_v addf32x8_v(const f32x8_v a, const f32x8_v b) {
+  const float *a_ptr = reinterpret_cast<const float *>(&a);
+  const float *b_ptr = reinterpret_cast<const float *>(&b);
+  float *result_ptr = (float *)aligned_alloc(32, sizeof(f32x8_v));
+  HWY_STATIC_DISPATCH(_addx8_f32)(a_ptr, b_ptr, result_ptr);
+  f32x8_v result;
+  std::memcpy(&result, result_ptr, sizeof(f32x8_v));
+  return result;
+}
+
+f32x8_v subf32x8_v(const f32x8_v a, const f32x8_v b) {
+  const float *a_ptr = reinterpret_cast<const float *>(&a);
+  const float *b_ptr = reinterpret_cast<const float *>(&b);
+  float *result_ptr = (float *)aligned_alloc(32, sizeof(f32x8_v));
+  HWY_STATIC_DISPATCH(_subx8_f32)(a_ptr, b_ptr, result_ptr);
+  f32x8_v result;
+  std::memcpy(&result, result_ptr, sizeof(f32x8_v));
+  return result;
+}
+
+f32x8_v mulf32x8_v(const f32x8_v a, const f32x8_v b) {
+  const float *a_ptr = reinterpret_cast<const float *>(&a);
+  const float *b_ptr = reinterpret_cast<const float *>(&b);
+  float *result_ptr = (float *)aligned_alloc(32, sizeof(f32x8_v));
+  HWY_STATIC_DISPATCH(_mulx8_f32)(a_ptr, b_ptr, result_ptr);
+  f32x8_v result;
+  std::memcpy(&result, result_ptr, sizeof(f32x8_v));
+  return result;
+}
+
+f32x8_v divf32x8_v(const f32x8_v a, const f32x8_v b) {
+  const float *a_ptr = reinterpret_cast<const float *>(&a);
+  const float *b_ptr = reinterpret_cast<const float *>(&b);
+  float *result_ptr = (float *)aligned_alloc(32, sizeof(f32x8_v));
+  HWY_STATIC_DISPATCH(_divx8_f32)(a_ptr, b_ptr, result_ptr);
+  f32x8_v result;
+  std::memcpy(&result, result_ptr, sizeof(f32x8_v));
+  return result;
+}
+
+#endif
+
+/* 512-bits */
+#if HWY_MAX_BYTES >= 64
+f64x8_v addf64x8_v(const f64x8_v a, const f64x8_v b) {
+  const double *a_ptr = reinterpret_cast<const double *>(&a);
+  const double *b_ptr = reinterpret_cast<const double *>(&b);
+  double *result_ptr = (double *)aligned_alloc(64, sizeof(f64x8_v));
+  HWY_STATIC_DISPATCH(_addx8_f64)(a_ptr, b_ptr, result_ptr);
+  f64x8_v result;
+  std::memcpy(&result, result_ptr, sizeof(f64x8_v));
+  return result;
+}
+
+f64x8_v subf64x8_v(const f64x8_v a, const f64x8_v b) {
+  const double *a_ptr = reinterpret_cast<const double *>(&a);
+  const double *b_ptr = reinterpret_cast<const double *>(&b);
+  double *result_ptr = (double *)aligned_alloc(64, sizeof(f64x8_v));
+  HWY_STATIC_DISPATCH(_subx8_f64)(a_ptr, b_ptr, result_ptr);
+  f64x8_v result;
+  std::memcpy(&result, result_ptr, sizeof(f64x8_v));
+  return result;
+}
+
+f64x8_v mulf64x8_v(const f64x8_v a, const f64x8_v b) {
+  const double *a_ptr = reinterpret_cast<const double *>(&a);
+  const double *b_ptr = reinterpret_cast<const double *>(&b);
+  double *result_ptr = (double *)aligned_alloc(64, sizeof(f64x8_v));
+  HWY_STATIC_DISPATCH(_mulx8_f64)(a_ptr, b_ptr, result_ptr);
+  f64x8_v result;
+  std::memcpy(&result, result_ptr, sizeof(f64x8_v));
+  return result;
+}
+
+f64x8_v divf64x8_v(const f64x8_v a, const f64x8_v b) {
+  const double *a_ptr = reinterpret_cast<const double *>(&a);
+  const double *b_ptr = reinterpret_cast<const double *>(&b);
+  double *result_ptr = (double *)aligned_alloc(64, sizeof(f64x8_v));
+  HWY_STATIC_DISPATCH(_divx8_f64)(a_ptr, b_ptr, result_ptr);
+  f64x8_v result;
+  std::memcpy(&result, result_ptr, sizeof(f64x8_v));
+  return result;
+}
+
+f32x16_v addf32x16_v(const f32x16_v a, const f32x16_v b) {
+  const float *a_ptr = reinterpret_cast<const float *>(&a);
+  const float *b_ptr = reinterpret_cast<const float *>(&b);
+  float *result_ptr = (float *)aligned_alloc(64, sizeof(f32x16_v));
+  HWY_STATIC_DISPATCH(_addx16_f32)(a_ptr, b_ptr, result_ptr);
+  f32x16_v result;
+  std::memcpy(&result, result_ptr, sizeof(f32x16_v));
+  return result;
+}
+
+f32x16_v subf32x16_v(const f32x16_v a, const f32x16_v b) {
+  const float *a_ptr = reinterpret_cast<const float *>(&a);
+  const float *b_ptr = reinterpret_cast<const float *>(&b);
+  float *result_ptr = (float *)aligned_alloc(64, sizeof(f32x16_v));
+  HWY_STATIC_DISPATCH(_subx16_f32)(a_ptr, b_ptr, result_ptr);
+  f32x16_v result;
+  std::memcpy(&result, result_ptr, sizeof(f32x16_v));
+  return result;
+}
+
+f32x16_v mulf32x16_v(const f32x16_v a, const f32x16_v b) {
+  const float *a_ptr = reinterpret_cast<const float *>(&a);
+  const float *b_ptr = reinterpret_cast<const float *>(&b);
+  float *result_ptr = (float *)aligned_alloc(64, sizeof(f32x16_v));
+  HWY_STATIC_DISPATCH(_mulx16_f32)(a_ptr, b_ptr, result_ptr);
+  f32x16_v result;
+  std::memcpy(&result, result_ptr, sizeof(f32x16_v));
+  return result;
+}
+
+f32x16_v divf32x16_v(const f32x16_v a, const f32x16_v b) {
+  const float *a_ptr = reinterpret_cast<const float *>(&a);
+  const float *b_ptr = reinterpret_cast<const float *>(&b);
+  float *result_ptr = (float *)aligned_alloc(64, sizeof(f32x16_v));
+  HWY_STATIC_DISPATCH(_divx16_f32)(a_ptr, b_ptr, result_ptr);
+  f32x16_v result;
+  std::memcpy(&result, result_ptr, sizeof(f32x16_v));
+  return result;
+}
+
+#endif // 512-bits
+
+/* 1024-bits */
+#if HWY_MAX_BYTES >= 128
+f64x16_v addf64x16_v(const f64x16_v a, const f64x16_v b) {
+  const double *a_ptr = reinterpret_cast<const double *>(&a);
+  const double *b_ptr = reinterpret_cast<const double *>(&b);
+  double *result_ptr = (double *)aligned_alloc(128, sizeof(f64x16_v));
+  HWY_STATIC_DISPATCH(_addx16_f64)(a_ptr, b_ptr, result_ptr);
+  f64x16_v result;
+  std::memcpy(&result, result_ptr, sizeof(f64x16_v));
+  return result;
+}
+
+f64x16_v subf64x16_v(const f64x16_v a, const f64x16_v b) {
+  const double *a_ptr = reinterpret_cast<const double *>(&a);
+  const double *b_ptr = reinterpret_cast<const double *>(&b);
+  double *result_ptr = (double *)aligned_alloc(128, sizeof(f64x16_v));
+  HWY_STATIC_DISPATCH(_subx16_f64)(a_ptr, b_ptr, result_ptr);
+  f64x16_v result;
+  std::memcpy(&result, result_ptr, sizeof(f64x16_v));
+  return result;
+}
+
+f64x16_v mulf64x16_v(const f64x16_v a, const f64x16_v b) {
+  const double *a_ptr = reinterpret_cast<const double *>(&a);
+  const double *b_ptr = reinterpret_cast<const double *>(&b);
+  double *result_ptr = (double *)aligned_alloc(128, sizeof(f64x16_v));
+  HWY_STATIC_DISPATCH(_mulx16_f64)(a_ptr, b_ptr, result_ptr);
+  f64x16_v result;
+  std::memcpy(&result, result_ptr, sizeof(f64x16_v));
+  return result;
+}
+
+f64x16_v divf64x16_v(const f64x16_v a, const f64x16_v b) {
+  const double *a_ptr = reinterpret_cast<const double *>(&a);
+  const double *b_ptr = reinterpret_cast<const double *>(&b);
+  double *result_ptr = (double *)aligned_alloc(128, sizeof(f64x16_v));
+  HWY_STATIC_DISPATCH(_divx16_f64)(a_ptr, b_ptr, result_ptr);
+  f64x16_v result;
+  std::memcpy(&result, result_ptr, sizeof(f64x16_v));
+  return result;
+}
+#endif // 1024-bits
+
+/* Single vector functions, dynamic dispatch */
+
+/* IEEE-754 binary32 x2 */
+
+f32x2_v addf32x2_d(const f32x2_v a, const f32x2_v b) {
+  const float *a_ptr = reinterpret_cast<const float *>(&a);
+  const float *b_ptr = reinterpret_cast<const float *>(&b);
+  float *result_ptr = (float *)aligned_alloc(8, sizeof(f32x2_v));
+  addf32(a_ptr, b_ptr, result_ptr, 2);
+  f32x2_v result;
+  std::memcpy(&result, result_ptr, sizeof(f32x2_v));
+  return result;
+}
+
+f32x2_v subf32x2_d(const f32x2_v a, const f32x2_v b) {
+  const float *a_ptr = reinterpret_cast<const float *>(&a);
+  const float *b_ptr = reinterpret_cast<const float *>(&b);
+  float *result_ptr = (float *)aligned_alloc(8, sizeof(f32x2_v));
+  subf32(a_ptr, b_ptr, result_ptr, 2);
+  f32x2_v result;
+  std::memcpy(&result, result_ptr, sizeof(f32x2_v));
+  return result;
+}
+
+f32x2_v mulf32x2_d(const f32x2_v a, const f32x2_v b) {
+  const float *a_ptr = reinterpret_cast<const float *>(&a);
+  const float *b_ptr = reinterpret_cast<const float *>(&b);
+  float *result_ptr = (float *)aligned_alloc(8, sizeof(f32x2_v));
+  mulf32(a_ptr, b_ptr, result_ptr, 2);
+  f32x2_v result;
+  std::memcpy(&result, result_ptr, sizeof(f32x2_v));
+  return result;
+}
+
+f32x2_v divf32x2_d(const f32x2_v a, const f32x2_v b) {
+  const float *a_ptr = reinterpret_cast<const float *>(&a);
+  const float *b_ptr = reinterpret_cast<const float *>(&b);
+  float *result_ptr = (float *)aligned_alloc(8, sizeof(f32x2_v));
+  divf32(a_ptr, b_ptr, result_ptr, 2);
+  f32x2_v result;
+  std::memcpy(&result, result_ptr, sizeof(f32x2_v));
+  return result;
+}
+
+/* IEEE-754 binary64 x2 */
+
+f64x2_v addf64x2_d(const f64x2_v a, const f64x2_v b) {
+  const double *a_ptr = reinterpret_cast<const double *>(&a);
+  const double *b_ptr = reinterpret_cast<const double *>(&b);
+  double *result_ptr = (double *)aligned_alloc(16, sizeof(f64x2_v));
+  addf64(a_ptr, b_ptr, result_ptr, 2);
+  f64x2_v result;
+  std::memcpy(&result, result_ptr, sizeof(f64x2_v));
+  return result;
+}
+
+f64x2_v subf64x2_d(const f64x2_v a, const f64x2_v b) {
+  const double *a_ptr = reinterpret_cast<const double *>(&a);
+  const double *b_ptr = reinterpret_cast<const double *>(&b);
+  double *result_ptr = (double *)aligned_alloc(16, sizeof(f64x2_v));
+  subf64(a_ptr, b_ptr, result_ptr, 2);
+  f64x2_v result;
+  std::memcpy(&result, result_ptr, sizeof(f64x2_v));
+  return result;
+}
+
+f64x2_v mulf64x2_d(const f64x2_v a, const f64x2_v b) {
+  const double *a_ptr = reinterpret_cast<const double *>(&a);
+  const double *b_ptr = reinterpret_cast<const double *>(&b);
+  double *result_ptr = (double *)aligned_alloc(16, sizeof(f64x2_v));
+  mulf64(a_ptr, b_ptr, result_ptr, 2);
+  f64x2_v result;
+  std::memcpy(&result, result_ptr, sizeof(f64x2_v));
+  return result;
+}
+
+f64x2_v divf64x2_d(const f64x2_v a, const f64x2_v b) {
+  const double *a_ptr = reinterpret_cast<const double *>(&a);
+  const double *b_ptr = reinterpret_cast<const double *>(&b);
+  double *result_ptr = (double *)aligned_alloc(16, sizeof(f64x2_v));
+  divf64(a_ptr, b_ptr, result_ptr, 2);
+  f64x2_v result;
+  std::memcpy(&result, result_ptr, sizeof(f64x2_v));
+  return result;
+}
+
+/* IEEE-754 binary32 x4 */
+f32x4_v addf32x4_d(const f32x4_v a, const f32x4_v b) {
+  const float *a_ptr = reinterpret_cast<const float *>(&a);
+  const float *b_ptr = reinterpret_cast<const float *>(&b);
+  float *result_ptr = (float *)aligned_alloc(16, sizeof(f32x4_v));
+  addf32(a_ptr, b_ptr, result_ptr, 4);
+  f32x4_v result;
+  std::memcpy(&result, result_ptr, sizeof(f32x4_v));
+  return result;
+}
+
+f32x4_v subf32x4_d(const f32x4_v a, const f32x4_v b) {
+  const float *a_ptr = reinterpret_cast<const float *>(&a);
+  const float *b_ptr = reinterpret_cast<const float *>(&b);
+  float *result_ptr = (float *)aligned_alloc(16, sizeof(f32x4_v));
+  subf32(a_ptr, b_ptr, result_ptr, 4);
+  f32x4_v result;
+  std::memcpy(&result, result_ptr, sizeof(f32x4_v));
+  return result;
+}
+
+f32x4_v mulf32x4_d(const f32x4_v a, const f32x4_v b) {
+  const float *a_ptr = reinterpret_cast<const float *>(&a);
+  const float *b_ptr = reinterpret_cast<const float *>(&b);
+  float *result_ptr = (float *)aligned_alloc(16, sizeof(f32x4_v));
+  mulf32(a_ptr, b_ptr, result_ptr, 4);
+  f32x4_v result;
+  std::memcpy(&result, result_ptr, sizeof(f32x4_v));
+  return result;
+}
+
+f32x4_v divf32x4_d(const f32x4_v a, const f32x4_v b) {
+  const float *a_ptr = reinterpret_cast<const float *>(&a);
+  const float *b_ptr = reinterpret_cast<const float *>(&b);
+  float *result_ptr = (float *)aligned_alloc(16, sizeof(f32x4_v));
+  divf32(a_ptr, b_ptr, result_ptr, 4);
+  f32x4_v result;
+  std::memcpy(&result, result_ptr, sizeof(f32x4_v));
+  return result;
+}
+
+/* IEEE-754 binary64 x4 */
+
+f64x4_v addf64x4_d(const f64x4_v a, const f64x4_v b) {
+  const double *a_ptr = reinterpret_cast<const double *>(&a);
+  const double *b_ptr = reinterpret_cast<const double *>(&b);
+  double *result_ptr = (double *)aligned_alloc(32, sizeof(f64x4_v));
+  addf64(a_ptr, b_ptr, result_ptr, 4);
+  f64x4_v result;
+  std::memcpy(&result, result_ptr, sizeof(f64x4_v));
+  return result;
+}
+
+f64x4_v subf64x4_d(const f64x4_v a, const f64x4_v b) {
+  const double *a_ptr = reinterpret_cast<const double *>(&a);
+  const double *b_ptr = reinterpret_cast<const double *>(&b);
+  double *result_ptr = (double *)aligned_alloc(32, sizeof(f64x4_v));
+  subf64(a_ptr, b_ptr, result_ptr, 4);
+  f64x4_v result;
+  std::memcpy(&result, result_ptr, sizeof(f64x4_v));
+  return result;
+}
+
+f64x4_v mulf64x4_d(const f64x4_v a, const f64x4_v b) {
+  const double *a_ptr = reinterpret_cast<const double *>(&a);
+  const double *b_ptr = reinterpret_cast<const double *>(&b);
+  double *result_ptr = (double *)aligned_alloc(32, sizeof(f64x4_v));
+  mulf64(a_ptr, b_ptr, result_ptr, 4);
+  f64x4_v result;
+  std::memcpy(&result, result_ptr, sizeof(f64x4_v));
+  return result;
+}
+
+f64x4_v divf64x4_d(const f64x4_v a, const f64x4_v b) {
+  const double *a_ptr = reinterpret_cast<const double *>(&a);
+  const double *b_ptr = reinterpret_cast<const double *>(&b);
+  double *result_ptr = (double *)aligned_alloc(32, sizeof(f64x4_v));
+  divf64(a_ptr, b_ptr, result_ptr, 4);
+  f64x4_v result;
+  std::memcpy(&result, result_ptr, sizeof(f64x4_v));
+  return result;
+}
+
+/* IEEE-754 binary32 x8 */
+
+f32x8_v addf32x8_d(const f32x8_v a, const f32x8_v b) {
+  const float *a_ptr = reinterpret_cast<const float *>(&a);
+  const float *b_ptr = reinterpret_cast<const float *>(&b);
+  float *result_ptr = (float *)aligned_alloc(32, sizeof(f32x8_v));
+  addf32(a_ptr, b_ptr, result_ptr, 8);
+  f32x8_v result;
+  std::memcpy(&result, result_ptr, sizeof(f32x8_v));
+  return result;
+}
+f32x8_v subf32x8_d(const f32x8_v a, const f32x8_v b) {
+  const float *a_ptr = reinterpret_cast<const float *>(&a);
+
+  const float *b_ptr = reinterpret_cast<const float *>(&b);
+  float *result_ptr = (float *)aligned_alloc(32, sizeof(f32x8_v));
+  subf32(a_ptr, b_ptr, result_ptr, 8);
+  f32x8_v result;
+  std::memcpy(&result, result_ptr, sizeof(f32x8_v));
+  return result;
+}
+
+f32x8_v mulf32x8_d(const f32x8_v a, const f32x8_v b) {
+  const float *a_ptr = reinterpret_cast<const float *>(&a);
+  const float *b_ptr = reinterpret_cast<const float *>(&b);
+  float *result_ptr = (float *)aligned_alloc(32, sizeof(f32x8_v));
+  mulf32(a_ptr, b_ptr, result_ptr, 8);
+  f32x8_v result;
+  std::memcpy(&result, result_ptr, sizeof(f32x8_v));
+  return result;
+}
+
+f32x8_v divf32x8_d(const f32x8_v a, const f32x8_v b) {
+  const float *a_ptr = reinterpret_cast<const float *>(&a);
+  const float *b_ptr = reinterpret_cast<const float *>(&b);
+  float *result_ptr = (float *)aligned_alloc(32, sizeof(f32x8_v));
+  divf32(a_ptr, b_ptr, result_ptr, 8);
+  f32x8_v result;
+  std::memcpy(&result, result_ptr, sizeof(f32x8_v));
+  return result;
+}
+
+/* IEEE-754 binary64 x8 */
+
+f64x8_v addf64x8_d(const f64x8_v a, const f64x8_v b) {
+  const double *a_ptr = reinterpret_cast<const double *>(&a);
+  const double *b_ptr = reinterpret_cast<const double *>(&b);
+  double *result_ptr = (double *)aligned_alloc(64, sizeof(f64x8_v));
+  addf64(a_ptr, b_ptr, result_ptr, 8);
+  f64x8_v result;
+  std::memcpy(&result, result_ptr, sizeof(f64x8_v));
+  return result;
+}
+
+f64x8_v subf64x8_d(const f64x8_v a, const f64x8_v b) {
+  const double *a_ptr = reinterpret_cast<const double *>(&a);
+  const double *b_ptr = reinterpret_cast<const double *>(&b);
+  double *result_ptr = (double *)aligned_alloc(64, sizeof(f64x8_v));
+  subf64(a_ptr, b_ptr, result_ptr, 8);
+  f64x8_v result;
+  std::memcpy(&result, result_ptr, sizeof(f64x8_v));
+  return result;
+}
+
+f64x8_v mulf64x8_d(const f64x8_v a, const f64x8_v b) {
+  const double *a_ptr = reinterpret_cast<const double *>(&a);
+  const double *b_ptr = reinterpret_cast<const double *>(&b);
+  double *result_ptr = (double *)aligned_alloc(64, sizeof(f64x8_v));
+  mulf64(a_ptr, b_ptr, result_ptr, 8);
+  f64x8_v result;
+  std::memcpy(&result, result_ptr, sizeof(f64x8_v));
+  return result;
+}
+
+f64x8_v divf64x8_d(const f64x8_v a, const f64x8_v b) {
+  const double *a_ptr = reinterpret_cast<const double *>(&a);
+  const double *b_ptr = reinterpret_cast<const double *>(&b);
+  double *result_ptr = (double *)aligned_alloc(64, sizeof(f64x8_v));
+  divf64(a_ptr, b_ptr, result_ptr, 8);
+  f64x8_v result;
+  std::memcpy(&result, result_ptr, sizeof(f64x8_v));
+  return result;
+}
+
+/* IEEE-754 binary32 x16 */
+
+f32x16_v addf32x16_d(const f32x16_v a, const f32x16_v b) {
+  const float *a_ptr = reinterpret_cast<const float *>(&a);
+  const float *b_ptr = reinterpret_cast<const float *>(&b);
+  float *result_ptr = (float *)aligned_alloc(64, sizeof(f32x16_v));
+  addf32(a_ptr, b_ptr, result_ptr, 16);
+  f32x16_v result;
+  std::memcpy(&result, result_ptr, sizeof(f32x16_v));
+  return result;
+}
+
+f32x16_v subf32x16_d(const f32x16_v a, const f32x16_v b) {
+  const float *a_ptr = reinterpret_cast<const float *>(&a);
+  const float *b_ptr = reinterpret_cast<const float *>(&b);
+  float *result_ptr = (float *)aligned_alloc(64, sizeof(f32x16_v));
+  subf32(a_ptr, b_ptr, result_ptr, 16);
+  f32x16_v result;
+  std::memcpy(&result, result_ptr, sizeof(f32x16_v));
+  return result;
+}
+
+f32x16_v mulf32x16_d(const f32x16_v a, const f32x16_v b) {
+  const float *a_ptr = reinterpret_cast<const float *>(&a);
+  const float *b_ptr = reinterpret_cast<const float *>(&b);
+  float *result_ptr = (float *)aligned_alloc(64, sizeof(f32x16_v));
+  mulf32(a_ptr, b_ptr, result_ptr, 16);
+  f32x16_v result;
+  std::memcpy(&result, result_ptr, sizeof(f32x16_v));
+  return result;
+}
+
+f32x16_v divf32x16_d(const f32x16_v a, const f32x16_v b) {
+  const float *a_ptr = reinterpret_cast<const float *>(&a);
+  const float *b_ptr = reinterpret_cast<const float *>(&b);
+  float *result_ptr = (float *)aligned_alloc(64, sizeof(f32x16_v));
+  divf32(a_ptr, b_ptr, result_ptr, 16);
+  f32x16_v result;
+  std::memcpy(&result, result_ptr, sizeof(f32x16_v));
+  return result;
+}
+
+/* IEEE-754 binary64 x16 */
+
+f64x16_v addf64x16_d(const f64x16_v a, const f64x16_v b) {
+  const double *a_ptr = reinterpret_cast<const double *>(&a);
+  const double *b_ptr = reinterpret_cast<const double *>(&b);
+  double *result_ptr = (double *)aligned_alloc(128, sizeof(f64x16_v));
+  addf64(a_ptr, b_ptr, result_ptr, 16);
+  f64x16_v result;
+  std::memcpy(&result, result_ptr, sizeof(f64x16_v));
+  return result;
+}
+
+f64x16_v subf64x16_d(const f64x16_v a, const f64x16_v b) {
+  const double *a_ptr = reinterpret_cast<const double *>(&a);
+  const double *b_ptr = reinterpret_cast<const double *>(&b);
+  double *result_ptr = (double *)aligned_alloc(128, sizeof(f64x16_v));
+  subf64(a_ptr, b_ptr, result_ptr, 16);
+  f64x16_v result;
+  std::memcpy(&result, result_ptr, sizeof(f64x16_v));
+  return result;
+}
+
+f64x16_v mulf64x16_d(const f64x16_v a, const f64x16_v b) {
+  const double *a_ptr = reinterpret_cast<const double *>(&a);
+  const double *b_ptr = reinterpret_cast<const double *>(&b);
+  double *result_ptr = (double *)aligned_alloc(128, sizeof(f64x16_v));
+  mulf64(a_ptr, b_ptr, result_ptr, 16);
+  f64x16_v result;
+  std::memcpy(&result, result_ptr, sizeof(f64x16_v));
+  return result;
+}
+
+f64x16_v divf64x16_d(const f64x16_v a, const f64x16_v b) {
+  const double *a_ptr = reinterpret_cast<const double *>(&a);
+  const double *b_ptr = reinterpret_cast<const double *>(&b);
+  double *result_ptr = (double *)aligned_alloc(128, sizeof(f64x16_v));
+  divf64(a_ptr, b_ptr, result_ptr, 16);
+  f64x16_v result;
+  std::memcpy(&result, result_ptr, sizeof(f64x16_v));
+  return result;
+}
+
+} // namespace vector
+} // namespace ud
+
+#endif // HWY_ONCE
+
+#if HWY_ONCE
+namespace ud {
+namespace scalar {
+
+float addf32(float a, float b) { return ud::scalar::add<float>(a, b); }
+float subf32(float a, float b) { return ud::scalar::sub<float>(a, b); }
+float mulf32(float a, float b) { return ud::scalar::mul<float>(a, b); }
+float divf32(float a, float b) { return ud::scalar::div<float>(a, b); }
+float sqrtf32(float a) { return ud::scalar::sqrt<float>(a); }
+
+double addf64(double a, double b) { return ud::scalar::add<double>(a, b); }
+double subf64(double a, double b) { return ud::scalar::sub<double>(a, b); }
+double mulf64(double a, double b) { return ud::scalar::mul<double>(a, b); }
+double divf64(double a, double b) { return ud::scalar::div<double>(a, b); }
+double sqrtf64(double a) { return ud::scalar::sqrt<double>(a); }
+
+} // namespace scalar
+} // namespace ud
+#endif // HWY_ONCE
\ No newline at end of file
diff --git a/src/libvfcinstrumentonline/rand/src/ud_hw.h b/src/libvfcinstrumentonline/rand/src/ud_hw.h
new file mode 100644
index 00000000..37f119da
--- /dev/null
+++ b/src/libvfcinstrumentonline/rand/src/ud_hw.h
@@ -0,0 +1,258 @@
+#ifndef _INTERFLOP_UD_LIB_H_
+#define _INTERFLOP_UD_LIB_H_
+
+#include "hwy/highway.h"
+
+namespace ud {
+namespace vector {
+/* Array functions */
+
+/* IEEE-754 binary32 */
+
+void addf32(const float *HWY_RESTRICT a, const float *HWY_RESTRICT b,
+            float *HWY_RESTRICT result, const size_t count);
+
+void subf32(const float *HWY_RESTRICT a, const float *HWY_RESTRICT b,
+            float *HWY_RESTRICT result, const size_t count);
+
+void mulf32(const float *HWY_RESTRICT a, const float *HWY_RESTRICT b,
+            float *HWY_RESTRICT result, const size_t count);
+
+void divf32(const float *HWY_RESTRICT a, const float *HWY_RESTRICT b,
+            float *HWY_RESTRICT result, const size_t count);
+
+void sqrtf32(const float *HWY_RESTRICT a, float *HWY_RESTRICT result,
+             const size_t count);
+
+/* IEEE-754 binary64 */
+
+void addf64(const double *HWY_RESTRICT a, const double *HWY_RESTRICT b,
+            double *HWY_RESTRICT result, const size_t count);
+
+void subf64(const double *HWY_RESTRICT a, const double *HWY_RESTRICT b,
+            double *HWY_RESTRICT result, const size_t count);
+
+void mulf64(const double *HWY_RESTRICT a, const double *HWY_RESTRICT b,
+            double *HWY_RESTRICT result, const size_t count);
+
+void divf64(const double *HWY_RESTRICT a, const double *HWY_RESTRICT b,
+            double *HWY_RESTRICT result, const size_t count);
+
+void sqrtf64(const double *HWY_RESTRICT a, double *HWY_RESTRICT result,
+             const size_t count);
+
+/* Single vector functions, pointer arguments */
+
+/* 64-bits */
+#if HWY_MAX_BYTES >= 8
+/* IEEE-754 binary32 x2 */
+void addf32x2(const float *HWY_RESTRICT a, const float *HWY_RESTRICT b,
+              float *HWY_RESTRICT result);
+void subf32x2(const float *HWY_RESTRICT a, const float *HWY_RESTRICT b,
+              float *HWY_RESTRICT result);
+void mulf32x2(const float *HWY_RESTRICT a, const float *HWY_RESTRICT b,
+              float *HWY_RESTRICT result);
+void divf32x2(const float *HWY_RESTRICT a, const float *HWY_RESTRICT b,
+              float *HWY_RESTRICT result);
+#endif
+
+/* 128-bits */
+#if HWY_MAX_BYTES >= 16
+/* IEEE-754 binary32 x4 */
+void addf32x4(const float *HWY_RESTRICT a, const float *HWY_RESTRICT b,
+              float *HWY_RESTRICT result);
+void subf32x4(const float *HWY_RESTRICT a, const float *HWY_RESTRICT b,
+              float *HWY_RESTRICT result);
+void mulf32x4(const float *HWY_RESTRICT a, const float *HWY_RESTRICT b,
+              float *HWY_RESTRICT result);
+void divf32x4(const float *HWY_RESTRICT a, const float *HWY_RESTRICT b,
+              float *HWY_RESTRICT result);
+
+/* IEEE-754 binary64 x2 */
+void addf64x2(const double *HWY_RESTRICT a, const double *HWY_RESTRICT b,
+              double *HWY_RESTRICT result);
+void subf64x2(const double *HWY_RESTRICT a, const double *HWY_RESTRICT b,
+              double *HWY_RESTRICT result);
+void mulf64x2(const double *HWY_RESTRICT a, const double *HWY_RESTRICT b,
+              double *HWY_RESTRICT result);
+void divf64x2(const double *HWY_RESTRICT a, const double *HWY_RESTRICT b,
+              double *HWY_RESTRICT result);
+#endif
+
+/* 256-bits */
+#if HWY_MAX_BYTES >= 32
+/* IEEE-754 binary64 x4 */
+void addf64x4(const double *HWY_RESTRICT a, const double *HWY_RESTRICT b,
+              double *HWY_RESTRICT result);
+void subf64x4(const double *HWY_RESTRICT a, const double *HWY_RESTRICT b,
+              double *HWY_RESTRICT result);
+void mulf64x4(const double *HWY_RESTRICT a, const double *HWY_RESTRICT b,
+              double *HWY_RESTRICT result);
+void divf64x4(const double *HWY_RESTRICT a, const double *HWY_RESTRICT b,
+              double *HWY_RESTRICT result);
+
+/* IEEE-754 binary32 x8 */
+void addf32x8(const float *HWY_RESTRICT a, const float *HWY_RESTRICT b,
+              float *HWY_RESTRICT result);
+void subf32x8(const float *HWY_RESTRICT a, const float *HWY_RESTRICT b,
+              float *HWY_RESTRICT result);
+void mulf32x8(const float *HWY_RESTRICT a, const float *HWY_RESTRICT b,
+              float *HWY_RESTRICT result);
+void divf32x8(const float *HWY_RESTRICT a, const float *HWY_RESTRICT b,
+              float *HWY_RESTRICT result);
+#endif
+
+/* 512-bits */
+#if HWY_MAX_BYTES >= 64
+/* IEEE-754 binary64 x8 */
+void addf64x8(const double *HWY_RESTRICT a, const double *HWY_RESTRICT b,
+              double *HWY_RESTRICT result);
+void subf64x8(const double *HWY_RESTRICT a, const double *HWY_RESTRICT b,
+              double *HWY_RESTRICT result);
+void mulf64x8(const double *HWY_RESTRICT a, const double *HWY_RESTRICT b,
+              double *HWY_RESTRICT result);
+void divf64x8(const double *HWY_RESTRICT a, const double *HWY_RESTRICT b,
+              double *HWY_RESTRICT result);
+
+/* IEEE-754 binary32 x16 */
+void addf32x16(const float *HWY_RESTRICT a, const float *HWY_RESTRICT b,
+               float *HWY_RESTRICT result);
+void subf32x16(const float *HWY_RESTRICT a, const float *HWY_RESTRICT b,
+               float *HWY_RESTRICT result);
+void mulf32x16(const float *HWY_RESTRICT a, const float *HWY_RESTRICT b,
+               float *HWY_RESTRICT result);
+void divf32x16(const float *HWY_RESTRICT a, const float *HWY_RESTRICT b,
+               float *HWY_RESTRICT result);
+#endif
+
+/* 1024-bits */
+#if HWY_MAX_BYTES >= 128
+/* IEEE-754 binary64 x16 */
+void addf64x16(const double *HWY_RESTRICT a, const double *HWY_RESTRICT b,
+               double *HWY_RESTRICT result);
+void subf64x16(const double *HWY_RESTRICT a, const double *HWY_RESTRICT b,
+               double *HWY_RESTRICT result);
+void mulf64x16(const double *HWY_RESTRICT a, const double *HWY_RESTRICT b,
+               double *HWY_RESTRICT result);
+void divf64x16(const double *HWY_RESTRICT a, const double *HWY_RESTRICT b,
+               double *HWY_RESTRICT result);
+#endif
+
+/* Single vector functions, vector argument */
+
+/* 64-bits */
+typedef float f32x2_v __attribute__((vector_size(8)));
+#if HWY_MAX_BYTES >= 8
+f32x2_v addf32x2_v(const f32x2_v a, const f32x2_v b);
+f32x2_v subf32x2_v(const f32x2_v a, const f32x2_v b);
+f32x2_v mulf32x2_v(const f32x2_v a, const f32x2_v b);
+f32x2_v divf32x2_v(const f32x2_v a, const f32x2_v b);
+#endif
+
+/* 128-bits */
+typedef double f64x2_v __attribute__((vector_size(16)));
+typedef float f32x4_v __attribute__((vector_size(16)));
+#if HWY_MAX_BYTES >= 16
+f64x2_v addf64x2_v(const f64x2_v a, const f64x2_v b);
+f64x2_v subf64x2_v(const f64x2_v a, const f64x2_v b);
+f64x2_v mulf64x2_v(const f64x2_v a, const f64x2_v b);
+f64x2_v divf64x2_v(const f64x2_v a, const f64x2_v b);
+
+f32x4_v addf32x4_v(const f32x4_v a, const f32x4_v b);
+f32x4_v subf32x4_v(const f32x4_v a, const f32x4_v b);
+f32x4_v mulf32x4_v(const f32x4_v a, const f32x4_v b);
+f32x4_v divf32x4_v(const f32x4_v a, const f32x4_v b);
+#endif
+
+/* 256-bits */
+typedef double f64x4_v __attribute__((vector_size(32)));
+typedef float f32x8_v __attribute__((vector_size(32)));
+#if HWY_MAX_BYTES >= 32
+f64x4_v addf64x4_v(const f64x4_v a, const f64x4_v b);
+f64x4_v subf64x4_v(const f64x4_v a, const f64x4_v b);
+f64x4_v mulf64x4_v(const f64x4_v a, const f64x4_v b);
+f64x4_v divf64x4_v(const f64x4_v a, const f64x4_v b);
+
+f32x8_v addf32x8_v(const f32x8_v a, const f32x8_v b);
+f32x8_v subf32x8_v(const f32x8_v a, const f32x8_v b);
+f32x8_v mulf32x8_v(const f32x8_v a, const f32x8_v b);
+f32x8_v divf32x8_v(const f32x8_v a, const f32x8_v b);
+#endif
+
+/* 512-bits */
+typedef double f64x8_v __attribute__((vector_size(64)));
+typedef float f32x16_v __attribute__((vector_size(64)));
+#if HWY_MAX_BYTES >= 64
+f64x8_v addf64x8_v(const f64x8_v a, const f64x8_v b);
+f64x8_v subf64x8_v(const f64x8_v a, const f64x8_v b);
+f64x8_v mulf64x8_v(const f64x8_v a, const f64x8_v b);
+f64x8_v divf64x8_v(const f64x8_v a, const f64x8_v b);
+
+f32x16_v addf32x16_v(const f32x16_v a, const f32x16_v b);
+f32x16_v subf32x16_v(const f32x16_v a, const f32x16_v b);
+f32x16_v mulf32x16_v(const f32x16_v a, const f32x16_v b);
+f32x16_v divf32x16_v(const f32x16_v a, const f32x16_v b);
+#endif
+
+typedef double f64x16_v __attribute__((vector_size(128)));
+#if HWY_MAX_BYTES >= 128
+f64x16_v addf64x16_v(const f64x16_v a, const f64x16_v b);
+f64x16_v subf64x16_v(const f64x16_v a, const f64x16_v b);
+f64x16_v mulf64x16_v(const f64x16_v a, const f64x16_v b);
+f64x16_v divf64x16_v(const f64x16_v a, const f64x16_v b);
+#endif
+
+/* Single vectors functions, vector arguments, dynamic dispatch */
+
+/* IEEE-754 binary32 x2 */
+f32x2_v addf32x2_d(const f32x2_v a, const f32x2_v b);
+f32x2_v subf32x2_d(const f32x2_v a, const f32x2_v b);
+f32x2_v mulf32x2_d(const f32x2_v a, const f32x2_v b);
+f32x2_v divf32x2_d(const f32x2_v a, const f32x2_v b);
+
+/* IEEE-754 binary64 x2 */
+f64x2_v addf64x2_d(const f64x2_v a, const f64x2_v b);
+f64x2_v subf64x2_d(const f64x2_v a, const f64x2_v b);
+f64x2_v mulf64x2_d(const f64x2_v a, const f64x2_v b);
+f64x2_v divf64x2_d(const f64x2_v a, const f64x2_v b);
+
+/* IEEE-754 binary32 x4 */
+f32x4_v addf32x4_d(const f32x4_v a, const f32x4_v b);
+f32x4_v subf32x4_d(const f32x4_v a, const f32x4_v b);
+f32x4_v mulf32x4_d(const f32x4_v a, const f32x4_v b);
+f32x4_v divf32x4_d(const f32x4_v a, const f32x4_v b);
+
+/* IEEE-754 binary64 x4 */
+f64x4_v addf64x4_d(const f64x4_v a, const f64x4_v b);
+f64x4_v subf64x4_d(const f64x4_v a, const f64x4_v b);
+f64x4_v mulf64x4_d(const f64x4_v a, const f64x4_v b);
+f64x4_v divf64x4_d(const f64x4_v a, const f64x4_v b);
+
+/* IEEE-754 binary32 x8 */
+f32x8_v addf32x8_d(const f32x8_v a, const f32x8_v b);
+f32x8_v subf32x8_d(const f32x8_v a, const f32x8_v b);
+f32x8_v mulf32x8_d(const f32x8_v a, const f32x8_v b);
+f32x8_v divf32x8_d(const f32x8_v a, const f32x8_v b);
+
+/* IEEE-754 binary64 x8 */
+f64x8_v addf64x8_d(const f64x8_v a, const f64x8_v b);
+f64x8_v subf64x8_d(const f64x8_v a, const f64x8_v b);
+f64x8_v mulf64x8_d(const f64x8_v a, const f64x8_v b);
+f64x8_v divf64x8_d(const f64x8_v a, const f64x8_v b);
+
+/* IEEE-754 binary32 x16 */
+f32x16_v addf32x16_d(const f32x16_v a, const f32x16_v b);
+f32x16_v subf32x16_d(const f32x16_v a, const f32x16_v b);
+f32x16_v mulf32x16_d(const f32x16_v a, const f32x16_v b);
+f32x16_v divf32x16_d(const f32x16_v a, const f32x16_v b);
+
+/* IEEE-754 binary64 x16 */
+f64x16_v addf64x16_d(const f64x16_v a, const f64x16_v b);
+f64x16_v subf64x16_d(const f64x16_v a, const f64x16_v b);
+f64x16_v mulf64x16_d(const f64x16_v a, const f64x16_v b);
+f64x16_v divf64x16_d(const f64x16_v a, const f64x16_v b);
+
+} // namespace vector
+} // namespace ud
+
+#endif // _INTERFLOP_UD_LIB_H_
\ No newline at end of file
diff --git a/src/libvfcinstrumentonline/rand/src/utils.hpp b/src/libvfcinstrumentonline/rand/src/utils.hpp
index a1c4ee02..a0373364 100644
--- a/src/libvfcinstrumentonline/rand/src/utils.hpp
+++ b/src/libvfcinstrumentonline/rand/src/utils.hpp
@@ -70,7 +70,7 @@ template <> struct IEEE754<double> {
 // Uses the FP constant 𝜓 = 1 − 2^{−𝑝} .
 // 𝑧 ← RN𝑒 (𝜓𝑎) (= pred(|a|))
 // 𝛿 ← RN𝑒 (𝑎 − 𝑧)
-// return 𝛿
+// return (z,𝛿)
 template <typename T> T get_predecessor_abs(T a) {
   T phi = std::is_same<T, float>::value ? 1.0f - 0x1.0p-24f : 1.0 - 0x1.0p-53;
   T z = a * phi;
diff --git a/src/libvfcinstrumentonline/rand/src/xoroshiro256+_hw-inl.hpp b/src/libvfcinstrumentonline/rand/src/xoroshiro256+_hw-inl.hpp
index 3faf715e..973f5e4f 100644
--- a/src/libvfcinstrumentonline/rand/src/xoroshiro256+_hw-inl.hpp
+++ b/src/libvfcinstrumentonline/rand/src/xoroshiro256+_hw-inl.hpp
@@ -51,6 +51,7 @@ extern RNGInitializer rng;
 
 HWY_API float uniformf32() { return rng.get()->Uniform(); }
 HWY_API double uniformf64() { return rng.get()->Uniform(); }
+HWY_API std::uint64_t randomu64() { return rng.get()->operator()(); }
 
 } // namespace HWY_NAMESPACE
 } // namespace xoroshiro256plus
@@ -81,6 +82,7 @@ class RNGInitializer {
     return std::make_unique<hn::VectorXoshiro>(seed, thread_id);
   }
 };
+
 extern RNGInitializer rng;
 
 template <class D, class V = hn::VFromD<D>> V uniform(const D d) {
@@ -104,6 +106,27 @@ template <class D, class V = hn::VFromD<D>> V uniform(const D d) {
   }
 }
 
+template <class D, class V = hn::VFromD<D>> V random(const D d) {
+  sv::debug_msg("[random] START");
+  using T = hn::TFromD<D>;
+  const std::size_t lanes = hn::Lanes(d);
+  if constexpr (sizeof(lanes * sizeof(T)) <= 16) {
+    /* allocate at least 128bits */
+    /* since VectorXoshiro returns 128bits elements */
+    const std::size_t size_rng = std::max(lanes, std::size_t{8});
+    auto z = rng.get()->operator()(T{}, size_rng);
+    auto z_load = hn::Load(d, z.data());
+    sv::debug_msg("[random] END");
+    return z_load;
+  } else {
+    const std::size_t size_rng = lanes;
+    auto z = rng.get()->operator()(T{}, size_rng);
+    auto z_load = hn::Load(d, z.data());
+    sv::debug_msg("[random] END");
+    return z_load;
+  }
+}
+
 } // namespace HWY_NAMESPACE
 
 // NOLINTNEXTLINE(google-readability-namespace-comments)
diff --git a/src/libvfcinstrumentonline/rand/src/xoroshiro256+_hw.cpp b/src/libvfcinstrumentonline/rand/src/xoroshiro256+_hw.cpp
index bc350fa4..4c1c91b3 100644
--- a/src/libvfcinstrumentonline/rand/src/xoroshiro256+_hw.cpp
+++ b/src/libvfcinstrumentonline/rand/src/xoroshiro256+_hw.cpp
@@ -82,10 +82,12 @@ namespace xoroshiro256plus {
 
 HWY_EXPORT(uniformf32);
 HWY_EXPORT(uniformf64);
+HWY_EXPORT(randomu64);
 
 namespace static_dispatch {
 float uniform(float) { return HWY_STATIC_DISPATCH(uniformf32)(); }
 double uniform(double) { return HWY_STATIC_DISPATCH(uniformf64)(); }
+std::uint64_t random() { return HWY_STATIC_DISPATCH(randomu64)(); }
 
 } // namespace static_dispatch
 
@@ -98,6 +100,10 @@ HWY_DLLEXPORT double uniform(double) {
   return HWY_DYNAMIC_DISPATCH(uniformf64)();
 }
 
+HWY_DLLEXPORT std::uint64_t random() {
+  return HWY_DYNAMIC_DISPATCH(randomu64)();
+}
+
 } // namespace dynamic_dispatch
 
 } // namespace xoroshiro256plus
diff --git a/src/libvfcinstrumentonline/rand/src/xoroshiro256+_hw.h b/src/libvfcinstrumentonline/rand/src/xoroshiro256+_hw.h
index 48b75a9b..396e3c4d 100644
--- a/src/libvfcinstrumentonline/rand/src/xoroshiro256+_hw.h
+++ b/src/libvfcinstrumentonline/rand/src/xoroshiro256+_hw.h
@@ -44,11 +44,13 @@ namespace xoroshiro256plus {
 namespace static_dispatch {
 HWY_DLLEXPORT float uniform(float);
 HWY_DLLEXPORT double uniform(double);
+HWY_DLLEXPORT std::uint64_t random();
 } // namespace static_dispatch
 
 namespace dynamic_dispatch {
 HWY_DLLEXPORT float uniform(float);
 HWY_DLLEXPORT double uniform(double);
+HWY_DLLEXPORT std::uint64_t random();
 } // namespace dynamic_dispatch
 
 } // namespace xoroshiro256plus
diff --git a/src/libvfcinstrumentonline/rand/tests/BUILD b/src/libvfcinstrumentonline/rand/tests/BUILD
index 46c429bb..dbf52b5d 100644
--- a/src/libvfcinstrumentonline/rand/tests/BUILD
+++ b/src/libvfcinstrumentonline/rand/tests/BUILD
@@ -2,148 +2,152 @@ load("//tests:macros.bzl", "cc_test_gen", "cc_test_lib_gen")
 
 HWY_OPTS = [
     "-O2",
-    "-std=c++20",
-    "-Wfatal-errors",
-    # "-DHWY_DISABLED_TARGETS=(HWY_AVX3|HWY_AVX3_ZEN4|HWY_AVX3_SPR)",
-    # "-DHWY_IS_TEST",
-]
-
-DEPS = [
-    "@hwy//:math",
 ]
 
 cc_test_gen(
-    name = "get_exponent_test",
+    name = "test_get_exponent",
     copts = HWY_OPTS,
 )
 
 cc_test_gen(
-    name = "get_predabs_test",
+    name = "test_get_predabs",
     copts = HWY_OPTS,
 )
 
 cc_test_gen(
-    name = "get_pow2_test",
+    name = "test_get_pow2",
     copts = HWY_OPTS,
 )
 
 cc_test_gen(
-    name = "get_pow2_test-hw",
-    copts = HWY_OPTS + ["-DSR_DEBUG"],
+    name = "test_get_pow2_hwy",
+    copts = HWY_OPTS,
+    dbg = True,
 )
 
 cc_test_gen(
-    name = "twosum_test",
+    name = "test_twosum",
     copts = HWY_OPTS,
 )
 
 cc_test_gen(
-    name = "twoprodfma_test",
+    name = "test_twoprodfma",
     copts = HWY_OPTS,
 )
 
+# Scalar tests
+
 cc_test_lib_gen(
-    name = "udround_test",
+    name = "sr-scalar-accuracy",
+    size = "small",
+    src = ["//tests:test_sr_accuracy.cpp"],
+    copts = HWY_OPTS,
 )
 
 cc_test_lib_gen(
-    name = "sr_accuracy",
+    name = "sr-scalar-accuracy-dbg",
     size = "small",
-    src = ["//tests:srround_test.cpp"],
+    src = ["//tests:test_sr_accuracy.cpp"],
     copts = HWY_OPTS,
+    dbg = True,
 )
 
 cc_test_lib_gen(
-    name = "sr_accuracy-dbg",
+    name = "ud-scalar-accuracy",
     size = "small",
-    src = ["//tests:srround_test.cpp"],
-    copts = HWY_OPTS + ["-DSR_DEBUG"],
+    src = [
+        "//tests:test_ud_accuracy.cpp",
+        "//src:srcs-ud",
+    ],
+    copts = HWY_OPTS,
 )
 
-# single test for scalar operations
 cc_binary(
-    name = "single-test",
-    srcs = ["//tests:test_sr_hw.cpp"],
-    copts = HWY_OPTS + [
-        "-DSR_DEBUG",
-    ],
-    deps = [
-        "@//src:sr",
-        "@hwy",
-    ],
+    name = "sr-scalar-single-test",
+    srcs = ["//tests:test_single_sr.cpp"],
+    copts = HWY_OPTS + ["-DSR_DEBUG"],
+    deps = ["@//src:sr"],
 )
 
-# single test for vector operations
 cc_binary(
-    name = "single-test-hw",
-    srcs = ["//tests:test_sr_hw-API.cpp"],
-    copts = HWY_OPTS + [
-        "-DSR_DEBUG",
-    ],
-    deps = [
-        "@//src:sr",
-        "@googletest//:gtest",
-        "@googletest//:gtest_main",
-        "@hwy",
-        "@hwy//:hwy_test_util",
-    ],
+    name = "ud-scalar-single-test",
+    srcs = ["//tests:test_single_ud.cpp"],
+    copts = HWY_OPTS + ["-DSR_DEBUG"],
+    deps = ["@//src:ud"],
 )
 
+# Vector tests
+
 cc_test_lib_gen(
     name = "xoroshiro",
     size = "medium",
-    src = ["//tests:xoroshiro.cpp"],
-    copts = [
-        "-O2",
-        "-Wfatal-errors",
-        "-DHWY_DISABLED_TARGETS=(HWY_AVX3|HWY_AVX3_ZEN4|HWY_AVX3_SPR)",
-        "-DHWY_IS_TEST",
-        "-DSR_DEBUG",
+    src = [
+        "//src:xoroshiro",
+        "//tests:test_xoroshiro.cpp",
     ],
+    copts = HWY_OPTS,
+    dbg = True,
+)
+
+cc_binary(
+    name = "sr-vector-single-test",
+    srcs = ["//tests:test_single_sr_hwy.cpp"],
+    copts = HWY_OPTS + ["-DSR_DEBUG"],
 )
 
 cc_test_lib_gen(
-    name = "sr_hw_accuracy",
+    name = "sr-vector-accuracy",
     size = "medium",
-    src = ["//tests:srround_test_hw-inl.cpp"],
+    src = ["//tests:test_sr_hwy_accuracy.cpp"],
     copts = HWY_OPTS,
 )
 
 cc_test_lib_gen(
-    name = "sr_hw_accuracy-dbg",
+    name = "ud-vector-accuracy",
     size = "medium",
-    src = ["//tests:srround_test_hw-inl.cpp"],
-    copts = HWY_OPTS + ["-DSR_DEBUG"],
+    src = [
+        "//tests:test_ud_hwy_accuracy.cpp",
+        "//src:srcs-ud",
+    ],
+    copts = HWY_OPTS,
 )
 
 cc_test_lib_gen(
-    name = "sr_hw_test",
+    name = "sr-vector-accuracy-dbg",
     size = "medium",
-    src = ["//tests:sr_hw_test.cpp"],
-    copts = [
-        "-std=c++20",
-        "-O2",
-        "-Wfatal-errors",
-        "-DHWY_DISABLED_TARGETS=(HWY_AVX3|HWY_AVX3_ZEN4|HWY_AVX3_SPR)",
-        "-DHWY_IS_TEST",
-        "-DSR_DEBUG",
-        "-Wno-psabi",
-    ],
+    src = ["//tests:test_sr_hwy_accuracy.cpp"],
+    copts = HWY_OPTS,
+    dbg = True,
+)
+
+cc_test_lib_gen(
+    name = "sr-perf",
+    size = "medium",
+    src = ["//tests:test_sr_hwy_performance.cpp"],
+    copts = HWY_OPTS,
     deps = [
         "@//src:sr",
         "@hwy//:nanobenchmark",
     ],
 )
 
+cc_test_lib_gen(
+    name = "ud-perf",
+    size = "medium",
+    src = ["//tests:test_ud_hwy_performance.cpp"],
+    copts = HWY_OPTS,
+    deps = [
+        "@//src:ud",
+        "@hwy//:nanobenchmark",
+    ],
+)
+
 test_suite(
     name = "all",
     tests = [
         ":get_exponent_test",
         ":get_pow2_test",
         ":get_predabs_test",
-        ":srround_boost_test",
-        ":srround_shishua_test",
-        ":srround_xoroshiro_test",
         ":twoprodfma_test",
         ":twosum_test",
         ":udround_test",
diff --git a/src/libvfcinstrumentonline/rand/tests/macros.bzl b/src/libvfcinstrumentonline/rand/tests/macros.bzl
index e5792a15..803a2a9d 100644
--- a/src/libvfcinstrumentonline/rand/tests/macros.bzl
+++ b/src/libvfcinstrumentonline/rand/tests/macros.bzl
@@ -1,12 +1,14 @@
+"""
+"""
+
 COPTS = [
-    "-std=c++20",
+    "-std=c++17",
     "-I.",
     "-Wfatal-errors",
+    "-DHWY_IS_TEST",
 ]
 
-SRCS = [
-    "//src:srcs",
-]
+SRCS = []
 
 DEPS = [
     "@hwy",
@@ -19,22 +21,22 @@ DEPS = [
 
 HEADERS = ["//tests:helper.hpp"]
 
-def cc_test_gen(name, src = None, deps = DEPS, copts = COPTS, size = "small"):
+def cc_test_gen(name, src = None, deps = DEPS, copts = COPTS, size = "small", dbg = False):
     native.cc_test(
         name = name,
         srcs = [src if src else name + ".cpp"] + HEADERS + SRCS,
-        copts = COPTS + (copts if copts else []),
+        copts = COPTS + (copts if copts else []) + (["-DSR_DEBUG"] if dbg else []),
         deps = deps,
         size = size,
     )
 
-def cc_test_lib_gen(name, src = None, deps = None, copts = COPTS, size = "small"):
+def cc_test_lib_gen(name, src = None, deps = None, copts = COPTS, size = "small", dbg = False):
     srcs = src if src else [name + ".cpp"]
     srcs += HEADERS
     native.cc_test(
         name = name,
         srcs = srcs + SRCS,
-        copts = COPTS + (copts if copts else []),
+        copts = COPTS + (copts if copts else []) + (["-DSR_DEBUG"] if dbg else []),
         deps = DEPS + (deps if deps else []),
         size = size,
         visibility = ["//visibility:public"],
diff --git a/src/libvfcinstrumentonline/rand/tests/test.c b/src/libvfcinstrumentonline/rand/tests/test.c
deleted file mode 100644
index dc7faa30..00000000
--- a/src/libvfcinstrumentonline/rand/tests/test.c
+++ /dev/null
@@ -1,17 +0,0 @@
-#include <stdint.h>
-#include <stdio.h>
-
-int main() {
-  float a = 1.4999999;
-  double b = 1.4999999;
-  double c = 0.1;
-  double d = __builtin_inff64();
-
-  printf("a: %.13a\n", a);
-  printf("b: %.13a\n", b);
-  printf("c: %.13a\n", c);
-  printf("d: %.13a\n", (float)c);
-  printf("e: %.13a\n", d - d);
-
-  return 0;
-}
diff --git a/src/libvfcinstrumentonline/rand/tests/get_exponent_test.cpp b/src/libvfcinstrumentonline/rand/tests/test_get_exponent.cpp
similarity index 100%
rename from src/libvfcinstrumentonline/rand/tests/get_exponent_test.cpp
rename to src/libvfcinstrumentonline/rand/tests/test_get_exponent.cpp
diff --git a/src/libvfcinstrumentonline/rand/tests/get_pow2_test.cpp b/src/libvfcinstrumentonline/rand/tests/test_get_pow2.cpp
similarity index 100%
rename from src/libvfcinstrumentonline/rand/tests/get_pow2_test.cpp
rename to src/libvfcinstrumentonline/rand/tests/test_get_pow2.cpp
diff --git a/src/libvfcinstrumentonline/rand/tests/get_pow2_test-hw.cpp b/src/libvfcinstrumentonline/rand/tests/test_get_pow2_hwy.cpp
similarity index 100%
rename from src/libvfcinstrumentonline/rand/tests/get_pow2_test-hw.cpp
rename to src/libvfcinstrumentonline/rand/tests/test_get_pow2_hwy.cpp
diff --git a/src/libvfcinstrumentonline/rand/tests/get_predabs_test.cpp b/src/libvfcinstrumentonline/rand/tests/test_get_predabs.cpp
similarity index 100%
rename from src/libvfcinstrumentonline/rand/tests/get_predabs_test.cpp
rename to src/libvfcinstrumentonline/rand/tests/test_get_predabs.cpp
diff --git a/src/libvfcinstrumentonline/rand/tests/test_sr_hw.cpp b/src/libvfcinstrumentonline/rand/tests/test_single_sr.cpp
similarity index 100%
rename from src/libvfcinstrumentonline/rand/tests/test_sr_hw.cpp
rename to src/libvfcinstrumentonline/rand/tests/test_single_sr.cpp
diff --git a/src/libvfcinstrumentonline/rand/tests/test_sr_hw-API.cpp b/src/libvfcinstrumentonline/rand/tests/test_single_sr_hwy.cpp
similarity index 100%
rename from src/libvfcinstrumentonline/rand/tests/test_sr_hw-API.cpp
rename to src/libvfcinstrumentonline/rand/tests/test_single_sr_hwy.cpp
diff --git a/src/libvfcinstrumentonline/rand/tests/test_single_ud.cpp b/src/libvfcinstrumentonline/rand/tests/test_single_ud.cpp
new file mode 100644
index 00000000..8b84c989
--- /dev/null
+++ b/src/libvfcinstrumentonline/rand/tests/test_single_ud.cpp
@@ -0,0 +1,70 @@
+#include <iostream>
+#include <map>
+
+#include "hwy/highway.h"
+#include "hwy/print-inl.h"
+
+#include "src/ud.h"
+
+template <typename T> void test_bin(T a, T b, int op) {
+
+  constexpr const char *fptype = std::is_same_v<T, float> ? "f32" : "f64";
+  constexpr const char *opname[] = {"Add", "Sub", "Mul", "Div"};
+  constexpr const char *fmt = std::is_same_v<T, float> ? "%+.6a " : "%+.13a ";
+
+  const std::string opstr = opname[op];
+  std::cerr << opstr << fptype << ":\n";
+
+  switch (op) {
+  case 0:
+    fprintf(stderr, fmt, ud::scalar::add(a, b));
+    break;
+  case 1:
+    fprintf(stderr, fmt, ud::scalar::sub(a, b));
+    break;
+  case 2:
+    fprintf(stderr, fmt, ud::scalar::mul(a, b));
+    break;
+  case 3:
+    fprintf(stderr, fmt, ud::scalar::div(a, b));
+    break;
+  }
+
+  std::cerr << std::endl;
+}
+
+int main(int argc, char *argv[]) {
+
+  if (argc != 5) {
+    std::cerr << "Usage: " << argv[0] << " <float> <float> <op> <type>\n";
+    return 1;
+  }
+
+  // print HWY_TARGET
+  std::cout << "HWY_TARGET: " << HWY_TARGET << std::endl;
+
+  double a = std::stof(argv[1]);
+  double b = std::stof(argv[2]);
+  const std::string op = std::string(argv[3]);
+  const std::string type = std::string(argv[4]);
+
+  const auto op_map = std::map<std::string, int>{
+      {"add", 0},
+      {"sub", 1},
+      {"mul", 2},
+      {"div", 3},
+  };
+
+  const auto type_map = std::map<std::string, int>{
+      {"f32", 0},
+      {"f64", 1},
+  };
+
+  if (type == "f32") {
+    test_bin<float>(a, b, op_map.at(op));
+  } else if (type == "f64") {
+    test_bin<double>(a, b, op_map.at(op));
+  }
+
+  return 0;
+}
diff --git a/src/libvfcinstrumentonline/rand/tests/srround_test.cpp b/src/libvfcinstrumentonline/rand/tests/test_sr_accuracy.cpp
similarity index 98%
rename from src/libvfcinstrumentonline/rand/tests/srround_test.cpp
rename to src/libvfcinstrumentonline/rand/tests/test_sr_accuracy.cpp
index bba87884..0fb457ea 100644
--- a/src/libvfcinstrumentonline/rand/tests/srround_test.cpp
+++ b/src/libvfcinstrumentonline/rand/tests/test_sr_accuracy.cpp
@@ -242,7 +242,9 @@ void check_distribution_match(T a, T b,
   // do the test only if the operation is not exact (probability is not zero)
   bool is_exact = probability_down == 1 and probability_up == 1;
 
-  // do
+  // do the test only if the operation is not exact (probability is not zero)
+  // do the test if the distance between the reference and the estimated value
+  // is greater than the minimum subnormal
   // do not the test if the probability is lower than 1/repetitions
   bool compare_down_values = not is_exact and down != 0 and
                              probability_down > (1.0 / repetitions) and
@@ -251,8 +253,7 @@ void check_distribution_match(T a, T b,
                            probability_up > (1.0 / repetitions) and
                            distance_error > helper::IEEE754<T>::min_subnormal;
 
-  if (not is_exact and down != 0 and probability_down > (1.0 / repetitions) and
-      distance_error > helper::IEEE754<T>::min_subnormal)
+  if (compare_down_values)
     EXPECT_THAT(counter.down(), Eq(static_cast<T>(down)))
         << "Value ↓ is not equal to reference\n"
         << "            type: " << ftype << "\n"
@@ -277,8 +278,7 @@ void check_distribution_match(T a, T b,
         << distance_error_msg << std::defaultfloat << flush();
 
   // do not the test if the probability is lower than 1/repetitions
-  if (not is_exact and up != 0 and probability_up > (1.0 / repetitions) and
-      distance_error > helper::IEEE754<T>::min_subnormal)
+  if (compare_up_values)
     EXPECT_THAT(counter.up(), Eq(static_cast<T>(up)))
         << "Value ↑ is not equal to reference\n"
         << "            type: " << ftype << "\n"
diff --git a/src/libvfcinstrumentonline/rand/tests/test_sr_hwy_accuracy.cpp b/src/libvfcinstrumentonline/rand/tests/test_sr_hwy_accuracy.cpp
new file mode 100644
index 00000000..e2019885
--- /dev/null
+++ b/src/libvfcinstrumentonline/rand/tests/test_sr_hwy_accuracy.cpp
@@ -0,0 +1,802 @@
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <iostream>
+#include <random>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+#include <iostream>
+
+// clang-format off
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "tests/srround_test_hw-inl.cpp"
+#include "hwy/foreach_target.h"
+
+#include "hwy/highway.h"
+#include "hwy/base.h"
+#include "hwy/tests/test_util-inl.h"
+#include "src/debug_hwy-inl.h"
+#include "src/sr_hw-inl.h"
+// clang-format on
+
+#include "src/sr_hw.h"
+#include "src/utils.hpp"
+#include "tests/helper.hpp"
+
+HWY_BEFORE_NAMESPACE(); // at file scope
+
+namespace sr {
+
+namespace HWY_NAMESPACE {
+
+namespace hn = hwy::HWY_NAMESPACE;
+
+namespace {
+
+using ::testing::AllOf;
+using ::testing::Ge;
+using ::testing::Le;
+using ::testing::Lt;
+
+constexpr auto default_alpha = 0.001;
+#ifdef SR_DEBUG
+constexpr auto default_repetitions = 100;
+#else
+constexpr auto default_repetitions = 10'000;
+#endif
+
+static auto distribution_failed_tests_counter = 0;
+static auto distribution_tests_counter = 0;
+
+namespace reference {
+// return pred(|s|)
+
+// twosum reference
+// compute in double precision if the input type is float
+// compute in quad precision if the input type is double
+template <typename T, typename H = typename helper::IEEE754<T>::H>
+H add(T a, T b) {
+  return static_cast<H>(a) + static_cast<H>(b);
+}
+
+template <typename T, typename H = typename helper::IEEE754<T>::H>
+H sub(T a, T b) {
+  return static_cast<H>(a) - static_cast<H>(b);
+}
+
+template <typename T, typename H = typename helper::IEEE754<T>::H>
+H mul(T a, T b) {
+  return static_cast<H>(a) * static_cast<H>(b);
+}
+
+template <typename T, typename H = typename helper::IEEE754<T>::H>
+H div(T a, T b) {
+  return static_cast<H>(a) / static_cast<H>(b);
+}
+
+}; // namespace reference
+
+namespace srvh = sr::vector::HWY_NAMESPACE;
+
+struct SRAdd {
+  static constexpr char name[] = "add";
+  static constexpr char symbol[] = "+";
+
+  template <class D, class V = hn::VFromD<D>, typename T = hn::TFromD<D>>
+  V operator()(D d, V a, V b) {
+    return srvh::sr_add<D>(a, b);
+  }
+
+  template <typename T, typename H = typename helper::IEEE754<T>::H>
+  static H reference(T a, T b) {
+    return reference::add<T>(a, b);
+  }
+};
+
+struct SRSub {
+  static constexpr char name[] = "sub";
+  static constexpr char symbol[] = "-";
+
+  template <class D, class V = hn::VFromD<D>, typename T = hn::TFromD<D>>
+  V operator()(D d, V a, V b) {
+    return srvh::sr_sub<D>(a, b);
+  }
+
+  template <typename T, typename H = typename helper::IEEE754<T>::H>
+  static H reference(T a, T b) {
+    return reference::sub<T>(a, b);
+  }
+};
+
+struct SRMul {
+  static constexpr char name[] = "mul";
+  static constexpr char symbol[] = "*";
+
+  template <class D, class V = hn::VFromD<D>, typename T = hn::TFromD<D>>
+  V operator()(D d, V a, V b) {
+    return srvh::sr_mul<D>(a, b);
+  }
+
+  template <typename T, typename H = typename helper::IEEE754<T>::H>
+  static H reference(T a, T b) {
+    return reference::mul<T>(a, b);
+  }
+};
+
+struct SRDiv {
+  static constexpr char name[] = "div";
+  static constexpr char symbol[] = "/";
+
+  template <class D, class V = hn::VFromD<D>, typename T = hn::TFromD<D>>
+  V operator()(D d, V a, V b) {
+    return srvh::sr_div<D>(a, b);
+  }
+
+  template <typename T, typename H = typename helper::IEEE754<T>::H>
+  static H reference(T a, T b) {
+    return reference::div<T>(a, b);
+  }
+};
+
+template <typename T, typename H = typename helper::IEEE754<T>::H>
+std::tuple<H, H, H, H, H, std::string> compute_distance_error(T a, T b,
+                                                              H reference) {
+  T ref_cast = static_cast<T>(reference);
+
+  if (helper::isnan(a) or helper::isnan(b) or helper::isnan(reference) or
+      helper::isinf(a) or helper::isinf(b) or helper::isinf(reference) or
+      helper::isinf(ref_cast)) {
+    return {0, 0, reference, reference, 0, ""};
+  }
+
+  H error = 0, error_c = 0;
+  H probability_down = 0, probability_up = 0;
+  H next = 0, prev = 0;
+  H ulp = helper::get_ulp(ref_cast);
+
+  if (ref_cast == reference) {
+    error = 0;
+    probability_down = 1;
+    probability_up = 1;
+    prev = reference;
+    next = reference;
+  } else {
+    error = helper::absolute_distance(reference, static_cast<H>(ref_cast));
+    error_c = helper::absolute_distance(static_cast<H>(ulp), error);
+    prev = (ref_cast < reference) ? ref_cast : (ref_cast - ulp);
+    next = (ref_cast < reference) ? (ref_cast + ulp) : ref_cast;
+    probability_down = (next - reference) / ulp;
+    probability_up = (reference - prev) / ulp;
+
+    if (((error + error_c) != ulp)) {
+      std::cerr << "error + error_c != ulp" << "\n"
+                << "error:   " << helper::hexfloat(error) << "\n"
+                << "error_c: " << helper::hexfloat(error_c) << "\n"
+                << "prev:    " << helper::hexfloat(prev) << "\n"
+                << "next:    " << helper::hexfloat(next) << "\n"
+                << "ulp:     " << helper::hexfloat(ulp) << std::endl;
+      HWY_ASSERT(false);
+    }
+    if (probability_down + probability_up != 1) {
+      std::cerr << "probability_down + probability_up != 1" << "\n"
+                << "probability_down: " << probability_down << "\n"
+                << "probability_up:   " << probability_up << "\n"
+                << "reference:        " << helper::hexfloat(reference) << "\n"
+                << "prev:             " << helper::hexfloat(prev) << "\n"
+                << "next:             " << helper::hexfloat(next) << "\n"
+                << "error:            " << helper::hexfloat(error) << "\n"
+                << "error_c:          " << helper::hexfloat(error_c) << "\n"
+                << "ulp:              " << helper::hexfloat(ulp) << std::endl;
+      HWY_ASSERT(false);
+    }
+  }
+
+  std::ostringstream os;
+  os << std::hexfloat << std::setprecision(13);
+  os << "-- compute_distance_error --" << std::endl;
+  os << "         reference: " << helper::hexfloat(reference) << std::endl;
+  if constexpr (std::is_same_v<T, float>) {
+    os << "  (float)reference: " << helper::hexfloat(ref_cast) << std::endl;
+    os << "  |ref-(float)ref|: " << helper::hexfloat(error) << std::endl;
+  }
+  if constexpr (std::is_same_v<T, double>) {
+    os << " (double)reference: " << helper::hexfloat(ref_cast) << std::endl;
+    os << " |ref-(double)ref|: " << helper::hexfloat(error) << std::endl;
+  }
+  os << "           error_c: " << helper::hexfloat(error_c) << std::endl;
+  os << "               ulp: " << helper::hexfloat(ulp) << std::endl;
+  os << "       reference ↓: " << helper::hexfloat(prev) << std::endl;
+  os << "       reference ↑: " << helper::hexfloat(next) << std::endl;
+  os << std::defaultfloat;
+  os << "                 p: " << probability_down << std::endl;
+  os << "               1-p: " << probability_up << std::endl;
+  auto msg = os.str();
+
+  return {probability_down, probability_up, prev, next, error, msg};
+}
+
+template <class Op, class D, class V = hn::VFromD<D>,
+          typename T = hn::TFromD<D>>
+std::vector<helper::Counter<T>> eval_op(D d, V a, V b, const int repetitions) {
+
+  const size_t lanes = hn::Lanes(d);
+  auto op = Op();
+
+  std::vector<helper::Counter<T>> c(lanes);
+  for (int i = 0; i < repetitions; i++) {
+    auto v = op(d, a, b);
+    for (size_t j = 0; j < lanes; j++) {
+      auto vj = hn::ExtractLane(v, j);
+      c[j][vj]++;
+    }
+  }
+
+  return c;
+}
+
+template <typename T> std::string fmt_proba(const T &x) {
+  std::ostringstream os;
+  os << std::fixed << std::setprecision(5) << x << std::defaultfloat;
+  return os.str();
+}
+
+std::string flush() {
+  debug_flush();
+  return "";
+}
+
+template <class D, class V = hn::VFromD<D>, typename T = hn::TFromD<D>>
+void assert_equal_inputs(D d, V a) {
+  auto a_min = hn::ReduceMin(d, a);
+  auto a_max = hn::ReduceMax(d, a);
+  if (helper::isnan(a_min) and helper::isnan(a_max))
+    return;
+
+#if HWY_TARGET != HWY_EMU128
+  if (a_min != a_max) {
+    std::cerr << "Vector does not have equal inputs!\n"
+              << "min(a): " << helper::hexfloat(a_min) << "\n"
+              << "max(a): " << helper::hexfloat(a_max) << std::endl;
+    HWY_ASSERT(false);
+  }
+#endif
+}
+
+template <class Op, class D, class V = hn::VFromD<D>,
+          typename T = hn::TFromD<D>>
+void check_distribution_match(D d, V va, V vb,
+                              const long repetitions = default_repetitions,
+                              const float alpha = default_alpha) {
+  using H = typename helper::IEEE754<T>::H;
+
+  auto a = hn::GetLane(va);
+  auto b = hn::GetLane(vb);
+
+  // ensure that we have vector of same value
+  assert_equal_inputs(d, va);
+  assert_equal_inputs(d, vb);
+
+  auto counters = eval_op<Op>(d, va, vb, repetitions);
+
+  H reference = Op::reference(a, b);
+  auto [probability_down, probability_up, down, up, distance_error,
+        distance_error_msg] = compute_distance_error(a, b, reference);
+
+  size_t lane = 0;
+  for (auto &counter : counters) {
+
+    auto count_down = counter.down_count();
+    auto count_up = counter.up_count();
+    auto probability_down_estimated = count_down / (double)repetitions;
+    auto probability_up_estimated = count_up / (double)repetitions;
+
+    if (not helper::isfinite(counter.up()) or
+        not helper::isfinite(counter.down())) {
+      if (helper::isnan(counter.up()) and helper::isnan(reference))
+        return;
+      if (helper::isnan(counter.down()) and helper::isnan(reference))
+        return;
+      if (helper::isinf(counter.up()) and helper::isinf(reference)) {
+        if (counter.up() != reference) {
+          std::cerr << "Error in inf comparison\n"
+                    << "counter.up() != reference\n"
+                    << "counter.up(): " << counter.up() << "\n"
+                    << "reference: " << reference << "\n"
+                    << distance_error_msg << std::defaultfloat << flush();
+          HWY_ASSERT(false);
+        }
+      }
+      if (helper::isinf(counter.down()) and helper::isinf(reference)) {
+        if (counter.down() != reference) {
+          std::cerr << "Error in inf comparison\n"
+                    << "counter.down() != reference\n"
+                    << "counter.down(): " << counter.down() << "\n"
+                    << "reference: " << reference << "\n"
+                    << distance_error_msg << std::defaultfloat << flush();
+          HWY_ASSERT(false);
+        }
+      }
+      return;
+    }
+
+    /* code */
+    if (0.0 > probability_down or probability_down > 1.0) {
+      std::cerr << "Probability ↓ is not in [0, 1] range\n"
+                << "-- theoretical -\n"
+                << "   probability ↓: " << fmt_proba(probability_down) << "\n"
+                << "   probability ↑: " << fmt_proba(probability_up) << "\n"
+                << "--- estimated --\n"
+                << "     sample size: " << repetitions << "\n"
+                << "              #↓: " << count_down << " ("
+                << fmt_proba(probability_down_estimated) << ")\n"
+                << "              #↑: " << count_up << " ("
+                << fmt_proba(probability_up_estimated) << ")\n"
+                << std::hexfloat << "" << "              ↓: " << counter.down()
+                << "\n"
+                << "              ↑: " << counter.up() << "\n"
+                << distance_error_msg << std::defaultfloat << flush();
+      HWY_ASSERT(false);
+    }
+
+    // do the test only if the operation is not exact (probability is not zero)
+    bool is_exact = probability_down == 1 and probability_up == 1;
+
+    // do the test only if the operation is not exact (probability is not zero)
+    // do the test if the distance between the reference and the estimated value
+    // is greater than the minimum subnormal
+    // do not the test if the probability is lower than 1/repetitions
+    bool compare_down_values =
+        not is_exact and down != 0 and
+        probability_down > (1.0 / repetitions) and
+        distance_error > helper::IEEE754<T>::min_subnormal;
+    bool compare_up_values = not is_exact and up != 0 and
+                             probability_up > (1.0 / repetitions) and
+                             distance_error > helper::IEEE754<T>::min_subnormal;
+
+    // 95% relative error
+    auto error_down = (probability_down_estimated / probability_down);
+    auto error_up = (probability_up_estimated / probability_up);
+
+    // binomial test
+    auto test = helper::binomial_test(repetitions, count_down,
+                                      static_cast<double>(probability_down));
+
+    const auto op_name = Op::name;
+    const auto op_symbol = Op::symbol;
+    const auto ftype = typeid(T).name();
+
+    // Bonferroni correction, divide by the number of lanes
+    const auto lanes = hn::Lanes(d);
+    const auto alpha_bon = (alpha / 2) / lanes;
+
+    if (compare_down_values and static_cast<T>(down) != counter.down()) {
+      std::cerr << "Value ↓ is not equal to reference\n"
+                << "     Lane/#Lanes: " << lane + 1 << "/" << lanes << "\n"
+                << "            type: " << ftype << "\n"
+                << "              op: " << op_name << "\n"
+                << "           alpha: " << alpha_bon << "\n"
+                << std::hexfloat << std::setprecision(13) << ""
+                << "               a: " << helper::hexfloat(a) << "\n"
+                << "               b: " << helper::hexfloat(b) << "\n"
+                << "             a" << op_symbol
+                << "b: " << helper::hexfloat(reference) << "\n"
+                << std::defaultfloat << "" << "-- theoretical -\n"
+                << "   probability ↓: " << fmt_proba(probability_down) << "\n"
+                << "   probability ↑: " << fmt_proba(probability_up) << "\n"
+                << "--- estimated --\n"
+                << "     sample size: " << repetitions << "\n"
+                << "              #↓: " << count_down << " ("
+                << fmt_proba(probability_down_estimated) << ")\n"
+                << "              #↑: " << count_up << " ("
+                << fmt_proba(probability_up_estimated) << ")\n"
+                << std::hexfloat
+                << "              ↓: " << helper::hexfloat(counter.down())
+                << "\n"
+                << "              ↑: " << helper::hexfloat(counter.up()) << "\n"
+                << distance_error_msg << std::defaultfloat << flush();
+      HWY_ASSERT(0);
+    }
+
+    if (compare_up_values and static_cast<T>(up) != counter.up()) {
+      std::cerr << "Value ↑ is not equal to reference\n"
+                << "     Lane/#Lanes: " << lane + 1 << "/" << lanes << "\n"
+                << "            type: " << ftype << "\n"
+                << "              op: " << op_name << "\n"
+                << "           alpha: " << alpha_bon << "\n"
+                << std::hexfloat << std::setprecision(13) << ""
+                << "               a: " << helper::hexfloat(a) << "\n"
+                << "               b: " << helper::hexfloat(b) << "\n"
+                << "             a" << op_symbol
+                << "b: " << helper::hexfloat(reference) << "\n"
+                << std::defaultfloat << "" << "-- theoretical -\n"
+                << "   probability ↓: " << fmt_proba(probability_down) << "\n"
+                << "   probability ↑: " << fmt_proba(probability_up) << "\n"
+                << "--- estimated --\n"
+                << "     sample size: " << repetitions << "\n"
+                << "              #↓: " << count_down << " ("
+                << fmt_proba(probability_down_estimated) << ")\n"
+                << "              #↑: " << count_up << " ("
+                << fmt_proba(probability_up_estimated) << ")\n"
+                << std::hexfloat
+                << "              ↓: " << helper::hexfloat(counter.down())
+                << "\n"
+                << "              ↑: " << helper::hexfloat(counter.up()) << "\n"
+                << distance_error_msg << std::defaultfloat << flush();
+      HWY_ASSERT(0);
+    }
+
+    // if (test.pvalue == 0)
+    //   return;
+
+    if (compare_down_values and compare_up_values and test.pvalue < alpha_bon) {
+      std::cerr << "Null hypotheis rejected!\n"
+                << "     Lane/#Lanes: " << lane + 1 << "/" << lanes << "\n"
+                << "            type: " << ftype << "\n"
+                << "              op: " << op_name << "\n"
+                << "           alpha: " << alpha_bon << "\n"
+                << std::hexfloat << std::setprecision(13) << ""
+                << "               a: " << helper::hexfloat(a) << "\n"
+                << "               b: " << helper::hexfloat(b) << "\n"
+                << "             a" << op_symbol
+                << "b: " << helper::hexfloat(reference) << "\n"
+                << std::defaultfloat << "" << "-- theoretical -\n"
+                << "   probability ↓: " << fmt_proba(probability_down) << "\n"
+                << "   probability ↑: " << fmt_proba(probability_up) << "\n"
+                << "--- estimated --\n"
+                << "     sample size: " << repetitions << "\n"
+                << "              #↓: " << count_down << " ("
+                << fmt_proba(probability_down_estimated) << ")\n"
+                << "              #↑: " << count_up << " ("
+                << fmt_proba(probability_up_estimated) << ")\n"
+                << "         p-value: " << test.pvalue << "\n"
+                << "            ↓ %: " << error_down * 100 << "\n"
+                << "            ↑ %: " << error_up * 100 << "\n"
+                << std::hexfloat
+                << "              ↓: " << helper::hexfloat(counter.down())
+                << "\n"
+                << "              ↑: " << helper::hexfloat(counter.up()) << "\n"
+                << distance_error_msg << std::defaultfloat << flush();
+      HWY_ASSERT(0);
+    }
+    distribution_tests_counter++;
+    lane++;
+    debug_reset();
+  }
+}
+
+template <typename T> std::vector<T> get_simple_case() {
+  std::vector<T> simple_case = {0.0,
+                                1.0,
+                                2.0,
+                                3.0,
+                                std::numeric_limits<T>::min(),
+                                std::numeric_limits<T>::lowest(),
+                                std::numeric_limits<T>::max(),
+                                std::numeric_limits<T>::epsilon(),
+                                std::numeric_limits<T>::infinity(),
+                                std::numeric_limits<T>::denorm_min(),
+                                std::numeric_limits<T>::quiet_NaN(),
+                                std::numeric_limits<T>::signaling_NaN()};
+  return simple_case;
+}
+
+template <class D, typename T = hn::TFromD<D>> void do_run_test_exact_add(D d) {
+  constexpr int32_t mantissa = helper::IEEE754<T>::mantissa;
+
+  for (int i = 0; i <= 5; i++) {
+    auto va = hn::Set(d, 1.25);
+    auto vb = hn::Set(d, std::ldexp(1.0, -(mantissa + i)));
+    check_distribution_match<SRAdd>(d, va, vb);
+  }
+}
+
+template <class Op, class D, class V = hn::VFromD<D>,
+          typename T = hn::TFromD<D>>
+void do_run_test_simple_case(D d) {
+  auto simple_case = get_simple_case<T>();
+  for (auto a : simple_case) {
+    for (auto b : simple_case) {
+      auto va = hn::Set(d, a);
+      auto vb = hn::Set(d, b);
+      auto va_neg = hn::Neg(va);
+      auto vb_neg = hn::Neg(vb);
+      check_distribution_match<Op>(d, va, vb);
+      check_distribution_match<Op>(d, va, vb_neg);
+      check_distribution_match<Op>(d, va_neg, vb);
+      check_distribution_match<Op>(d, va_neg, vb_neg);
+    }
+  }
+}
+
+template <class Op, class D, class V = hn::VFromD<D>,
+          typename T = hn::TFromD<D>>
+void do_run_test_random(D d, const double start_range_1st = 0.0,
+                        const double end_range_1st = 1.0,
+                        const double start_range_2nd = 0.0,
+                        const double end_range_2nd = 1.0) {
+  helper::RNG rng1(start_range_1st, end_range_1st);
+  helper::RNG rng2(start_range_2nd, end_range_2nd);
+
+  for (int i = 0; i < 100; i++) {
+    T a = rng1();
+    T b = rng2();
+    auto va = hn::Set(d, a);
+    auto vb = hn::Set(d, b);
+    auto va_neg = hn::Neg(va);
+    auto vb_neg = hn::Neg(vb);
+    check_distribution_match<Op>(d, va, vb);
+    check_distribution_match<Op>(d, va, vb_neg);
+    check_distribution_match<Op>(d, va_neg, vb);
+    check_distribution_match<Op>(d, va_neg, vb_neg);
+  }
+}
+
+void check_failed_tests(float threshold, bool skip = false) {
+  const auto failed_ratio =
+      distribution_failed_tests_counter / (double)distribution_tests_counter;
+  if (failed_ratio > threshold) {
+    std::cerr << "Number of failed tests above threshold!\n"
+              << "Threshold:       " << threshold << "\n"
+              << "Failed tests:    " << distribution_failed_tests_counter
+              << " (" << failed_ratio << ")\n"
+              << "Number of tests: " << distribution_tests_counter << "\n"
+              << std::endl;
+    HWY_ASSERT(skip);
+  }
+  distribution_failed_tests_counter = 0;
+  distribution_tests_counter = 0;
+}
+
+void assert_exact() { check_failed_tests(0); }
+void assert_almost_exact(bool skip = false) { check_failed_tests(0.05, skip); }
+
+struct TestExactOperationsAdd {
+  template <typename T, class D> void operator()(T /*unused*/, D d) {
+    do_run_test_exact_add(d);
+    assert_exact();
+  }
+};
+
+template <class Op> struct TestBasicAssertions {
+  template <typename T, class D> void operator()(T /*unused*/, D d) {
+    do_run_test_simple_case<Op>(d);
+    assert_almost_exact();
+  }
+};
+
+using TestBasicAssertionsAdd = TestBasicAssertions<SRAdd>;
+using TestBasicAssertionsSub = TestBasicAssertions<SRSub>;
+using TestBasicAssertionsMul = TestBasicAssertions<SRMul>;
+using TestBasicAssertionsDiv = TestBasicAssertions<SRDiv>;
+
+HWY_NOINLINE void TestAllExactOperationsAdd() {
+  hn::ForFloat3264Types(hn::ForPartialVectors<TestExactOperationsAdd>());
+}
+
+HWY_NOINLINE void TestAllBasicAssertionsAdd() {
+  hn::ForFloat3264Types(hn::ForPartialVectors<TestBasicAssertionsAdd>());
+}
+
+HWY_NOINLINE void TestAllBasicAssertionsSub() {
+  hn::ForFloat3264Types(hn::ForPartialVectors<TestBasicAssertionsSub>());
+}
+
+HWY_NOINLINE void TestAllBasicAssertionsMul() {
+  hn::ForFloat3264Types(hn::ForPartialVectors<TestBasicAssertionsMul>());
+}
+
+HWY_NOINLINE void TestAllBasicAssertionsDiv() {
+  hn::ForFloat3264Types(hn::ForPartialVectors<TestBasicAssertionsDiv>());
+}
+
+template <class Op> struct TestRandom01Assertions {
+  template <typename T, class D> void operator()(T /*unused*/, D d) {
+    do_run_test_random<Op>(d);
+  }
+};
+
+using TestRandom01AssertionsAdd = TestRandom01Assertions<SRAdd>;
+using TestRandom01AssertionsSub = TestRandom01Assertions<SRSub>;
+using TestRandom01AssertionsMul = TestRandom01Assertions<SRMul>;
+using TestRandom01AssertionsDiv = TestRandom01Assertions<SRDiv>;
+
+HWY_NOINLINE void TestAllRandom01AssertionsAdd() {
+  hn::ForFloat3264Types(hn::ForPartialVectors<TestRandom01AssertionsAdd>());
+}
+
+HWY_NOINLINE void TestAllRandom01AssertionsSub() {
+  hn::ForFloat3264Types(hn::ForPartialVectors<TestRandom01AssertionsSub>());
+}
+
+HWY_NOINLINE void TestAllRandom01AssertionsMul() {
+  hn::ForFloat3264Types(hn::ForPartialVectors<TestRandom01AssertionsMul>());
+}
+
+HWY_NOINLINE void TestAllRandom01AssertionsDiv() {
+  hn::ForFloat3264Types(hn::ForPartialVectors<TestRandom01AssertionsDiv>());
+}
+
+template <class Op, class D, class V = hn::VFromD<D>,
+          typename T = hn::TFromD<D>>
+void do_random_no_overlap_test(D d) {
+  double start_range_1st = 0.0;
+  double end_range_1st = 1.0;
+  int s2;
+  if constexpr (std::is_same_v<T, float>) {
+    s2 = -25;
+  } else {
+    s2 = -54;
+  };
+  double start_range_2nd = std::ldexp(1.0, s2 + 1);
+  double end_range_2nd = std::ldexp(1.0, s2 + 1);
+  do_run_test_random<Op>(d, start_range_1st, end_range_1st, start_range_2nd,
+                         end_range_2nd);
+}
+
+template <class Op> struct TestRandomNoOverlapAssertions {
+  template <typename T, class D> void operator()(T /*unused*/, D d) {
+    do_random_no_overlap_test<Op>(d);
+  }
+};
+
+using TestRandomNoOverlapAssertionsAdd = TestRandomNoOverlapAssertions<SRAdd>;
+using TestRandomNoOverlapAssertionsSub = TestRandomNoOverlapAssertions<SRSub>;
+using TestRandomNoOverlapAssertionsMul = TestRandomNoOverlapAssertions<SRMul>;
+using TestRandomNoOverlapAssertionsDiv = TestRandomNoOverlapAssertions<SRDiv>;
+
+HWY_NOINLINE void TestAllRandomNoOverlapAssertionsAdd() {
+  hn::ForFloat3264Types(
+      hn::ForPartialVectors<TestRandomNoOverlapAssertionsAdd>());
+}
+
+HWY_NOINLINE void TestAllRandomNoOverlapAssertionsSub() {
+  hn::ForFloat3264Types(
+      hn::ForPartialVectors<TestRandomNoOverlapAssertionsSub>());
+}
+
+HWY_NOINLINE void TestAllRandomNoOverlapAssertionsMul() {
+  hn::ForFloat3264Types(
+      hn::ForPartialVectors<TestRandomNoOverlapAssertionsMul>());
+}
+
+HWY_NOINLINE void TestAllRandomNoOverlapAssertionsDiv() {
+  hn::ForFloat3264Types(
+      hn::ForPartialVectors<TestRandomNoOverlapAssertionsDiv>());
+}
+
+template <class Op, class D, class V = hn::VFromD<D>,
+          typename T = hn::TFromD<D>>
+void do_random_last_bit_overlap(D d) {
+  double start_range_1st = 1.0;
+  double end_range_1st = 2.0;
+  int s2;
+  if constexpr (std::is_same_v<T, float>) {
+    s2 = -24;
+  } else {
+    s2 = -53;
+  }
+  double start_range_2nd = std::ldexp(1.0, s2 + 1);
+  double end_range_2nd = std::ldexp(1.0, s2 + 1);
+  do_run_test_random<Op>(d, start_range_1st, end_range_1st, start_range_2nd,
+                         end_range_2nd);
+}
+
+template <class Op> struct TestRandomLastBitOverlap {
+  template <typename T, class D> void operator()(T /*unused*/, D d) {
+    do_random_last_bit_overlap<Op>(d);
+  }
+};
+
+using TestRandomLastBitOverlapAdd = TestRandomLastBitOverlap<SRAdd>;
+using TestRandomLastBitOverlapSub = TestRandomLastBitOverlap<SRSub>;
+using TestRandomLastBitOverlapMul = TestRandomLastBitOverlap<SRMul>;
+using TestRandomLastBitOverlapDiv = TestRandomLastBitOverlap<SRDiv>;
+
+HWY_NOINLINE void TestAllRandomLastBitOverlapAdd() {
+  hn::ForFloat3264Types(hn::ForPartialVectors<TestRandomLastBitOverlapAdd>());
+}
+
+HWY_NOINLINE void TestAllRandomLastBitOverlapSub() {
+  hn::ForFloat3264Types(hn::ForPartialVectors<TestRandomLastBitOverlapSub>());
+}
+
+HWY_NOINLINE void TestAllRandomLastBitOverlapMul() {
+  hn::ForFloat3264Types(hn::ForPartialVectors<TestRandomLastBitOverlapMul>());
+}
+
+HWY_NOINLINE void TestAllRandomLastBitOverlapDiv() {
+  hn::ForFloat3264Types(hn::ForPartialVectors<TestRandomLastBitOverlapDiv>());
+}
+
+template <class Op, class D, class V = hn::VFromD<D>,
+          typename T = hn::TFromD<D>>
+void do_random_mid_overlap_test(D d) {
+  double start_range_1st = 1.0;
+  double end_range_1st = 2.0;
+  int s2;
+  if constexpr (std::is_same_v<T, float>) {
+    s2 = -13;
+  } else {
+    s2 = -27;
+  };
+  double start_range_2nd = std::ldexp(1.0, s2 + 1);
+  double end_range_2nd = std::ldexp(1.0, s2 + 1);
+  do_run_test_random<Op>(d, start_range_1st, end_range_1st, start_range_2nd,
+                         end_range_2nd);
+}
+
+template <class Op> struct TestRandomMidOverlap {
+  template <typename T, class D> void operator()(T /*unused*/, D d) {
+    do_random_mid_overlap_test<Op>(d);
+  }
+};
+
+using TestRandomMidOverlapAdd = TestRandomMidOverlap<SRAdd>;
+using TestRandomMidOverlapSub = TestRandomMidOverlap<SRSub>;
+using TestRandomMidOverlapMul = TestRandomMidOverlap<SRMul>;
+using TestRandomMidOverlapDiv = TestRandomMidOverlap<SRDiv>;
+
+HWY_NOINLINE void TestAllRandomMidOverlapAdd() {
+  hn::ForFloat3264Types(hn::ForPartialVectors<TestRandomMidOverlapAdd>());
+}
+
+HWY_NOINLINE void TestAllRandomMidOverlapSub() {
+  hn::ForFloat3264Types(hn::ForPartialVectors<TestRandomMidOverlapSub>());
+}
+
+HWY_NOINLINE void TestAllRandomMidOverlapMul() {
+  hn::ForFloat3264Types(hn::ForPartialVectors<TestRandomMidOverlapMul>());
+}
+
+HWY_NOINLINE void TestAllRandomMidOverlapDiv() {
+  hn::ForFloat3264Types(hn::ForPartialVectors<TestRandomMidOverlapDiv>());
+}
+
+} // namespace
+} // namespace HWY_NAMESPACE
+} // namespace sr
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace sr {
+namespace HWY_NAMESPACE {
+
+HWY_BEFORE_TEST(SRRoundTest);
+HWY_EXPORT_AND_TEST_P(SRRoundTest, TestAllExactOperationsAdd);
+HWY_EXPORT_AND_TEST_P(SRRoundTest, TestAllBasicAssertionsAdd);
+HWY_EXPORT_AND_TEST_P(SRRoundTest, TestAllBasicAssertionsSub);
+HWY_EXPORT_AND_TEST_P(SRRoundTest, TestAllBasicAssertionsMul);
+HWY_EXPORT_AND_TEST_P(SRRoundTest, TestAllBasicAssertionsDiv);
+HWY_EXPORT_AND_TEST_P(SRRoundTest, TestAllRandom01AssertionsAdd);
+HWY_EXPORT_AND_TEST_P(SRRoundTest, TestAllRandom01AssertionsSub);
+HWY_EXPORT_AND_TEST_P(SRRoundTest, TestAllRandom01AssertionsMul);
+HWY_EXPORT_AND_TEST_P(SRRoundTest, TestAllRandom01AssertionsDiv);
+HWY_EXPORT_AND_TEST_P(SRRoundTest, TestAllRandomNoOverlapAssertionsAdd);
+HWY_EXPORT_AND_TEST_P(SRRoundTest, TestAllRandomNoOverlapAssertionsSub);
+HWY_EXPORT_AND_TEST_P(SRRoundTest, TestAllRandomNoOverlapAssertionsMul);
+HWY_EXPORT_AND_TEST_P(SRRoundTest, TestAllRandomNoOverlapAssertionsDiv);
+HWY_EXPORT_AND_TEST_P(SRRoundTest, TestAllRandomLastBitOverlapAdd);
+HWY_EXPORT_AND_TEST_P(SRRoundTest, TestAllRandomLastBitOverlapSub);
+HWY_EXPORT_AND_TEST_P(SRRoundTest, TestAllRandomLastBitOverlapMul);
+HWY_EXPORT_AND_TEST_P(SRRoundTest, TestAllRandomLastBitOverlapDiv);
+HWY_EXPORT_AND_TEST_P(SRRoundTest, TestAllRandomMidOverlapAdd);
+HWY_EXPORT_AND_TEST_P(SRRoundTest, TestAllRandomMidOverlapSub);
+HWY_EXPORT_AND_TEST_P(SRRoundTest, TestAllRandomMidOverlapMul);
+HWY_EXPORT_AND_TEST_P(SRRoundTest, TestAllRandomMidOverlapDiv);
+HWY_AFTER_TEST();
+
+} // namespace HWY_NAMESPACE
+} // namespace sr
+
+HWY_TEST_MAIN();
+
+#endif // HWY_ONCE
+
+// int main(int argc, char **argv) {
+//   testing::InitGoogleTest(&argc, argv);
+//   init();
+//   return RUN_ALL_TESTS();
+// }
\ No newline at end of file
diff --git a/src/libvfcinstrumentonline/rand/tests/sr_hw_test.cpp b/src/libvfcinstrumentonline/rand/tests/test_sr_hwy_performance.cpp
similarity index 100%
rename from src/libvfcinstrumentonline/rand/tests/sr_hw_test.cpp
rename to src/libvfcinstrumentonline/rand/tests/test_sr_hwy_performance.cpp
diff --git a/src/libvfcinstrumentonline/rand/tests/twoprodfma_test.cpp b/src/libvfcinstrumentonline/rand/tests/test_twoprodfma.cpp
similarity index 100%
rename from src/libvfcinstrumentonline/rand/tests/twoprodfma_test.cpp
rename to src/libvfcinstrumentonline/rand/tests/test_twoprodfma.cpp
diff --git a/src/libvfcinstrumentonline/rand/tests/twosum_test.cpp b/src/libvfcinstrumentonline/rand/tests/test_twosum.cpp
similarity index 100%
rename from src/libvfcinstrumentonline/rand/tests/twosum_test.cpp
rename to src/libvfcinstrumentonline/rand/tests/test_twosum.cpp
diff --git a/src/libvfcinstrumentonline/rand/tests/srround.cpp b/src/libvfcinstrumentonline/rand/tests/test_ud_accuracy.cpp
similarity index 66%
rename from src/libvfcinstrumentonline/rand/tests/srround.cpp
rename to src/libvfcinstrumentonline/rand/tests/test_ud_accuracy.cpp
index c6cb7c65..b4590143 100644
--- a/src/libvfcinstrumentonline/rand/tests/srround.cpp
+++ b/src/libvfcinstrumentonline/rand/tests/test_ud_accuracy.cpp
@@ -5,17 +5,24 @@
 #include <random>
 #include <vector>
 
-#include "src/eft.hpp"
-#include "src/main.hpp"
-#include "src/sr.hpp"
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+#include "src/ud.h"
 #include "src/utils.hpp"
 #include "tests/helper.hpp"
 
-constexpr auto default_alpha = 0.001;
+using ::testing::AllOf;
+using ::testing::Eq;
+using ::testing::Ge;
+using ::testing::Le;
+using ::testing::Lt;
+
+constexpr auto default_alpha = 0.0001;
 #ifdef SR_DEBUG
 constexpr auto default_repetitions = 10;
 #else
-constexpr auto default_repetitions = 100'000;
+constexpr auto default_repetitions = 1'000;
 #endif
 namespace reference {
 // return pred(|s|)
@@ -24,22 +31,22 @@ namespace reference {
 // compute in double precision if the input type is float
 // compute in quad precision if the input type is double
 template <typename T, typename H = typename helper::IEEE754<T>::H>
-H sr_add(T a, T b) {
+H add(T a, T b) {
   return static_cast<H>(a) + static_cast<H>(b);
 }
 
 template <typename T, typename H = typename helper::IEEE754<T>::H>
-H sr_sub(T a, T b) {
+H sub(T a, T b) {
   return static_cast<H>(a) - static_cast<H>(b);
 }
 
 template <typename T, typename H = typename helper::IEEE754<T>::H>
-H sr_mul(T a, T b) {
+H mul(T a, T b) {
   return static_cast<H>(a) * static_cast<H>(b);
 }
 
 template <typename T, typename H = typename helper::IEEE754<T>::H>
-H sr_div(T a, T b) {
+H div(T a, T b) {
   return static_cast<H>(a) / static_cast<H>(b);
 }
 
@@ -47,16 +54,16 @@ template <typename T, typename Op, typename H = typename helper::IEEE754<T>::H>
 std::function<H(T, T)> get_operator() {
   static_assert(std::is_base_of_v<helper::Operator<T>, Op>);
   if constexpr (std::is_same_v<helper::AddOp<T>, Op>) {
-    return sr_add<T>;
+    return add<T>;
   }
   if constexpr (std::is_same_v<Op, helper::SubOp<T>>) {
-    return sr_sub<T>;
+    return sub<T>;
   }
   if constexpr (std::is_same_v<Op, helper::MulOp<T>>) {
-    return sr_mul<T>;
+    return mul<T>;
   }
   if constexpr (std::is_same_v<Op, helper::DivOp<T>>) {
-    return sr_div<T>;
+    return div<T>;
   }
 }
 
@@ -66,59 +73,70 @@ template <typename T, typename Op> typename Op::function get_target_operator() {
   static_assert(std::is_base_of_v<helper::Operator<T>, Op>);
 
   if constexpr (std::is_same_v<Op, helper::AddOp<T>>) {
-    return sr_add<T>;
+    return ud::scalar::add<T>;
   }
   if constexpr (std::is_same_v<Op, helper::SubOp<T>>) {
-    return sr_sub<T>;
+    return ud::scalar::sub<T>;
   }
   if constexpr (std::is_same_v<Op, helper::MulOp<T>>) {
-    return sr_mul<T>;
+    return ud::scalar::mul<T>;
   }
   if constexpr (std::is_same_v<Op, helper::DivOp<T>>) {
-    return sr_div<T>;
+    return ud::scalar::div<T>;
   }
 }
 
 template <typename T, typename H = typename helper::IEEE754<T>::H>
-std::pair<H, H> compute_distance_error(T a, T b, H reference) {
+std::tuple<H, H, H, H, H, std::string> compute_distance_error(T a, T b,
+                                                              H reference) {
   T ref_cast = static_cast<T>(reference);
 
-  H error = helper::absolute_distance(reference, static_cast<H>(ref_cast));
-  H ulp = helper::get_ulp(ref_cast);
-
-  H probability_down = 1 - (error / ulp);
-  H probability_up = 1 - probability_down;
-
-  if (ref_cast > reference) {
-    std::swap(probability_down, probability_up);
-  }
-
-  if (error == 0 or ulp == 0 or
-      helper::abs(reference) < helper::IEEE754<T>::min_subnormal or
-      helper::abs(error) < helper::IEEE754<T>::min_subnormal) {
-    probability_down = 0;
-    probability_up = 1;
+  if (helper::isnan(a) or helper::isnan(b) or helper::isnan(reference) or
+      helper::isinf(a) or helper::isinf(b) or helper::isinf(reference) or
+      helper::isinf(ref_cast)) {
+    return {0, 0, reference, reference, 0, ""};
   }
 
-  return {probability_down, probability_up};
-}
-
-template <typename T, typename H = typename helper::IEEE754<T>::H>
-std::string compute_distance_error_str(T a, T b, H reference) {
-  T ref_cast = static_cast<T>(reference);
-  H error = helper::absolute_distance(reference, static_cast<H>(ref_cast));
+  H error = 0, error_c = 0;
+  H probability_down = 0, probability_up = 0;
+  H next = 0, prev = 0;
   H ulp = helper::get_ulp(ref_cast);
 
-  auto [probability_down, probability_up] =
-      compute_distance_error(a, b, reference);
-
-  H next, prev;
-  if (ref_cast < reference) {
-    prev = ref_cast;
-    next = static_cast<T>(reference + error);
+  if (ref_cast == reference) {
+    error = 0;
+    probability_down = 1;
+    probability_up = 1;
+    prev = reference;
+    next = reference;
   } else {
-    prev = static_cast<T>(reference - error);
-    next = ref_cast;
+    error = helper::absolute_distance(reference, static_cast<H>(ref_cast));
+    error_c = helper::absolute_distance(static_cast<H>(ulp), error);
+    prev = (ref_cast < reference) ? ref_cast : (ref_cast - ulp);
+    next = (ref_cast < reference) ? (ref_cast + ulp) : ref_cast;
+    probability_down = (next - reference) / ulp;
+    probability_up = (reference - prev) / ulp;
+
+    if (((error + error_c) != ulp)) {
+      std::cerr << "error + error_c != ulp" << "\n"
+                << "error:   " << helper::hexfloat(error) << "\n"
+                << "error_c: " << helper::hexfloat(error_c) << "\n"
+                << "prev:    " << helper::hexfloat(prev) << "\n"
+                << "next:    " << helper::hexfloat(next) << "\n"
+                << "ulp:     " << helper::hexfloat(ulp) << std::endl;
+      HWY_ASSERT(false);
+    }
+    if (probability_down + probability_up != 1) {
+      std::cerr << "probability_down + probability_up != 1" << "\n"
+                << "probability_down: " << probability_down << "\n"
+                << "probability_up:   " << probability_up << "\n"
+                << "reference:        " << helper::hexfloat(reference) << "\n"
+                << "prev:             " << helper::hexfloat(prev) << "\n"
+                << "next:             " << helper::hexfloat(next) << "\n"
+                << "error:            " << helper::hexfloat(error) << "\n"
+                << "error_c:          " << helper::hexfloat(error_c) << "\n"
+                << "ulp:              " << helper::hexfloat(ulp) << std::endl;
+      HWY_ASSERT(false);
+    }
   }
 
   std::ostringstream os;
@@ -133,13 +151,16 @@ std::string compute_distance_error_str(T a, T b, H reference) {
     os << " (double)reference: " << helper::hexfloat(ref_cast) << std::endl;
     os << " |ref-(double)ref|: " << helper::hexfloat(error) << std::endl;
   }
+  os << "           error_c: " << helper::hexfloat(error_c) << std::endl;
   os << "               ulp: " << helper::hexfloat(ulp) << std::endl;
   os << "       reference ↓: " << helper::hexfloat(prev) << std::endl;
   os << "       reference ↑: " << helper::hexfloat(next) << std::endl;
   os << std::defaultfloat;
   os << "                 p: " << probability_down << std::endl;
   os << "               1-p: " << probability_up << std::endl;
-  return os.str();
+  auto msg = os.str();
+
+  return {probability_down, probability_up, prev, next, error, msg};
 }
 
 template <typename T, typename Op>
@@ -173,10 +194,13 @@ void check_distribution_match(T a, T b,
                               const float alpha = default_alpha) {
   using H = typename helper::IEEE754<T>::H;
 
+  const auto op_name = Op::name;
+  const auto ftype = Op::ftype;
+
   auto reference_op = reference::get_operator<T, Op>();
   H reference = reference_op(a, b);
-  auto [probability_down, probability_up] =
-      compute_distance_error(a, b, reference);
+  auto [probability_down, probability_up, down, up, distance_error,
+        distance_error_msg] = compute_distance_error(a, b, reference);
 
   auto counter = eval_op<T, Op>(a, b, repetitions);
   auto count_down = counter.down_count();
@@ -199,58 +223,63 @@ void check_distribution_match(T a, T b,
     return;
   }
 
-  /* code */
-  // EXPECT_THAT(static_cast<double>(probability_down), AllOf(Ge(0.0), Le(1.0)))
-  //     << "Probability ↓ is not in [0, 1] range\n"
-  //     << "-- theoretical -\n"
-  //     << "   probability ↓: " << fmt_proba(probability_down) << "\n"
-  //     << "   probability ↑: " << fmt_proba(probability_up) << "\n"
-  //     << "--- estimated --\n"
-  //     << "     sample size: " << repetitions << "\n"
-  //     << "              #↓: " << count_down << " ("
-  //     << fmt_proba(probability_down_estimated) << ")\n"
-  //     << "              #↑: " << count_up << " ("
-  //     << fmt_proba(probability_up_estimated) << ")\n"
-  //     << std::hexfloat << ""
-  //     << "              ↓: " << counter.down() << "\n"
-  //     << "              ↑: " << counter.up() << "\n"
-  //     << compute_distance_error_str(a, b, reference) << std::defaultfloat
-  //     << flush();
-
-  auto test = helper::binomial_test(repetitions, count_down,
-                                    static_cast<double>(probability_down));
-
-  const auto op_name = Op::name;
-  const auto ftype = Op::ftype;
-
-  if (test.pvalue == 0)
-    return;
-
-  // EXPECT_THAT(alpha / 2, Lt(test.pvalue))
-  //     << "Null hypotheis rejected!\n"
-  //     << "            type: " << ftype << "\n"
-  //     << "              op: " << op_name << "\n"
-  //     << "           alpha: " << alpha << "\n"
-  //     << std::hexfloat << std::setprecision(13) << ""
-  //     << "               a: " << helper::hexfloat(a) << "\n"
-  //     << "               b: " << helper::hexfloat(b) << "\n"
-  //     << "             a+b: " << helper::hexfloat(reference) << "\n"
-  //     << std::defaultfloat << ""
-  //     << "-- theoretical -\n"
-  //     << "   probability ↓: " << fmt_proba(probability_down) << "\n"
-  //     << "   probability ↑: " << fmt_proba(probability_up) << "\n"
-  //     << "--- estimated --\n"
-  //     << "     sample size: " << repetitions << "\n"
-  //     << "              #↓: " << count_down << " ("
-  //     << fmt_proba(probability_down_estimated) << ")\n"
-  //     << "              #↑: " << count_up << " ("
-  //     << fmt_proba(probability_up_estimated) << ")\n"
-  //     << "         p-value: " << test.pvalue << "\n"
-  //     << std::hexfloat << ""
-  //     << "              ↓: " << helper::hexfloat(counter.down()) << "\n"
-  //     << "              ↑: " << helper::hexfloat(counter.up()) << "\n"
-  //     << compute_distance_error_str(a, b, reference) << std::defaultfloat
-  //     << flush();
+  EXPECT_THAT(static_cast<double>(probability_down), AllOf(Ge(0.0), Le(1.0)))
+      << "Probability ↓ is not in [0, 1] range\n"
+      << "-- theoretical -\n"
+      << "   probability ↓: " << fmt_proba(probability_down) << "\n"
+      << "   probability ↑: " << fmt_proba(probability_up) << "\n"
+      << "--- estimated --\n"
+      << "     sample size: " << repetitions << "\n"
+      << "              #↓: " << count_down << " ("
+      << fmt_proba(probability_down_estimated) << ")\n"
+      << "              #↑: " << count_up << " ("
+      << fmt_proba(probability_up_estimated) << ")\n"
+      << std::hexfloat << "" << "              ↓: " << counter.down() << "\n"
+      << "              ↑: " << counter.up() << "\n"
+      << distance_error_msg << std::defaultfloat << flush();
+
+  // do the test only if the operation is not exact (probability is not zero)
+  bool is_exact = probability_down == 1 and probability_up == 1;
+
+  // do the test only if the operation is not exact (probability is not zero)
+  // do the test if the distance between the reference and the estimated value
+  // is greater than the minimum subnormal
+  // do not the test if the probability is lower than 1/repetitions
+  bool compare_down_values = not is_exact and down != 0 and
+                             probability_down > (1.0 / repetitions) and
+                             distance_error > helper::IEEE754<T>::min_subnormal;
+  bool compare_up_values = not is_exact and up != 0 and
+                           probability_up > (1.0 / repetitions) and
+                           distance_error > helper::IEEE754<T>::min_subnormal;
+
+  // binomial test
+  auto test = helper::binomial_test(repetitions, count_down, 0.5);
+
+  // check probability if we compare the values
+  if (compare_down_values and compare_up_values)
+    EXPECT_THAT(alpha / 2, Lt(test.pvalue))
+        << "Null hypotheis rejected!\n"
+        << "            type: " << ftype << "\n"
+        << "              op: " << op_name << "\n"
+        << "           alpha: " << alpha << "\n"
+        << std::hexfloat << std::setprecision(13) << ""
+        << "               a: " << helper::hexfloat(a) << "\n"
+        << "               b: " << helper::hexfloat(b) << "\n"
+        << "             a+b: " << helper::hexfloat(reference) << "\n"
+        << std::defaultfloat << "" << "-- theoretical -\n"
+        << "   probability ↓: " << fmt_proba(probability_down) << "\n"
+        << "   probability ↑: " << fmt_proba(probability_up) << "\n"
+        << "--- estimated --\n"
+        << "     sample size: " << repetitions << "\n"
+        << "              #↓: " << count_down << " ("
+        << fmt_proba(probability_down_estimated) << ")\n"
+        << "              #↑: " << count_up << " ("
+        << fmt_proba(probability_up_estimated) << ")\n"
+        << "         p-value: " << test.pvalue << "\n"
+        << std::hexfloat << ""
+        << "              ↓: " << helper::hexfloat(counter.down()) << "\n"
+        << "              ↑: " << helper::hexfloat(counter.up()) << "\n"
+        << distance_error_msg << std::defaultfloat << flush();
   debug_reset();
 }
 
@@ -491,6 +520,5 @@ TEST(SRRoundTest, RandomMidOverlapDivAssertions) {
 
 int main(int argc, char **argv) {
   testing::InitGoogleTest(&argc, argv);
-  init();
   return RUN_ALL_TESTS();
 }
\ No newline at end of file
diff --git a/src/libvfcinstrumentonline/rand/tests/srround_test_hw-inl.cpp b/src/libvfcinstrumentonline/rand/tests/test_ud_hwy_accuracy.cpp
similarity index 82%
rename from src/libvfcinstrumentonline/rand/tests/srround_test_hw-inl.cpp
rename to src/libvfcinstrumentonline/rand/tests/test_ud_hwy_accuracy.cpp
index 9d7f9b37..684e9f4d 100644
--- a/src/libvfcinstrumentonline/rand/tests/srround_test_hw-inl.cpp
+++ b/src/libvfcinstrumentonline/rand/tests/test_ud_hwy_accuracy.cpp
@@ -12,17 +12,17 @@
 
 // clang-format off
 #undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "tests/srround_test_hw-inl.cpp"
+#define HWY_TARGET_INCLUDE "tests/test_ud_hwy_accuracy.cpp"
 #include "hwy/foreach_target.h"
 
 #include "hwy/highway.h"
 #include "hwy/base.h"
 #include "hwy/tests/test_util-inl.h"
 #include "src/debug_hwy-inl.h"
-#include "src/sr_hw-inl.h"
+#include "src/ud_hw-inl.h"
 // clang-format on
 
-#include "src/sr_hw.h"
+#include "src/ud_hw.h"
 #include "src/utils.hpp"
 #include "tests/helper.hpp"
 
@@ -41,11 +41,11 @@ using ::testing::Ge;
 using ::testing::Le;
 using ::testing::Lt;
 
-constexpr auto default_alpha = 0.001;
+constexpr auto default_alpha = 0.0001;
 #ifdef SR_DEBUG
 constexpr auto default_repetitions = 100;
 #else
-constexpr auto default_repetitions = 10'000;
+constexpr auto default_repetitions = 1'000;
 #endif
 
 static auto distribution_failed_tests_counter = 0;
@@ -79,15 +79,15 @@ H div(T a, T b) {
 
 }; // namespace reference
 
-namespace srvh = sr::vector::HWY_NAMESPACE;
+namespace udvh = ud::vector::HWY_NAMESPACE;
 
 struct SRAdd {
-  static constexpr std::string name = "add";
-  static constexpr std::string symbol = "+";
+  static constexpr char name[] = "add";
+  static constexpr char symbol[] = "+";
 
   template <class D, class V = hn::VFromD<D>, typename T = hn::TFromD<D>>
   V operator()(D d, V a, V b) {
-    return srvh::sr_add<D>(a, b);
+    return udvh::add<D>(a, b);
   }
 
   template <typename T, typename H = typename helper::IEEE754<T>::H>
@@ -97,12 +97,12 @@ struct SRAdd {
 };
 
 struct SRSub {
-  static constexpr std::string name = "sub";
-  static constexpr std::string symbol = "-";
+  static constexpr char name[] = "sub";
+  static constexpr char symbol[] = "-";
 
   template <class D, class V = hn::VFromD<D>, typename T = hn::TFromD<D>>
   V operator()(D d, V a, V b) {
-    return srvh::sr_sub<D>(a, b);
+    return udvh::sub<D>(a, b);
   }
 
   template <typename T, typename H = typename helper::IEEE754<T>::H>
@@ -112,12 +112,12 @@ struct SRSub {
 };
 
 struct SRMul {
-  static constexpr std::string name = "mul";
-  static constexpr std::string symbol = "*";
+  static constexpr char name[] = "mul";
+  static constexpr char symbol[] = "*";
 
   template <class D, class V = hn::VFromD<D>, typename T = hn::TFromD<D>>
   V operator()(D d, V a, V b) {
-    return srvh::sr_mul<D>(a, b);
+    return udvh::mul<D>(a, b);
   }
 
   template <typename T, typename H = typename helper::IEEE754<T>::H>
@@ -127,12 +127,12 @@ struct SRMul {
 };
 
 struct SRDiv {
-  static constexpr std::string name = "div";
-  static constexpr std::string symbol = "/";
+  static constexpr char name[] = "div";
+  static constexpr char symbol[] = "/";
 
   template <class D, class V = hn::VFromD<D>, typename T = hn::TFromD<D>>
   V operator()(D d, V a, V b) {
-    return srvh::sr_div<D>(a, b);
+    return udvh::div<D>(a, b);
   }
 
   template <typename T, typename H = typename helper::IEEE754<T>::H>
@@ -142,13 +142,14 @@ struct SRDiv {
 };
 
 template <typename T, typename H = typename helper::IEEE754<T>::H>
-std::tuple<H, H, std::string> compute_distance_error(T a, T b, H reference) {
+std::tuple<H, H, H, H, H, std::string> compute_distance_error(T a, T b,
+                                                              H reference) {
   T ref_cast = static_cast<T>(reference);
 
   if (helper::isnan(a) or helper::isnan(b) or helper::isnan(reference) or
       helper::isinf(a) or helper::isinf(b) or helper::isinf(reference) or
       helper::isinf(ref_cast)) {
-    return {0, 0, ""};
+    return {0, 0, reference, reference, 0, ""};
   }
 
   H error = 0, error_c = 0;
@@ -214,7 +215,7 @@ std::tuple<H, H, std::string> compute_distance_error(T a, T b, H reference) {
   os << "               1-p: " << probability_up << std::endl;
   auto msg = os.str();
 
-  return {probability_down, probability_up, msg};
+  return {probability_down, probability_up, prev, next, error, msg};
 }
 
 template <class Op, class D, class V = hn::VFromD<D>,
@@ -281,8 +282,8 @@ void check_distribution_match(D d, V va, V vb,
   auto counters = eval_op<Op>(d, va, vb, repetitions);
 
   H reference = Op::reference(a, b);
-  auto [probability_down, probability_up, distance_error_msg] =
-      compute_distance_error(a, b, reference);
+  auto [probability_down, probability_up, down, up, distance_error,
+        distance_error_msg] = compute_distance_error(a, b, reference);
 
   size_t lane = 0;
   for (auto &counter : counters) {
@@ -340,60 +341,65 @@ void check_distribution_match(D d, V va, V vb,
       HWY_ASSERT(false);
     }
 
+    // do the test only if the operation is not exact (probability is not zero)
+    bool is_exact = probability_down == 1 and probability_up == 1;
+
+    // do the test only if the operation is not exact (probability is not zero)
+    // do the test if the distance between the reference and the estimated value
+    // is greater than the minimum subnormal
+    // do not the test if the probability is lower than 1/repetitions
+    bool compare_down_values =
+        not is_exact and down != 0 and
+        probability_down > (1.0 / repetitions) and
+        distance_error > helper::IEEE754<T>::min_subnormal;
+    bool compare_up_values = not is_exact and up != 0 and
+                             probability_up > (1.0 / repetitions) and
+                             distance_error > helper::IEEE754<T>::min_subnormal;
+
     // 95% relative error
     auto error_down = (probability_down_estimated / probability_down);
     auto error_up = (probability_up_estimated / probability_up);
 
     // binomial test
-    auto test = helper::binomial_test(repetitions, count_down,
-                                      static_cast<double>(probability_down));
+    auto test = helper::binomial_test(repetitions, count_down, .5);
 
     const auto op_name = Op::name;
     const auto op_symbol = Op::symbol;
     const auto ftype = typeid(T).name();
 
-    if (test.pvalue == 0)
-      return;
-
     // Bonferroni correction, divide by the number of lanes
     const auto lanes = hn::Lanes(d);
     const auto alpha_bon = (alpha / 2) / lanes;
 
-    if (test.pvalue < alpha_bon) {
-      // check that we are in the 95% confidence interval
-      // to avoid false positives
-      if (error_down < 0.90 or error_up < 0.90) {
-        std::cerr << "Null hypotheis rejected!\n"
-                  << "     Lane/#Lanes: " << lane + 1 << "/" << lanes << "\n"
-                  << "            type: " << ftype << "\n"
-                  << "              op: " << op_name << "\n"
-                  << "           alpha: " << alpha_bon << "\n"
-                  << std::hexfloat << std::setprecision(13) << ""
-                  << "               a: " << helper::hexfloat(a) << "\n"
-                  << "               b: " << helper::hexfloat(b) << "\n"
-                  << "             a" << op_symbol
-                  << "b: " << helper::hexfloat(reference) << "\n"
-                  << std::defaultfloat << "" << "-- theoretical -\n"
-                  << "   probability ↓: " << fmt_proba(probability_down) << "\n"
-                  << "   probability ↑: " << fmt_proba(probability_up) << "\n"
-                  << "--- estimated --\n"
-                  << "     sample size: " << repetitions << "\n"
-                  << "              #↓: " << count_down << " ("
-                  << fmt_proba(probability_down_estimated) << ")\n"
-                  << "              #↑: " << count_up << " ("
-                  << fmt_proba(probability_up_estimated) << ")\n"
-                  << "         p-value: " << test.pvalue << "\n"
-                  << "            ↓ %: " << error_down * 100 << "\n"
-                  << "            ↑ %: " << error_up * 100 << "\n"
-                  << std::hexfloat
-                  << "              ↓: " << helper::hexfloat(counter.down())
-                  << "\n"
-                  << "              ↑: " << helper::hexfloat(counter.up())
-                  << "\n"
-                  << distance_error_msg << std::defaultfloat << flush();
-        HWY_ASSERT(0);
-      }
-      distribution_failed_tests_counter++;
+    if (compare_down_values and compare_up_values and test.pvalue < alpha_bon) {
+      std::cerr << "Null hypotheis rejected!\n"
+                << "     Lane/#Lanes: " << lane + 1 << "/" << lanes << "\n"
+                << "            type: " << ftype << "\n"
+                << "              op: " << op_name << "\n"
+                << "           alpha: " << alpha_bon << "\n"
+                << std::hexfloat << std::setprecision(13) << ""
+                << "               a: " << helper::hexfloat(a) << "\n"
+                << "               b: " << helper::hexfloat(b) << "\n"
+                << "             a" << op_symbol
+                << "b: " << helper::hexfloat(reference) << "\n"
+                << std::defaultfloat << "" << "-- theoretical -\n"
+                << "   probability ↓: " << fmt_proba(probability_down) << "\n"
+                << "   probability ↑: " << fmt_proba(probability_up) << "\n"
+                << "--- estimated --\n"
+                << "     sample size: " << repetitions << "\n"
+                << "              #↓: " << count_down << " ("
+                << fmt_proba(probability_down_estimated) << ")\n"
+                << "              #↑: " << count_up << " ("
+                << fmt_proba(probability_up_estimated) << ")\n"
+                << "         p-value: " << test.pvalue << "\n"
+                << "            ↓ %: " << error_down * 100 << "\n"
+                << "            ↑ %: " << error_up * 100 << "\n"
+                << std::hexfloat
+                << "              ↓: " << helper::hexfloat(counter.down())
+                << "\n"
+                << "              ↑: " << helper::hexfloat(counter.up()) << "\n"
+                << distance_error_msg << std::defaultfloat << flush();
+      HWY_ASSERT(0);
     }
     distribution_tests_counter++;
     lane++;
@@ -705,9 +711,21 @@ HWY_EXPORT_AND_TEST_P(SRRoundTest, TestAllBasicAssertionsSub);
 HWY_EXPORT_AND_TEST_P(SRRoundTest, TestAllBasicAssertionsMul);
 HWY_EXPORT_AND_TEST_P(SRRoundTest, TestAllBasicAssertionsDiv);
 HWY_EXPORT_AND_TEST_P(SRRoundTest, TestAllRandom01AssertionsAdd);
+HWY_EXPORT_AND_TEST_P(SRRoundTest, TestAllRandom01AssertionsSub);
+HWY_EXPORT_AND_TEST_P(SRRoundTest, TestAllRandom01AssertionsMul);
+HWY_EXPORT_AND_TEST_P(SRRoundTest, TestAllRandom01AssertionsDiv);
 HWY_EXPORT_AND_TEST_P(SRRoundTest, TestAllRandomNoOverlapAssertionsAdd);
+HWY_EXPORT_AND_TEST_P(SRRoundTest, TestAllRandomNoOverlapAssertionsSub);
+HWY_EXPORT_AND_TEST_P(SRRoundTest, TestAllRandomNoOverlapAssertionsMul);
+HWY_EXPORT_AND_TEST_P(SRRoundTest, TestAllRandomNoOverlapAssertionsDiv);
 HWY_EXPORT_AND_TEST_P(SRRoundTest, TestAllRandomLastBitOverlapAdd);
+HWY_EXPORT_AND_TEST_P(SRRoundTest, TestAllRandomLastBitOverlapSub);
+HWY_EXPORT_AND_TEST_P(SRRoundTest, TestAllRandomLastBitOverlapMul);
+HWY_EXPORT_AND_TEST_P(SRRoundTest, TestAllRandomLastBitOverlapDiv);
 HWY_EXPORT_AND_TEST_P(SRRoundTest, TestAllRandomMidOverlapAdd);
+HWY_EXPORT_AND_TEST_P(SRRoundTest, TestAllRandomMidOverlapSub);
+HWY_EXPORT_AND_TEST_P(SRRoundTest, TestAllRandomMidOverlapMul);
+HWY_EXPORT_AND_TEST_P(SRRoundTest, TestAllRandomMidOverlapDiv);
 HWY_AFTER_TEST();
 
 } // namespace HWY_NAMESPACE
diff --git a/src/libvfcinstrumentonline/rand/tests/test_ud_hwy_performance.cpp b/src/libvfcinstrumentonline/rand/tests/test_ud_hwy_performance.cpp
new file mode 100644
index 00000000..f2c15b80
--- /dev/null
+++ b/src/libvfcinstrumentonline/rand/tests/test_ud_hwy_performance.cpp
@@ -0,0 +1,1175 @@
+#include "hwy/nanobenchmark.h"
+
+#include <chrono>
+#include <numeric>
+#include <stddef.h>
+#include <stdio.h>
+
+#include "gtest/gtest.h"
+
+#include "hwy/tests/hwy_gtest.h"
+#include "hwy/tests/test_util-inl.h"
+
+#include "src/ud_hw.h"
+
+namespace ud {
+namespace vector {
+
+using VecArgf32 = hwy::AlignedUniquePtr<float[]>;
+using VecArgf64 = hwy::AlignedUniquePtr<double[]>;
+
+using binary_f32_op = void (*)(const VecArgf32 &, const VecArgf32 &,
+                               const VecArgf32 &, const size_t);
+using binary_f64_op = void (*)(const VecArgf64 &, const VecArgf64 &,
+                               const VecArgf64 &, const size_t);
+
+template <typename T> constexpr const char *GetFormatString() {
+  if constexpr (std::is_same_v<T, float>) {
+    return "%+.6a ";
+  } else {
+    return "%+.13a ";
+  }
+}
+
+template <std::size_t S, typename T, typename Op, std::size_t N = 10>
+void MeasureFunction(Op func, const std::size_t lanes = 0,
+                     const bool verbose = false) {
+
+  constexpr size_t inputs_size = S;
+  const auto a = hwy::MakeUniqueAlignedArray<T>(inputs_size);
+  const auto b = hwy::MakeUniqueAlignedArray<T>(inputs_size);
+  auto c = hwy::MakeUniqueAlignedArray<T>(inputs_size);
+  constexpr T ulp = std::is_same_v<T, float> ? 0x1.0p-24f : 0x1.0p-53;
+  constexpr const char *fmt = GetFormatString<T>();
+
+  for (size_t i = 0; i < inputs_size; i++) {
+    a[i] = 1.0;
+    b[i] = ulp;
+  }
+
+  std::vector<double> times(N);
+
+  for (size_t i = 0; i < N; i++) {
+    auto start = std::chrono::high_resolution_clock::now();
+    func(a, b, c, inputs_size);
+    auto end = std::chrono::high_resolution_clock::now();
+    std::chrono::duration<double> diff = end - start;
+    times[i] = diff.count();
+    if (verbose)
+      std::cout << "Iteration: " << i << " time: " << diff.count() << " s\n";
+
+    if (verbose) {
+      for (size_t i = 0; i < inputs_size; i++) {
+        fprintf(stderr, fmt, c[i]);
+        if ((lanes != 0) and ((i % lanes) == (lanes - 1))) {
+          fprintf(stderr, "\n");
+        }
+      }
+      fprintf(stderr, "\n");
+    }
+  }
+
+  // min, median, mean, max
+  auto min = *std::min_element(times.begin(), times.end());
+  auto max = *std::max_element(times.begin(), times.end());
+  auto mean = std::accumulate(times.begin(), times.end(), 0.0) / N;
+  auto std = std::sqrt(
+      std::inner_product(times.begin(), times.end(), times.begin(), 0.0) / N -
+      mean * mean);
+
+  fprintf(stderr, "[%-4zu] ", inputs_size);
+  fprintf(stderr, "%.4e ± %.4e [%.4e - %.4e] (%zu)\n", mean, std, min, max, N);
+}
+
+template <std::size_t S, typename T, typename V, typename Op,
+          std::size_t N = 10>
+void MeasureFunctionX(Op func, const std::size_t lanes = 0,
+                      const bool verbose = false) {
+
+  constexpr size_t inputs_size = S;
+  constexpr T ulp = std::is_same_v<T, float> ? 0x1.0p-24 : 0x1.0p-53;
+  V a = {0};
+  V b = {0};
+  constexpr const char *fmt = GetFormatString<T>();
+
+  for (size_t i = 0; i < inputs_size; i++) {
+    a[i] = 1.0;
+    b[i] = ulp;
+  }
+
+  std::vector<double> times(N);
+
+  for (size_t i = 0; i < N; i++) {
+    auto start = std::chrono::high_resolution_clock::now();
+    auto c = func(a, b, inputs_size);
+    auto end = std::chrono::high_resolution_clock::now();
+    std::chrono::duration<double> diff = end - start;
+    times[i] = diff.count();
+    if (verbose)
+      std::cout << "Iteration: " << i << " time: " << diff.count() << " s\n";
+
+    if (verbose) {
+      for (size_t j = 0; j < inputs_size; j++) {
+        fprintf(stderr, fmt, ((T *)&c)[j]);
+        if ((lanes != 0) and ((j % lanes) == (lanes - 1))) {
+          fprintf(stderr, "\n");
+        }
+      }
+      fprintf(stderr, "\n");
+    }
+  }
+
+  // min, median, mean, max
+  auto min = *std::min_element(times.begin(), times.end());
+  auto max = *std::max_element(times.begin(), times.end());
+  auto mean = std::accumulate(times.begin(), times.end(), 0.0) / N;
+  auto std = std::sqrt(
+      std::inner_product(times.begin(), times.end(), times.begin(), 0.0) / N -
+      mean * mean);
+
+  fprintf(stderr, "[%-4zu] ", inputs_size);
+  fprintf(stderr, "%.4e ± %.4e [%.4e - %.4e] (%zu)\n", mean, std, min, max, N);
+}
+
+/* Test array inputs */
+
+#define define_array_test(op, type)                                            \
+  void test_##op##type(const VecArg##type &a, const VecArg##type &b,           \
+                       const VecArg##type &c, const size_t count) {            \
+    op##type(a.get(), b.get(), c.get(), count);                                \
+  }
+
+define_array_test(add, f32);
+define_array_test(sub, f32);
+define_array_test(mul, f32);
+define_array_test(div, f32);
+
+define_array_test(add, f64);
+define_array_test(sub, f64);
+define_array_test(mul, f64);
+define_array_test(div, f64);
+
+/* Test vector pointer inputs */
+
+#define define_array_test_vector(op, type, size)                               \
+  void test_##op##type##x##size(const VecArg##type &a, const VecArg##type &b,  \
+                                const VecArg##type &c, const size_t count) {   \
+    op##type##x##size(a.get(), b.get(), c.get());                              \
+  }
+
+/* 64-bits */
+#if HWY_MAX_BYTES >= 8
+define_array_test_vector(add, f32, 2);
+define_array_test_vector(sub, f32, 2);
+define_array_test_vector(mul, f32, 2);
+define_array_test_vector(div, f32, 2);
+#endif
+
+/* 128-bits */
+#if HWY_MAX_BYTES >= 16
+define_array_test_vector(add, f32, 4);
+define_array_test_vector(sub, f32, 4);
+define_array_test_vector(mul, f32, 4);
+define_array_test_vector(div, f32, 4);
+
+define_array_test_vector(add, f64, 2);
+define_array_test_vector(sub, f64, 2);
+define_array_test_vector(mul, f64, 2);
+define_array_test_vector(div, f64, 2);
+#endif
+
+/* 256-bits */
+#if HWY_MAX_BYTES >= 32
+define_array_test_vector(add, f32, 8);
+define_array_test_vector(sub, f32, 8);
+define_array_test_vector(mul, f32, 8);
+define_array_test_vector(div, f32, 8);
+
+define_array_test_vector(add, f64, 4);
+define_array_test_vector(sub, f64, 4);
+define_array_test_vector(mul, f64, 4);
+define_array_test_vector(div, f64, 4);
+#endif
+
+/* 512-bits */
+#if HWY_MAX_BYTES >= 64
+define_array_test_vector(add, f64, 8);
+define_array_test_vector(sub, f64, 8);
+define_array_test_vector(mul, f64, 8);
+define_array_test_vector(div, f64, 8);
+
+define_array_test_vector(add, f32, 16);
+define_array_test_vector(sub, f32, 16);
+define_array_test_vector(mul, f32, 16);
+define_array_test_vector(div, f32, 16);
+#endif
+
+/* Test vector inputs (static dispatch) */
+
+#define define_vector_test(op, type, size)                                     \
+  type##x##size##_v test_##op##type##x##size##_v(const type##x##size##_v a,    \
+                                                 const type##x##size##_v b,    \
+                                                 const size_t count) {         \
+    return op##type##x##size##_v(a, b);                                        \
+  }
+
+/* 64-bits */
+
+#if HWY_MAX_BYTES >= 8
+define_vector_test(add, f32, 2);
+define_vector_test(sub, f32, 2);
+define_vector_test(mul, f32, 2);
+define_vector_test(div, f32, 2);
+#endif
+
+/* 128-bits */
+
+#if HWY_MAX_BYTES >= 16
+define_vector_test(add, f64, 2);
+define_vector_test(sub, f64, 2);
+define_vector_test(mul, f64, 2);
+define_vector_test(div, f64, 2);
+
+define_vector_test(add, f32, 4);
+define_vector_test(sub, f32, 4);
+define_vector_test(mul, f32, 4);
+define_vector_test(div, f32, 4);
+#endif
+
+/* 256-bits */
+#if HWY_MAX_BYTES >= 32
+define_vector_test(add, f64, 4);
+define_vector_test(sub, f64, 4);
+define_vector_test(mul, f64, 4);
+define_vector_test(div, f64, 4);
+
+define_vector_test(add, f32, 8);
+define_vector_test(sub, f32, 8);
+define_vector_test(mul, f32, 8);
+define_vector_test(div, f32, 8);
+#endif
+
+/* 512-bits */
+#if HWY_MAX_BYTES >= 64
+define_vector_test(add, f64, 8);
+define_vector_test(sub, f64, 8);
+define_vector_test(mul, f64, 8);
+define_vector_test(div, f64, 8);
+
+define_vector_test(add, f32, 16);
+define_vector_test(sub, f32, 16);
+define_vector_test(mul, f32, 16);
+define_vector_test(div, f32, 16);
+#endif
+
+/* Test vector inputs (dynamic dispatch) */
+
+#define define_vector_test_dynamic(op, type, size)                             \
+  type##x##size##_v test_##op##type##x##size##_d(const type##x##size##_v a,    \
+                                                 const type##x##size##_v b,    \
+                                                 const size_t count) {         \
+    return op##type##x##size##_d(a, b);                                        \
+  }
+
+define_vector_test_dynamic(add, f32, 2);
+define_vector_test_dynamic(sub, f32, 2);
+define_vector_test_dynamic(mul, f32, 2);
+define_vector_test_dynamic(div, f32, 2);
+
+define_vector_test_dynamic(add, f64, 2);
+define_vector_test_dynamic(sub, f64, 2);
+define_vector_test_dynamic(mul, f64, 2);
+define_vector_test_dynamic(div, f64, 2);
+
+define_vector_test_dynamic(add, f32, 4);
+define_vector_test_dynamic(sub, f32, 4);
+define_vector_test_dynamic(mul, f32, 4);
+define_vector_test_dynamic(div, f32, 4);
+
+define_vector_test_dynamic(add, f64, 4);
+define_vector_test_dynamic(sub, f64, 4);
+define_vector_test_dynamic(mul, f64, 4);
+define_vector_test_dynamic(div, f64, 4);
+
+define_vector_test_dynamic(add, f32, 8);
+define_vector_test_dynamic(sub, f32, 8);
+define_vector_test_dynamic(mul, f32, 8);
+define_vector_test_dynamic(div, f32, 8);
+
+define_vector_test_dynamic(add, f64, 8);
+define_vector_test_dynamic(sub, f64, 8);
+define_vector_test_dynamic(mul, f64, 8);
+define_vector_test_dynamic(div, f64, 8);
+
+define_vector_test_dynamic(add, f32, 16);
+define_vector_test_dynamic(sub, f32, 16);
+define_vector_test_dynamic(mul, f32, 16);
+define_vector_test_dynamic(div, f32, 16);
+
+// Recursion function to call MeasureFunction with powers of 2
+template <size_t N, size_t Max, typename T, typename Op>
+void callMeasureFunctions(Op function) {
+  MeasureFunction<N, T>(function);
+  if constexpr (N * 2 <= Max) {
+    callMeasureFunctions<N * 2, Max, T, Op>(function);
+  }
+}
+
+/* Test on Array inputs */
+
+/* IEEE-754 binary32 */
+
+TEST(SRArrayBenchmark, SRAddF32) {
+  constexpr size_t N = 10;
+  std::cout << "Measure function  ud::addf32 with " << N << " repetitions\n";
+  callMeasureFunctions<2, 1024, float>(&test_addf32);
+}
+
+TEST(SRArrayBenchmark, SRSubF32) {
+  constexpr size_t N = 10;
+  std::cout << "Measure function  ud::subf32 with " << N << " repetitions\n";
+  callMeasureFunctions<2, 1024, float>(&test_subf32);
+}
+
+TEST(SRArrayBenchmark, SRMulF32) {
+  constexpr size_t N = 10;
+  std::cout << "Measure function  ud::mulf32 with " << N << " repetitions\n";
+  callMeasureFunctions<2, 1024, float>(&test_mulf32);
+}
+
+TEST(SRArrayBenchmark, SRDivF32) {
+  constexpr size_t N = 10;
+  std::cout << "Measure function  ud::divf32 with " << N << " repetitions\n";
+  callMeasureFunctions<2, 1024, float>(&test_divf32);
+}
+
+/* IEEE-754 binary64 */
+
+TEST(SRArrayBenchmark, SRAddF64) {
+  constexpr size_t N = 10;
+  std::cout << "Measure function  ud::addf64 with " << N << " repetitions\n";
+  callMeasureFunctions<2, 1024, double>(&test_addf64);
+}
+
+TEST(SRArrayBenchmark, SRSubF64) {
+  constexpr size_t N = 10;
+  std::cout << "Measure function  ud::subf64 with " << N << " repetitions\n";
+  callMeasureFunctions<2, 1024, double>(&test_subf64);
+}
+
+TEST(SRArrayBenchmark, SRMulF64) {
+  constexpr size_t N = 10;
+  std::cout << "Measure function  ud::mulf64 with " << N << " repetitions\n";
+  callMeasureFunctions<2, 1024, double>(&test_mulf64);
+}
+
+TEST(SRArrayBenchmark, SRDivF64) {
+  constexpr size_t N = 10;
+  std::cout << "Measure function  ud::divf64 with " << N << " repetitions\n";
+  callMeasureFunctions<2, 1024, double>(&test_divf64);
+}
+
+/* Test on single vector passed by pointer */
+
+constexpr bool kVerbose = false;
+
+/* IEEE-754 binary32 x2 */
+
+TEST(SRVectorPtrBenchmark, SRAddF32x2) {
+  constexpr size_t N = 10;
+  std::cout << "Measure function  ud::addf32x2 with " << N << " repetitions\n";
+  MeasureFunction<2, float>(&test_addf32x2, 2, kVerbose);
+}
+
+TEST(SRVectorPtrBenchmark, SRSubF32x2) {
+  constexpr size_t N = 10;
+  std::cout << "Measure function  ud::subf32x2 with " << N << " repetitions\n";
+  MeasureFunction<2, float>(&test_subf32x2, 2, kVerbose);
+}
+
+TEST(SRVectorPtrBenchmark, SRMulF32x2) {
+  constexpr size_t N = 10;
+  std::cout << "Measure function  ud::mulf32x2 with " << N << " repetitions\n";
+  MeasureFunction<2, float>(&test_mulf32x2, 2, kVerbose);
+}
+
+TEST(SRVectorPtrBenchmark, SRDivF32x2) {
+  constexpr size_t N = 10;
+  std::cout << "Measure function  ud::divf32x2 with " << N << " repetitions\n";
+  MeasureFunction<2, float>(&test_divf32x2, 2, kVerbose);
+}
+
+/* IEEE-754 binary64 x2 */
+
+TEST(SRVectorPtrBenchmark, SRAddF64x2) {
+  constexpr size_t N = 10;
+  std::cout << "Measure function  ud::addf64x2 with " << N << " repetitions\n";
+  MeasureFunction<2, double>(&test_addf64x2, 2, kVerbose);
+}
+
+TEST(SRVectorPtrBenchmark, SRSubF64x2) {
+  constexpr size_t N = 10;
+  std::cout << "Measure function  ud::subf64x2 with " << N << " repetitions\n";
+  MeasureFunction<2, double>(&test_subf64x2, 2, kVerbose);
+}
+
+TEST(SRVectorPtrBenchmark, SRMulF64x2) {
+  constexpr size_t N = 10;
+  std::cout << "Measure function  ud::mulf64x2 with " << N << " repetitions\n";
+  MeasureFunction<2, double>(&test_mulf64x2, 2, kVerbose);
+}
+
+TEST(SRVectorPtrBenchmark, SRDivF64x2) {
+  constexpr size_t N = 10;
+  std::cout << "Measure function  ud::divf64x2 with " << N << " repetitions\n";
+  MeasureFunction<2, double>(&test_divf64x2, 2, kVerbose);
+}
+
+/* IEEE-754 binary32 x4 */
+
+TEST(SRVectorPtrBenchmark, SRAddF32x4) {
+#if HWY_MAX_BYTES >= 16
+  constexpr size_t N = 10;
+  std::cout << "Measure function  ud::addf32x4 with " << N << " repetitions\n";
+  MeasureFunction<4, float>(&test_addf32x4, 4, kVerbose);
+#else
+  GTEST_SKIP() << "SRAddF32x4 is not available";
+#endif
+}
+
+TEST(SRVectorPtrBenchmark, SRSubF32x4) {
+#if HWY_MAX_BYTES >= 16
+  constexpr size_t N = 10;
+  std::cout << "Measure function  ud::subf32x4 with " << N << " repetitions\n";
+  MeasureFunction<4, float>(&test_subf32x4, 4, kVerbose);
+#else
+  GTEST_SKIP() << "SRSubF32x4 is not available";
+#endif
+}
+
+TEST(SRVectorPtrBenchmark, SRMulF32x4) {
+#if HWY_MAX_BYTES >= 16
+  constexpr size_t N = 10;
+  std::cout << "Measure function  ud::mulf32x4 with " << N << " repetitions\n";
+  MeasureFunction<4, float>(&test_mulf32x4, 4, kVerbose);
+#else
+  GTEST_SKIP() << "SRMulF32x4 is not available";
+#endif
+}
+
+TEST(SRVectorPtrBenchmark, SRDivF32x4) {
+#if HWY_MAX_BYTES >= 16
+  constexpr size_t N = 10;
+  std::cout << "Measure function  ud::divf32x4 with " << N << " repetitions\n";
+  MeasureFunction<4, float>(&test_divf32x4, 4, kVerbose);
+#else
+  GTEST_SKIP() << "SRDivF32x4 is not available";
+#endif
+}
+
+/* IEEE-754 binary32 x8 */
+
+TEST(SRVectorPtrBenchmark, SRAddF32x8) {
+#if HWY_MAX_BYTES >= 32
+  constexpr size_t N = 10;
+  std::cout << "Measure function  ud::addf32x8 with " << N << " repetitions\n";
+  MeasureFunction<8, float>(&test_addf32x8, 8, kVerbose);
+#else
+  GTEST_SKIP() << "SRAddF32x8 is not available";
+#endif
+}
+
+TEST(SRVectorPtrBenchmark, SRSubF32x8) {
+#if HWY_MAX_BYTES >= 32
+  constexpr size_t N = 10;
+  std::cout << "Measure function  ud::subf32x8 with " << N << " repetitions\n";
+  MeasureFunction<8, float>(&test_subf32x8, 8, kVerbose);
+#else
+  GTEST_SKIP() << "SRSubF32x8 is not available";
+#endif
+}
+
+TEST(SRVectorPtrBenchmark, SRMulF32x8) {
+#if HWY_MAX_BYTES >= 32
+  constexpr size_t N = 10;
+  std::cout << "Measure function  ud::mulf32x8 with " << N << " repetitions\n";
+  MeasureFunction<8, float>(&test_mulf32x8, 8, kVerbose);
+#else
+  GTEST_SKIP() << "SRMulF32x8 is not available";
+#endif
+}
+
+TEST(SRVectorPtrBenchmark, SRDivF32x8) {
+#if HWY_MAX_BYTES >= 32
+  constexpr size_t N = 10;
+  std::cout << "Measure function  ud::divf32x8 with " << N << " repetitions\n";
+  MeasureFunction<8, float>(&test_divf32x8, 8, kVerbose);
+#else
+  GTEST_SKIP() << "SRDivF32x8 is not available";
+#endif
+}
+
+/* IEEE-754 binary64 x4 */
+
+TEST(SRVectorPtrBenchmark, SRAddF64x4) {
+#if HWY_MAX_BYTES >= 32
+  constexpr size_t N = 10;
+  std::cout << "Measure function  ud::addf64x4 with " << N << "repetitions\n";
+  MeasureFunction<4, double>(&test_addf64x4, 4, kVerbose);
+#else
+  GTEST_SKIP() << "SRAddF64x4 is not available";
+#endif
+}
+
+TEST(SRVectorPtrBenchmark, SRSubF64x4) {
+#if HWY_MAX_BYTES >= 32
+  constexpr size_t N = 10;
+  std::cout << "Measure function  ud::subf64x4 with " << N << "repetitions\n";
+  MeasureFunction<4, double>(&test_subf64x4, 4, kVerbose);
+#else
+  GTEST_SKIP() << "SRSubF64x4 is not available";
+#endif
+}
+
+TEST(SRVectorPtrBenchmark, SRMulF64x4) {
+#if HWY_MAX_BYTES >= 32
+  constexpr size_t N = 10;
+  std::cout << "Measure function  ud::mulf64x4 with " << N << "repetitions\n";
+  MeasureFunction<4, double>(&test_mulf64x4, 4, kVerbose);
+#else
+  GTEST_SKIP() << "SRMulF64x4 is not available";
+#endif
+}
+
+TEST(SRVectorPtrBenchmark, SRDivF64x4) {
+#if HWY_MAX_BYTES >= 32
+  constexpr size_t N = 10;
+  std::cout << "Measure function  ud::divf64x4 with " << N << "repetitions\n";
+  MeasureFunction<4, double>(&test_divf64x4, 4, kVerbose);
+#else
+  GTEST_SKIP() << "SRDivF64x4 is not available";
+#endif
+}
+
+/* IEEE-754 binary64 x8 */
+
+TEST(SRVectorPtrBenchmark, SRAddF64x8) {
+#if HWY_MAX_BYTES >= 64
+  constexpr size_t N = 10;
+  std::cout << "Measure function  ud::addf64x8 with " << N << "repetitions\n";
+  MeasureFunction<8, double>(&test_addf64x8, 8, kVerbose);
+#else
+  GTEST_SKIP() << "SRAddF64x8 is not available";
+#endif
+}
+
+TEST(SRVectorPtrBenchmark, SRSubF64x8) {
+#if HWY_MAX_BYTES >= 64
+  constexpr size_t N = 10;
+  std::cout << "Measure function  ud::subf64x8 with " << N << "repetitions\n";
+  MeasureFunction<8, double>(&test_subf64x8, 8, kVerbose);
+#else
+  GTEST_SKIP() << "SRSubF64x8 is not available";
+#endif
+}
+
+TEST(SRVectorPtrBenchmark, SRMulF64x8) {
+#if HWY_MAX_BYTES >= 64
+  constexpr size_t N = 10;
+  std::cout << "Measure function  ud::mulf64x8 with " << N << "repetitions\n";
+  MeasureFunction<8, double>(&test_mulf64x8, 8, kVerbose);
+#else
+  GTEST_SKIP() << "SRMulF64x8 is not available";
+#endif
+}
+
+TEST(SRVectorPtrBenchmark, SRDivF64x8) {
+#if HWY_MAX_BYTES >= 64
+  constexpr size_t N = 10;
+  std::cout << "Measure function  ud::divf64x8 with " << N << "repetitions\n";
+  MeasureFunction<8, double>(&test_divf64x8, 8, kVerbose);
+#else
+  GTEST_SKIP() << "SRDivF64x8 is not available";
+#endif
+}
+
+/* IEEE-754 binary32 x16 */
+
+TEST(SRVectorPtrBenchmark, SRAddF32x16) {
+#if HWY_MAX_BYTES >= 64
+  constexpr size_t N = 10;
+  std::cout << "Measure function  ud::addf32x16 with " << N << "repetitions\n";
+  MeasureFunction<16, float>(&test_addf32x16, 16, kVerbose);
+#else
+  GTEST_SKIP() << "SRAddF32x16 is not available";
+#endif
+}
+
+TEST(SRVectorPtrBenchmark, SRSubF32x16) {
+#if HWY_MAX_BYTES >= 64
+  constexpr size_t N = 10;
+  std::cout << "Measure function  ud::subf32x16 with " << N << "repetitions\n";
+  MeasureFunction<16, float>(&test_subf32x16, 16, kVerbose);
+#else
+  GTEST_SKIP() << "SRSubF32x16 is not available";
+#endif
+}
+
+TEST(SRVectorPtrBenchmark, SRMulF32x16) {
+#if HWY_MAX_BYTES >= 64
+  constexpr size_t N = 10;
+  std::cout << "Measure function  ud::mulf32x16 with " << N << "repetitions\n";
+  MeasureFunction<16, float>(&test_mulf32x16, 16, kVerbose);
+#else
+  GTEST_SKIP() << "SRMulF32x16 is not available";
+#endif
+}
+
+TEST(SRVectorPtrBenchmark, SRDivF32x16) {
+#if HWY_MAX_BYTES >= 64
+  constexpr size_t N = 10;
+  std::cout << "Measure function  ud::divf32x16 with " << N << "repetitions\n";
+  MeasureFunction<16, float>(&test_divf32x16, 16, kVerbose);
+#else
+  GTEST_SKIP() << "SRDivF32x16 is not available";
+#endif
+}
+
+/* Test on single vector passed by value with static dispatch */
+
+/* IEEE-754 binary32 x2 */
+
+TEST(SRVectorBenchmark, SRAddF32x2V) {
+#if HWY_MAX_BYTES >= 8
+  constexpr size_t N = 10;
+  std::cout << "Measure function  ud::addf32x2_v with " << N
+            << " repetitions\n";
+  MeasureFunctionX<2, float, f32x2_v>(&test_addf32x2_v, 2, kVerbose);
+#else
+  GTEST_SKIP() << "SRAddF32x2V is not available";
+#endif
+}
+
+TEST(SRVectorBenchmark, SRSubF32x2V) {
+#if HWY_MAX_BYTES >= 8
+  constexpr size_t N = 10;
+  std::cout << "Measure function  ud::subf32x2_v with " << N
+            << " repetitions\n";
+  MeasureFunctionX<2, float, f32x2_v>(&test_subf32x2_v, 2, kVerbose);
+#else
+  GTEST_SKIP() << "SRSubF32x2V is not available";
+#endif
+}
+
+TEST(SRVectorBenchmark, SRMulF32x2V) {
+#if HWY_MAX_BYTES >= 8
+  constexpr size_t N = 10;
+  std::cout << "Measure function  ud::mulf32x2_v with " << N
+            << " repetitions\n";
+  MeasureFunctionX<2, float, f32x2_v>(&test_mulf32x2_v, 2, kVerbose);
+#else
+  GTEST_SKIP() << "SRMulF32x2V is not available";
+#endif
+}
+
+TEST(SRVectorBenchmark, SRDivF32x2V) {
+#if HWY_MAX_BYTES >= 8
+  constexpr size_t N = 10;
+  std::cout << "Measure function  ud::divf32x2_v with " << N
+            << " repetitions\n";
+  MeasureFunctionX<2, float, f32x2_v>(&test_divf32x2_v, 2, kVerbose);
+#else
+  GTEST_SKIP() << "SRDivF32x2V is not available";
+#endif
+}
+
+/* IEEE-754 binary64 x2 */
+TEST(SRVectorBenchmark, SRAddF64x2V) {
+#if HWY_MAX_BYTES >= 16
+  constexpr size_t N = 10;
+  std::cout << "Measure function  ud::addf64x2_v with " << N
+            << " repetitions\n";
+  MeasureFunctionX<2, double, f64x2_v>(&test_addf64x2_v, 2, kVerbose);
+#else
+  GTEST_SKIP() << "SRAddF64x2V is not available";
+#endif
+}
+
+TEST(SRVectorBenchmark, SRSubF64x2V) {
+#if HWY_MAX_BYTES >= 16
+  constexpr size_t N = 10;
+  std::cout << "Measure function  ud::subf64x2_v with " << N
+            << " repetitions\n";
+  MeasureFunctionX<2, double, f64x2_v>(&test_subf64x2_v, 2, kVerbose);
+#else
+  GTEST_SKIP() << "SRSubF64x2V is not available";
+#endif
+}
+
+TEST(SRVectorBenchmark, SRMulF64x2V) {
+#if HWY_MAX_BYTES >= 16
+  constexpr size_t N = 10;
+  std::cout << "Measure function  ud::mulf64x2_v with " << N
+            << " repetitions\n";
+  MeasureFunctionX<2, double, f64x2_v>(&test_mulf64x2_v, 2, kVerbose);
+#else
+  GTEST_SKIP() << "SRMulF64x2V is not available";
+#endif
+}
+
+TEST(SRVectorBenchmark, SRDivF64x2V) {
+#if HWY_MAX_BYTES >= 16
+  constexpr size_t N = 10;
+  std::cout << "Measure function  ud::divf64x2_v with " << N
+            << " repetitions\n";
+  MeasureFunctionX<2, double, f64x2_v>(&test_divf64x2_v, 2, kVerbose);
+#else
+  GTEST_SKIP() << "SRDivF64x2V is not available";
+#endif
+}
+
+/* IEEE-754 binary32 x4 */
+
+TEST(SRVectorBenchmark, SRAddF32x4V) {
+#if HWY_MAX_BYTES >= 16
+  constexpr size_t N = 10;
+  std::cout << "Measure function  ud::addf32x4_v with " << N
+            << " repetitions\n";
+  MeasureFunctionX<4, float, f32x4_v>(&test_addf32x4_v, 4, kVerbose);
+#else
+  GTEST_SKIP() << "SRAddF32x4V is not available";
+#endif
+}
+
+TEST(SRVectorBenchmark, SRSubF32x4V) {
+#if HWY_MAX_BYTES >= 16
+  constexpr size_t N = 10;
+  std::cout << "Measure function  ud::subf32x4_v with " << N
+            << " repetitions\n";
+  MeasureFunctionX<4, float, f32x4_v>(&test_subf32x4_v, 4, kVerbose);
+#else
+  GTEST_SKIP() << "SRSubF32x4V is not available";
+#endif
+}
+
+TEST(SRVectorBenchmark, SRMulF32x4V) {
+#if HWY_MAX_BYTES >= 16
+  constexpr size_t N = 10;
+  std::cout << "Measure function  ud::mulf32x4_v with " << N
+            << " repetitions\n";
+  MeasureFunctionX<4, float, f32x4_v>(&test_mulf32x4_v, 4, kVerbose);
+#else
+  GTEST_SKIP() << "SRMulF32x4V is not available";
+#endif
+}
+
+TEST(SRVectorBenchmark, SRDivF32x4V) {
+#if HWY_MAX_BYTES >= 16
+  constexpr size_t N = 10;
+  std::cout << "Measure function  ud::divf32x4_v with " << N
+            << " repetitions\n";
+  MeasureFunctionX<4, float, f32x4_v>(&test_divf32x4_v, 4, kVerbose);
+#else
+  GTEST_SKIP() << "SRDivF32x4V is not available";
+#endif
+}
+
+/* IEEE-754 binary32 x8 */
+
+TEST(SRVectorBenchmark, SRAddF32x8V) {
+#if HWY_MAX_BYTES >= 32
+  constexpr size_t N = 10;
+  std::cout << "Measure function  ud::addf32x8_v with " << N
+            << " repetitions\n";
+  MeasureFunctionX<8, float, f32x8_v>(&test_addf32x8_v, 8, kVerbose);
+#else
+  GTEST_SKIP() << "SRAddF32x8V is not available";
+#endif
+}
+
+TEST(SRVectorBenchmark, SRSubF32x8V) {
+#if HWY_MAX_BYTES >= 32
+  constexpr size_t N = 10;
+  std::cout << "Measure function  ud::subf32x8_v with " << N
+            << " repetitions\n";
+  MeasureFunctionX<8, float, f32x8_v>(&test_subf32x8_v, 8, kVerbose);
+#else
+  GTEST_SKIP() << "SRSubF32x8V is not available";
+#endif
+}
+
+TEST(SRVectorBenchmark, SRMulF32x8V) {
+#if HWY_MAX_BYTES >= 32
+  constexpr size_t N = 10;
+  std::cout << "Measure function  ud::mulf32x8_v with " << N
+            << " repetitions\n";
+  MeasureFunctionX<8, float, f32x8_v>(&test_mulf32x8_v, 8, kVerbose);
+#else
+  GTEST_SKIP() << "SRMulF32x8V is not available";
+#endif
+}
+
+TEST(SRVectorBenchmark, SRDivF32x8V) {
+#if HWY_MAX_BYTES >= 32
+  constexpr size_t N = 10;
+  std::cout << "Measure function  ud::divf32x8_v with " << N
+            << " repetitions\n";
+  MeasureFunctionX<8, float, f32x8_v>(&test_divf32x8_v, 8, kVerbose);
+#else
+  GTEST_SKIP() << "SRDivF32x8V is not available";
+#endif
+}
+
+/* IEEE-754 binary64 x4 */
+
+TEST(SRVectorBenchmark, SRAddF64x4V) {
+#if HWY_MAX_BYTES >= 32
+  constexpr size_t N = 10;
+  std::cout << "Measure function  ud::addf64x4_v with " << N
+            << " repetitions\n";
+  MeasureFunctionX<4, double, f64x4_v>(&test_addf64x4_v, 4, kVerbose);
+#else
+  GTEST_SKIP() << "SRAddF64x4V is not available";
+#endif
+}
+
+TEST(SRVectorBenchmark, SRSubF64x4V) {
+#if HWY_MAX_BYTES >= 32
+  constexpr size_t N = 10;
+  std::cout << "Measure function  ud::subf64x4_v with " << N
+            << " repetitions\n";
+  MeasureFunctionX<4, double, f64x4_v>(&test_subf64x4_v, 4, kVerbose);
+#else
+  GTEST_SKIP() << "SRSubF64x4V is not available";
+#endif
+}
+
+TEST(SRVectorBenchmark, SRMulF64x4V) {
+#if HWY_MAX_BYTES >= 32
+  constexpr size_t N = 10;
+  std::cout << "Measure function  ud::mulf64x4_v with " << N
+            << " repetitions\n";
+  MeasureFunctionX<4, double, f64x4_v>(&test_mulf64x4_v, 4, kVerbose);
+#else
+  GTEST_SKIP() << "SRMulF64x4V is not available";
+#endif
+}
+
+TEST(SRVectorBenchmark, SRDivF64x4V) {
+#if HWY_MAX_BYTES >= 32
+  constexpr size_t N = 10;
+  std::cout << "Measure function  ud::divf64x4_v with " << N
+            << " repetitions\n";
+  MeasureFunctionX<4, double, f64x4_v>(&test_divf64x4_v, 4, kVerbose);
+#else
+  GTEST_SKIP() << "SRDivF64x4V is not available";
+#endif
+}
+
+/* IEEE-754 binary64 x8 */
+
+TEST(SRVectorBenchmark, SRAddF64x8V) {
+#if HWY_MAX_BYTES >= 64
+  constexpr size_t N = 10;
+  std::cout << "Measure function  ud::addf64x8_v with " << N
+            << " repetitions\n";
+  MeasureFunctionX<8, double, f64x8_v>(&test_addf64x8_v, 8, kVerbose);
+#else
+  GTEST_SKIP() << "SRAddF64x8V is not available";
+#endif
+}
+
+TEST(SRVectorBenchmark, SRSubF64x8V) {
+#if HWY_MAX_BYTES >= 64
+  constexpr size_t N = 10;
+  std::cout << "Measure function  ud::subf64x8_v with " << N
+            << " repetitions\n";
+  MeasureFunctionX<8, double, f64x8_v>(&test_subf64x8_v, 8, kVerbose);
+
+#else
+  GTEST_SKIP() << "SRSubF64x8V is not available";
+#endif
+}
+
+TEST(SRVectorBenchmark, SRMulF64x8V) {
+#if HWY_MAX_BYTES >= 64
+  constexpr size_t N = 10;
+  std::cout << "Measure function  ud::mulf64x8_v with " << N
+            << " repetitions\n";
+  MeasureFunctionX<8, double, f64x8_v>(&test_mulf64x8_v, 8, kVerbose);
+#else
+  GTEST_SKIP() << "SRMulF64x8V is not available";
+#endif
+}
+
+TEST(SRVectorBenchmark, SRDivF64x8V) {
+#if HWY_MAX_BYTES >= 64
+  constexpr size_t N = 10;
+  std::cout << "Measure function  ud::divf64x8_v with " << N
+            << " repetitions\n";
+  MeasureFunctionX<8, double, f64x8_v>(&test_divf64x8_v, 8, kVerbose);
+#else
+  GTEST_SKIP() << "SRDivF64x8V is not available";
+#endif
+}
+
+/* IEEE-754 binary32 x16 */
+
+TEST(SRVectorBenchmark, SRAddF32x16V) {
+#if HWY_MAX_BYTES >= 64
+  constexpr size_t N = 10;
+  std::cout << "Measure function  ud::addf32x16_v with " << N
+            << " repetitions\n";
+  MeasureFunctionX<16, float, f32x16_v>(&test_addf32x16_v, 16, kVerbose);
+#else
+  GTEST_SKIP() << "SRAddF32x16V is not available";
+#endif
+}
+
+TEST(SRVectorBenchmark, SRSubF32x16V) {
+#if HWY_MAX_BYTES >= 64
+  constexpr size_t N = 10;
+  std::cout << "Measure function  ud::subf32x16_v with " << N
+            << " repetitions\n";
+  MeasureFunctionX<16, float, f32x16_v>(&test_subf32x16_v, 16, kVerbose);
+#else
+  GTEST_SKIP() << "SRSubF32x16V is not available";
+#endif
+}
+
+TEST(SRVectorBenchmark, SRMulF32x16V) {
+#if HWY_MAX_BYTES >= 64
+  constexpr size_t N = 10;
+  std::cout << "Measure function  ud::mulf32x16_v with " << N
+            << " repetitions\n";
+  MeasureFunctionX<16, float, f32x16_v>(&test_mulf32x16_v, 16, kVerbose);
+#else
+  GTEST_SKIP() << "SRMulF32x16V is not available";
+#endif
+}
+
+TEST(SRVectorBenchmark, SRDivF32x16V) {
+#if HWY_MAX_BYTES >= 64
+  constexpr size_t N = 10;
+  std::cout << "Measure function  ud::divf32x16_v with " << N
+            << " repetitions\n";
+  MeasureFunctionX<16, float, f32x16_v>(&test_divf32x16_v, 16, kVerbose);
+#else
+  GTEST_SKIP() << "SRDivF32x16V is not available";
+#endif
+}
+
+/* Test on single vector passed by value with dynamic dispatch */
+
+TEST(SRVectorDynamicBenchmark, SRAddF32x2D) {
+  constexpr size_t N = 10;
+  std::cout << "Measure function  ud::addf32x2_d with " << N
+            << " repetitions\n";
+  MeasureFunctionX<2, float, f32x2_v>(&test_addf32x2_d, 2, kVerbose);
+}
+
+TEST(SRVectorDynamicBenchmark, SRSubF32x2D) {
+  constexpr size_t N = 10;
+  std::cout << "Measure function  ud::subf32x2_d with " << N
+            << " repetitions\n";
+  MeasureFunctionX<2, float, f32x2_v>(&test_subf32x2_d, 2, kVerbose);
+}
+
+TEST(SRVectorDynamicBenchmark, SRMulF32x2D) {
+  constexpr size_t N = 10;
+  std::cout << "Measure function  ud::mulf32x2_d with " << N
+            << " repetitions\n";
+  MeasureFunctionX<2, float, f32x2_v>(&test_mulf32x2_d, 2, kVerbose);
+}
+
+TEST(SRVectorDynamicBenchmark, SRDivF32x2D) {
+  constexpr size_t N = 10;
+  std::cout << "Measure function  ud::divf32x2_d with " << N
+            << " repetitions\n";
+  MeasureFunctionX<2, float, f32x2_v>(&test_divf32x2_d, 2, kVerbose);
+}
+
+/* IEEE-754 binary64 x2 */
+
+TEST(SRVectorDynamicBenchmark, SRAddF64x2D) {
+  constexpr size_t N = 10;
+  std::cout << "Measure function  ud::addf64x2_d with " << N
+            << " repetitions\n";
+  MeasureFunctionX<2, double, f64x2_v>(&test_addf64x2_d, 2, kVerbose);
+}
+
+TEST(SRVectorDynamicBenchmark, SRSubF64x2D) {
+  constexpr size_t N = 10;
+  std::cout << "Measure function  ud::subf64x2_d with " << N
+            << " repetitions\n";
+  MeasureFunctionX<2, double, f64x2_v>(&test_subf64x2_d, 2, kVerbose);
+}
+
+TEST(SRVectorDynamicBenchmark, SRMulF64x2D) {
+  constexpr size_t N = 10;
+  std::cout << "Measure function  ud::mulf64x2_d with " << N
+            << " repetitions\n";
+  MeasureFunctionX<2, double, f64x2_v>(&test_mulf64x2_d, 2, kVerbose);
+}
+
+TEST(SRVectorDynamicBenchmark, SRDivF64x2D) {
+  constexpr size_t N = 10;
+  std::cout << "Measure function  ud::divf64x2_d with " << N
+            << " repetitions\n";
+  MeasureFunctionX<2, double, f64x2_v>(&test_divf64x2_d, 2, kVerbose);
+}
+
+/* IEEE-754 binary32 x4 */
+
+TEST(SRVectorDynamicBenchmark, SRAddF32x4D) {
+  constexpr size_t N = 10;
+  std::cout << "Measure function  ud::addf32x4_d with " << N
+            << " repetitions\n";
+  MeasureFunctionX<4, float, f32x4_v>(&test_addf32x4_d, 4, kVerbose);
+}
+
+TEST(SRVectorDynamicBenchmark, SRSubF32x4D) {
+  constexpr size_t N = 10;
+  std::cout << "Measure function  ud::subf32x4_d with " << N
+            << " repetitions\n";
+  MeasureFunctionX<4, float, f32x4_v>(&test_subf32x4_d, 4, kVerbose);
+}
+
+TEST(SRVectorDynamicBenchmark, SRMulF32x4D) {
+  constexpr size_t N = 10;
+  std::cout << "Measure function  ud::mulf32x4_d with " << N
+            << " repetitions\n";
+  MeasureFunctionX<4, float, f32x4_v>(&test_mulf32x4_d, 4, kVerbose);
+}
+
+TEST(SRVectorDynamicBenchmark, SRDivF32x4D) {
+  constexpr size_t N = 10;
+  std::cout << "Measure function  ud::divf32x4_d with " << N
+            << " repetitions\n";
+  MeasureFunctionX<4, float, f32x4_v>(&test_divf32x4_d, 4, kVerbose);
+}
+
+/* IEEE-754 binary32 x8 */
+
+TEST(SRVectorDynamicBenchmark, SRAddF32x8D) {
+  constexpr size_t N = 10;
+  std::cout << "Measure function  ud::addf32x8_d with " << N
+            << " repetitions\n";
+  MeasureFunctionX<8, float, f32x8_v>(&test_addf32x8_d, 8, kVerbose);
+}
+
+TEST(SRVectorDynamicBenchmark, SRSubF32x8D) {
+  constexpr size_t N = 10;
+  std::cout << "Measure function  ud::subf32x8_d with " << N
+            << " repetitions\n";
+  MeasureFunctionX<8, float, f32x8_v>(&test_subf32x8_d, 8, kVerbose);
+}
+
+TEST(SRVectorDynamicBenchmark, SRMulF32x8D) {
+  constexpr size_t N = 10;
+  std::cout << "Measure function  ud::mulf32x8_d with " << N
+            << " repetitions\n";
+  MeasureFunctionX<8, float, f32x8_v>(&test_mulf32x8_d, 8, kVerbose);
+}
+
+TEST(SRVectorDynamicBenchmark, SRDivF32x8D) {
+  constexpr size_t N = 10;
+  std::cout << "Measure function  ud::divf32x8_d with " << N
+            << " repetitions\n";
+  MeasureFunctionX<8, float, f32x8_v>(&test_divf32x8_d, 8, kVerbose);
+}
+
+/* IEEE-754 binary64 x4 */
+
+TEST(SRVectorDynamicBenchmark, SRAddF64x4D) {
+  constexpr size_t N = 10;
+  std::cout << "Measure function  ud::addf64x4_d with " << N
+            << " repetitions\n";
+  MeasureFunctionX<4, double, f64x4_v>(&test_addf64x4_d, 4, kVerbose);
+}
+
+TEST(SRVectorDynamicBenchmark, SRSubF64x4D) {
+  constexpr size_t N = 10;
+  std::cout << "Measure function  ud::subf64x4_d with " << N
+            << " repetitions\n";
+  MeasureFunctionX<4, double, f64x4_v>(&test_subf64x4_d, 4, kVerbose);
+}
+
+TEST(SRVectorDynamicBenchmark, SRMulF64x4D) {
+  constexpr size_t N = 10;
+  std::cout << "Measure function  ud::mulf64x4_d with " << N
+            << " repetitions\n";
+  MeasureFunctionX<4, double, f64x4_v>(&test_mulf64x4_d, 4, kVerbose);
+}
+
+TEST(SRVectorDynamicBenchmark, SRDivF64x4D) {
+  constexpr size_t N = 10;
+  std::cout << "Measure function  ud::divf64x4_d with " << N
+            << " repetitions\n";
+  MeasureFunctionX<4, double, f64x4_v>(&test_divf64x4_d, 4, kVerbose);
+}
+
+/* IEEE-754 binary64 x8 */
+
+TEST(SRVectorDynamicBenchmark, SRAddF64x8D) {
+  constexpr size_t N = 10;
+  std::cout << "Measure function  ud::addf64x8_d with " << N
+            << " repetitions\n";
+  MeasureFunctionX<8, double, f64x8_v>(&test_addf64x8_d, 8, kVerbose);
+}
+
+TEST(SRVectorDynamicBenchmark, SRSubF64x8D) {
+  constexpr size_t N = 10;
+  std::cout << "Measure function  ud::subf64x8_d with " << N
+            << " repetitions\n";
+  MeasureFunctionX<8, double, f64x8_v>(&test_subf64x8_d, 8, kVerbose);
+}
+
+TEST(SRVectorDynamicBenchmark, SRMulF64x8D) {
+  constexpr size_t N = 10;
+  std::cout << "Measure function  ud::mulf64x8_d with " << N
+            << " repetitions\n";
+  MeasureFunctionX<8, double, f64x8_v>(&test_mulf64x8_d, 8, kVerbose);
+}
+
+TEST(SRVectorDynamicBenchmark, SRDivF64x8D) {
+  constexpr size_t N = 10;
+  std::cout << "Measure function  ud::divf64x8_d with " << N
+            << " repetitions\n";
+  MeasureFunctionX<8, double, f64x8_v>(&test_divf64x8_d, 8, kVerbose);
+}
+
+/* IEEE-754 binary32 x16 */
+
+TEST(SRVectorDynamicBenchmark, SRAddF32x16D) {
+  constexpr size_t N = 10;
+  std::cout << "Measure function  ud::addf32x16_d with " << N
+            << " repetitions\n";
+  MeasureFunctionX<16, float, f32x16_v>(&test_addf32x16_d, 16, kVerbose);
+}
+
+TEST(SRVectorDynamicBenchmark, SRSubF32x16D) {
+  constexpr size_t N = 10;
+  std::cout << "Measure function  ud::subf32x16_d with " << N
+            << " repetitions\n";
+  MeasureFunctionX<16, float, f32x16_v>(&test_subf32x16_d, 16, kVerbose);
+}
+
+TEST(SRVectorDynamicBenchmark, SRMulF32x16D) {
+  constexpr size_t N = 10;
+  std::cout << "Measure function  ud::mulf32x16_d with " << N
+            << " repetitions\n";
+  MeasureFunctionX<16, float, f32x16_v>(&test_mulf32x16_d, 16, kVerbose);
+}
+
+TEST(SRVectorDynamicBenchmark, SRDivF32x16D) {
+  constexpr size_t N = 10;
+  std::cout << "Measure function  ud::divf32x16_d with " << N
+            << " repetitions\n";
+  MeasureFunctionX<16, float, f32x16_v>(&test_divf32x16_d, 16, kVerbose);
+}
+
+} // namespace vector
+} // namespace ud
+
+// HYW_TEST_MAIN();1
diff --git a/src/libvfcinstrumentonline/rand/tests/xoroshiro.cpp b/src/libvfcinstrumentonline/rand/tests/test_xoroshiro.cpp
similarity index 98%
rename from src/libvfcinstrumentonline/rand/tests/xoroshiro.cpp
rename to src/libvfcinstrumentonline/rand/tests/test_xoroshiro.cpp
index 2743b493..e60854bc 100644
--- a/src/libvfcinstrumentonline/rand/tests/xoroshiro.cpp
+++ b/src/libvfcinstrumentonline/rand/tests/test_xoroshiro.cpp
@@ -27,6 +27,8 @@ namespace HWY_NAMESPACE { // required: unique per target
 namespace {
 
 constexpr std::uint64_t tests = 1UL << 10;
+constexpr std::uint64_t u64{};
+constexpr std::uint32_t u32{};
 
 std::uint64_t GetSeed() { return static_cast<uint64_t>(std::time(nullptr)); }
 
@@ -35,7 +37,7 @@ void RngLoop(const std::uint64_t seed, std::uint64_t *HWY_RESTRICT result,
   const ScalableTag<std::uint64_t> d;
   VectorXoshiro generator{seed};
   for (size_t i = 0; i < size; i += Lanes(d)) {
-    Store(generator(), d, result + i);
+    Store(generator(u64), d, result + i);
   }
 }
 
@@ -224,7 +226,7 @@ void TestUniformVecDistF32() {
 void TestNextNRandomUint64() {
   const std::uint64_t seed = GetSeed();
   VectorXoshiro generator{seed};
-  const auto result_array = generator.operator()(tests);
+  const auto result_array = generator.operator()(u64, tests);
   std::vector<internal::Xoshiro> reference;
   reference.emplace_back(seed);
   const ScalableTag<std::uint64_t> d;
@@ -252,7 +254,7 @@ void TestNextNRandomUint64() {
 void TestNextFixedNRandomUint64() {
   const std::uint64_t seed = GetSeed();
   VectorXoshiro generator{seed};
-  const auto result_array = generator.operator()<tests>();
+  const auto result_array = generator.operator()<tests>(u64);
   std::vector<internal::Xoshiro> reference;
   reference.emplace_back(seed);
   const ScalableTag<std::uint64_t> d;
diff --git a/src/libvfcinstrumentonline/rand/tests/udround_test.cpp b/src/libvfcinstrumentonline/rand/tests/udround_test.cpp
deleted file mode 100644
index 6e448155..00000000
--- a/src/libvfcinstrumentonline/rand/tests/udround_test.cpp
+++ /dev/null
@@ -1,488 +0,0 @@
-#include <cmath>
-#include <cstdio>
-#include <cstdlib>
-#include <iostream>
-#include <random>
-#include <vector>
-
-#include <boost/math/distributions/binomial.hpp>
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-
-#include "src/eft.hpp"
-#include "src/main.hpp"
-#include "src/ud.hpp"
-#include "src/utils.hpp"
-#include "tests/helper.hpp"
-
-using ::testing::AllOf;
-using ::testing::Ge;
-using ::testing::Gt;
-using ::testing::Le;
-using ::testing::Lt;
-
-constexpr float default_alpha = 0.05;
-
-namespace reference {
-// return pred(|s|)
-
-// twosum reference
-// compute in double precision if the input type is float
-// compute in quad precision if the input type is double
-template <typename T, typename R = typename helper::IEEE754<T>::H>
-R ud_add(T a, T b) {
-  using H = typename helper::IEEE754<T>::H;
-  return static_cast<H>(a) + static_cast<H>(b);
-}
-
-template <typename T, typename R = typename helper::IEEE754<T>::H>
-R ud_sub(T a, T b) {
-  using H = typename helper::IEEE754<T>::H;
-  return static_cast<H>(a) - static_cast<H>(b);
-}
-
-template <typename T, typename R = typename helper::IEEE754<T>::H>
-R ud_mul(T a, T b) {
-  using H = typename helper::IEEE754<T>::H;
-  return static_cast<H>(a) * static_cast<H>(b);
-}
-
-template <typename T, typename R = typename helper::IEEE754<T>::H>
-R ud_div(T a, T b) {
-  using H = typename helper::IEEE754<T>::H;
-  return static_cast<H>(a) / static_cast<H>(b);
-}
-
-template <typename T> T absolute_distance(T a, T b = 0) {
-  if constexpr (std::is_same<T, Float128>::value) {
-    __uint128_t u = reinterpret_cast<__uint128_t &>(a - b);
-    __uint128_t mask = 1;
-    mask <<= 127;
-    u &= ~mask;
-    return reinterpret_cast<T &>(u);
-  } else {
-    return std::abs(a - b);
-  }
-}
-
-template <typename T> int get_exponent(T a) {
-  int exp = 0;
-  if (a == 0)
-    return 0;
-  if constexpr (std::is_same_v<T, Float128>) {
-    __uint128_t u = reinterpret_cast<__uint128_t &>(a);
-    __uint128_t mask = 0x7fff;
-    u >>= 112;
-    exp = (int)(u & mask);
-    exp -= 0x3FFF;
-  } else {
-    using U = typename helper::IEEE754<T>::U;
-    U u = reinterpret_cast<U &>(a);
-    u >>= helper::IEEE754<T>::mantissa;
-    exp = (int)(u & helper::IEEE754<T>::exponent_mask);
-    exp -= helper::IEEE754<T>::bias;
-  }
-  return exp;
-}
-
-template <typename T> T relative_distance(T a, T b) {
-  if (a == 0 and b == 0)
-    return 0;
-  if (a == 0)
-    return b;
-  if (b == 0)
-    return a;
-  return absolute_distance(a, b) / absolute_distance(a);
-}
-
-}; // namespace reference
-
-template <typename T> class Counter {
-private:
-  int _up_count;
-  int _down_count;
-  T _down;
-  T _up;
-  std::map<T, int> data;
-  bool _is_finalized = false;
-
-public:
-  Counter() : _up_count(0), _down_count(0) {}
-
-  int &operator[](const T &key) {
-    _is_finalized = false;
-    return data[key];
-  }
-
-  void finalize() {
-    if (_is_finalized)
-      return;
-
-    auto keys = data.begin();
-    _down = keys->first;
-    _down_count = keys->second;
-    keys++;
-    _up = keys->first;
-    _up_count = keys->second;
-
-    if (_down > _up) {
-      std::swap(_down, _up);
-      std::swap(_down_count, _up_count);
-    }
-
-    _is_finalized = true;
-  }
-
-  const int size() const { return data.size(); }
-
-  const T &down() {
-    finalize();
-    return _down;
-  }
-
-  const T &up() {
-    finalize();
-    return _up;
-  }
-
-  const int &down_count() {
-    finalize();
-    return _down_count;
-  }
-
-  const int &up_count() {
-    finalize();
-    return _up_count;
-  }
-};
-
-template <typename T> struct Operator {
-  using binary_operator = std::function<T(T, T)>;
-};
-
-// compute ulp(a)
-template <typename T, typename H = typename helper::IEEE754<T>::H>
-H get_ulp(T a) {
-  int exponent;
-  std::frexp(a, &exponent);
-  exponent--;
-  H ulp = std::ldexp(1.0, exponent - helper::IEEE754<T>::mantissa);
-  return ulp;
-}
-
-template <typename T, typename H = typename helper::IEEE754<T>::H>
-std::pair<float, float> compute_distance_error(T a, T b, H reference) {
-
-  T ref_cast = static_cast<T>(reference);
-
-  H distance_to_fp = reference::absolute_distance(reference, ref_cast);
-  H ulp = get_ulp(ref_cast);
-
-  H probability = distance_to_fp / ulp;
-  H probability_c = 1 - probability;
-
-  if (ref_cast > reference)
-    return {probability_c, probability};
-  else
-    return {probability, probability_c};
-}
-
-template <typename T, typename Op = typename Operator<T>::binary>
-Counter<T> eval_op(T a, T b, Op op, const int repetitions = 1000) {
-  Counter<T> c;
-  for (int i = 0; i < repetitions; i++) {
-    T v = op(a, b);
-    c[v]++;
-  }
-
-  EXPECT_EQ(c.size(), 2) << "More than two values returned by "
-                            "target function";
-  EXPECT_EQ(c.down_count() + c.up_count(), repetitions)
-      << "Total count mismatch";
-
-  return c;
-}
-
-struct BinomialTest {
-  double lower;
-  double upper;
-  double pvalue;
-};
-
-BinomialTest binomial_test(const int n, const int k, const float p,
-                           const float alpha = 0.05) {
-
-  boost::math::binomial_distribution<> dist(n, p);
-  double lower = boost::math::cdf(dist, k);
-  double upper = boost::math::cdf(complement(dist, k - 1));
-  double pvalue = 2 * std::min(lower, upper);
-
-  return BinomialTest{lower, upper, pvalue};
-}
-
-std::string demangle(const char *name) {
-  int status = -1;
-  std::unique_ptr<char, void (*)(void *)> res{
-      abi::__cxa_demangle(name, NULL, NULL, &status), std::free};
-  return (status == 0) ? res.get() : name;
-}
-
-template <typename T, typename Op = typename Operator<T>::binary>
-void check_exact_proportion(T a, T b, Op op, const float proportion,
-                            const int repetitions = 1000,
-                            const float alpha = default_alpha) {
-
-  auto counter = eval_op(a, b, op, repetitions);
-  auto test = binomial_test(repetitions, counter.down_count(), proportion);
-
-  EXPECT_THAT(alpha / 2, Lt(test.pvalue))
-      << "Null hypotheis rejected!\n"
-      // << "op: " << op << "\n"
-      << "alpha: " << alpha << "\n"
-      << "proportion: " << proportion << "\n"
-      << "count_down: " << counter.down_count() << "\n"
-      << "sample size: " << repetitions << "\n"
-      << "pvalue: " << test.pvalue << "\n"
-      << "lower_bound: " << test.lower << "\n"
-      << "upper_bound: " << test.upper << "\n"
-      << std::hexfloat << "down: " << counter.down() << "\n"
-      << std::hexfloat << "up: " << counter.down() << "\n";
-}
-
-template <typename T, typename Op = typename Operator<T>::binary>
-void check_distribution_match(T a, T b, Op reference_op, Op target_op,
-                              const int repetitions = 1000,
-                              const float alpha = default_alpha) {
-  using H = typename helper::IEEE754<T>::H;
-
-  H reference = reference_op(a, b);
-  auto [down, up] = compute_distance_error(a, b, reference);
-
-  auto counter = eval_op(a, b, target_op, repetitions);
-  auto count_down = counter.down_count();
-  auto count_up = counter.up_count();
-
-  EXPECT_EQ(count_down, repetitions * down) << "Down value count mismatch";
-  EXPECT_EQ(count_up, repetitions * up) << "Up value count mismatch";
-  EXPECT_EQ(count_down + count_up, repetitions) << "Total count mismatch";
-
-  auto down_test = binomial_test(repetitions, count_down, down, alpha);
-
-  EXPECT_THAT(down, AllOf(Ge(down_test.lower_bound), Le(down_test.upper_bound)))
-      << "Propability of down value mismatch!\n"
-      << "op: " << target_op.target_type().name() << "\n"
-      << "down: " << down << "\n"
-      << "lower_bound_down: " << down_test.lower_bound << "\n"
-      << "upper_bound_down: " << down_test.upper_bound
-      << "sample size: " << repetitions << "\n"
-      << "count_down: " << counter.down_count() << "\n"
-      << "alpha: " << alpha << "\n";
-
-  auto up_test = binomial_test(repetitions, count_up, up, alpha);
-
-  EXPECT_THAT(up, AllOf(Ge(up_test.lower_bound), Le(up_test.upper_bound)))
-      << "Propability of up value mismatch\n"
-      << "op: " << target_op.target_type().name() << "\n"
-      << "up: " << counter.up() << "\n"
-      << "lower_bound_up: " << up_test.lower_bound << "\n"
-      << "upper_bound_up: " << up_test.upper_bound
-      << "sample size: " << repetitions << "\n"
-      << "count_up: " << counter.up_count() << "\n"
-      << "alpha: " << alpha << "\n";
-}
-
-template <typename T> std::vector<T> get_simple_case() {
-  std::vector<T> simple_case = {0.0,
-                                1.0,
-                                2.0,
-                                3.0,
-                                std::numeric_limits<T>::min(),
-                                std::numeric_limits<T>::lowest(),
-                                std::numeric_limits<T>::max(),
-                                std::numeric_limits<T>::epsilon(),
-                                std::numeric_limits<T>::infinity(),
-                                std::numeric_limits<T>::denorm_min(),
-                                std::numeric_limits<T>::quiet_NaN(),
-                                std::numeric_limits<T>::signaling_NaN()};
-  return simple_case;
-}
-
-enum class OperatorType { ADD, SUB, MUL, DIV };
-
-template <typename T, OperatorType op_type> void do_basic_test() {
-  using I = typename helper::IEEE754<T>::I;
-  constexpr I mantissa = helper::IEEE754<T>::mantissa;
-
-  T a = 1.25;
-  T b;
-
-  std::function<T(T, T)> op;
-  switch (op_type) {
-  case OperatorType::ADD:
-    op = ud_add<T>;
-    break;
-  case OperatorType::SUB:
-    op = ud_sub<T>;
-    break;
-  case OperatorType::MUL:
-    op = ud_mul<T>;
-    break;
-  case OperatorType::DIV:
-    op = ud_div<T>;
-    break;
-  }
-
-  for (int i = 0; i <= 5; i++) {
-    b = std::ldexp(1.0, -(mantissa + i));
-    check_exact_proportion(a, b, op, 0.5, 100000, 0.01);
-  }
-}
-
-template <typename T> void do_basic_test_add() {
-  do_basic_test<T, OperatorType::ADD>();
-}
-
-template <typename T> void do_basic_test_sub() {
-  do_basic_test<T, OperatorType::SUB>();
-}
-
-template <typename T> void do_basic_test_mul() {
-  do_basic_test<T, OperatorType::MUL>();
-}
-
-template <typename T> void do_basic_test_div() {
-  do_basic_test<T, OperatorType::DIV>();
-}
-
-TEST(UDRoundTest, BasicAssertionsAddition) {
-  do_basic_test_add<float>();
-  do_basic_test_add<double>();
-}
-
-TEST(UDRoundTest, BasicAssertionsSubstraction) {
-  do_basic_test_sub<float>();
-  do_basic_test_sub<double>();
-}
-
-TEST(UDRoundTest, BasicAssertionsMultiplication) {
-  do_basic_test_mul<float>();
-  do_basic_test_mul<double>();
-}
-
-TEST(UDRoundTest, BasicAssertionsDivision) {
-  do_basic_test_div<float>();
-  do_basic_test_div<double>();
-}
-
-// TEST(UDRoundTest, Random01Assertions) {
-//   RNG rng;
-
-//   for (int i = 0; i < 1000; i++) {
-//     float a = rng();
-//     float b = rng();
-//     test_equality(a, b);
-//     test_equality(a, -b);
-//     test_equality(-a, b);
-//     test_equality(-a, -b);
-//   }
-
-//   for (int i = 0; i < 1000; i++) {
-//     double a = rng();
-//     double b = rng();
-//     test_equality(a, b);
-//     test_equality(a, -b);
-//     test_equality(-a, b);
-//     test_equality(-a, -b);
-//   }
-// }
-
-// TEST(UDRoundTest, RandomNoOverlapAssertions) {
-//   RNG rng_0v1_float(0, 1);
-//   RNG rng_24v25_float(0x1p24, 0x1p25);
-
-//   for (int i = 0; i < 1000; i++) {
-//     float a = rng_0v1_float();
-//     float b = rng_24v25_float();
-//     test_equality(a, b);
-//     test_equality(a, -b);
-//     test_equality(-a, b);
-//     test_equality(-a, -b);
-//   }
-
-//   RNG rng_0v1_double(0, 1);
-//   RNG rng_53v54_double(0x1p53, 0x1p54);
-
-//   for (int i = 0; i < 1000; i++) {
-//     double a = rng_0v1_double();
-//     double b = rng_53v54_double();
-//     test_equality(a, b);
-//     test_equality(a, -b);
-//     test_equality(-a, b);
-//     test_equality(-a, -b);
-//   }
-// }
-
-// TEST(UDRoundTest, RandomLastBitOverlapAssertions) {
-//   RNG rng_1v2_float(1, 2);
-//   RNG rng_23v24_float(0x1p23, 0x1p24);
-
-//   for (int i = 0; i < 1000; i++) {
-//     float a = rng_1v2_float();
-//     float b = rng_23v24_float();
-//     test_equality(a, b);
-//     test_equality(a, -b);
-//     test_equality(-a, b);
-//     test_equality(-a, -b);
-//   }
-
-//   RNG rng_1v2_double(1, 2);
-//   RNG rng_52v53_double(0x1p52, 0x1p53);
-
-//   for (int i = 0; i < 1000; i++) {
-//     double a = rng_1v2_double();
-//     double b = rng_52v53_double();
-//     test_equality(a, b);
-//     test_equality(a, -b);
-//     test_equality(-a, b);
-//     test_equality(-a, -b);
-//   }
-// }
-
-// TEST(UDRoundTest, RandomMidOverlapAssertions) {
-//   RNG rng_0v1_float(0, 1);
-//   RNG rng_12v13_float(0x1p12, 0x1p13);
-
-//   for (int i = 0; i < 1000; i++) {
-//     float a = rng_0v1_float();
-//     float b = rng_12v13_float();
-//     test_equality(a, b);
-//     test_equality(a, -b);
-//     test_equality(-a, b);
-//     test_equality(-a, -b);
-//   }
-
-//   RNG rng_0v1_double(0, 1);
-//   RNG rng_26v27_double(0x1p26, 0x1p27);
-
-//   for (int i = 0; i < 1000; i++) {
-//     double a = rng_0v1_double();
-//     double b = rng_26v27_double();
-//     test_equality(a, b);
-//     test_equality(a, -b);
-//     test_equality(-a, b);
-//     test_equality(-a, -b);
-//   }
-// }
-
-// TEST(UDRoundTest, BinadeAssertions) {
-//   for (int i = -126; i < 127; i++) {
-//     testBinade<float>(i);
-//     testBinade<double>(i);
-//   }
-// }
-
-int main(int argc, char **argv) {
-  testing::InitGoogleTest(&argc, argv);
-  init();
-  return RUN_ALL_TESTS();
-}
\ No newline at end of file