Add up-down operators

yohanchatelain · Nov 21, 2024 · 022cadf · 022cadf
1 parent 6fea34f
commit 022cadf
Show file tree

Hide file tree

Showing 40 changed files with 4,613 additions and 851 deletions.
diff --git a/src/libvfcinstrumentonline/rand/build-ir.sh b/src/libvfcinstrumentonline/rand/build-ir.sh
@@ -6,4 +6,11 @@ bazelisk build --repo_env=CC=$(llvm-config-14 --bindir)/clang \
     --cxxopt="-march=native" --cxxopt="-emit-llvm" --cxxopt="-O3" --save_temps \
     --cxxopt="-USR_DEBUG" --cxxopt="-g0"
 
+bazelisk build --repo_env=CC=$(llvm-config-14 --bindir)/clang \
+    --repo_env=CXX=$(llvm-config-14 --bindir)/clang++ \
+    --compile_one_dependency //src:ud_hw.cpp \
+    --cxxopt="-march=native" --cxxopt="-emit-llvm" --cxxopt="-O3" --save_temps \
+    --cxxopt="-USR_DEBUG" --cxxopt="-g0"
+
 cp -f bazel-bin/src/_objs/sr/sr_hw.pic.s sr_hw.ll
+cp -f bazel-bin/src/_objs/ud/ud_hw.pic.s ud_hw.ll
diff --git a/src/libvfcinstrumentonline/rand/src/BUILD b/src/libvfcinstrumentonline/rand/src/BUILD
@@ -10,7 +10,10 @@ exports_files([
     "sr_hw.h",
     "sr.h",
     "sr_hw-inl.h",
-    "ud.hpp",
+    "ud.h",
+    "ud_hw.h",
+    "ud_hw.cpp",
+    "ud_hw-inl.h",
     "utils.hpp",
     "xoroshiro256+_hw.hpp",
     "xoroshiro256+_hw.h",
@@ -20,23 +23,62 @@ exports_files([
 
 # Compiler options
 COPTS = [
-    "-std=c++20",
+    "-std=c++17",
     "-Wfatal-errors",
 ]
 
 filegroup(
-    name = "srcs",
-    srcs = glob([
-        "*.cpp",
-        "*.hpp",
-        "*.h",
-    ]),
+    name = "xoroshiro",
+    srcs = [
+        "debug_hwy-inl.h",
+        "random-inl.h",
+        "target-utils.h",
+        "xoroshiro256+_hw.cpp",
+        "xoroshiro256+_hw.h",
+        "xoroshiro256+_hw-inl.hpp",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+filegroup(
+    name = "srcs-sr",
+    srcs = glob(
+        [
+            "*.cpp",
+            "*.hpp",
+            "*.h",
+        ],
+        exclude = [
+            "ud_hw.cpp",
+            "ud_hw.h",
+            "ud_hw-inl.h",
+            "ud.h",
+        ],
+    ),
+    visibility = ["//visibility:public"],
+)
+
+filegroup(
+    name = "srcs-ud",
+    srcs = glob(
+        [
+            "*.cpp",
+            "*.hpp",
+            "*.h",
+        ],
+        exclude = [
+            "sr_hw.cpp",
+            "sr_hw.h",
+            "sr_hw-inl.h",
+            "sr.h",
+        ],
+    ),
     visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "sr",
-    srcs = [":srcs"],
+    srcs = [":srcs-sr"],
     hdrs = ["//src:sr_hw.h"],
     copts = COPTS + [
         "-O2",
@@ -47,6 +89,19 @@ cc_library(
     deps = ["@hwy"],
 )
 
+cc_library(
+    name = "ud",
+    srcs = [":srcs-ud"],
+    hdrs = ["//src:ud_hw.h"],
+    copts = COPTS + [
+        "-O2",
+        "-Wall",
+        "-Wno-psabi",
+    ],
+    visibility = ["//visibility:public"],
+    deps = ["@hwy"],
+)
+
 cc_library(
     name = "sr-dbg",
     srcs = [
@@ -66,10 +121,8 @@ cc_library(
         "//src:sr_hw.h",
     ],
     copts = COPTS + [
-        "-std=c++20",
         "-Og",
         "-g",
-        "-march=native",
         "-Wfatal-errors",
         "-DHWY_DISABLED_TARGETS=(HWY_AVX3|HWY_AVX3_ZEN4|HWY_AVX3_SPR)",
         "-DSR_DEBUG",

diff --git a/src/libvfcinstrumentonline/rand/src/inputs_test_generator/generate.py b/src/libvfcinstrumentonline/rand/src/inputs_test_generator/generate.py
diff --git a/src/libvfcinstrumentonline/rand/src/inputs_test_generator/test b/src/libvfcinstrumentonline/rand/src/inputs_test_generator/test
diff --git a/src/libvfcinstrumentonline/rand/src/inputs_test_generator/test.cpp b/src/libvfcinstrumentonline/rand/src/inputs_test_generator/test.cpp
@@ -3,6 +3,8 @@
 #include <iostream>
 #include <map>
 
+const std::size_t N = 1'000'000;
+
 #ifndef REAL
 #define REAL double
 #warning "REAL not defined, using double"
@@ -49,10 +51,10 @@ int main(int argc, char *argv[]) {
 
   std::map<REAL, int> visited;
 
-  for (int i = 0; i < 1000; i++)
+  for (int i = 0; i < N; i++)
     visited[apply_op(argv[1][0], a, b)]++;
 
-  compute_proba(visited, 1000);
+  compute_proba(visited, N);
 
   return 0;
 }
diff --git a/src/libvfcinstrumentonline/rand/src/inputs_test_generator/test_double b/src/libvfcinstrumentonline/rand/src/inputs_test_generator/test_double
diff --git a/src/libvfcinstrumentonline/rand/src/inputs_test_generator/test_float b/src/libvfcinstrumentonline/rand/src/inputs_test_generator/test_float
diff --git a/src/libvfcinstrumentonline/rand/src/random-inl.h b/src/libvfcinstrumentonline/rand/src/random-inl.h
@@ -205,7 +205,7 @@ class Xoshiro {
 class VectorXoshiro {
 private:
   using VU64 = Vec<ScalableTag<std::uint64_t>>;
-  using VU32 = Vec<ScalableTag<std::uint64_t>>;
+  using VU32 = Vec<ScalableTag<std::uint32_t>>;
   using StateType = AlignedNDArray<std::uint64_t, 2>;
   using VF32 = Vec<ScalableTag<float>>;
 #if HWY_HAVE_FLOAT64
@@ -232,9 +232,34 @@ class VectorXoshiro {
     }
   }
 
-  HWY_INLINE VU64 operator()() noexcept { return Next(); }
+  HWY_INLINE VU32 operator()(std::uint32_t) noexcept {
+    const auto result = Next();
+    return BitCast(ScalableTag<std::uint32_t>{}, result);
+  }
+
+  HWY_INLINE VU64 operator()(std::uint64_t) noexcept { return Next(); }
 
-  AlignedVector<std::uint64_t> operator()(const std::size_t n) {
+  AlignedVector<std::uint32_t> operator()(std::uint32_t, const std::size_t n) {
+    const auto u32_tag = ScalableTag<std::uint32_t>{};
+    AlignedVector<std::uint32_t> result(n);
+    const ScalableTag<std::uint64_t> tag{};
+    auto s0 = Load(tag, state_[{0}].data());
+    auto s1 = Load(tag, state_[{1}].data());
+    auto s2 = Load(tag, state_[{2}].data());
+    auto s3 = Load(tag, state_[{3}].data());
+    for (std::uint64_t i = 0; i < n; i += Lanes(u32_tag)) {
+      const auto next = Update(s0, s1, s2, s3);
+      const auto next_u32 = BitCast(u32_tag, next);
+      Store(next_u32, u32_tag, result.data() + i);
+    }
+    Store(s0, tag, state_[{0}].data());
+    Store(s1, tag, state_[{1}].data());
+    Store(s2, tag, state_[{2}].data());
+    Store(s3, tag, state_[{3}].data());
+    return result;
+  }
+
+  AlignedVector<std::uint64_t> operator()(std::uint64_t, const std::size_t n) {
     AlignedVector<std::uint64_t> result(n);
     const ScalableTag<std::uint64_t> tag{};
     auto s0 = Load(tag, state_[{0}].data());
@@ -252,8 +277,29 @@ class VectorXoshiro {
     return result;
   }
 
+  template <std::uint32_t N>
+  std::array<std::uint32_t, N> operator()(std::uint32_t) noexcept {
+    alignas(HWY_ALIGNMENT) std::array<std::uint32_t, N> result;
+    const ScalableTag<std::uint64_t> tag{};
+    const ScalableTag<std::uint32_t> u32_tag{};
+    auto s0 = Load(tag, state_[{0}].data());
+    auto s1 = Load(tag, state_[{1}].data());
+    auto s2 = Load(tag, state_[{2}].data());
+    auto s3 = Load(tag, state_[{3}].data());
+    for (std::uint64_t i = 0; i < N; i += Lanes(u32_tag)) {
+      const auto next = Update(s0, s1, s2, s3);
+      const auto next_u32 = BitCast(u32_tag, next);
+      Store(next_u32, u32_tag, result.data() + i);
+    }
+    Store(s0, tag, state_[{0}].data());
+    Store(s1, tag, state_[{1}].data());
+    Store(s2, tag, state_[{2}].data());
+    Store(s3, tag, state_[{3}].data());
+    return result;
+  }
+
   template <std::uint64_t N>
-  std::array<std::uint64_t, N> operator()() noexcept {
+  std::array<std::uint64_t, N> operator()(std::uint64_t) noexcept {
     alignas(HWY_ALIGNMENT) std::array<std::uint64_t, N> result;
     const ScalableTag<std::uint64_t> tag{};
     auto s0 = Load(tag, state_[{0}].data());
@@ -294,10 +340,7 @@ class VectorXoshiro {
   }
 
   HWY_INLINE AlignedVector<float> Uniform(float, const std::size_t n) {
-    // std::cerr << "Uniform float start\n";
     AlignedVector<float> result(n);
-    // std::cerr << "result address: " << &result << "\n";
-    // std::cerr << "result size: " << result.size() << "\n";
     const ScalableTag<std::uint32_t> u32_tag{};
     const ScalableTag<std::uint64_t> tag{};
     const ScalableTag<float> real_tag{};
@@ -309,22 +352,17 @@ class VectorXoshiro {
     auto s3 = Load(tag, state_[{3}].data());
 
     for (std::size_t i = 0; i < n; i += Lanes(real_tag)) {
-      // std::cerr << "i: " << i << "\n";
       const auto next = Update(s0, s1, s2, s3);
       const auto bits = BitCast(u32_tag, next);
       const auto bitscast = ShiftRight<8>(bits);
       const auto real = ConvertTo(real_tag, bitscast);
       const auto uniform = Mul(real, MUL_VALUE);
-      // std::cerr << "store at " << i << " " << result.data() + i << "\n";
       Store(uniform, real_tag, result.data() + i);
     }
-
     Store(s0, tag, state_[{0}].data());
     Store(s1, tag, state_[{1}].data());
     Store(s2, tag, state_[{2}].data());
     Store(s3, tag, state_[{3}].data());
-
-    // std::cerr << "Uniform float end\n";
     return result;
   }
 
@@ -466,12 +504,12 @@ template <std::uint64_t size = 1024> class CachedXoshiro {
 
   explicit CachedXoshiro(const result_type seed,
                          const result_type threadNumber = 0)
-      : generator_{seed, threadNumber}, cache_{generator_.operator()<size>()},
-        index_{0} {}
+      : generator_{seed, threadNumber},
+        cache_{generator_.operator()<size>(result_type{})}, index_{0} {}
 
   result_type operator()() noexcept {
     if (HWY_UNLIKELY(index_ == size)) {
-      cache_ = std::move(generator_.operator()<size>());
+      cache_ = std::move(generator_.operator()<size>(result_type{}));
       index_ = 0;
     }
     return cache_[index_++];

diff --git a/src/libvfcinstrumentonline/rand/src/sr.h b/src/libvfcinstrumentonline/rand/src/sr.h
@@ -34,7 +34,7 @@ template <typename T> inline T sround(const T sigma, const T tau) {
   debug_start();
   if (tau == 0) {
     debug_end();
-    return sigma;
+    return 0;
   }
   constexpr int32_t mantissa = IEEE754<T>::mantissa;
   const bool sign_tau = tau < 0;

diff --git a/src/libvfcinstrumentonline/rand/src/sr_hw-inl.h b/src/libvfcinstrumentonline/rand/src/sr_hw-inl.h
@@ -49,15 +49,18 @@ void twosum(V a, V b, V &sigma, V &tau) {
 }
 
 /*
+"Emulation of the FMA in rounded-to-nearest loating-point arithmetic"
+Stef Graillat, Jean-Michel Muller
+---
 Algorithm 3 – Split(x, s). Veltkamp’s splitting algorithm. Returns a pair
 (xh, xℓ) of FP numbers such that the significand of xh fits in s − p bits, the
 significand of xℓ fits in s − 1 bits, and xh + xℓ = x.
-Require: K = 2s + 1
+Require: K = 2^s + 1
 Require: 2 ≤ s ≤ p − 2
 γ ← RN(K · x)
 δ ← RN(x − γ)
-ah ← RN(γ + δ)
-aℓ ← RN(x − ah)
+xh ← RN(γ + δ)
+xℓ ← RN(x − xh)
 return (xh, xℓ)
 */
 template <class D, class V = hn::TFromD<D>, typename T = hn::TFromD<D>>
@@ -80,6 +83,9 @@ void Split(V x, V &xh, V &xl) {
 }
 
 /*
+"Emulation of the FMA in rounded-to-nearest loating-point arithmetic"
+Stef Graillat, Jean-Michel Muller
+---
 Algorithm 4 – DekkerProd(a, b). Dekker’s product. Returns a pair (πh, πℓ)
 of FP numbers such that πh = RN(ab) and πh + πℓ = ab.
 Require: s = ⌈p/2⌉
@@ -115,6 +121,9 @@ void DekkerProd(V a, V b, V &pi_h, V &pi_l) {
 }
 
 /*
+"Emulation of the FMA in rounded-to-nearest loating-point arithmetic"
+Stef Graillat, Jean-Michel Muller
+---
 Algorithm 7 EmulFMA(a, b, c).
 Require: P = 2^(p−1) + 1
 Require: Q = 2^(p−1)
@@ -192,11 +201,12 @@ V fma(V a, V b, V c) {
   auto g = hn::Mul(t, w);
   auto mask3 = hn::Lt(g, hn::Zero(d)); // if g < 0 then
 
-  auto res = hn::IfThenElse(
-      mask, d_temp_1,
-      hn::IfThenElse(mask1, z_h,
-                     hn::IfThenElse(mask2, d_temp_2,
-                                    hn::IfThenElse(mask3, z_h, d_temp_2))));
+  auto ret3 = hn::IfThenElse(mask3, z_h, d_temp_2);
+  auto ret2 = hn::IfThenElse(mask2, d_temp_2, ret3);
+  auto ret1 = hn::IfThenElse(mask1, z_h, ret2);
+  auto ret = hn::IfThenElse(mask, d_temp_1, ret1);
+
+  auto res = ret;
 
   debug_vec<D>("[fma] res", res);
   debug_msg("[fma] END\n");
@@ -276,7 +286,7 @@ HWY_INLINE hn::Vec<D> FastPow2I(D d, VI x) {
   const hn::Rebind<hwy::MakeSigned<D>, D> di;
   const auto kOffset = Set(di, kOffsetS);
   const auto offset = Add(x, kOffset);
-  const auto shift = ShiftLeft<mantissa>(offset);
+  const auto shift = hn::ShiftLeft<mantissa>(offset);
   return BitCast(d, shift);
 }
 
@@ -329,7 +339,7 @@ hn::Vec<D> pow2(D d, V n) {
   return hn::BitCast(d, res);
 }
 
-template <class D, class V, typename T = hn::TFromD<D>>
+template <class D, class V = hn::VFromD<D>, typename T = hn::TFromD<D>>
 V sr_round(V sigma, V tau) {
   debug_msg("\n[sr_round] START");
   debug_vec<D>("[sr_round] sigma", sigma);
@@ -362,14 +372,17 @@ V sr_round(V sigma, V tau) {
 
   auto exp = hn::Sub(eta, hn::Set(di, mantissa));
   auto abs_ulp = pow2(d, exp);
-  debug_vec<D>("[sr_round] abs_ulp", abs_ulp);
+  debug_vec<D>("[sr_round] |ulp|", abs_ulp);
+
   auto ulp = hn::CopySign(abs_ulp, tau);
   debug_vec<D>("[sr_round] ulp", ulp);
 
   auto pi = hn::Mul(ulp, z);
   debug_vec<D>("[sr_round] pi", pi);
+
   auto abs_tau_plus_pi = hn::Abs(hn::Add(tau, pi));
-  debug_vec<D>("[sr_round] abs_tau_plus_pi", abs_tau_plus_pi);
+  debug_vec<D>("[sr_round] |tau|+pi", abs_tau_plus_pi);
+
   auto round = hn::IfThenElse(hn::Ge(abs_tau_plus_pi, abs_ulp), ulp, zero);
   debug_vec<D>("[sr_round] round", round);