diff --git a/src/libvfcinstrumentonline/rand/build-ir.sh b/src/libvfcinstrumentonline/rand/build-ir.sh index 58242ad2..49963c06 100755 --- a/src/libvfcinstrumentonline/rand/build-ir.sh +++ b/src/libvfcinstrumentonline/rand/build-ir.sh @@ -6,4 +6,11 @@ bazelisk build --repo_env=CC=$(llvm-config-14 --bindir)/clang \ --cxxopt="-march=native" --cxxopt="-emit-llvm" --cxxopt="-O3" --save_temps \ --cxxopt="-USR_DEBUG" --cxxopt="-g0" +bazelisk build --repo_env=CC=$(llvm-config-14 --bindir)/clang \ + --repo_env=CXX=$(llvm-config-14 --bindir)/clang++ \ + --compile_one_dependency //src:ud_hw.cpp \ + --cxxopt="-march=native" --cxxopt="-emit-llvm" --cxxopt="-O3" --save_temps \ + --cxxopt="-USR_DEBUG" --cxxopt="-g0" + cp -f bazel-bin/src/_objs/sr/sr_hw.pic.s sr_hw.ll +cp -f bazel-bin/src/_objs/ud/ud_hw.pic.s ud_hw.ll diff --git a/src/libvfcinstrumentonline/rand/src/BUILD b/src/libvfcinstrumentonline/rand/src/BUILD index dc9cf666..02fc564b 100644 --- a/src/libvfcinstrumentonline/rand/src/BUILD +++ b/src/libvfcinstrumentonline/rand/src/BUILD @@ -10,7 +10,10 @@ exports_files([ "sr_hw.h", "sr.h", "sr_hw-inl.h", - "ud.hpp", + "ud.h", + "ud_hw.h", + "ud_hw.cpp", + "ud_hw-inl.h", "utils.hpp", "xoroshiro256+_hw.hpp", "xoroshiro256+_hw.h", @@ -20,23 +23,62 @@ exports_files([ # Compiler options COPTS = [ - "-std=c++20", + "-std=c++17", "-Wfatal-errors", ] filegroup( - name = "srcs", - srcs = glob([ - "*.cpp", - "*.hpp", - "*.h", - ]), + name = "xoroshiro", + srcs = [ + "debug_hwy-inl.h", + "random-inl.h", + "target-utils.h", + "xoroshiro256+_hw.cpp", + "xoroshiro256+_hw.h", + "xoroshiro256+_hw-inl.hpp", + ], + visibility = ["//visibility:public"], +) + +filegroup( + name = "srcs-sr", + srcs = glob( + [ + "*.cpp", + "*.hpp", + "*.h", + ], + exclude = [ + "ud_hw.cpp", + "ud_hw.h", + "ud_hw-inl.h", + "ud.h", + ], + ), + visibility = ["//visibility:public"], +) + +filegroup( + name = "srcs-ud", + srcs = glob( + [ + "*.cpp", + "*.hpp", + "*.h", + ], + exclude = [ + "sr_hw.cpp", + "sr_hw.h", + "sr_hw-inl.h", + "sr.h", + ], + ), visibility = ["//visibility:public"], ) cc_library( name = "sr", - srcs = [":srcs"], + srcs = [":srcs-sr"], hdrs = ["//src:sr_hw.h"], copts = COPTS + [ "-O2", @@ -47,6 +89,19 @@ cc_library( deps = ["@hwy"], ) +cc_library( + name = "ud", + srcs = [":srcs-ud"], + hdrs = ["//src:ud_hw.h"], + copts = COPTS + [ + "-O2", + "-Wall", + "-Wno-psabi", + ], + visibility = ["//visibility:public"], + deps = ["@hwy"], +) + cc_library( name = "sr-dbg", srcs = [ @@ -66,10 +121,8 @@ cc_library( "//src:sr_hw.h", ], copts = COPTS + [ - "-std=c++20", "-Og", "-g", - "-march=native", "-Wfatal-errors", "-DHWY_DISABLED_TARGETS=(HWY_AVX3|HWY_AVX3_ZEN4|HWY_AVX3_SPR)", "-DSR_DEBUG", diff --git a/src/libvfcinstrumentonline/rand/src/inputs_test_generator/generate.py b/src/libvfcinstrumentonline/rand/src/inputs_test_generator/generate.py new file mode 100644 index 00000000..e69de29b diff --git a/src/libvfcinstrumentonline/rand/src/inputs_test_generator/test b/src/libvfcinstrumentonline/rand/src/inputs_test_generator/test new file mode 100755 index 00000000..cddbe63b Binary files /dev/null and b/src/libvfcinstrumentonline/rand/src/inputs_test_generator/test differ diff --git a/src/libvfcinstrumentonline/rand/src/inputs_test_generator/test.cpp b/src/libvfcinstrumentonline/rand/src/inputs_test_generator/test.cpp index 7348e0cd..ad274e79 100644 --- a/src/libvfcinstrumentonline/rand/src/inputs_test_generator/test.cpp +++ b/src/libvfcinstrumentonline/rand/src/inputs_test_generator/test.cpp @@ -3,6 +3,8 @@ #include #include +const std::size_t N = 1'000'000; + #ifndef REAL #define REAL double #warning "REAL not defined, using double" @@ -49,10 +51,10 @@ int main(int argc, char *argv[]) { std::map visited; - for (int i = 0; i < 1000; i++) + for (int i = 0; i < N; i++) visited[apply_op(argv[1][0], a, b)]++; - compute_proba(visited, 1000); + compute_proba(visited, N); return 0; } \ No newline at end of file diff --git a/src/libvfcinstrumentonline/rand/src/inputs_test_generator/test_double b/src/libvfcinstrumentonline/rand/src/inputs_test_generator/test_double new file mode 100755 index 00000000..7c89fe72 Binary files /dev/null and b/src/libvfcinstrumentonline/rand/src/inputs_test_generator/test_double differ diff --git a/src/libvfcinstrumentonline/rand/src/inputs_test_generator/test_float b/src/libvfcinstrumentonline/rand/src/inputs_test_generator/test_float new file mode 100755 index 00000000..7b477ff3 Binary files /dev/null and b/src/libvfcinstrumentonline/rand/src/inputs_test_generator/test_float differ diff --git a/src/libvfcinstrumentonline/rand/src/random-inl.h b/src/libvfcinstrumentonline/rand/src/random-inl.h index 69360e90..31cbeb81 100644 --- a/src/libvfcinstrumentonline/rand/src/random-inl.h +++ b/src/libvfcinstrumentonline/rand/src/random-inl.h @@ -205,7 +205,7 @@ class Xoshiro { class VectorXoshiro { private: using VU64 = Vec>; - using VU32 = Vec>; + using VU32 = Vec>; using StateType = AlignedNDArray; using VF32 = Vec>; #if HWY_HAVE_FLOAT64 @@ -232,9 +232,34 @@ class VectorXoshiro { } } - HWY_INLINE VU64 operator()() noexcept { return Next(); } + HWY_INLINE VU32 operator()(std::uint32_t) noexcept { + const auto result = Next(); + return BitCast(ScalableTag{}, result); + } + + HWY_INLINE VU64 operator()(std::uint64_t) noexcept { return Next(); } - AlignedVector operator()(const std::size_t n) { + AlignedVector operator()(std::uint32_t, const std::size_t n) { + const auto u32_tag = ScalableTag{}; + AlignedVector result(n); + const ScalableTag tag{}; + auto s0 = Load(tag, state_[{0}].data()); + auto s1 = Load(tag, state_[{1}].data()); + auto s2 = Load(tag, state_[{2}].data()); + auto s3 = Load(tag, state_[{3}].data()); + for (std::uint64_t i = 0; i < n; i += Lanes(u32_tag)) { + const auto next = Update(s0, s1, s2, s3); + const auto next_u32 = BitCast(u32_tag, next); + Store(next_u32, u32_tag, result.data() + i); + } + Store(s0, tag, state_[{0}].data()); + Store(s1, tag, state_[{1}].data()); + Store(s2, tag, state_[{2}].data()); + Store(s3, tag, state_[{3}].data()); + return result; + } + + AlignedVector operator()(std::uint64_t, const std::size_t n) { AlignedVector result(n); const ScalableTag tag{}; auto s0 = Load(tag, state_[{0}].data()); @@ -252,8 +277,29 @@ class VectorXoshiro { return result; } + template + std::array operator()(std::uint32_t) noexcept { + alignas(HWY_ALIGNMENT) std::array result; + const ScalableTag tag{}; + const ScalableTag u32_tag{}; + auto s0 = Load(tag, state_[{0}].data()); + auto s1 = Load(tag, state_[{1}].data()); + auto s2 = Load(tag, state_[{2}].data()); + auto s3 = Load(tag, state_[{3}].data()); + for (std::uint64_t i = 0; i < N; i += Lanes(u32_tag)) { + const auto next = Update(s0, s1, s2, s3); + const auto next_u32 = BitCast(u32_tag, next); + Store(next_u32, u32_tag, result.data() + i); + } + Store(s0, tag, state_[{0}].data()); + Store(s1, tag, state_[{1}].data()); + Store(s2, tag, state_[{2}].data()); + Store(s3, tag, state_[{3}].data()); + return result; + } + template - std::array operator()() noexcept { + std::array operator()(std::uint64_t) noexcept { alignas(HWY_ALIGNMENT) std::array result; const ScalableTag tag{}; auto s0 = Load(tag, state_[{0}].data()); @@ -294,10 +340,7 @@ class VectorXoshiro { } HWY_INLINE AlignedVector Uniform(float, const std::size_t n) { - // std::cerr << "Uniform float start\n"; AlignedVector result(n); - // std::cerr << "result address: " << &result << "\n"; - // std::cerr << "result size: " << result.size() << "\n"; const ScalableTag u32_tag{}; const ScalableTag tag{}; const ScalableTag real_tag{}; @@ -309,22 +352,17 @@ class VectorXoshiro { auto s3 = Load(tag, state_[{3}].data()); for (std::size_t i = 0; i < n; i += Lanes(real_tag)) { - // std::cerr << "i: " << i << "\n"; const auto next = Update(s0, s1, s2, s3); const auto bits = BitCast(u32_tag, next); const auto bitscast = ShiftRight<8>(bits); const auto real = ConvertTo(real_tag, bitscast); const auto uniform = Mul(real, MUL_VALUE); - // std::cerr << "store at " << i << " " << result.data() + i << "\n"; Store(uniform, real_tag, result.data() + i); } - Store(s0, tag, state_[{0}].data()); Store(s1, tag, state_[{1}].data()); Store(s2, tag, state_[{2}].data()); Store(s3, tag, state_[{3}].data()); - - // std::cerr << "Uniform float end\n"; return result; } @@ -466,12 +504,12 @@ template class CachedXoshiro { explicit CachedXoshiro(const result_type seed, const result_type threadNumber = 0) - : generator_{seed, threadNumber}, cache_{generator_.operator()()}, - index_{0} {} + : generator_{seed, threadNumber}, + cache_{generator_.operator()(result_type{})}, index_{0} {} result_type operator()() noexcept { if (HWY_UNLIKELY(index_ == size)) { - cache_ = std::move(generator_.operator()()); + cache_ = std::move(generator_.operator()(result_type{})); index_ = 0; } return cache_[index_++]; diff --git a/src/libvfcinstrumentonline/rand/src/sr.h b/src/libvfcinstrumentonline/rand/src/sr.h index 230546ec..89c92e88 100644 --- a/src/libvfcinstrumentonline/rand/src/sr.h +++ b/src/libvfcinstrumentonline/rand/src/sr.h @@ -34,7 +34,7 @@ template inline T sround(const T sigma, const T tau) { debug_start(); if (tau == 0) { debug_end(); - return sigma; + return 0; } constexpr int32_t mantissa = IEEE754::mantissa; const bool sign_tau = tau < 0; diff --git a/src/libvfcinstrumentonline/rand/src/sr_hw-inl.h b/src/libvfcinstrumentonline/rand/src/sr_hw-inl.h index f173e1bc..fac8f114 100644 --- a/src/libvfcinstrumentonline/rand/src/sr_hw-inl.h +++ b/src/libvfcinstrumentonline/rand/src/sr_hw-inl.h @@ -49,15 +49,18 @@ void twosum(V a, V b, V &sigma, V &tau) { } /* +"Emulation of the FMA in rounded-to-nearest loating-point arithmetic" +Stef Graillat, Jean-Michel Muller +--- Algorithm 3 – Split(x, s). Veltkamp’s splitting algorithm. Returns a pair (xh, xℓ) of FP numbers such that the significand of xh fits in s − p bits, the significand of xℓ fits in s − 1 bits, and xh + xℓ = x. -Require: K = 2s + 1 +Require: K = 2^s + 1 Require: 2 ≤ s ≤ p − 2 γ ← RN(K · x) δ ← RN(x − γ) -ah ← RN(γ + δ) -aℓ ← RN(x − ah) +xh ← RN(γ + δ) +xℓ ← RN(x − xh) return (xh, xℓ) */ template , typename T = hn::TFromD> @@ -80,6 +83,9 @@ void Split(V x, V &xh, V &xl) { } /* +"Emulation of the FMA in rounded-to-nearest loating-point arithmetic" +Stef Graillat, Jean-Michel Muller +--- Algorithm 4 – DekkerProd(a, b). Dekker’s product. Returns a pair (πh, πℓ) of FP numbers such that πh = RN(ab) and πh + πℓ = ab. Require: s = ⌈p/2⌉ @@ -115,6 +121,9 @@ void DekkerProd(V a, V b, V &pi_h, V &pi_l) { } /* +"Emulation of the FMA in rounded-to-nearest loating-point arithmetic" +Stef Graillat, Jean-Michel Muller +--- Algorithm 7 EmulFMA(a, b, c). Require: P = 2^(p−1) + 1 Require: Q = 2^(p−1) @@ -192,11 +201,12 @@ V fma(V a, V b, V c) { auto g = hn::Mul(t, w); auto mask3 = hn::Lt(g, hn::Zero(d)); // if g < 0 then - auto res = hn::IfThenElse( - mask, d_temp_1, - hn::IfThenElse(mask1, z_h, - hn::IfThenElse(mask2, d_temp_2, - hn::IfThenElse(mask3, z_h, d_temp_2)))); + auto ret3 = hn::IfThenElse(mask3, z_h, d_temp_2); + auto ret2 = hn::IfThenElse(mask2, d_temp_2, ret3); + auto ret1 = hn::IfThenElse(mask1, z_h, ret2); + auto ret = hn::IfThenElse(mask, d_temp_1, ret1); + + auto res = ret; debug_vec("[fma] res", res); debug_msg("[fma] END\n"); @@ -276,7 +286,7 @@ HWY_INLINE hn::Vec FastPow2I(D d, VI x) { const hn::Rebind, D> di; const auto kOffset = Set(di, kOffsetS); const auto offset = Add(x, kOffset); - const auto shift = ShiftLeft(offset); + const auto shift = hn::ShiftLeft(offset); return BitCast(d, shift); } @@ -329,7 +339,7 @@ hn::Vec pow2(D d, V n) { return hn::BitCast(d, res); } -template > +template , typename T = hn::TFromD> V sr_round(V sigma, V tau) { debug_msg("\n[sr_round] START"); debug_vec("[sr_round] sigma", sigma); @@ -362,14 +372,17 @@ V sr_round(V sigma, V tau) { auto exp = hn::Sub(eta, hn::Set(di, mantissa)); auto abs_ulp = pow2(d, exp); - debug_vec("[sr_round] abs_ulp", abs_ulp); + debug_vec("[sr_round] |ulp|", abs_ulp); + auto ulp = hn::CopySign(abs_ulp, tau); debug_vec("[sr_round] ulp", ulp); auto pi = hn::Mul(ulp, z); debug_vec("[sr_round] pi", pi); + auto abs_tau_plus_pi = hn::Abs(hn::Add(tau, pi)); - debug_vec("[sr_round] abs_tau_plus_pi", abs_tau_plus_pi); + debug_vec("[sr_round] |tau|+pi", abs_tau_plus_pi); + auto round = hn::IfThenElse(hn::Ge(abs_tau_plus_pi, abs_ulp), ulp, zero); debug_vec("[sr_round] round", round); diff --git a/src/libvfcinstrumentonline/rand/src/sr_hw.cpp b/src/libvfcinstrumentonline/rand/src/sr_hw.cpp index 5053a087..4e34d2ca 100644 --- a/src/libvfcinstrumentonline/rand/src/sr_hw.cpp +++ b/src/libvfcinstrumentonline/rand/src/sr_hw.cpp @@ -294,17 +294,26 @@ define_fp_xN_unary_op(float, f32, 16, sqrt); #endif template -void _round(const T *HWY_RESTRICT sigma, const T *HWY_RESTRICT tau, - T *HWY_RESTRICT result, const size_t count) { +HWY_NOINLINE void _round(const T *HWY_RESTRICT sigma, const T *HWY_RESTRICT tau, + T *HWY_RESTRICT result, const size_t count) { using D = hn::ScalableTag; const D d; const size_t N = hn::Lanes(d); - for (size_t i = 0; i + N <= count; i += N) { + size_t i = 0; + for (; i + N <= count; i += N) { auto sigma_vec = hn::Load(d, sigma + i); auto tau_vec = hn::Load(d, tau + i); auto res = sr_round(sigma_vec, tau_vec); hn::Store(res, d, result + i); } + // using C = hn::CappedTag; + // const C c; + // for (; i < count; ++i) { + // auto sigma_vec = hn::Load(d, sigma + i); + // auto tau_vec = hn::Load(d, tau + i); + // auto res = sr_round(sigma_vec, tau_vec); + // hn::Store(res, c, result + i); + // } } template @@ -336,12 +345,21 @@ HWY_NOINLINE void _sub(const T *HWY_RESTRICT a, const T *HWY_RESTRICT b, using D = hn::ScalableTag; const D d; const size_t N = hn::Lanes(d); - for (size_t i = 0; i + N <= count; i += N) { + size_t i = 0; + for (; i + N <= count; i += N) { auto a_vec = hn::Load(d, a + i); auto b_vec = hn::Load(d, b + i); auto res = sr_sub(a_vec, b_vec); hn::Store(res, d, result + i); } + using C = hn::CappedTag; + const C c; + for (; i < count; ++i) { + auto a_vec = hn::Load(c, a + i); + auto b_vec = hn::Load(c, b + i); + auto res = sr_sub(a_vec, b_vec); + hn::Store(res, c, result + i); + } } template @@ -350,12 +368,21 @@ HWY_NOINLINE void _mul(const T *HWY_RESTRICT a, const T *HWY_RESTRICT b, using D = hn::ScalableTag; const hn::ScalableTag d; const size_t N = hn::Lanes(d); - for (size_t i = 0; i + N <= count; i += N) { + size_t i = 0; + for (; i + N <= count; i += N) { auto a_vec = hn::Load(d, a + i); auto b_vec = hn::Load(d, b + i); auto res = sr_mul(a_vec, b_vec); hn::Store(res, d, result + i); } + using C = hn::CappedTag; + const C c; + for (; i < count; ++i) { + auto a_vec = hn::Load(c, a + i); + auto b_vec = hn::Load(c, b + i); + auto res = sr_mul(a_vec, b_vec); + hn::Store(res, c, result + i); + } } template @@ -364,12 +391,21 @@ HWY_NOINLINE void _div(const T *HWY_RESTRICT a, const T *HWY_RESTRICT b, using D = hn::ScalableTag; const hn::ScalableTag d; const size_t N = hn::Lanes(d); - for (size_t i = 0; i + N <= count; i += N) { + size_t i = 0; + for (; i + N <= count; i += N) { auto a_vec = hn::Load(d, a + i); auto b_vec = hn::Load(d, b + i); auto res = sr_div(a_vec, b_vec); hn::Store(res, d, result + i); } + using C = hn::CappedTag; + const C c; + for (; i < count; ++i) { + auto a_vec = hn::Load(c, a + i); + auto b_vec = hn::Load(c, b + i); + auto res = sr_div(a_vec, b_vec); + hn::Store(res, c, result + i); + } } template @@ -378,11 +414,19 @@ HWY_NOINLINE void _sqrt(const T *HWY_RESTRICT a, T *HWY_RESTRICT result, using D = hn::ScalableTag; const hn::ScalableTag d; const size_t N = hn::Lanes(d); - for (size_t i = 0; i + N <= count; i += N) { + size_t i = 0; + for (; i + N <= count; i += N) { auto a_vec = hn::Load(d, a + i); auto res = sr_sqrt(a_vec); hn::Store(res, d, result + i); } + // using C = hn::CappedTag; + // const C c; + // for (; i < count; ++i) { + // auto a_vec = hn::Load(d, a + i); + // auto res = sr_sqrt(a_vec); + // hn::Store(res, c, result + i); + // } } void _round_f32(const float *HWY_RESTRICT sigma, const float *HWY_RESTRICT tau, diff --git a/src/libvfcinstrumentonline/rand/src/ud.h b/src/libvfcinstrumentonline/rand/src/ud.h new file mode 100644 index 00000000..5749dba8 --- /dev/null +++ b/src/libvfcinstrumentonline/rand/src/ud.h @@ -0,0 +1,40 @@ +#ifndef __VERIFICARLO_SRLIB_UD_HPP__ +#define __VERIFICARLO_SRLIB_UD_HPP__ + +#include "debug.hpp" +#include "eft.hpp" +#include "utils.hpp" +#include "xoroshiro256+_hw.h" + +namespace ud { +namespace scalar { + +template T udround(T a) { + debug_start(); + if (a == 0) { + debug_end(); + return a; + } + debug_print("a = %.13a\n", a); + using I = typename sr::utils::IEEE754::I; + I a_bits = *reinterpret_cast(&a); + std::uint64_t rand = sr::scalar::xoroshiro256plus::static_dispatch::random(); + debug_print("rand = 0x%02x\n", rand); + // get the last bit of the random number to get -1 or 1 + a_bits += 1 - ((rand & 1) << 1); + a = *reinterpret_cast(&a_bits); + debug_print("round(a) = %.13a\n", a); + debug_end(); + return a; +} + +template T add(T a, T b) { return udround(a + b); } +template T sub(T a, T b) { return udround(a - b); } +template T mul(T a, T b) { return udround(a * b); } +template T div(T a, T b) { return udround(a / b); } +template T sqrt(T a) { return udround(std::sqrt(a)); } + +} // namespace scalar +} // namespace ud + +#endif // __UD_HPP__ \ No newline at end of file diff --git a/src/libvfcinstrumentonline/rand/src/ud.hpp b/src/libvfcinstrumentonline/rand/src/ud.hpp deleted file mode 100644 index 15cb172d..00000000 --- a/src/libvfcinstrumentonline/rand/src/ud.hpp +++ /dev/null @@ -1,43 +0,0 @@ -#ifndef __VERIFICARLO_SRLIB_UD_HPP__ -#define __VERIFICARLO_SRLIB_UD_HPP__ - -#include "debug.hpp" -#include "eft.hpp" -#include "rand.hpp" -#include "utils.hpp" - -template T ud_round(T a) { - debug_start(); - if (a == 0) { - debug_end(); - return a; - } - debug_print("a = %.13a\n", a); - using I = typename sr::utils::IEEE754::I; - I a_bits = *reinterpret_cast(&a); - uint8_t rand = get_rand_uint1(); - debug_print("rand = 0x%02x\n", rand); - a_bits += 1 - (rand * 2); - debug_print("ud_round(%.13a)", a); - a = *reinterpret_cast(&a_bits); - debug_print(" = %.13a\n", a); - debug_end(); - return a; -} - -template T ud_add(T a, T b) { return ud_round(a + b); } -template T ud_sub(T a, T b) { return ud_add(a, -b); } -template T ud_mul(T a, T b) { return ud_round(a * b); } -template T ud_div(T a, T b) { return ud_round(a / b); } - -float ud_add_float(float a, float b) { return ud_add(a, b); } -float ud_sub_float(float a, float b) { return ud_sub(a, b); } -float ud_mul_float(float a, float b) { return ud_mul(a, b); } -float ud_div_float(float a, float b) { return ud_div(a, b); } - -double ud_add_double(double a, double b) { return ud_add(a, b); } -double ud_sub_double(double a, double b) { return ud_sub(a, b); } -double ud_mul_double(double a, double b) { return ud_mul(a, b); } -double ud_div_double(double a, double b) { return ud_div(a, b); } - -#endif // __UD_HPP__ \ No newline at end of file diff --git a/src/libvfcinstrumentonline/rand/src/ud_hw-inl.h b/src/libvfcinstrumentonline/rand/src/ud_hw-inl.h new file mode 100644 index 00000000..60b0b9ac --- /dev/null +++ b/src/libvfcinstrumentonline/rand/src/ud_hw-inl.h @@ -0,0 +1,139 @@ +#if defined(HIGHWAY_HWY_VERIFICARLO_UD_HW_INL_H_) == defined(HWY_TARGET_TOGGLE) +#ifdef HIGHWAY_HWY_VERIFICARLO_UD_HW_INL_H_ +#undef HIGHWAY_HWY_VERIFICARLO_UD_HW_INL_H_ +#else +#define HIGHWAY_HWY_VERIFICARLO_UD_HW_INL_H_ +#endif + +// clang-format off +#include "hwy/highway.h" +#include "hwy/print-inl.h" +#include "src/debug_hwy-inl.h" +#include "src/utils.hpp" +#include "src/xoroshiro256+_hw-inl.hpp" +// clang-format on + +HWY_BEFORE_NAMESPACE(); // at file scope +namespace ud { +namespace vector { +namespace HWY_NAMESPACE { + +namespace hn = hwy::HWY_NAMESPACE; +namespace rng = sr::vector::xoroshiro256plus::HWY_NAMESPACE; +namespace dbg = sr::vector::HWY_NAMESPACE; + +template , typename T = hn::TFromD> +V round(V a) { + debug_start(); + dbg::debug_vec("[round] a", a); + + using DI = hn::RebindToSigned; + using DU = hn::RebindToUnsigned; + const DI di{}; + const DU du{}; + const D d{}; + + auto is_not_zero = hn::Ne(a, hn::Set(d, 0)); + auto is_finite = hn::IsFinite(a); + auto must_be_rounded = hn::And(is_finite, is_not_zero); + + // rand = 1 - 2 * (z & 1) + auto z = rng::random(du); + auto z_di = hn::BitCast(di, z); + auto z_last_bit = hn::And(hn::Set(di, 0x1), z_di); + auto z_last_bit_two = hn::ShiftLeft<1>(z_last_bit); + auto rand = hn::Sub(hn::Set(di, 1), z_last_bit_two); + + auto a_di = hn::BitCast(di, a); + auto a_rounded = hn::Add(a_di, rand); + auto res = hn::BitCast(d, a_rounded); + + auto ret = hn::IfThenElse(must_be_rounded, res, a); + + dbg::debug_vec("[round] res", ret); + debug_end(); + + return ret; +} + +template , typename T = hn::TFromD> +V add(V a, V b) { + debug_start(); + dbg::debug_vec("[add] a", a); + dbg::debug_vec("[add] b", b); + + V c = hn::Add(a, b); + auto res = round(c); + + dbg::debug_vec("[add] res", res); + debug_end(); + + return res; +} + +template , typename T = hn::TFromD> +V sub(V a, V b) { + debug_start(); + dbg::debug_vec("[sub] a", a); + dbg::debug_vec("[sub] b", b); + + V c = hn::Sub(a, b); + auto res = round(c); + + dbg::debug_vec("[sub] res", res); + debug_end(); + + return res; +} + +template , typename T = hn::TFromD> +V mul(V a, V b) { + debug_start(); + dbg::debug_vec("[mul] a", a); + dbg::debug_vec("[mul] b", b); + + V c = hn::Mul(a, b); + auto res = round(c); + + dbg::debug_vec("[mul] res", res); + debug_end(); + + return res; +} + +template , typename T = hn::TFromD> +V div(V a, V b) { + debug_start(); + dbg::debug_vec("[div] a", a); + dbg::debug_vec("[div] b", b); + + V c = hn::Div(a, b); + auto res = round(c); + + dbg::debug_vec("[div] res", res); + debug_end(); + + return res; +} + +template , typename T = hn::TFromD> +V sqrt(V a) { + debug_start(); + dbg::debug_vec("[sqrt] a", a); + + V c = hn::Sqrt(a); + auto res = round(c); + + dbg::debug_vec("[sqrt] res", res); + debug_end(); + + return res; +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace vector +} // namespace ud +HWY_AFTER_NAMESPACE(); + +#endif // HIGHWAY_HWY_VERIFICARLO_UD_HW_INL_H_ diff --git a/src/libvfcinstrumentonline/rand/src/ud_hw.cpp b/src/libvfcinstrumentonline/rand/src/ud_hw.cpp new file mode 100644 index 00000000..f8ecaec1 --- /dev/null +++ b/src/libvfcinstrumentonline/rand/src/ud_hw.cpp @@ -0,0 +1,1584 @@ + +#include "src/debug.hpp" +#include "src/ud.h" + +// Generates code for every target that this compiler can support. +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "src/ud_hw.cpp" // this file +#include "hwy/foreach_target.h" // must come before highway.h + +// clang-format off +#include "hwy/highway.h" +#include "src/ud_hw-inl.h" +#include "src/ud_hw.h" +#include "src/xoroshiro256+_hw-inl.hpp" +// clang-format on + +// Optional, can instead add HWY_ATTR to all functions. +HWY_BEFORE_NAMESPACE(); +namespace ud { +namespace vector { +namespace HWY_NAMESPACE { + +namespace hn = hwy::HWY_NAMESPACE; + +namespace internal { + +template , + class V = hn::Vec> +V _add_fp_xN(V a, V b) { + auto res = add(a, b); + return res; +} + +template , + class V = hn::Vec> +V _sub_fp_xN(V a, V b) { + auto res = sub(a, b); + return res; +} + +template , + class V = hn::Vec> +V _mul_fp_xN(V a, V b) { + auto res = mul(a, b); + return res; +} + +template , + class V = hn::Vec> +V _div_fp_xN(V a, V b) { + auto res = div(a, b); + return res; +} + +template , + class V = hn::Vec> +V _sqrt_fp_xN(V a) { + auto res = sqrt(a); + return res; +} + +/* 32-bits */ +#if HWY_MAX_BYTES >= 4 +using f32x1_t = hn::FixedTag; +using f32x1 = hn::Vec; + +f32x1 _add_f32_x1(f32x1 a, f32x1 b) { return _add_fp_xN(a, b); } +f32x1 _sub_f32_x1(f32x1 a, f32x1 b) { return _sub_fp_xN(a, b); } +f32x1 _mul_f32_x1(f32x1 a, f32x1 b) { return _mul_fp_xN(a, b); } +f32x1 _div_f32_x1(f32x1 a, f32x1 b) { return _div_fp_xN(a, b); } +f32x1 _sqrt_f32_x1(f32x1 a) { return _sqrt_fp_xN(a); } +#endif + +/* 64-bits */ +#if HWY_MAX_BYTES >= 8 +using f64x1_t = hn::FixedTag; +using f64x1 = hn::Vec; +using f32x2_t = hn::FixedTag; +using f32x2 = hn::Vec; + +f64x1 _add_f64_x1(f64x1 a, f64x1 b) { return _add_fp_xN(a, b); } +f64x1 _sub_f64_x1(f64x1 a, f64x1 b) { return _sub_fp_xN(a, b); } +f64x1 _mul_f64_x1(f64x1 a, f64x1 b) { return _mul_fp_xN(a, b); } +f64x1 _div_f64_x1(f64x1 a, f64x1 b) { return _div_fp_xN(a, b); } +f64x1 _sqrt_f64_x1(f64x1 a) { return _sqrt_fp_xN(a); } + +f32x2 _add_f32_x2(f32x2 a, f32x2 b) { return _add_fp_xN(a, b); } +f32x2 _sub_f32_x2(f32x2 a, f32x2 b) { return _sub_fp_xN(a, b); } +f32x2 _mul_f32_x2(f32x2 a, f32x2 b) { return _mul_fp_xN(a, b); } +f32x2 _div_f32_x2(f32x2 a, f32x2 b) { return _div_fp_xN(a, b); } +f32x2 _sqrt_f32_x2(f32x2 a) { return _sqrt_fp_xN(a); } +#endif + +/* 128-bits */ +#if HWY_MAX_BYTES >= 16 +using f64x2_t = hn::FixedTag; +using f64x2 = hn::Vec; +using f32x4_t = hn::FixedTag; +using f32x4 = hn::Vec; + +f64x2 _add_f64_x2(f64x2 a, f64x2 b) { return _add_fp_xN(a, b); } +f64x2 _sub_f64_x2(f64x2 a, f64x2 b) { return _sub_fp_xN(a, b); } +f64x2 _mul_f64_x2(f64x2 a, f64x2 b) { return _mul_fp_xN(a, b); } +f64x2 _div_f64_x2(f64x2 a, f64x2 b) { return _div_fp_xN(a, b); } +f64x2 _sqrt_f64_x2(f64x2 a) { return _sqrt_fp_xN(a); } + +f32x4 _add_f32_x4(f32x4 a, f32x4 b) { return _add_fp_xN(a, b); } +f32x4 _sub_f32_x4(f32x4 a, f32x4 b) { return _sub_fp_xN(a, b); } +f32x4 _mul_f32_x4(f32x4 a, f32x4 b) { return _mul_fp_xN(a, b); } +f32x4 _div_f32_x4(f32x4 a, f32x4 b) { return _div_fp_xN(a, b); } +f32x4 _sqrt_f32_x4(f32x4 a) { return _sqrt_fp_xN(a); } + +#endif + +/* 256-bits */ +#if HWY_MAX_BYTES >= 32 +using f64x4_t = hn::FixedTag; +using f64x4 = hn::Vec; +using f32x8_t = hn::FixedTag; +using f32x8 = hn::Vec; + +f64x4 _add_f64_x4(f64x4 a, f64x4 b) { return _add_fp_xN(a, b); } +f64x4 _sub_f64_x4(f64x4 a, f64x4 b) { return _sub_fp_xN(a, b); } +f64x4 _mul_f64_x4(f64x4 a, f64x4 b) { return _mul_fp_xN(a, b); } +f64x4 _div_f64_x4(f64x4 a, f64x4 b) { return _div_fp_xN(a, b); } +f64x4 _sqrt_f64_x4(f64x4 a) { return _sqrt_fp_xN(a); } + +f32x8 _add_f32_x8(f32x8 a, f32x8 b) { return _add_fp_xN(a, b); } +f32x8 _sub_f32_x8(f32x8 a, f32x8 b) { return _sub_fp_xN(a, b); } +f32x8 _mul_f32_x8(f32x8 a, f32x8 b) { return _mul_fp_xN(a, b); } +f32x8 _div_f32_x8(f32x8 a, f32x8 b) { return _div_fp_xN(a, b); } +f32x8 _sqrt_f32_x8(f32x8 a) { return _sqrt_fp_xN(a); } +#endif + +/* 512-bits */ +#if HWY_MAX_BYTES >= 64 +using f64x8_t = hn::FixedTag; +using f64x8 = hn::Vec; +using f32x16_t = hn::FixedTag; +using f32x16 = hn::Vec; + +f64x8 _add_f64_x8(f64x8 a, f64x8 b) { return _add_fp_xN(a, b); } +f64x8 _sub_f64_x8(f64x8 a, f64x8 b) { return _sub_fp_xN(a, b); } +f64x8 _mul_f64_x8(f64x8 a, f64x8 b) { return _mul_fp_xN(a, b); } +f64x8 _div_f64_x8(f64x8 a, f64x8 b) { return _div_fp_xN(a, b); } +f64x8 _sqrt_f64_x8(f64x8 a) { return _sqrt_fp_xN(a); } + +f32x16 _add_f32_x16(f32x16 a, f32x16 b) { return _add_fp_xN(a, b); } +f32x16 _sub_f32_x16(f32x16 a, f32x16 b) { return _sub_fp_xN(a, b); } +f32x16 _mul_f32_x16(f32x16 a, f32x16 b) { return _mul_fp_xN(a, b); } +f32x16 _div_f32_x16(f32x16 a, f32x16 b) { return _div_fp_xN(a, b); } +f32x16 _sqrt_f32_x16(f32x16 a) { return _sqrt_fp_xN(a); } +#endif +} // namespace internal + +template , + class V = hn::Vec> +void _addxN(const T *HWY_RESTRICT a, const T *HWY_RESTRICT b, + T *HWY_RESTRICT c) { + const D d; + const auto va = hn::Load(d, a); + const auto vb = hn::Load(d, b); + auto res = internal::_add_fp_xN(va, vb); + hn::Store(res, d, c); +} + +template , + class V = hn::Vec> +void _subxN(const T *HWY_RESTRICT a, const T *HWY_RESTRICT b, + T *HWY_RESTRICT c) { + const D d; + const auto va = hn::Load(d, a); + const auto vb = hn::Load(d, b); + auto res = internal::_sub_fp_xN(va, vb); + hn::Store(res, d, c); +} + +template , + class V = hn::Vec> +void _mulxN(const T *HWY_RESTRICT a, const T *HWY_RESTRICT b, + T *HWY_RESTRICT c) { + const D d; + const auto va = hn::Load(d, a); + const auto vb = hn::Load(d, b); + auto res = internal::_mul_fp_xN(va, vb); + hn::Store(res, d, c); +} + +template , + class V = hn::Vec> +void _divxN(const T *HWY_RESTRICT a, const T *HWY_RESTRICT b, + T *HWY_RESTRICT c) { + const D d; + const auto va = hn::Load(d, a); + const auto vb = hn::Load(d, b); + auto res = internal::_div_fp_xN(va, vb); + hn::Store(res, d, c); +} + +template , + class V = hn::Vec> +void _sqrtxN(const T *HWY_RESTRICT a, T *HWY_RESTRICT c) { + const D d; + const auto va = hn::Load(d, a); + auto res = internal::_sqrt_fp_xN(va); + hn::Store(res, d, c); +} + +/* _x_ */ + +#define define_fp_xN_unary_op(type, name, size, op) \ + void _##op##x##size##_##name(const type *HWY_RESTRICT a, \ + type *HWY_RESTRICT c) { \ + _##op##xN(a, c); \ + } + +#define define_fp_xN_bin_op(type, name, size, op) \ + void _##op##x##size##_##name(const type *HWY_RESTRICT a, \ + const type *HWY_RESTRICT b, \ + type *HWY_RESTRICT c) { \ + _##op##xN(a, b, c); \ + } + +/* 32-bits */ +#if HWY_MAX_BYTES >= 4 +define_fp_xN_bin_op(float, f32, 1, add); +define_fp_xN_bin_op(float, f32, 1, sub); +define_fp_xN_bin_op(float, f32, 1, mul); +define_fp_xN_bin_op(float, f32, 1, div); +define_fp_xN_unary_op(float, f32, 1, sqrt); +#endif + +/* 64-bits */ +#if HWY_MAX_BYTES >= 8 +define_fp_xN_bin_op(double, f64, 1, add); +define_fp_xN_bin_op(double, f64, 1, sub); +define_fp_xN_bin_op(double, f64, 1, mul); +define_fp_xN_bin_op(double, f64, 1, div); +define_fp_xN_unary_op(double, f64, 1, sqrt); + +define_fp_xN_bin_op(float, f32, 2, add); +define_fp_xN_bin_op(float, f32, 2, sub); +define_fp_xN_bin_op(float, f32, 2, mul); +define_fp_xN_bin_op(float, f32, 2, div); +define_fp_xN_unary_op(float, f32, 2, sqrt); +#endif + +/* 128-bits */ +#if HWY_MAX_BYTES >= 16 +define_fp_xN_bin_op(double, f64, 2, add); +define_fp_xN_bin_op(double, f64, 2, sub); +define_fp_xN_bin_op(double, f64, 2, mul); +define_fp_xN_bin_op(double, f64, 2, div); +define_fp_xN_unary_op(double, f64, 2, sqrt); + +define_fp_xN_bin_op(float, f32, 4, add); +define_fp_xN_bin_op(float, f32, 4, sub); +define_fp_xN_bin_op(float, f32, 4, mul); +define_fp_xN_bin_op(float, f32, 4, div); +define_fp_xN_unary_op(float, f32, 4, sqrt); +#endif + +/* 256-bits */ +#if HWY_MAX_BYTES >= 32 +define_fp_xN_bin_op(double, f64, 4, add); +define_fp_xN_bin_op(double, f64, 4, sub); +define_fp_xN_bin_op(double, f64, 4, mul); +define_fp_xN_bin_op(double, f64, 4, div); +define_fp_xN_unary_op(double, f64, 4, sqrt); + +define_fp_xN_bin_op(float, f32, 8, add); +define_fp_xN_bin_op(float, f32, 8, sub); +define_fp_xN_bin_op(float, f32, 8, mul); +define_fp_xN_bin_op(float, f32, 8, div); +define_fp_xN_unary_op(float, f32, 8, sqrt); +#endif + +/* 512-bits */ +#if HWY_MAX_BYTES >= 64 +define_fp_xN_bin_op(double, f64, 8, add); +define_fp_xN_bin_op(double, f64, 8, sub); +define_fp_xN_bin_op(double, f64, 8, mul); +define_fp_xN_bin_op(double, f64, 8, div); +define_fp_xN_unary_op(double, f64, 8, sqrt); + +define_fp_xN_bin_op(float, f32, 16, add); +define_fp_xN_bin_op(float, f32, 16, sub); +define_fp_xN_bin_op(float, f32, 16, mul); +define_fp_xN_bin_op(float, f32, 16, div); +define_fp_xN_unary_op(float, f32, 16, sqrt); +#endif +template +void _round(const T *HWY_RESTRICT a, T *HWY_RESTRICT result, + const size_t count) { + using D = hn::ScalableTag; + const D d; + const size_t N = hn::Lanes(d); + size_t i = 0; + for (; i + N <= count; i += N) { + auto a_vec = hn::Load(d, a + i); + auto res = round(a_vec); + hn::Store(res, d, result + i); + } + // using C = hn::CappedTag; + // const C c; + // for (; i < count; ++i) { + // auto sigma_vec = hn::Load(d, sigma + i); + // auto tau_vec = hn::Load(d, tau + i); + // auto res = ud_round(sigma_vec, tau_vec); + // hn::Store(res, c, result + i); + // } +} + +template +HWY_NOINLINE void _add(const T *HWY_RESTRICT a, const T *HWY_RESTRICT b, + T *HWY_RESTRICT result, const size_t count) { + using D = hn::ScalableTag; + const D d; + const size_t N = hn::Lanes(d); + size_t i = 0; + for (; i + N <= count; i += N) { + auto a_vec = hn::Load(d, a + i); + auto b_vec = hn::Load(d, b + i); + auto res = add(a_vec, b_vec); + hn::Store(res, d, result + i); + } + // using C = hn::CappedTag; + // const C c; + // for (; i < count; ++i) { + // auto a_vec = hn::Load(c, a + i); + // auto b_vec = hn::Load(c, b + i); + // auto res = ud_add(a_vec, b_vec); + // hn::Store(res, c, result + i); + // } +} + +template +HWY_NOINLINE void _sub(const T *HWY_RESTRICT a, const T *HWY_RESTRICT b, + T *HWY_RESTRICT result, const size_t count) { + using D = hn::ScalableTag; + const D d; + const size_t N = hn::Lanes(d); + size_t i = 0; + for (; i + N <= count; i += N) { + auto a_vec = hn::Load(d, a + i); + auto b_vec = hn::Load(d, b + i); + auto res = sub(a_vec, b_vec); + hn::Store(res, d, result + i); + } + // using C = hn::CappedTag; + // const C c; + // for (; i < count; ++i) { + // auto a_vec = hn::Load(c, a + i); + // auto b_vec = hn::Load(c, b + i); + // auto res = ud_sub(a_vec, b_vec); + // hn::Store(res, c, result + i); + // } +} + +template +HWY_NOINLINE void _mul(const T *HWY_RESTRICT a, const T *HWY_RESTRICT b, + T *HWY_RESTRICT result, const size_t count) { + using D = hn::ScalableTag; + const hn::ScalableTag d; + const size_t N = hn::Lanes(d); + size_t i = 0; + for (; i + N <= count; i += N) { + auto a_vec = hn::Load(d, a + i); + auto b_vec = hn::Load(d, b + i); + auto res = mul(a_vec, b_vec); + hn::Store(res, d, result + i); + } + // using C = hn::CappedTag; + // const C c; + // for (; i < count; ++i) { + // auto a_vec = hn::Load(c, a + i); + // auto b_vec = hn::Load(c, b + i); + // auto res = ud_mul(a_vec, b_vec); + // hn::Store(res, c, result + i); + // } +} + +template +HWY_NOINLINE void _div(const T *HWY_RESTRICT a, const T *HWY_RESTRICT b, + T *HWY_RESTRICT result, const size_t count) { + using D = hn::ScalableTag; + const hn::ScalableTag d; + const size_t N = hn::Lanes(d); + size_t i = 0; + for (; i + N <= count; i += N) { + auto a_vec = hn::Load(d, a + i); + auto b_vec = hn::Load(d, b + i); + auto res = div(a_vec, b_vec); + hn::Store(res, d, result + i); + } + // using C = hn::CappedTag; + // const C c; + // for (; i < count; ++i) { + // auto a_vec = hn::Load(c, a + i); + // auto b_vec = hn::Load(c, b + i); + // auto res = ud_div(a_vec, b_vec); + // hn::Store(res, c, result + i); + // } +} + +template +HWY_NOINLINE void _sqrt(const T *HWY_RESTRICT a, T *HWY_RESTRICT result, + const size_t count) { + using D = hn::ScalableTag; + const hn::ScalableTag d; + const size_t N = hn::Lanes(d); + size_t i = 0; + for (; i + N <= count; i += N) { + auto a_vec = hn::Load(d, a + i); + auto res = sqrt(a_vec); + hn::Store(res, d, result + i); + } + // using C = hn::CappedTag; + // const C c; + // for (; i < count; ++i) { + // auto a_vec = hn::Load(d, a + i); + // auto res = ud_sqrt(a_vec); + // hn::Store(res, c, result + i); + // } +} +void _round_f32(const float *HWY_RESTRICT a, float *HWY_RESTRICT result, + const size_t count) { + _round(a, result, count); +} + +void _add_f32(const float *HWY_RESTRICT a, const float *HWY_RESTRICT b, + float *HWY_RESTRICT result, const size_t count) { + _add(a, b, result, count); +} +void _sub_f32(const float *HWY_RESTRICT a, const float *HWY_RESTRICT b, + float *HWY_RESTRICT result, const size_t count) { + _sub(a, b, result, count); +} + +void _mul_f32(const float *HWY_RESTRICT a, const float *HWY_RESTRICT b, + float *HWY_RESTRICT result, const size_t count) { + _mul(a, b, result, count); +} + +void _div_f32(const float *HWY_RESTRICT a, const float *HWY_RESTRICT b, + float *HWY_RESTRICT result, const size_t count) { + _div(a, b, result, count); +} + +void _sqrt_f32(const float *HWY_RESTRICT a, float *HWY_RESTRICT result, + const size_t count) { + _sqrt(a, result, count); +} + +void _round_f64(const double *HWY_RESTRICT a, double *HWY_RESTRICT result, + const size_t count) { + _round(a, result, count); +} + +void _add_f64(const double *HWY_RESTRICT a, const double *HWY_RESTRICT b, + double *HWY_RESTRICT result, const size_t count) { + _add(a, b, result, count); +} + +void _sub_f64(const double *HWY_RESTRICT a, const double *HWY_RESTRICT b, + double *HWY_RESTRICT result, const size_t count) { + _sub(a, b, result, count); +} + +void _mul_f64(const double *HWY_RESTRICT a, const double *HWY_RESTRICT b, + double *HWY_RESTRICT result, const size_t count) { + _mul(a, b, result, count); +} + +void _div_f64(const double *HWY_RESTRICT a, const double *HWY_RESTRICT b, + double *HWY_RESTRICT result, const size_t count) { + _div(a, b, result, count); +} + +void _sqrt_f64(const double *HWY_RESTRICT a, double *HWY_RESTRICT result, + const size_t count) { + _sqrt(a, result, count); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace vector +} // namespace ud +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE + +namespace ud { +namespace vector { + +HWY_EXPORT(_round_f32); +HWY_EXPORT(_round_f64); +HWY_EXPORT(_add_f32); +HWY_EXPORT(_add_f64); +HWY_EXPORT(_sub_f32); +HWY_EXPORT(_sub_f64); +HWY_EXPORT(_mul_f32); +HWY_EXPORT(_mul_f64); +HWY_EXPORT(_div_f32); +HWY_EXPORT(_div_f64); +HWY_EXPORT(_sqrt_f32); +HWY_EXPORT(_sqrt_f64); + +/* 32-bits */ +#if HWY_MAX_BYTES >= 4 +HWY_EXPORT(_addx1_f32); +HWY_EXPORT(_subx1_f32); +HWY_EXPORT(_mulx1_f32); +HWY_EXPORT(_divx1_f32); +HWY_EXPORT(_sqrtx1_f32); +#endif + +/* 64-bits */ +#if HWY_MAX_BYTES >= 8 +HWY_EXPORT(_addx1_f64); +HWY_EXPORT(_subx1_f64); +HWY_EXPORT(_mulx1_f64); +HWY_EXPORT(_divx1_f64); +HWY_EXPORT(_sqrtx1_f64); +HWY_EXPORT(_addx2_f32); +HWY_EXPORT(_subx2_f32); +HWY_EXPORT(_mulx2_f32); +HWY_EXPORT(_divx2_f32); +HWY_EXPORT(_sqrtx2_f32); +#endif + +/* 128-bits */ +#if HWY_MAX_BYTES >= 16 +HWY_EXPORT(_addx2_f64); +HWY_EXPORT(_subx2_f64); +HWY_EXPORT(_mulx2_f64); +HWY_EXPORT(_divx2_f64); +HWY_EXPORT(_sqrtx2_f64); +HWY_EXPORT(_addx4_f32); +HWY_EXPORT(_subx4_f32); +HWY_EXPORT(_mulx4_f32); +HWY_EXPORT(_divx4_f32); +HWY_EXPORT(_sqrtx4_f32); +#endif + +/* 256-bits */ +#if HWY_MAX_BYTES >= 32 +HWY_EXPORT(_addx4_f64); +HWY_EXPORT(_subx4_f64); +HWY_EXPORT(_mulx4_f64); +HWY_EXPORT(_divx4_f64); +HWY_EXPORT(_sqrtx4_f64); +HWY_EXPORT(_addx8_f32); +HWY_EXPORT(_subx8_f32); +HWY_EXPORT(_mulx8_f32); +HWY_EXPORT(_divx8_f32); +HWY_EXPORT(_sqrtx8_f32); +#endif + +/* 512-bits */ +#if HWY_MAX_BYTES >= 64 +HWY_EXPORT(_addx8_f64); +HWY_EXPORT(_subx8_f64); +HWY_EXPORT(_mulx8_f64); +HWY_EXPORT(_divx8_f64); +HWY_EXPORT(_sqrtx8_f64); +HWY_EXPORT(_addx16_f32); +HWY_EXPORT(_subx16_f32); +HWY_EXPORT(_mulx16_f32); +HWY_EXPORT(_divx16_f32); +HWY_EXPORT(_sqrtx16_f32); +#endif + +/* 1024-bits */ +#if HWY_MAX_BYTES >= 128 +HWY_EXPORT(_addx16_f64); +HWY_EXPORT(_subx16_f64); +HWY_EXPORT(_mulx16_f64); +HWY_EXPORT(_divx16_f64); +HWY_EXPORT(_sqrtx16_f64); +#endif + +template +void round(const T *HWY_RESTRICT a, T *HWY_RESTRICT result, + const size_t count) { + if constexpr (std::is_same_v) { + return HWY_DYNAMIC_DISPATCH(_round_f32)(a, result, count); + } else if constexpr (std::is_same_v) { + return HWY_DYNAMIC_DISPATCH(_round_f64)(a, result, count); + } +} + +template +void add(const T *HWY_RESTRICT a, const T *HWY_RESTRICT b, + T *HWY_RESTRICT result, const size_t count) { + if constexpr (std::is_same_v) { + return HWY_DYNAMIC_DISPATCH(_add_f32)(a, b, result, count); + } else if constexpr (std::is_same_v) { + return HWY_DYNAMIC_DISPATCH(_add_f64)(a, b, result, count); + } +} + +template +void sub(const T *HWY_RESTRICT a, const T *HWY_RESTRICT b, + T *HWY_RESTRICT result, const size_t count) { + if constexpr (std::is_same_v) { + return HWY_DYNAMIC_DISPATCH(_sub_f32)(a, b, result, count); + } else if constexpr (std::is_same_v) { + return HWY_DYNAMIC_DISPATCH(_sub_f64)(a, b, result, count); + } +} + +template +void mul(const T *HWY_RESTRICT a, const T *HWY_RESTRICT b, + T *HWY_RESTRICT result, const size_t count) { + if constexpr (std::is_same_v) { + return HWY_DYNAMIC_DISPATCH(_mul_f32)(a, b, result, count); + } else if constexpr (std::is_same_v) { + return HWY_DYNAMIC_DISPATCH(_mul_f64)(a, b, result, count); + } +} + +template +void div(const T *HWY_RESTRICT a, const T *HWY_RESTRICT b, + T *HWY_RESTRICT result, const size_t count) { + if constexpr (std::is_same_v) { + return HWY_DYNAMIC_DISPATCH(_div_f32)(a, b, result, count); + } else if constexpr (std::is_same_v) { + return HWY_DYNAMIC_DISPATCH(_div_f64)(a, b, result, count); + } +} + +template +void sqrt(const T *HWY_RESTRICT a, T *HWY_RESTRICT result, const size_t count) { + if constexpr (std::is_same_v) { + return HWY_DYNAMIC_DISPATCH(_sqrt_f32)(a, result, count); + } else if constexpr (std::is_same_v) { + return HWY_DYNAMIC_DISPATCH(_sqrt_f64)(a, result, count); + } +} + +/* Array functions */ + +void addf32(const float *HWY_RESTRICT a, const float *HWY_RESTRICT b, + float *HWY_RESTRICT result, const size_t count) { + return add(a, b, result, count); +} + +void subf32(const float *HWY_RESTRICT a, const float *HWY_RESTRICT b, + float *HWY_RESTRICT result, const size_t count) { + return sub(a, b, result, count); +} + +void mulf32(const float *HWY_RESTRICT a, const float *HWY_RESTRICT b, + float *HWY_RESTRICT result, const size_t count) { + return mul(a, b, result, count); +} + +void divf32(const float *HWY_RESTRICT a, const float *HWY_RESTRICT b, + float *HWY_RESTRICT result, const size_t count) { + return div(a, b, result, count); +} + +void sqrtf32(const float *HWY_RESTRICT a, float *HWY_RESTRICT result, + const size_t count) { + return sqrt(a, result, count); +} + +void addf64(const double *HWY_RESTRICT a, const double *HWY_RESTRICT b, + double *HWY_RESTRICT result, const size_t count) { + return add(a, b, result, count); +} + +void subf64(const double *HWY_RESTRICT a, const double *HWY_RESTRICT b, + double *HWY_RESTRICT result, const size_t count) { + return sub(a, b, result, count); +} + +void mulf64(const double *HWY_RESTRICT a, const double *HWY_RESTRICT b, + double *HWY_RESTRICT result, const size_t count) { + return mul(a, b, result, count); +} + +void divf64(const double *HWY_RESTRICT a, const double *HWY_RESTRICT b, + double *HWY_RESTRICT result, const size_t count) { + return div(a, b, result, count); +} + +void sqrtf64(const double *HWY_RESTRICT a, double *HWY_RESTRICT result, + const size_t count) { + return sqrt(a, result, count); +} + +/* Single vector instructions with dynamic dispatch */ + +/* 64-bits functions */ +#if HWY_MAX_BYTES >= 8 +void addf32x2(const float *HWY_RESTRICT a, const float *HWY_RESTRICT b, + float *HWY_RESTRICT result) { + HWY_DYNAMIC_DISPATCH(_addx2_f32)(a, b, result); +} + +void subf32x2(const float *HWY_RESTRICT a, const float *HWY_RESTRICT b, + float *HWY_RESTRICT result) { + HWY_DYNAMIC_DISPATCH(_subx2_f32)(a, b, result); +} + +void mulf32x2(const float *HWY_RESTRICT a, const float *HWY_RESTRICT b, + float *HWY_RESTRICT result) { + HWY_DYNAMIC_DISPATCH(_mulx2_f32)(a, b, result); +} + +void divf32x2(const float *HWY_RESTRICT a, const float *HWY_RESTRICT b, + float *HWY_RESTRICT result) { + HWY_DYNAMIC_DISPATCH(_divx2_f32)(a, b, result); +} +#endif + +/* 128-bits functions */ +#if HWY_MAX_BYTES >= 16 +void addf64x2(const double *HWY_RESTRICT a, const double *HWY_RESTRICT b, + double *HWY_RESTRICT result) { + HWY_DYNAMIC_DISPATCH(_addx2_f64)(a, b, result); +} + +void subf64x2(const double *HWY_RESTRICT a, const double *HWY_RESTRICT b, + double *HWY_RESTRICT result) { + HWY_DYNAMIC_DISPATCH(_subx2_f64)(a, b, result); +} + +void mulf64x2(const double *HWY_RESTRICT a, const double *HWY_RESTRICT b, + double *HWY_RESTRICT result) { + HWY_DYNAMIC_DISPATCH(_mulx2_f64)(a, b, result); +} + +void divf64x2(const double *HWY_RESTRICT a, const double *HWY_RESTRICT b, + double *HWY_RESTRICT result) { + HWY_DYNAMIC_DISPATCH(_divx2_f64)(a, b, result); +} + +void sqrtf64x2(const double *HWY_RESTRICT a, double *HWY_RESTRICT result) { + HWY_DYNAMIC_DISPATCH(_sqrtx2_f64)(a, result); +} + +void addf32x4(const float *HWY_RESTRICT a, const float *HWY_RESTRICT b, + float *HWY_RESTRICT result) { + HWY_DYNAMIC_DISPATCH(_addx4_f32)(a, b, result); +} + +void subf32x4(const float *HWY_RESTRICT a, const float *HWY_RESTRICT b, + float *HWY_RESTRICT result) { + HWY_DYNAMIC_DISPATCH(_subx4_f32)(a, b, result); +} + +void mulf32x4(const float *HWY_RESTRICT a, const float *HWY_RESTRICT b, + float *HWY_RESTRICT result) { + HWY_DYNAMIC_DISPATCH(_mulx4_f32)(a, b, result); +} + +void divf32x4(const float *HWY_RESTRICT a, const float *HWY_RESTRICT b, + float *HWY_RESTRICT result) { + HWY_DYNAMIC_DISPATCH(_divx4_f32)(a, b, result); +} +#endif + +/* 256-bits functions*/ + +#if HWY_MAX_BYTES >= 32 +void addf32x8(const float *HWY_RESTRICT a, const float *HWY_RESTRICT b, + float *HWY_RESTRICT result) { + HWY_DYNAMIC_DISPATCH(_addx8_f32)(a, b, result); +} + +void subf32x8(const float *HWY_RESTRICT a, const float *HWY_RESTRICT b, + float *HWY_RESTRICT result) { + HWY_DYNAMIC_DISPATCH(_subx8_f32)(a, b, result); +} + +void mulf32x8(const float *HWY_RESTRICT a, const float *HWY_RESTRICT b, + float *HWY_RESTRICT result) { + HWY_DYNAMIC_DISPATCH(_mulx8_f32)(a, b, result); +} + +void divf32x8(const float *HWY_RESTRICT a, const float *HWY_RESTRICT b, + float *HWY_RESTRICT result) { + HWY_DYNAMIC_DISPATCH(_divx8_f32)(a, b, result); +} + +void addf64x4(const double *HWY_RESTRICT a, const double *HWY_RESTRICT b, + double *HWY_RESTRICT result) { + HWY_DYNAMIC_DISPATCH(_addx4_f64)(a, b, result); +} + +void subf64x4(const double *HWY_RESTRICT a, const double *HWY_RESTRICT b, + double *HWY_RESTRICT result) { + HWY_DYNAMIC_DISPATCH(_subx4_f64)(a, b, result); +} + +void mulf64x4(const double *HWY_RESTRICT a, const double *HWY_RESTRICT b, + double *HWY_RESTRICT result) { + HWY_DYNAMIC_DISPATCH(_mulx4_f64)(a, b, result); +} + +void divf64x4(const double *HWY_RESTRICT a, const double *HWY_RESTRICT b, + double *HWY_RESTRICT result) { + HWY_DYNAMIC_DISPATCH(_divx4_f64)(a, b, result); +} +#endif + +/* 512-bits functions */ + +#if HWY_MAX_BYTES >= 64 +void addf64x8(const double *HWY_RESTRICT a, const double *HWY_RESTRICT b, + double *HWY_RESTRICT result) { + HWY_DYNAMIC_DISPATCH(_addx8_f64)(a, b, result); +} + +void subf64x8(const double *HWY_RESTRICT a, const double *HWY_RESTRICT b, + double *HWY_RESTRICT result) { + HWY_DYNAMIC_DISPATCH(_subx8_f64)(a, b, result); +} + +void mulf64x8(const double *HWY_RESTRICT a, const double *HWY_RESTRICT b, + double *HWY_RESTRICT result) { + HWY_DYNAMIC_DISPATCH(_mulx8_f64)(a, b, result); +} + +void divf64x8(const double *HWY_RESTRICT a, const double *HWY_RESTRICT b, + double *HWY_RESTRICT result) { + HWY_DYNAMIC_DISPATCH(_divx8_f64)(a, b, result); +} + +void addf32x16(const float *HWY_RESTRICT a, const float *HWY_RESTRICT b, + float *HWY_RESTRICT result) { + HWY_DYNAMIC_DISPATCH(_addx16_f32)(a, b, result); +} + +void subf32x16(const float *HWY_RESTRICT a, const float *HWY_RESTRICT b, + float *HWY_RESTRICT result) { + HWY_DYNAMIC_DISPATCH(_subx16_f32)(a, b, result); +} + +void mulf32x16(const float *HWY_RESTRICT a, const float *HWY_RESTRICT b, + float *HWY_RESTRICT result) { + HWY_DYNAMIC_DISPATCH(_mulx16_f32)(a, b, result); +} + +void divf32x16(const float *HWY_RESTRICT a, const float *HWY_RESTRICT b, + float *HWY_RESTRICT result) { + HWY_DYNAMIC_DISPATCH(_divx16_f32)(a, b, result); +} +#endif + +/* 1024-bits functions */ +#if HWY_MAX_BYTES >= 128 +void addf64x16(const double *HWY_RESTRICT a, const double *HWY_RESTRICT b, + double *HWY_RESTRICT result) { + HWY_DYNAMIC_DISPATCH(_addx16_f64)(a, b, result); +} + +void subf64x16(const double *HWY_RESTRICT a, const double *HWY_RESTRICT b, + double *HWY_RESTRICT result) { + HWY_DYNAMIC_DISPATCH(_subx16_f64)(a, b, result); +} + +void mulf64x16(const double *HWY_RESTRICT a, const double *HWY_RESTRICT b, + double *HWY_RESTRICT result) { + HWY_DYNAMIC_DISPATCH(_mulx16_f64)(a, b, result); +} + +void divf64x16(const double *HWY_RESTRICT a, const double *HWY_RESTRICT b, + double *HWY_RESTRICT result) { + HWY_DYNAMIC_DISPATCH(_divx16_f64)(a, b, result); +} +#endif + +/* Single vector functions with static dispatch */ + +/* 64-bits */ + +#if HWY_MAX_BYTES >= 8 + +f32x2_v addf32x2_v(const f32x2_v a, const f32x2_v b) { + const float *a_ptr = reinterpret_cast(&a); + const float *b_ptr = reinterpret_cast(&b); + float *result_ptr = (float *)aligned_alloc(8, sizeof(f32x2_v)); + HWY_STATIC_DISPATCH(_addx2_f32)(a_ptr, b_ptr, result_ptr); + f32x2_v result; + std::memcpy(&result, result_ptr, sizeof(f32x2_v)); + return result; +} + +f32x2_v subf32x2_v(const f32x2_v a, const f32x2_v b) { + const float *a_ptr = reinterpret_cast(&a); + const float *b_ptr = reinterpret_cast(&b); + float *result_ptr = (float *)aligned_alloc(8, sizeof(f32x2_v)); + HWY_STATIC_DISPATCH(_subx2_f32)(a_ptr, b_ptr, result_ptr); + f32x2_v result; + std::memcpy(&result, result_ptr, sizeof(f32x2_v)); + return result; +} + +f32x2_v mulf32x2_v(const f32x2_v a, const f32x2_v b) { + const float *a_ptr = reinterpret_cast(&a); + const float *b_ptr = reinterpret_cast(&b); + float *result_ptr = (float *)aligned_alloc(8, sizeof(f32x2_v)); + HWY_STATIC_DISPATCH(_mulx2_f32)(a_ptr, b_ptr, result_ptr); + f32x2_v result; + std::memcpy(&result, result_ptr, sizeof(f32x2_v)); + return result; +} + +f32x2_v divf32x2_v(const f32x2_v a, const f32x2_v b) { + const float *a_ptr = reinterpret_cast(&a); + const float *b_ptr = reinterpret_cast(&b); + float *result_ptr = (float *)aligned_alloc(8, sizeof(f32x2_v)); + HWY_STATIC_DISPATCH(_divx2_f32)(a_ptr, b_ptr, result_ptr); + f32x2_v result; + std::memcpy(&result, result_ptr, sizeof(f32x2_v)); + return result; +} + +#endif + +/* 128-bits */ + +#if HWY_MAX_BYTES >= 16 + +f64x2_v addf64x2_v(const f64x2_v a, const f64x2_v b) { + const double *a_ptr = reinterpret_cast(&a); + const double *b_ptr = reinterpret_cast(&b); + double *result_ptr = (double *)aligned_alloc(16, sizeof(f64x2_v)); + HWY_STATIC_DISPATCH(_addx2_f64)(a_ptr, b_ptr, result_ptr); + f64x2_v result; + std::memcpy(&result, result_ptr, sizeof(f64x2_v)); + return result; +} + +f64x2_v subf64x2_v(const f64x2_v a, const f64x2_v b) { + const double *a_ptr = reinterpret_cast(&a); + const double *b_ptr = reinterpret_cast(&b); + double *result_ptr = (double *)aligned_alloc(16, sizeof(f64x2_v)); + HWY_STATIC_DISPATCH(_subx2_f64)(a_ptr, b_ptr, result_ptr); + f64x2_v result; + std::memcpy(&result, result_ptr, sizeof(f64x2_v)); + return result; +} + +f64x2_v mulf64x2_v(const f64x2_v a, const f64x2_v b) { + const double *a_ptr = reinterpret_cast(&a); + const double *b_ptr = reinterpret_cast(&b); + double *result_ptr = (double *)aligned_alloc(16, sizeof(f64x2_v)); + HWY_STATIC_DISPATCH(_mulx2_f64)(a_ptr, b_ptr, result_ptr); + f64x2_v result; + std::memcpy(&result, result_ptr, sizeof(f64x2_v)); + return result; +} + +f64x2_v divf64x2_v(const f64x2_v a, const f64x2_v b) { + const double *a_ptr = reinterpret_cast(&a); + const double *b_ptr = reinterpret_cast(&b); + double *result_ptr = (double *)aligned_alloc(16, sizeof(f64x2_v)); + HWY_STATIC_DISPATCH(_divx2_f64)(a_ptr, b_ptr, result_ptr); + f64x2_v result; + std::memcpy(&result, result_ptr, sizeof(f64x2_v)); + return result; +} + +f32x4_v addf32x4_v(const f32x4_v a, const f32x4_v b) { + const float *a_ptr = reinterpret_cast(&a); + const float *b_ptr = reinterpret_cast(&b); + float *result_ptr = (float *)aligned_alloc(16, sizeof(f32x4_v)); + HWY_STATIC_DISPATCH(_addx4_f32)(a_ptr, b_ptr, result_ptr); + f32x4_v result; + std::memcpy(&result, result_ptr, sizeof(f32x4_v)); + return result; +} + +f32x4_v subf32x4_v(const f32x4_v a, const f32x4_v b) { + const float *a_ptr = reinterpret_cast(&a); + const float *b_ptr = reinterpret_cast(&b); + float *result_ptr = (float *)aligned_alloc(16, sizeof(f32x4_v)); + HWY_STATIC_DISPATCH(_subx4_f32)(a_ptr, b_ptr, result_ptr); + f32x4_v result; + std::memcpy(&result, result_ptr, sizeof(f32x4_v)); + return result; +} + +f32x4_v mulf32x4_v(const f32x4_v a, const f32x4_v b) { + const float *a_ptr = reinterpret_cast(&a); + const float *b_ptr = reinterpret_cast(&b); + float *result_ptr = (float *)aligned_alloc(16, sizeof(f32x4_v)); + HWY_STATIC_DISPATCH(_mulx4_f32)(a_ptr, b_ptr, result_ptr); + f32x4_v result; + std::memcpy(&result, result_ptr, sizeof(f32x4_v)); + return result; +} + +f32x4_v divf32x4_v(const f32x4_v a, const f32x4_v b) { + const float *a_ptr = reinterpret_cast(&a); + const float *b_ptr = reinterpret_cast(&b); + float *result_ptr = (float *)aligned_alloc(16, sizeof(f32x4_v)); + HWY_STATIC_DISPATCH(_divx4_f32)(a_ptr, b_ptr, result_ptr); + f32x4_v result; + std::memcpy(&result, result_ptr, sizeof(f32x4_v)); + return result; +} + +#endif + +/* 256-bits */ + +#if HWY_MAX_BYTES >= 32 + +f64x4_v addf64x4_v(const f64x4_v a, const f64x4_v b) { + const double *a_ptr = reinterpret_cast(&a); + const double *b_ptr = reinterpret_cast(&b); + double *result_ptr = (double *)aligned_alloc(32, sizeof(f64x4_v)); + HWY_STATIC_DISPATCH(_addx4_f64)(a_ptr, b_ptr, result_ptr); + f64x4_v result; + std::memcpy(&result, result_ptr, sizeof(f64x4_v)); + return result; +} + +f64x4_v subf64x4_v(const f64x4_v a, const f64x4_v b) { + const double *a_ptr = reinterpret_cast(&a); + const double *b_ptr = reinterpret_cast(&b); + double *result_ptr = (double *)aligned_alloc(32, sizeof(f64x4_v)); + HWY_STATIC_DISPATCH(_subx4_f64)(a_ptr, b_ptr, result_ptr); + f64x4_v result; + std::memcpy(&result, result_ptr, sizeof(f64x4_v)); + return result; +} + +f64x4_v mulf64x4_v(const f64x4_v a, const f64x4_v b) { + const double *a_ptr = reinterpret_cast(&a); + const double *b_ptr = reinterpret_cast(&b); + double *result_ptr = (double *)aligned_alloc(32, sizeof(f64x4_v)); + HWY_STATIC_DISPATCH(_mulx4_f64)(a_ptr, b_ptr, result_ptr); + f64x4_v result; + std::memcpy(&result, result_ptr, sizeof(f64x4_v)); + return result; +} + +f64x4_v divf64x4_v(const f64x4_v a, const f64x4_v b) { + const double *a_ptr = reinterpret_cast(&a); + const double *b_ptr = reinterpret_cast(&b); + double *result_ptr = (double *)aligned_alloc(32, sizeof(f64x4_v)); + HWY_STATIC_DISPATCH(_divx4_f64)(a_ptr, b_ptr, result_ptr); + f64x4_v result; + std::memcpy(&result, result_ptr, sizeof(f64x4_v)); + return result; +} + +f32x8_v addf32x8_v(const f32x8_v a, const f32x8_v b) { + const float *a_ptr = reinterpret_cast(&a); + const float *b_ptr = reinterpret_cast(&b); + float *result_ptr = (float *)aligned_alloc(32, sizeof(f32x8_v)); + HWY_STATIC_DISPATCH(_addx8_f32)(a_ptr, b_ptr, result_ptr); + f32x8_v result; + std::memcpy(&result, result_ptr, sizeof(f32x8_v)); + return result; +} + +f32x8_v subf32x8_v(const f32x8_v a, const f32x8_v b) { + const float *a_ptr = reinterpret_cast(&a); + const float *b_ptr = reinterpret_cast(&b); + float *result_ptr = (float *)aligned_alloc(32, sizeof(f32x8_v)); + HWY_STATIC_DISPATCH(_subx8_f32)(a_ptr, b_ptr, result_ptr); + f32x8_v result; + std::memcpy(&result, result_ptr, sizeof(f32x8_v)); + return result; +} + +f32x8_v mulf32x8_v(const f32x8_v a, const f32x8_v b) { + const float *a_ptr = reinterpret_cast(&a); + const float *b_ptr = reinterpret_cast(&b); + float *result_ptr = (float *)aligned_alloc(32, sizeof(f32x8_v)); + HWY_STATIC_DISPATCH(_mulx8_f32)(a_ptr, b_ptr, result_ptr); + f32x8_v result; + std::memcpy(&result, result_ptr, sizeof(f32x8_v)); + return result; +} + +f32x8_v divf32x8_v(const f32x8_v a, const f32x8_v b) { + const float *a_ptr = reinterpret_cast(&a); + const float *b_ptr = reinterpret_cast(&b); + float *result_ptr = (float *)aligned_alloc(32, sizeof(f32x8_v)); + HWY_STATIC_DISPATCH(_divx8_f32)(a_ptr, b_ptr, result_ptr); + f32x8_v result; + std::memcpy(&result, result_ptr, sizeof(f32x8_v)); + return result; +} + +#endif + +/* 512-bits */ +#if HWY_MAX_BYTES >= 64 +f64x8_v addf64x8_v(const f64x8_v a, const f64x8_v b) { + const double *a_ptr = reinterpret_cast(&a); + const double *b_ptr = reinterpret_cast(&b); + double *result_ptr = (double *)aligned_alloc(64, sizeof(f64x8_v)); + HWY_STATIC_DISPATCH(_addx8_f64)(a_ptr, b_ptr, result_ptr); + f64x8_v result; + std::memcpy(&result, result_ptr, sizeof(f64x8_v)); + return result; +} + +f64x8_v subf64x8_v(const f64x8_v a, const f64x8_v b) { + const double *a_ptr = reinterpret_cast(&a); + const double *b_ptr = reinterpret_cast(&b); + double *result_ptr = (double *)aligned_alloc(64, sizeof(f64x8_v)); + HWY_STATIC_DISPATCH(_subx8_f64)(a_ptr, b_ptr, result_ptr); + f64x8_v result; + std::memcpy(&result, result_ptr, sizeof(f64x8_v)); + return result; +} + +f64x8_v mulf64x8_v(const f64x8_v a, const f64x8_v b) { + const double *a_ptr = reinterpret_cast(&a); + const double *b_ptr = reinterpret_cast(&b); + double *result_ptr = (double *)aligned_alloc(64, sizeof(f64x8_v)); + HWY_STATIC_DISPATCH(_mulx8_f64)(a_ptr, b_ptr, result_ptr); + f64x8_v result; + std::memcpy(&result, result_ptr, sizeof(f64x8_v)); + return result; +} + +f64x8_v divf64x8_v(const f64x8_v a, const f64x8_v b) { + const double *a_ptr = reinterpret_cast(&a); + const double *b_ptr = reinterpret_cast(&b); + double *result_ptr = (double *)aligned_alloc(64, sizeof(f64x8_v)); + HWY_STATIC_DISPATCH(_divx8_f64)(a_ptr, b_ptr, result_ptr); + f64x8_v result; + std::memcpy(&result, result_ptr, sizeof(f64x8_v)); + return result; +} + +f32x16_v addf32x16_v(const f32x16_v a, const f32x16_v b) { + const float *a_ptr = reinterpret_cast(&a); + const float *b_ptr = reinterpret_cast(&b); + float *result_ptr = (float *)aligned_alloc(64, sizeof(f32x16_v)); + HWY_STATIC_DISPATCH(_addx16_f32)(a_ptr, b_ptr, result_ptr); + f32x16_v result; + std::memcpy(&result, result_ptr, sizeof(f32x16_v)); + return result; +} + +f32x16_v subf32x16_v(const f32x16_v a, const f32x16_v b) { + const float *a_ptr = reinterpret_cast(&a); + const float *b_ptr = reinterpret_cast(&b); + float *result_ptr = (float *)aligned_alloc(64, sizeof(f32x16_v)); + HWY_STATIC_DISPATCH(_subx16_f32)(a_ptr, b_ptr, result_ptr); + f32x16_v result; + std::memcpy(&result, result_ptr, sizeof(f32x16_v)); + return result; +} + +f32x16_v mulf32x16_v(const f32x16_v a, const f32x16_v b) { + const float *a_ptr = reinterpret_cast(&a); + const float *b_ptr = reinterpret_cast(&b); + float *result_ptr = (float *)aligned_alloc(64, sizeof(f32x16_v)); + HWY_STATIC_DISPATCH(_mulx16_f32)(a_ptr, b_ptr, result_ptr); + f32x16_v result; + std::memcpy(&result, result_ptr, sizeof(f32x16_v)); + return result; +} + +f32x16_v divf32x16_v(const f32x16_v a, const f32x16_v b) { + const float *a_ptr = reinterpret_cast(&a); + const float *b_ptr = reinterpret_cast(&b); + float *result_ptr = (float *)aligned_alloc(64, sizeof(f32x16_v)); + HWY_STATIC_DISPATCH(_divx16_f32)(a_ptr, b_ptr, result_ptr); + f32x16_v result; + std::memcpy(&result, result_ptr, sizeof(f32x16_v)); + return result; +} + +#endif // 512-bits + +/* 1024-bits */ +#if HWY_MAX_BYTES >= 128 +f64x16_v addf64x16_v(const f64x16_v a, const f64x16_v b) { + const double *a_ptr = reinterpret_cast(&a); + const double *b_ptr = reinterpret_cast(&b); + double *result_ptr = (double *)aligned_alloc(128, sizeof(f64x16_v)); + HWY_STATIC_DISPATCH(_addx16_f64)(a_ptr, b_ptr, result_ptr); + f64x16_v result; + std::memcpy(&result, result_ptr, sizeof(f64x16_v)); + return result; +} + +f64x16_v subf64x16_v(const f64x16_v a, const f64x16_v b) { + const double *a_ptr = reinterpret_cast(&a); + const double *b_ptr = reinterpret_cast(&b); + double *result_ptr = (double *)aligned_alloc(128, sizeof(f64x16_v)); + HWY_STATIC_DISPATCH(_subx16_f64)(a_ptr, b_ptr, result_ptr); + f64x16_v result; + std::memcpy(&result, result_ptr, sizeof(f64x16_v)); + return result; +} + +f64x16_v mulf64x16_v(const f64x16_v a, const f64x16_v b) { + const double *a_ptr = reinterpret_cast(&a); + const double *b_ptr = reinterpret_cast(&b); + double *result_ptr = (double *)aligned_alloc(128, sizeof(f64x16_v)); + HWY_STATIC_DISPATCH(_mulx16_f64)(a_ptr, b_ptr, result_ptr); + f64x16_v result; + std::memcpy(&result, result_ptr, sizeof(f64x16_v)); + return result; +} + +f64x16_v divf64x16_v(const f64x16_v a, const f64x16_v b) { + const double *a_ptr = reinterpret_cast(&a); + const double *b_ptr = reinterpret_cast(&b); + double *result_ptr = (double *)aligned_alloc(128, sizeof(f64x16_v)); + HWY_STATIC_DISPATCH(_divx16_f64)(a_ptr, b_ptr, result_ptr); + f64x16_v result; + std::memcpy(&result, result_ptr, sizeof(f64x16_v)); + return result; +} +#endif // 1024-bits + +/* Single vector functions, dynamic dispatch */ + +/* IEEE-754 binary32 x2 */ + +f32x2_v addf32x2_d(const f32x2_v a, const f32x2_v b) { + const float *a_ptr = reinterpret_cast(&a); + const float *b_ptr = reinterpret_cast(&b); + float *result_ptr = (float *)aligned_alloc(8, sizeof(f32x2_v)); + addf32(a_ptr, b_ptr, result_ptr, 2); + f32x2_v result; + std::memcpy(&result, result_ptr, sizeof(f32x2_v)); + return result; +} + +f32x2_v subf32x2_d(const f32x2_v a, const f32x2_v b) { + const float *a_ptr = reinterpret_cast(&a); + const float *b_ptr = reinterpret_cast(&b); + float *result_ptr = (float *)aligned_alloc(8, sizeof(f32x2_v)); + subf32(a_ptr, b_ptr, result_ptr, 2); + f32x2_v result; + std::memcpy(&result, result_ptr, sizeof(f32x2_v)); + return result; +} + +f32x2_v mulf32x2_d(const f32x2_v a, const f32x2_v b) { + const float *a_ptr = reinterpret_cast(&a); + const float *b_ptr = reinterpret_cast(&b); + float *result_ptr = (float *)aligned_alloc(8, sizeof(f32x2_v)); + mulf32(a_ptr, b_ptr, result_ptr, 2); + f32x2_v result; + std::memcpy(&result, result_ptr, sizeof(f32x2_v)); + return result; +} + +f32x2_v divf32x2_d(const f32x2_v a, const f32x2_v b) { + const float *a_ptr = reinterpret_cast(&a); + const float *b_ptr = reinterpret_cast(&b); + float *result_ptr = (float *)aligned_alloc(8, sizeof(f32x2_v)); + divf32(a_ptr, b_ptr, result_ptr, 2); + f32x2_v result; + std::memcpy(&result, result_ptr, sizeof(f32x2_v)); + return result; +} + +/* IEEE-754 binary64 x2 */ + +f64x2_v addf64x2_d(const f64x2_v a, const f64x2_v b) { + const double *a_ptr = reinterpret_cast(&a); + const double *b_ptr = reinterpret_cast(&b); + double *result_ptr = (double *)aligned_alloc(16, sizeof(f64x2_v)); + addf64(a_ptr, b_ptr, result_ptr, 2); + f64x2_v result; + std::memcpy(&result, result_ptr, sizeof(f64x2_v)); + return result; +} + +f64x2_v subf64x2_d(const f64x2_v a, const f64x2_v b) { + const double *a_ptr = reinterpret_cast(&a); + const double *b_ptr = reinterpret_cast(&b); + double *result_ptr = (double *)aligned_alloc(16, sizeof(f64x2_v)); + subf64(a_ptr, b_ptr, result_ptr, 2); + f64x2_v result; + std::memcpy(&result, result_ptr, sizeof(f64x2_v)); + return result; +} + +f64x2_v mulf64x2_d(const f64x2_v a, const f64x2_v b) { + const double *a_ptr = reinterpret_cast(&a); + const double *b_ptr = reinterpret_cast(&b); + double *result_ptr = (double *)aligned_alloc(16, sizeof(f64x2_v)); + mulf64(a_ptr, b_ptr, result_ptr, 2); + f64x2_v result; + std::memcpy(&result, result_ptr, sizeof(f64x2_v)); + return result; +} + +f64x2_v divf64x2_d(const f64x2_v a, const f64x2_v b) { + const double *a_ptr = reinterpret_cast(&a); + const double *b_ptr = reinterpret_cast(&b); + double *result_ptr = (double *)aligned_alloc(16, sizeof(f64x2_v)); + divf64(a_ptr, b_ptr, result_ptr, 2); + f64x2_v result; + std::memcpy(&result, result_ptr, sizeof(f64x2_v)); + return result; +} + +/* IEEE-754 binary32 x4 */ +f32x4_v addf32x4_d(const f32x4_v a, const f32x4_v b) { + const float *a_ptr = reinterpret_cast(&a); + const float *b_ptr = reinterpret_cast(&b); + float *result_ptr = (float *)aligned_alloc(16, sizeof(f32x4_v)); + addf32(a_ptr, b_ptr, result_ptr, 4); + f32x4_v result; + std::memcpy(&result, result_ptr, sizeof(f32x4_v)); + return result; +} + +f32x4_v subf32x4_d(const f32x4_v a, const f32x4_v b) { + const float *a_ptr = reinterpret_cast(&a); + const float *b_ptr = reinterpret_cast(&b); + float *result_ptr = (float *)aligned_alloc(16, sizeof(f32x4_v)); + subf32(a_ptr, b_ptr, result_ptr, 4); + f32x4_v result; + std::memcpy(&result, result_ptr, sizeof(f32x4_v)); + return result; +} + +f32x4_v mulf32x4_d(const f32x4_v a, const f32x4_v b) { + const float *a_ptr = reinterpret_cast(&a); + const float *b_ptr = reinterpret_cast(&b); + float *result_ptr = (float *)aligned_alloc(16, sizeof(f32x4_v)); + mulf32(a_ptr, b_ptr, result_ptr, 4); + f32x4_v result; + std::memcpy(&result, result_ptr, sizeof(f32x4_v)); + return result; +} + +f32x4_v divf32x4_d(const f32x4_v a, const f32x4_v b) { + const float *a_ptr = reinterpret_cast(&a); + const float *b_ptr = reinterpret_cast(&b); + float *result_ptr = (float *)aligned_alloc(16, sizeof(f32x4_v)); + divf32(a_ptr, b_ptr, result_ptr, 4); + f32x4_v result; + std::memcpy(&result, result_ptr, sizeof(f32x4_v)); + return result; +} + +/* IEEE-754 binary64 x4 */ + +f64x4_v addf64x4_d(const f64x4_v a, const f64x4_v b) { + const double *a_ptr = reinterpret_cast(&a); + const double *b_ptr = reinterpret_cast(&b); + double *result_ptr = (double *)aligned_alloc(32, sizeof(f64x4_v)); + addf64(a_ptr, b_ptr, result_ptr, 4); + f64x4_v result; + std::memcpy(&result, result_ptr, sizeof(f64x4_v)); + return result; +} + +f64x4_v subf64x4_d(const f64x4_v a, const f64x4_v b) { + const double *a_ptr = reinterpret_cast(&a); + const double *b_ptr = reinterpret_cast(&b); + double *result_ptr = (double *)aligned_alloc(32, sizeof(f64x4_v)); + subf64(a_ptr, b_ptr, result_ptr, 4); + f64x4_v result; + std::memcpy(&result, result_ptr, sizeof(f64x4_v)); + return result; +} + +f64x4_v mulf64x4_d(const f64x4_v a, const f64x4_v b) { + const double *a_ptr = reinterpret_cast(&a); + const double *b_ptr = reinterpret_cast(&b); + double *result_ptr = (double *)aligned_alloc(32, sizeof(f64x4_v)); + mulf64(a_ptr, b_ptr, result_ptr, 4); + f64x4_v result; + std::memcpy(&result, result_ptr, sizeof(f64x4_v)); + return result; +} + +f64x4_v divf64x4_d(const f64x4_v a, const f64x4_v b) { + const double *a_ptr = reinterpret_cast(&a); + const double *b_ptr = reinterpret_cast(&b); + double *result_ptr = (double *)aligned_alloc(32, sizeof(f64x4_v)); + divf64(a_ptr, b_ptr, result_ptr, 4); + f64x4_v result; + std::memcpy(&result, result_ptr, sizeof(f64x4_v)); + return result; +} + +/* IEEE-754 binary32 x8 */ + +f32x8_v addf32x8_d(const f32x8_v a, const f32x8_v b) { + const float *a_ptr = reinterpret_cast(&a); + const float *b_ptr = reinterpret_cast(&b); + float *result_ptr = (float *)aligned_alloc(32, sizeof(f32x8_v)); + addf32(a_ptr, b_ptr, result_ptr, 8); + f32x8_v result; + std::memcpy(&result, result_ptr, sizeof(f32x8_v)); + return result; +} +f32x8_v subf32x8_d(const f32x8_v a, const f32x8_v b) { + const float *a_ptr = reinterpret_cast(&a); + + const float *b_ptr = reinterpret_cast(&b); + float *result_ptr = (float *)aligned_alloc(32, sizeof(f32x8_v)); + subf32(a_ptr, b_ptr, result_ptr, 8); + f32x8_v result; + std::memcpy(&result, result_ptr, sizeof(f32x8_v)); + return result; +} + +f32x8_v mulf32x8_d(const f32x8_v a, const f32x8_v b) { + const float *a_ptr = reinterpret_cast(&a); + const float *b_ptr = reinterpret_cast(&b); + float *result_ptr = (float *)aligned_alloc(32, sizeof(f32x8_v)); + mulf32(a_ptr, b_ptr, result_ptr, 8); + f32x8_v result; + std::memcpy(&result, result_ptr, sizeof(f32x8_v)); + return result; +} + +f32x8_v divf32x8_d(const f32x8_v a, const f32x8_v b) { + const float *a_ptr = reinterpret_cast(&a); + const float *b_ptr = reinterpret_cast(&b); + float *result_ptr = (float *)aligned_alloc(32, sizeof(f32x8_v)); + divf32(a_ptr, b_ptr, result_ptr, 8); + f32x8_v result; + std::memcpy(&result, result_ptr, sizeof(f32x8_v)); + return result; +} + +/* IEEE-754 binary64 x8 */ + +f64x8_v addf64x8_d(const f64x8_v a, const f64x8_v b) { + const double *a_ptr = reinterpret_cast(&a); + const double *b_ptr = reinterpret_cast(&b); + double *result_ptr = (double *)aligned_alloc(64, sizeof(f64x8_v)); + addf64(a_ptr, b_ptr, result_ptr, 8); + f64x8_v result; + std::memcpy(&result, result_ptr, sizeof(f64x8_v)); + return result; +} + +f64x8_v subf64x8_d(const f64x8_v a, const f64x8_v b) { + const double *a_ptr = reinterpret_cast(&a); + const double *b_ptr = reinterpret_cast(&b); + double *result_ptr = (double *)aligned_alloc(64, sizeof(f64x8_v)); + subf64(a_ptr, b_ptr, result_ptr, 8); + f64x8_v result; + std::memcpy(&result, result_ptr, sizeof(f64x8_v)); + return result; +} + +f64x8_v mulf64x8_d(const f64x8_v a, const f64x8_v b) { + const double *a_ptr = reinterpret_cast(&a); + const double *b_ptr = reinterpret_cast(&b); + double *result_ptr = (double *)aligned_alloc(64, sizeof(f64x8_v)); + mulf64(a_ptr, b_ptr, result_ptr, 8); + f64x8_v result; + std::memcpy(&result, result_ptr, sizeof(f64x8_v)); + return result; +} + +f64x8_v divf64x8_d(const f64x8_v a, const f64x8_v b) { + const double *a_ptr = reinterpret_cast(&a); + const double *b_ptr = reinterpret_cast(&b); + double *result_ptr = (double *)aligned_alloc(64, sizeof(f64x8_v)); + divf64(a_ptr, b_ptr, result_ptr, 8); + f64x8_v result; + std::memcpy(&result, result_ptr, sizeof(f64x8_v)); + return result; +} + +/* IEEE-754 binary32 x16 */ + +f32x16_v addf32x16_d(const f32x16_v a, const f32x16_v b) { + const float *a_ptr = reinterpret_cast(&a); + const float *b_ptr = reinterpret_cast(&b); + float *result_ptr = (float *)aligned_alloc(64, sizeof(f32x16_v)); + addf32(a_ptr, b_ptr, result_ptr, 16); + f32x16_v result; + std::memcpy(&result, result_ptr, sizeof(f32x16_v)); + return result; +} + +f32x16_v subf32x16_d(const f32x16_v a, const f32x16_v b) { + const float *a_ptr = reinterpret_cast(&a); + const float *b_ptr = reinterpret_cast(&b); + float *result_ptr = (float *)aligned_alloc(64, sizeof(f32x16_v)); + subf32(a_ptr, b_ptr, result_ptr, 16); + f32x16_v result; + std::memcpy(&result, result_ptr, sizeof(f32x16_v)); + return result; +} + +f32x16_v mulf32x16_d(const f32x16_v a, const f32x16_v b) { + const float *a_ptr = reinterpret_cast(&a); + const float *b_ptr = reinterpret_cast(&b); + float *result_ptr = (float *)aligned_alloc(64, sizeof(f32x16_v)); + mulf32(a_ptr, b_ptr, result_ptr, 16); + f32x16_v result; + std::memcpy(&result, result_ptr, sizeof(f32x16_v)); + return result; +} + +f32x16_v divf32x16_d(const f32x16_v a, const f32x16_v b) { + const float *a_ptr = reinterpret_cast(&a); + const float *b_ptr = reinterpret_cast(&b); + float *result_ptr = (float *)aligned_alloc(64, sizeof(f32x16_v)); + divf32(a_ptr, b_ptr, result_ptr, 16); + f32x16_v result; + std::memcpy(&result, result_ptr, sizeof(f32x16_v)); + return result; +} + +/* IEEE-754 binary64 x16 */ + +f64x16_v addf64x16_d(const f64x16_v a, const f64x16_v b) { + const double *a_ptr = reinterpret_cast(&a); + const double *b_ptr = reinterpret_cast(&b); + double *result_ptr = (double *)aligned_alloc(128, sizeof(f64x16_v)); + addf64(a_ptr, b_ptr, result_ptr, 16); + f64x16_v result; + std::memcpy(&result, result_ptr, sizeof(f64x16_v)); + return result; +} + +f64x16_v subf64x16_d(const f64x16_v a, const f64x16_v b) { + const double *a_ptr = reinterpret_cast(&a); + const double *b_ptr = reinterpret_cast(&b); + double *result_ptr = (double *)aligned_alloc(128, sizeof(f64x16_v)); + subf64(a_ptr, b_ptr, result_ptr, 16); + f64x16_v result; + std::memcpy(&result, result_ptr, sizeof(f64x16_v)); + return result; +} + +f64x16_v mulf64x16_d(const f64x16_v a, const f64x16_v b) { + const double *a_ptr = reinterpret_cast(&a); + const double *b_ptr = reinterpret_cast(&b); + double *result_ptr = (double *)aligned_alloc(128, sizeof(f64x16_v)); + mulf64(a_ptr, b_ptr, result_ptr, 16); + f64x16_v result; + std::memcpy(&result, result_ptr, sizeof(f64x16_v)); + return result; +} + +f64x16_v divf64x16_d(const f64x16_v a, const f64x16_v b) { + const double *a_ptr = reinterpret_cast(&a); + const double *b_ptr = reinterpret_cast(&b); + double *result_ptr = (double *)aligned_alloc(128, sizeof(f64x16_v)); + divf64(a_ptr, b_ptr, result_ptr, 16); + f64x16_v result; + std::memcpy(&result, result_ptr, sizeof(f64x16_v)); + return result; +} + +} // namespace vector +} // namespace ud + +#endif // HWY_ONCE + +#if HWY_ONCE +namespace ud { +namespace scalar { + +float addf32(float a, float b) { return ud::scalar::add(a, b); } +float subf32(float a, float b) { return ud::scalar::sub(a, b); } +float mulf32(float a, float b) { return ud::scalar::mul(a, b); } +float divf32(float a, float b) { return ud::scalar::div(a, b); } +float sqrtf32(float a) { return ud::scalar::sqrt(a); } + +double addf64(double a, double b) { return ud::scalar::add(a, b); } +double subf64(double a, double b) { return ud::scalar::sub(a, b); } +double mulf64(double a, double b) { return ud::scalar::mul(a, b); } +double divf64(double a, double b) { return ud::scalar::div(a, b); } +double sqrtf64(double a) { return ud::scalar::sqrt(a); } + +} // namespace scalar +} // namespace ud +#endif // HWY_ONCE \ No newline at end of file diff --git a/src/libvfcinstrumentonline/rand/src/ud_hw.h b/src/libvfcinstrumentonline/rand/src/ud_hw.h new file mode 100644 index 00000000..37f119da --- /dev/null +++ b/src/libvfcinstrumentonline/rand/src/ud_hw.h @@ -0,0 +1,258 @@ +#ifndef _INTERFLOP_UD_LIB_H_ +#define _INTERFLOP_UD_LIB_H_ + +#include "hwy/highway.h" + +namespace ud { +namespace vector { +/* Array functions */ + +/* IEEE-754 binary32 */ + +void addf32(const float *HWY_RESTRICT a, const float *HWY_RESTRICT b, + float *HWY_RESTRICT result, const size_t count); + +void subf32(const float *HWY_RESTRICT a, const float *HWY_RESTRICT b, + float *HWY_RESTRICT result, const size_t count); + +void mulf32(const float *HWY_RESTRICT a, const float *HWY_RESTRICT b, + float *HWY_RESTRICT result, const size_t count); + +void divf32(const float *HWY_RESTRICT a, const float *HWY_RESTRICT b, + float *HWY_RESTRICT result, const size_t count); + +void sqrtf32(const float *HWY_RESTRICT a, float *HWY_RESTRICT result, + const size_t count); + +/* IEEE-754 binary64 */ + +void addf64(const double *HWY_RESTRICT a, const double *HWY_RESTRICT b, + double *HWY_RESTRICT result, const size_t count); + +void subf64(const double *HWY_RESTRICT a, const double *HWY_RESTRICT b, + double *HWY_RESTRICT result, const size_t count); + +void mulf64(const double *HWY_RESTRICT a, const double *HWY_RESTRICT b, + double *HWY_RESTRICT result, const size_t count); + +void divf64(const double *HWY_RESTRICT a, const double *HWY_RESTRICT b, + double *HWY_RESTRICT result, const size_t count); + +void sqrtf64(const double *HWY_RESTRICT a, double *HWY_RESTRICT result, + const size_t count); + +/* Single vector functions, pointer arguments */ + +/* 64-bits */ +#if HWY_MAX_BYTES >= 8 +/* IEEE-754 binary32 x2 */ +void addf32x2(const float *HWY_RESTRICT a, const float *HWY_RESTRICT b, + float *HWY_RESTRICT result); +void subf32x2(const float *HWY_RESTRICT a, const float *HWY_RESTRICT b, + float *HWY_RESTRICT result); +void mulf32x2(const float *HWY_RESTRICT a, const float *HWY_RESTRICT b, + float *HWY_RESTRICT result); +void divf32x2(const float *HWY_RESTRICT a, const float *HWY_RESTRICT b, + float *HWY_RESTRICT result); +#endif + +/* 128-bits */ +#if HWY_MAX_BYTES >= 16 +/* IEEE-754 binary32 x4 */ +void addf32x4(const float *HWY_RESTRICT a, const float *HWY_RESTRICT b, + float *HWY_RESTRICT result); +void subf32x4(const float *HWY_RESTRICT a, const float *HWY_RESTRICT b, + float *HWY_RESTRICT result); +void mulf32x4(const float *HWY_RESTRICT a, const float *HWY_RESTRICT b, + float *HWY_RESTRICT result); +void divf32x4(const float *HWY_RESTRICT a, const float *HWY_RESTRICT b, + float *HWY_RESTRICT result); + +/* IEEE-754 binary64 x2 */ +void addf64x2(const double *HWY_RESTRICT a, const double *HWY_RESTRICT b, + double *HWY_RESTRICT result); +void subf64x2(const double *HWY_RESTRICT a, const double *HWY_RESTRICT b, + double *HWY_RESTRICT result); +void mulf64x2(const double *HWY_RESTRICT a, const double *HWY_RESTRICT b, + double *HWY_RESTRICT result); +void divf64x2(const double *HWY_RESTRICT a, const double *HWY_RESTRICT b, + double *HWY_RESTRICT result); +#endif + +/* 256-bits */ +#if HWY_MAX_BYTES >= 32 +/* IEEE-754 binary64 x4 */ +void addf64x4(const double *HWY_RESTRICT a, const double *HWY_RESTRICT b, + double *HWY_RESTRICT result); +void subf64x4(const double *HWY_RESTRICT a, const double *HWY_RESTRICT b, + double *HWY_RESTRICT result); +void mulf64x4(const double *HWY_RESTRICT a, const double *HWY_RESTRICT b, + double *HWY_RESTRICT result); +void divf64x4(const double *HWY_RESTRICT a, const double *HWY_RESTRICT b, + double *HWY_RESTRICT result); + +/* IEEE-754 binary32 x8 */ +void addf32x8(const float *HWY_RESTRICT a, const float *HWY_RESTRICT b, + float *HWY_RESTRICT result); +void subf32x8(const float *HWY_RESTRICT a, const float *HWY_RESTRICT b, + float *HWY_RESTRICT result); +void mulf32x8(const float *HWY_RESTRICT a, const float *HWY_RESTRICT b, + float *HWY_RESTRICT result); +void divf32x8(const float *HWY_RESTRICT a, const float *HWY_RESTRICT b, + float *HWY_RESTRICT result); +#endif + +/* 512-bits */ +#if HWY_MAX_BYTES >= 64 +/* IEEE-754 binary64 x8 */ +void addf64x8(const double *HWY_RESTRICT a, const double *HWY_RESTRICT b, + double *HWY_RESTRICT result); +void subf64x8(const double *HWY_RESTRICT a, const double *HWY_RESTRICT b, + double *HWY_RESTRICT result); +void mulf64x8(const double *HWY_RESTRICT a, const double *HWY_RESTRICT b, + double *HWY_RESTRICT result); +void divf64x8(const double *HWY_RESTRICT a, const double *HWY_RESTRICT b, + double *HWY_RESTRICT result); + +/* IEEE-754 binary32 x16 */ +void addf32x16(const float *HWY_RESTRICT a, const float *HWY_RESTRICT b, + float *HWY_RESTRICT result); +void subf32x16(const float *HWY_RESTRICT a, const float *HWY_RESTRICT b, + float *HWY_RESTRICT result); +void mulf32x16(const float *HWY_RESTRICT a, const float *HWY_RESTRICT b, + float *HWY_RESTRICT result); +void divf32x16(const float *HWY_RESTRICT a, const float *HWY_RESTRICT b, + float *HWY_RESTRICT result); +#endif + +/* 1024-bits */ +#if HWY_MAX_BYTES >= 128 +/* IEEE-754 binary64 x16 */ +void addf64x16(const double *HWY_RESTRICT a, const double *HWY_RESTRICT b, + double *HWY_RESTRICT result); +void subf64x16(const double *HWY_RESTRICT a, const double *HWY_RESTRICT b, + double *HWY_RESTRICT result); +void mulf64x16(const double *HWY_RESTRICT a, const double *HWY_RESTRICT b, + double *HWY_RESTRICT result); +void divf64x16(const double *HWY_RESTRICT a, const double *HWY_RESTRICT b, + double *HWY_RESTRICT result); +#endif + +/* Single vector functions, vector argument */ + +/* 64-bits */ +typedef float f32x2_v __attribute__((vector_size(8))); +#if HWY_MAX_BYTES >= 8 +f32x2_v addf32x2_v(const f32x2_v a, const f32x2_v b); +f32x2_v subf32x2_v(const f32x2_v a, const f32x2_v b); +f32x2_v mulf32x2_v(const f32x2_v a, const f32x2_v b); +f32x2_v divf32x2_v(const f32x2_v a, const f32x2_v b); +#endif + +/* 128-bits */ +typedef double f64x2_v __attribute__((vector_size(16))); +typedef float f32x4_v __attribute__((vector_size(16))); +#if HWY_MAX_BYTES >= 16 +f64x2_v addf64x2_v(const f64x2_v a, const f64x2_v b); +f64x2_v subf64x2_v(const f64x2_v a, const f64x2_v b); +f64x2_v mulf64x2_v(const f64x2_v a, const f64x2_v b); +f64x2_v divf64x2_v(const f64x2_v a, const f64x2_v b); + +f32x4_v addf32x4_v(const f32x4_v a, const f32x4_v b); +f32x4_v subf32x4_v(const f32x4_v a, const f32x4_v b); +f32x4_v mulf32x4_v(const f32x4_v a, const f32x4_v b); +f32x4_v divf32x4_v(const f32x4_v a, const f32x4_v b); +#endif + +/* 256-bits */ +typedef double f64x4_v __attribute__((vector_size(32))); +typedef float f32x8_v __attribute__((vector_size(32))); +#if HWY_MAX_BYTES >= 32 +f64x4_v addf64x4_v(const f64x4_v a, const f64x4_v b); +f64x4_v subf64x4_v(const f64x4_v a, const f64x4_v b); +f64x4_v mulf64x4_v(const f64x4_v a, const f64x4_v b); +f64x4_v divf64x4_v(const f64x4_v a, const f64x4_v b); + +f32x8_v addf32x8_v(const f32x8_v a, const f32x8_v b); +f32x8_v subf32x8_v(const f32x8_v a, const f32x8_v b); +f32x8_v mulf32x8_v(const f32x8_v a, const f32x8_v b); +f32x8_v divf32x8_v(const f32x8_v a, const f32x8_v b); +#endif + +/* 512-bits */ +typedef double f64x8_v __attribute__((vector_size(64))); +typedef float f32x16_v __attribute__((vector_size(64))); +#if HWY_MAX_BYTES >= 64 +f64x8_v addf64x8_v(const f64x8_v a, const f64x8_v b); +f64x8_v subf64x8_v(const f64x8_v a, const f64x8_v b); +f64x8_v mulf64x8_v(const f64x8_v a, const f64x8_v b); +f64x8_v divf64x8_v(const f64x8_v a, const f64x8_v b); + +f32x16_v addf32x16_v(const f32x16_v a, const f32x16_v b); +f32x16_v subf32x16_v(const f32x16_v a, const f32x16_v b); +f32x16_v mulf32x16_v(const f32x16_v a, const f32x16_v b); +f32x16_v divf32x16_v(const f32x16_v a, const f32x16_v b); +#endif + +typedef double f64x16_v __attribute__((vector_size(128))); +#if HWY_MAX_BYTES >= 128 +f64x16_v addf64x16_v(const f64x16_v a, const f64x16_v b); +f64x16_v subf64x16_v(const f64x16_v a, const f64x16_v b); +f64x16_v mulf64x16_v(const f64x16_v a, const f64x16_v b); +f64x16_v divf64x16_v(const f64x16_v a, const f64x16_v b); +#endif + +/* Single vectors functions, vector arguments, dynamic dispatch */ + +/* IEEE-754 binary32 x2 */ +f32x2_v addf32x2_d(const f32x2_v a, const f32x2_v b); +f32x2_v subf32x2_d(const f32x2_v a, const f32x2_v b); +f32x2_v mulf32x2_d(const f32x2_v a, const f32x2_v b); +f32x2_v divf32x2_d(const f32x2_v a, const f32x2_v b); + +/* IEEE-754 binary64 x2 */ +f64x2_v addf64x2_d(const f64x2_v a, const f64x2_v b); +f64x2_v subf64x2_d(const f64x2_v a, const f64x2_v b); +f64x2_v mulf64x2_d(const f64x2_v a, const f64x2_v b); +f64x2_v divf64x2_d(const f64x2_v a, const f64x2_v b); + +/* IEEE-754 binary32 x4 */ +f32x4_v addf32x4_d(const f32x4_v a, const f32x4_v b); +f32x4_v subf32x4_d(const f32x4_v a, const f32x4_v b); +f32x4_v mulf32x4_d(const f32x4_v a, const f32x4_v b); +f32x4_v divf32x4_d(const f32x4_v a, const f32x4_v b); + +/* IEEE-754 binary64 x4 */ +f64x4_v addf64x4_d(const f64x4_v a, const f64x4_v b); +f64x4_v subf64x4_d(const f64x4_v a, const f64x4_v b); +f64x4_v mulf64x4_d(const f64x4_v a, const f64x4_v b); +f64x4_v divf64x4_d(const f64x4_v a, const f64x4_v b); + +/* IEEE-754 binary32 x8 */ +f32x8_v addf32x8_d(const f32x8_v a, const f32x8_v b); +f32x8_v subf32x8_d(const f32x8_v a, const f32x8_v b); +f32x8_v mulf32x8_d(const f32x8_v a, const f32x8_v b); +f32x8_v divf32x8_d(const f32x8_v a, const f32x8_v b); + +/* IEEE-754 binary64 x8 */ +f64x8_v addf64x8_d(const f64x8_v a, const f64x8_v b); +f64x8_v subf64x8_d(const f64x8_v a, const f64x8_v b); +f64x8_v mulf64x8_d(const f64x8_v a, const f64x8_v b); +f64x8_v divf64x8_d(const f64x8_v a, const f64x8_v b); + +/* IEEE-754 binary32 x16 */ +f32x16_v addf32x16_d(const f32x16_v a, const f32x16_v b); +f32x16_v subf32x16_d(const f32x16_v a, const f32x16_v b); +f32x16_v mulf32x16_d(const f32x16_v a, const f32x16_v b); +f32x16_v divf32x16_d(const f32x16_v a, const f32x16_v b); + +/* IEEE-754 binary64 x16 */ +f64x16_v addf64x16_d(const f64x16_v a, const f64x16_v b); +f64x16_v subf64x16_d(const f64x16_v a, const f64x16_v b); +f64x16_v mulf64x16_d(const f64x16_v a, const f64x16_v b); +f64x16_v divf64x16_d(const f64x16_v a, const f64x16_v b); + +} // namespace vector +} // namespace ud + +#endif // _INTERFLOP_UD_LIB_H_ \ No newline at end of file diff --git a/src/libvfcinstrumentonline/rand/src/utils.hpp b/src/libvfcinstrumentonline/rand/src/utils.hpp index a1c4ee02..a0373364 100644 --- a/src/libvfcinstrumentonline/rand/src/utils.hpp +++ b/src/libvfcinstrumentonline/rand/src/utils.hpp @@ -70,7 +70,7 @@ template <> struct IEEE754 { // Uses the FP constant 𝜓 = 1 − 2^{−𝑝} . // 𝑧 ← RN𝑒 (𝜓𝑎) (= pred(|a|)) // 𝛿 ← RN𝑒 (𝑎 − 𝑧) -// return 𝛿 +// return (z,𝛿) template T get_predecessor_abs(T a) { T phi = std::is_same::value ? 1.0f - 0x1.0p-24f : 1.0 - 0x1.0p-53; T z = a * phi; diff --git a/src/libvfcinstrumentonline/rand/src/xoroshiro256+_hw-inl.hpp b/src/libvfcinstrumentonline/rand/src/xoroshiro256+_hw-inl.hpp index 3faf715e..973f5e4f 100644 --- a/src/libvfcinstrumentonline/rand/src/xoroshiro256+_hw-inl.hpp +++ b/src/libvfcinstrumentonline/rand/src/xoroshiro256+_hw-inl.hpp @@ -51,6 +51,7 @@ extern RNGInitializer rng; HWY_API float uniformf32() { return rng.get()->Uniform(); } HWY_API double uniformf64() { return rng.get()->Uniform(); } +HWY_API std::uint64_t randomu64() { return rng.get()->operator()(); } } // namespace HWY_NAMESPACE } // namespace xoroshiro256plus @@ -81,6 +82,7 @@ class RNGInitializer { return std::make_unique(seed, thread_id); } }; + extern RNGInitializer rng; template > V uniform(const D d) { @@ -104,6 +106,27 @@ template > V uniform(const D d) { } } +template > V random(const D d) { + sv::debug_msg("[random] START"); + using T = hn::TFromD; + const std::size_t lanes = hn::Lanes(d); + if constexpr (sizeof(lanes * sizeof(T)) <= 16) { + /* allocate at least 128bits */ + /* since VectorXoshiro returns 128bits elements */ + const std::size_t size_rng = std::max(lanes, std::size_t{8}); + auto z = rng.get()->operator()(T{}, size_rng); + auto z_load = hn::Load(d, z.data()); + sv::debug_msg("[random] END"); + return z_load; + } else { + const std::size_t size_rng = lanes; + auto z = rng.get()->operator()(T{}, size_rng); + auto z_load = hn::Load(d, z.data()); + sv::debug_msg("[random] END"); + return z_load; + } +} + } // namespace HWY_NAMESPACE // NOLINTNEXTLINE(google-readability-namespace-comments) diff --git a/src/libvfcinstrumentonline/rand/src/xoroshiro256+_hw.cpp b/src/libvfcinstrumentonline/rand/src/xoroshiro256+_hw.cpp index bc350fa4..4c1c91b3 100644 --- a/src/libvfcinstrumentonline/rand/src/xoroshiro256+_hw.cpp +++ b/src/libvfcinstrumentonline/rand/src/xoroshiro256+_hw.cpp @@ -82,10 +82,12 @@ namespace xoroshiro256plus { HWY_EXPORT(uniformf32); HWY_EXPORT(uniformf64); +HWY_EXPORT(randomu64); namespace static_dispatch { float uniform(float) { return HWY_STATIC_DISPATCH(uniformf32)(); } double uniform(double) { return HWY_STATIC_DISPATCH(uniformf64)(); } +std::uint64_t random() { return HWY_STATIC_DISPATCH(randomu64)(); } } // namespace static_dispatch @@ -98,6 +100,10 @@ HWY_DLLEXPORT double uniform(double) { return HWY_DYNAMIC_DISPATCH(uniformf64)(); } +HWY_DLLEXPORT std::uint64_t random() { + return HWY_DYNAMIC_DISPATCH(randomu64)(); +} + } // namespace dynamic_dispatch } // namespace xoroshiro256plus diff --git a/src/libvfcinstrumentonline/rand/src/xoroshiro256+_hw.h b/src/libvfcinstrumentonline/rand/src/xoroshiro256+_hw.h index 48b75a9b..396e3c4d 100644 --- a/src/libvfcinstrumentonline/rand/src/xoroshiro256+_hw.h +++ b/src/libvfcinstrumentonline/rand/src/xoroshiro256+_hw.h @@ -44,11 +44,13 @@ namespace xoroshiro256plus { namespace static_dispatch { HWY_DLLEXPORT float uniform(float); HWY_DLLEXPORT double uniform(double); +HWY_DLLEXPORT std::uint64_t random(); } // namespace static_dispatch namespace dynamic_dispatch { HWY_DLLEXPORT float uniform(float); HWY_DLLEXPORT double uniform(double); +HWY_DLLEXPORT std::uint64_t random(); } // namespace dynamic_dispatch } // namespace xoroshiro256plus diff --git a/src/libvfcinstrumentonline/rand/tests/BUILD b/src/libvfcinstrumentonline/rand/tests/BUILD index 46c429bb..dbf52b5d 100644 --- a/src/libvfcinstrumentonline/rand/tests/BUILD +++ b/src/libvfcinstrumentonline/rand/tests/BUILD @@ -2,148 +2,152 @@ load("//tests:macros.bzl", "cc_test_gen", "cc_test_lib_gen") HWY_OPTS = [ "-O2", - "-std=c++20", - "-Wfatal-errors", - # "-DHWY_DISABLED_TARGETS=(HWY_AVX3|HWY_AVX3_ZEN4|HWY_AVX3_SPR)", - # "-DHWY_IS_TEST", -] - -DEPS = [ - "@hwy//:math", ] cc_test_gen( - name = "get_exponent_test", + name = "test_get_exponent", copts = HWY_OPTS, ) cc_test_gen( - name = "get_predabs_test", + name = "test_get_predabs", copts = HWY_OPTS, ) cc_test_gen( - name = "get_pow2_test", + name = "test_get_pow2", copts = HWY_OPTS, ) cc_test_gen( - name = "get_pow2_test-hw", - copts = HWY_OPTS + ["-DSR_DEBUG"], + name = "test_get_pow2_hwy", + copts = HWY_OPTS, + dbg = True, ) cc_test_gen( - name = "twosum_test", + name = "test_twosum", copts = HWY_OPTS, ) cc_test_gen( - name = "twoprodfma_test", + name = "test_twoprodfma", copts = HWY_OPTS, ) +# Scalar tests + cc_test_lib_gen( - name = "udround_test", + name = "sr-scalar-accuracy", + size = "small", + src = ["//tests:test_sr_accuracy.cpp"], + copts = HWY_OPTS, ) cc_test_lib_gen( - name = "sr_accuracy", + name = "sr-scalar-accuracy-dbg", size = "small", - src = ["//tests:srround_test.cpp"], + src = ["//tests:test_sr_accuracy.cpp"], copts = HWY_OPTS, + dbg = True, ) cc_test_lib_gen( - name = "sr_accuracy-dbg", + name = "ud-scalar-accuracy", size = "small", - src = ["//tests:srround_test.cpp"], - copts = HWY_OPTS + ["-DSR_DEBUG"], + src = [ + "//tests:test_ud_accuracy.cpp", + "//src:srcs-ud", + ], + copts = HWY_OPTS, ) -# single test for scalar operations cc_binary( - name = "single-test", - srcs = ["//tests:test_sr_hw.cpp"], - copts = HWY_OPTS + [ - "-DSR_DEBUG", - ], - deps = [ - "@//src:sr", - "@hwy", - ], + name = "sr-scalar-single-test", + srcs = ["//tests:test_single_sr.cpp"], + copts = HWY_OPTS + ["-DSR_DEBUG"], + deps = ["@//src:sr"], ) -# single test for vector operations cc_binary( - name = "single-test-hw", - srcs = ["//tests:test_sr_hw-API.cpp"], - copts = HWY_OPTS + [ - "-DSR_DEBUG", - ], - deps = [ - "@//src:sr", - "@googletest//:gtest", - "@googletest//:gtest_main", - "@hwy", - "@hwy//:hwy_test_util", - ], + name = "ud-scalar-single-test", + srcs = ["//tests:test_single_ud.cpp"], + copts = HWY_OPTS + ["-DSR_DEBUG"], + deps = ["@//src:ud"], ) +# Vector tests + cc_test_lib_gen( name = "xoroshiro", size = "medium", - src = ["//tests:xoroshiro.cpp"], - copts = [ - "-O2", - "-Wfatal-errors", - "-DHWY_DISABLED_TARGETS=(HWY_AVX3|HWY_AVX3_ZEN4|HWY_AVX3_SPR)", - "-DHWY_IS_TEST", - "-DSR_DEBUG", + src = [ + "//src:xoroshiro", + "//tests:test_xoroshiro.cpp", ], + copts = HWY_OPTS, + dbg = True, +) + +cc_binary( + name = "sr-vector-single-test", + srcs = ["//tests:test_single_sr_hwy.cpp"], + copts = HWY_OPTS + ["-DSR_DEBUG"], ) cc_test_lib_gen( - name = "sr_hw_accuracy", + name = "sr-vector-accuracy", size = "medium", - src = ["//tests:srround_test_hw-inl.cpp"], + src = ["//tests:test_sr_hwy_accuracy.cpp"], copts = HWY_OPTS, ) cc_test_lib_gen( - name = "sr_hw_accuracy-dbg", + name = "ud-vector-accuracy", size = "medium", - src = ["//tests:srround_test_hw-inl.cpp"], - copts = HWY_OPTS + ["-DSR_DEBUG"], + src = [ + "//tests:test_ud_hwy_accuracy.cpp", + "//src:srcs-ud", + ], + copts = HWY_OPTS, ) cc_test_lib_gen( - name = "sr_hw_test", + name = "sr-vector-accuracy-dbg", size = "medium", - src = ["//tests:sr_hw_test.cpp"], - copts = [ - "-std=c++20", - "-O2", - "-Wfatal-errors", - "-DHWY_DISABLED_TARGETS=(HWY_AVX3|HWY_AVX3_ZEN4|HWY_AVX3_SPR)", - "-DHWY_IS_TEST", - "-DSR_DEBUG", - "-Wno-psabi", - ], + src = ["//tests:test_sr_hwy_accuracy.cpp"], + copts = HWY_OPTS, + dbg = True, +) + +cc_test_lib_gen( + name = "sr-perf", + size = "medium", + src = ["//tests:test_sr_hwy_performance.cpp"], + copts = HWY_OPTS, deps = [ "@//src:sr", "@hwy//:nanobenchmark", ], ) +cc_test_lib_gen( + name = "ud-perf", + size = "medium", + src = ["//tests:test_ud_hwy_performance.cpp"], + copts = HWY_OPTS, + deps = [ + "@//src:ud", + "@hwy//:nanobenchmark", + ], +) + test_suite( name = "all", tests = [ ":get_exponent_test", ":get_pow2_test", ":get_predabs_test", - ":srround_boost_test", - ":srround_shishua_test", - ":srround_xoroshiro_test", ":twoprodfma_test", ":twosum_test", ":udround_test", diff --git a/src/libvfcinstrumentonline/rand/tests/macros.bzl b/src/libvfcinstrumentonline/rand/tests/macros.bzl index e5792a15..803a2a9d 100644 --- a/src/libvfcinstrumentonline/rand/tests/macros.bzl +++ b/src/libvfcinstrumentonline/rand/tests/macros.bzl @@ -1,12 +1,14 @@ +""" +""" + COPTS = [ - "-std=c++20", + "-std=c++17", "-I.", "-Wfatal-errors", + "-DHWY_IS_TEST", ] -SRCS = [ - "//src:srcs", -] +SRCS = [] DEPS = [ "@hwy", @@ -19,22 +21,22 @@ DEPS = [ HEADERS = ["//tests:helper.hpp"] -def cc_test_gen(name, src = None, deps = DEPS, copts = COPTS, size = "small"): +def cc_test_gen(name, src = None, deps = DEPS, copts = COPTS, size = "small", dbg = False): native.cc_test( name = name, srcs = [src if src else name + ".cpp"] + HEADERS + SRCS, - copts = COPTS + (copts if copts else []), + copts = COPTS + (copts if copts else []) + (["-DSR_DEBUG"] if dbg else []), deps = deps, size = size, ) -def cc_test_lib_gen(name, src = None, deps = None, copts = COPTS, size = "small"): +def cc_test_lib_gen(name, src = None, deps = None, copts = COPTS, size = "small", dbg = False): srcs = src if src else [name + ".cpp"] srcs += HEADERS native.cc_test( name = name, srcs = srcs + SRCS, - copts = COPTS + (copts if copts else []), + copts = COPTS + (copts if copts else []) + (["-DSR_DEBUG"] if dbg else []), deps = DEPS + (deps if deps else []), size = size, visibility = ["//visibility:public"], diff --git a/src/libvfcinstrumentonline/rand/tests/test.c b/src/libvfcinstrumentonline/rand/tests/test.c deleted file mode 100644 index dc7faa30..00000000 --- a/src/libvfcinstrumentonline/rand/tests/test.c +++ /dev/null @@ -1,17 +0,0 @@ -#include -#include - -int main() { - float a = 1.4999999; - double b = 1.4999999; - double c = 0.1; - double d = __builtin_inff64(); - - printf("a: %.13a\n", a); - printf("b: %.13a\n", b); - printf("c: %.13a\n", c); - printf("d: %.13a\n", (float)c); - printf("e: %.13a\n", d - d); - - return 0; -} diff --git a/src/libvfcinstrumentonline/rand/tests/get_exponent_test.cpp b/src/libvfcinstrumentonline/rand/tests/test_get_exponent.cpp similarity index 100% rename from src/libvfcinstrumentonline/rand/tests/get_exponent_test.cpp rename to src/libvfcinstrumentonline/rand/tests/test_get_exponent.cpp diff --git a/src/libvfcinstrumentonline/rand/tests/get_pow2_test.cpp b/src/libvfcinstrumentonline/rand/tests/test_get_pow2.cpp similarity index 100% rename from src/libvfcinstrumentonline/rand/tests/get_pow2_test.cpp rename to src/libvfcinstrumentonline/rand/tests/test_get_pow2.cpp diff --git a/src/libvfcinstrumentonline/rand/tests/get_pow2_test-hw.cpp b/src/libvfcinstrumentonline/rand/tests/test_get_pow2_hwy.cpp similarity index 100% rename from src/libvfcinstrumentonline/rand/tests/get_pow2_test-hw.cpp rename to src/libvfcinstrumentonline/rand/tests/test_get_pow2_hwy.cpp diff --git a/src/libvfcinstrumentonline/rand/tests/get_predabs_test.cpp b/src/libvfcinstrumentonline/rand/tests/test_get_predabs.cpp similarity index 100% rename from src/libvfcinstrumentonline/rand/tests/get_predabs_test.cpp rename to src/libvfcinstrumentonline/rand/tests/test_get_predabs.cpp diff --git a/src/libvfcinstrumentonline/rand/tests/test_sr_hw.cpp b/src/libvfcinstrumentonline/rand/tests/test_single_sr.cpp similarity index 100% rename from src/libvfcinstrumentonline/rand/tests/test_sr_hw.cpp rename to src/libvfcinstrumentonline/rand/tests/test_single_sr.cpp diff --git a/src/libvfcinstrumentonline/rand/tests/test_sr_hw-API.cpp b/src/libvfcinstrumentonline/rand/tests/test_single_sr_hwy.cpp similarity index 100% rename from src/libvfcinstrumentonline/rand/tests/test_sr_hw-API.cpp rename to src/libvfcinstrumentonline/rand/tests/test_single_sr_hwy.cpp diff --git a/src/libvfcinstrumentonline/rand/tests/test_single_ud.cpp b/src/libvfcinstrumentonline/rand/tests/test_single_ud.cpp new file mode 100644 index 00000000..8b84c989 --- /dev/null +++ b/src/libvfcinstrumentonline/rand/tests/test_single_ud.cpp @@ -0,0 +1,70 @@ +#include +#include + +#include "hwy/highway.h" +#include "hwy/print-inl.h" + +#include "src/ud.h" + +template void test_bin(T a, T b, int op) { + + constexpr const char *fptype = std::is_same_v ? "f32" : "f64"; + constexpr const char *opname[] = {"Add", "Sub", "Mul", "Div"}; + constexpr const char *fmt = std::is_same_v ? "%+.6a " : "%+.13a "; + + const std::string opstr = opname[op]; + std::cerr << opstr << fptype << ":\n"; + + switch (op) { + case 0: + fprintf(stderr, fmt, ud::scalar::add(a, b)); + break; + case 1: + fprintf(stderr, fmt, ud::scalar::sub(a, b)); + break; + case 2: + fprintf(stderr, fmt, ud::scalar::mul(a, b)); + break; + case 3: + fprintf(stderr, fmt, ud::scalar::div(a, b)); + break; + } + + std::cerr << std::endl; +} + +int main(int argc, char *argv[]) { + + if (argc != 5) { + std::cerr << "Usage: " << argv[0] << " \n"; + return 1; + } + + // print HWY_TARGET + std::cout << "HWY_TARGET: " << HWY_TARGET << std::endl; + + double a = std::stof(argv[1]); + double b = std::stof(argv[2]); + const std::string op = std::string(argv[3]); + const std::string type = std::string(argv[4]); + + const auto op_map = std::map{ + {"add", 0}, + {"sub", 1}, + {"mul", 2}, + {"div", 3}, + }; + + const auto type_map = std::map{ + {"f32", 0}, + {"f64", 1}, + }; + + if (type == "f32") { + test_bin(a, b, op_map.at(op)); + } else if (type == "f64") { + test_bin(a, b, op_map.at(op)); + } + + return 0; +} diff --git a/src/libvfcinstrumentonline/rand/tests/srround_test.cpp b/src/libvfcinstrumentonline/rand/tests/test_sr_accuracy.cpp similarity index 98% rename from src/libvfcinstrumentonline/rand/tests/srround_test.cpp rename to src/libvfcinstrumentonline/rand/tests/test_sr_accuracy.cpp index bba87884..0fb457ea 100644 --- a/src/libvfcinstrumentonline/rand/tests/srround_test.cpp +++ b/src/libvfcinstrumentonline/rand/tests/test_sr_accuracy.cpp @@ -242,7 +242,9 @@ void check_distribution_match(T a, T b, // do the test only if the operation is not exact (probability is not zero) bool is_exact = probability_down == 1 and probability_up == 1; - // do + // do the test only if the operation is not exact (probability is not zero) + // do the test if the distance between the reference and the estimated value + // is greater than the minimum subnormal // do not the test if the probability is lower than 1/repetitions bool compare_down_values = not is_exact and down != 0 and probability_down > (1.0 / repetitions) and @@ -251,8 +253,7 @@ void check_distribution_match(T a, T b, probability_up > (1.0 / repetitions) and distance_error > helper::IEEE754::min_subnormal; - if (not is_exact and down != 0 and probability_down > (1.0 / repetitions) and - distance_error > helper::IEEE754::min_subnormal) + if (compare_down_values) EXPECT_THAT(counter.down(), Eq(static_cast(down))) << "Value ↓ is not equal to reference\n" << " type: " << ftype << "\n" @@ -277,8 +278,7 @@ void check_distribution_match(T a, T b, << distance_error_msg << std::defaultfloat << flush(); // do not the test if the probability is lower than 1/repetitions - if (not is_exact and up != 0 and probability_up > (1.0 / repetitions) and - distance_error > helper::IEEE754::min_subnormal) + if (compare_up_values) EXPECT_THAT(counter.up(), Eq(static_cast(up))) << "Value ↑ is not equal to reference\n" << " type: " << ftype << "\n" diff --git a/src/libvfcinstrumentonline/rand/tests/test_sr_hwy_accuracy.cpp b/src/libvfcinstrumentonline/rand/tests/test_sr_hwy_accuracy.cpp new file mode 100644 index 00000000..e2019885 --- /dev/null +++ b/src/libvfcinstrumentonline/rand/tests/test_sr_hwy_accuracy.cpp @@ -0,0 +1,802 @@ +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +// clang-format off +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "tests/srround_test_hw-inl.cpp" +#include "hwy/foreach_target.h" + +#include "hwy/highway.h" +#include "hwy/base.h" +#include "hwy/tests/test_util-inl.h" +#include "src/debug_hwy-inl.h" +#include "src/sr_hw-inl.h" +// clang-format on + +#include "src/sr_hw.h" +#include "src/utils.hpp" +#include "tests/helper.hpp" + +HWY_BEFORE_NAMESPACE(); // at file scope + +namespace sr { + +namespace HWY_NAMESPACE { + +namespace hn = hwy::HWY_NAMESPACE; + +namespace { + +using ::testing::AllOf; +using ::testing::Ge; +using ::testing::Le; +using ::testing::Lt; + +constexpr auto default_alpha = 0.001; +#ifdef SR_DEBUG +constexpr auto default_repetitions = 100; +#else +constexpr auto default_repetitions = 10'000; +#endif + +static auto distribution_failed_tests_counter = 0; +static auto distribution_tests_counter = 0; + +namespace reference { +// return pred(|s|) + +// twosum reference +// compute in double precision if the input type is float +// compute in quad precision if the input type is double +template ::H> +H add(T a, T b) { + return static_cast(a) + static_cast(b); +} + +template ::H> +H sub(T a, T b) { + return static_cast(a) - static_cast(b); +} + +template ::H> +H mul(T a, T b) { + return static_cast(a) * static_cast(b); +} + +template ::H> +H div(T a, T b) { + return static_cast(a) / static_cast(b); +} + +}; // namespace reference + +namespace srvh = sr::vector::HWY_NAMESPACE; + +struct SRAdd { + static constexpr char name[] = "add"; + static constexpr char symbol[] = "+"; + + template , typename T = hn::TFromD> + V operator()(D d, V a, V b) { + return srvh::sr_add(a, b); + } + + template ::H> + static H reference(T a, T b) { + return reference::add(a, b); + } +}; + +struct SRSub { + static constexpr char name[] = "sub"; + static constexpr char symbol[] = "-"; + + template , typename T = hn::TFromD> + V operator()(D d, V a, V b) { + return srvh::sr_sub(a, b); + } + + template ::H> + static H reference(T a, T b) { + return reference::sub(a, b); + } +}; + +struct SRMul { + static constexpr char name[] = "mul"; + static constexpr char symbol[] = "*"; + + template , typename T = hn::TFromD> + V operator()(D d, V a, V b) { + return srvh::sr_mul(a, b); + } + + template ::H> + static H reference(T a, T b) { + return reference::mul(a, b); + } +}; + +struct SRDiv { + static constexpr char name[] = "div"; + static constexpr char symbol[] = "/"; + + template , typename T = hn::TFromD> + V operator()(D d, V a, V b) { + return srvh::sr_div(a, b); + } + + template ::H> + static H reference(T a, T b) { + return reference::div(a, b); + } +}; + +template ::H> +std::tuple compute_distance_error(T a, T b, + H reference) { + T ref_cast = static_cast(reference); + + if (helper::isnan(a) or helper::isnan(b) or helper::isnan(reference) or + helper::isinf(a) or helper::isinf(b) or helper::isinf(reference) or + helper::isinf(ref_cast)) { + return {0, 0, reference, reference, 0, ""}; + } + + H error = 0, error_c = 0; + H probability_down = 0, probability_up = 0; + H next = 0, prev = 0; + H ulp = helper::get_ulp(ref_cast); + + if (ref_cast == reference) { + error = 0; + probability_down = 1; + probability_up = 1; + prev = reference; + next = reference; + } else { + error = helper::absolute_distance(reference, static_cast(ref_cast)); + error_c = helper::absolute_distance(static_cast(ulp), error); + prev = (ref_cast < reference) ? ref_cast : (ref_cast - ulp); + next = (ref_cast < reference) ? (ref_cast + ulp) : ref_cast; + probability_down = (next - reference) / ulp; + probability_up = (reference - prev) / ulp; + + if (((error + error_c) != ulp)) { + std::cerr << "error + error_c != ulp" << "\n" + << "error: " << helper::hexfloat(error) << "\n" + << "error_c: " << helper::hexfloat(error_c) << "\n" + << "prev: " << helper::hexfloat(prev) << "\n" + << "next: " << helper::hexfloat(next) << "\n" + << "ulp: " << helper::hexfloat(ulp) << std::endl; + HWY_ASSERT(false); + } + if (probability_down + probability_up != 1) { + std::cerr << "probability_down + probability_up != 1" << "\n" + << "probability_down: " << probability_down << "\n" + << "probability_up: " << probability_up << "\n" + << "reference: " << helper::hexfloat(reference) << "\n" + << "prev: " << helper::hexfloat(prev) << "\n" + << "next: " << helper::hexfloat(next) << "\n" + << "error: " << helper::hexfloat(error) << "\n" + << "error_c: " << helper::hexfloat(error_c) << "\n" + << "ulp: " << helper::hexfloat(ulp) << std::endl; + HWY_ASSERT(false); + } + } + + std::ostringstream os; + os << std::hexfloat << std::setprecision(13); + os << "-- compute_distance_error --" << std::endl; + os << " reference: " << helper::hexfloat(reference) << std::endl; + if constexpr (std::is_same_v) { + os << " (float)reference: " << helper::hexfloat(ref_cast) << std::endl; + os << " |ref-(float)ref|: " << helper::hexfloat(error) << std::endl; + } + if constexpr (std::is_same_v) { + os << " (double)reference: " << helper::hexfloat(ref_cast) << std::endl; + os << " |ref-(double)ref|: " << helper::hexfloat(error) << std::endl; + } + os << " error_c: " << helper::hexfloat(error_c) << std::endl; + os << " ulp: " << helper::hexfloat(ulp) << std::endl; + os << " reference ↓: " << helper::hexfloat(prev) << std::endl; + os << " reference ↑: " << helper::hexfloat(next) << std::endl; + os << std::defaultfloat; + os << " p: " << probability_down << std::endl; + os << " 1-p: " << probability_up << std::endl; + auto msg = os.str(); + + return {probability_down, probability_up, prev, next, error, msg}; +} + +template , + typename T = hn::TFromD> +std::vector> eval_op(D d, V a, V b, const int repetitions) { + + const size_t lanes = hn::Lanes(d); + auto op = Op(); + + std::vector> c(lanes); + for (int i = 0; i < repetitions; i++) { + auto v = op(d, a, b); + for (size_t j = 0; j < lanes; j++) { + auto vj = hn::ExtractLane(v, j); + c[j][vj]++; + } + } + + return c; +} + +template std::string fmt_proba(const T &x) { + std::ostringstream os; + os << std::fixed << std::setprecision(5) << x << std::defaultfloat; + return os.str(); +} + +std::string flush() { + debug_flush(); + return ""; +} + +template , typename T = hn::TFromD> +void assert_equal_inputs(D d, V a) { + auto a_min = hn::ReduceMin(d, a); + auto a_max = hn::ReduceMax(d, a); + if (helper::isnan(a_min) and helper::isnan(a_max)) + return; + +#if HWY_TARGET != HWY_EMU128 + if (a_min != a_max) { + std::cerr << "Vector does not have equal inputs!\n" + << "min(a): " << helper::hexfloat(a_min) << "\n" + << "max(a): " << helper::hexfloat(a_max) << std::endl; + HWY_ASSERT(false); + } +#endif +} + +template , + typename T = hn::TFromD> +void check_distribution_match(D d, V va, V vb, + const long repetitions = default_repetitions, + const float alpha = default_alpha) { + using H = typename helper::IEEE754::H; + + auto a = hn::GetLane(va); + auto b = hn::GetLane(vb); + + // ensure that we have vector of same value + assert_equal_inputs(d, va); + assert_equal_inputs(d, vb); + + auto counters = eval_op(d, va, vb, repetitions); + + H reference = Op::reference(a, b); + auto [probability_down, probability_up, down, up, distance_error, + distance_error_msg] = compute_distance_error(a, b, reference); + + size_t lane = 0; + for (auto &counter : counters) { + + auto count_down = counter.down_count(); + auto count_up = counter.up_count(); + auto probability_down_estimated = count_down / (double)repetitions; + auto probability_up_estimated = count_up / (double)repetitions; + + if (not helper::isfinite(counter.up()) or + not helper::isfinite(counter.down())) { + if (helper::isnan(counter.up()) and helper::isnan(reference)) + return; + if (helper::isnan(counter.down()) and helper::isnan(reference)) + return; + if (helper::isinf(counter.up()) and helper::isinf(reference)) { + if (counter.up() != reference) { + std::cerr << "Error in inf comparison\n" + << "counter.up() != reference\n" + << "counter.up(): " << counter.up() << "\n" + << "reference: " << reference << "\n" + << distance_error_msg << std::defaultfloat << flush(); + HWY_ASSERT(false); + } + } + if (helper::isinf(counter.down()) and helper::isinf(reference)) { + if (counter.down() != reference) { + std::cerr << "Error in inf comparison\n" + << "counter.down() != reference\n" + << "counter.down(): " << counter.down() << "\n" + << "reference: " << reference << "\n" + << distance_error_msg << std::defaultfloat << flush(); + HWY_ASSERT(false); + } + } + return; + } + + /* code */ + if (0.0 > probability_down or probability_down > 1.0) { + std::cerr << "Probability ↓ is not in [0, 1] range\n" + << "-- theoretical -\n" + << " probability ↓: " << fmt_proba(probability_down) << "\n" + << " probability ↑: " << fmt_proba(probability_up) << "\n" + << "--- estimated --\n" + << " sample size: " << repetitions << "\n" + << " #↓: " << count_down << " (" + << fmt_proba(probability_down_estimated) << ")\n" + << " #↑: " << count_up << " (" + << fmt_proba(probability_up_estimated) << ")\n" + << std::hexfloat << "" << " ↓: " << counter.down() + << "\n" + << " ↑: " << counter.up() << "\n" + << distance_error_msg << std::defaultfloat << flush(); + HWY_ASSERT(false); + } + + // do the test only if the operation is not exact (probability is not zero) + bool is_exact = probability_down == 1 and probability_up == 1; + + // do the test only if the operation is not exact (probability is not zero) + // do the test if the distance between the reference and the estimated value + // is greater than the minimum subnormal + // do not the test if the probability is lower than 1/repetitions + bool compare_down_values = + not is_exact and down != 0 and + probability_down > (1.0 / repetitions) and + distance_error > helper::IEEE754::min_subnormal; + bool compare_up_values = not is_exact and up != 0 and + probability_up > (1.0 / repetitions) and + distance_error > helper::IEEE754::min_subnormal; + + // 95% relative error + auto error_down = (probability_down_estimated / probability_down); + auto error_up = (probability_up_estimated / probability_up); + + // binomial test + auto test = helper::binomial_test(repetitions, count_down, + static_cast(probability_down)); + + const auto op_name = Op::name; + const auto op_symbol = Op::symbol; + const auto ftype = typeid(T).name(); + + // Bonferroni correction, divide by the number of lanes + const auto lanes = hn::Lanes(d); + const auto alpha_bon = (alpha / 2) / lanes; + + if (compare_down_values and static_cast(down) != counter.down()) { + std::cerr << "Value ↓ is not equal to reference\n" + << " Lane/#Lanes: " << lane + 1 << "/" << lanes << "\n" + << " type: " << ftype << "\n" + << " op: " << op_name << "\n" + << " alpha: " << alpha_bon << "\n" + << std::hexfloat << std::setprecision(13) << "" + << " a: " << helper::hexfloat(a) << "\n" + << " b: " << helper::hexfloat(b) << "\n" + << " a" << op_symbol + << "b: " << helper::hexfloat(reference) << "\n" + << std::defaultfloat << "" << "-- theoretical -\n" + << " probability ↓: " << fmt_proba(probability_down) << "\n" + << " probability ↑: " << fmt_proba(probability_up) << "\n" + << "--- estimated --\n" + << " sample size: " << repetitions << "\n" + << " #↓: " << count_down << " (" + << fmt_proba(probability_down_estimated) << ")\n" + << " #↑: " << count_up << " (" + << fmt_proba(probability_up_estimated) << ")\n" + << std::hexfloat + << " ↓: " << helper::hexfloat(counter.down()) + << "\n" + << " ↑: " << helper::hexfloat(counter.up()) << "\n" + << distance_error_msg << std::defaultfloat << flush(); + HWY_ASSERT(0); + } + + if (compare_up_values and static_cast(up) != counter.up()) { + std::cerr << "Value ↑ is not equal to reference\n" + << " Lane/#Lanes: " << lane + 1 << "/" << lanes << "\n" + << " type: " << ftype << "\n" + << " op: " << op_name << "\n" + << " alpha: " << alpha_bon << "\n" + << std::hexfloat << std::setprecision(13) << "" + << " a: " << helper::hexfloat(a) << "\n" + << " b: " << helper::hexfloat(b) << "\n" + << " a" << op_symbol + << "b: " << helper::hexfloat(reference) << "\n" + << std::defaultfloat << "" << "-- theoretical -\n" + << " probability ↓: " << fmt_proba(probability_down) << "\n" + << " probability ↑: " << fmt_proba(probability_up) << "\n" + << "--- estimated --\n" + << " sample size: " << repetitions << "\n" + << " #↓: " << count_down << " (" + << fmt_proba(probability_down_estimated) << ")\n" + << " #↑: " << count_up << " (" + << fmt_proba(probability_up_estimated) << ")\n" + << std::hexfloat + << " ↓: " << helper::hexfloat(counter.down()) + << "\n" + << " ↑: " << helper::hexfloat(counter.up()) << "\n" + << distance_error_msg << std::defaultfloat << flush(); + HWY_ASSERT(0); + } + + // if (test.pvalue == 0) + // return; + + if (compare_down_values and compare_up_values and test.pvalue < alpha_bon) { + std::cerr << "Null hypotheis rejected!\n" + << " Lane/#Lanes: " << lane + 1 << "/" << lanes << "\n" + << " type: " << ftype << "\n" + << " op: " << op_name << "\n" + << " alpha: " << alpha_bon << "\n" + << std::hexfloat << std::setprecision(13) << "" + << " a: " << helper::hexfloat(a) << "\n" + << " b: " << helper::hexfloat(b) << "\n" + << " a" << op_symbol + << "b: " << helper::hexfloat(reference) << "\n" + << std::defaultfloat << "" << "-- theoretical -\n" + << " probability ↓: " << fmt_proba(probability_down) << "\n" + << " probability ↑: " << fmt_proba(probability_up) << "\n" + << "--- estimated --\n" + << " sample size: " << repetitions << "\n" + << " #↓: " << count_down << " (" + << fmt_proba(probability_down_estimated) << ")\n" + << " #↑: " << count_up << " (" + << fmt_proba(probability_up_estimated) << ")\n" + << " p-value: " << test.pvalue << "\n" + << " ↓ %: " << error_down * 100 << "\n" + << " ↑ %: " << error_up * 100 << "\n" + << std::hexfloat + << " ↓: " << helper::hexfloat(counter.down()) + << "\n" + << " ↑: " << helper::hexfloat(counter.up()) << "\n" + << distance_error_msg << std::defaultfloat << flush(); + HWY_ASSERT(0); + } + distribution_tests_counter++; + lane++; + debug_reset(); + } +} + +template std::vector get_simple_case() { + std::vector simple_case = {0.0, + 1.0, + 2.0, + 3.0, + std::numeric_limits::min(), + std::numeric_limits::lowest(), + std::numeric_limits::max(), + std::numeric_limits::epsilon(), + std::numeric_limits::infinity(), + std::numeric_limits::denorm_min(), + std::numeric_limits::quiet_NaN(), + std::numeric_limits::signaling_NaN()}; + return simple_case; +} + +template > void do_run_test_exact_add(D d) { + constexpr int32_t mantissa = helper::IEEE754::mantissa; + + for (int i = 0; i <= 5; i++) { + auto va = hn::Set(d, 1.25); + auto vb = hn::Set(d, std::ldexp(1.0, -(mantissa + i))); + check_distribution_match(d, va, vb); + } +} + +template , + typename T = hn::TFromD> +void do_run_test_simple_case(D d) { + auto simple_case = get_simple_case(); + for (auto a : simple_case) { + for (auto b : simple_case) { + auto va = hn::Set(d, a); + auto vb = hn::Set(d, b); + auto va_neg = hn::Neg(va); + auto vb_neg = hn::Neg(vb); + check_distribution_match(d, va, vb); + check_distribution_match(d, va, vb_neg); + check_distribution_match(d, va_neg, vb); + check_distribution_match(d, va_neg, vb_neg); + } + } +} + +template , + typename T = hn::TFromD> +void do_run_test_random(D d, const double start_range_1st = 0.0, + const double end_range_1st = 1.0, + const double start_range_2nd = 0.0, + const double end_range_2nd = 1.0) { + helper::RNG rng1(start_range_1st, end_range_1st); + helper::RNG rng2(start_range_2nd, end_range_2nd); + + for (int i = 0; i < 100; i++) { + T a = rng1(); + T b = rng2(); + auto va = hn::Set(d, a); + auto vb = hn::Set(d, b); + auto va_neg = hn::Neg(va); + auto vb_neg = hn::Neg(vb); + check_distribution_match(d, va, vb); + check_distribution_match(d, va, vb_neg); + check_distribution_match(d, va_neg, vb); + check_distribution_match(d, va_neg, vb_neg); + } +} + +void check_failed_tests(float threshold, bool skip = false) { + const auto failed_ratio = + distribution_failed_tests_counter / (double)distribution_tests_counter; + if (failed_ratio > threshold) { + std::cerr << "Number of failed tests above threshold!\n" + << "Threshold: " << threshold << "\n" + << "Failed tests: " << distribution_failed_tests_counter + << " (" << failed_ratio << ")\n" + << "Number of tests: " << distribution_tests_counter << "\n" + << std::endl; + HWY_ASSERT(skip); + } + distribution_failed_tests_counter = 0; + distribution_tests_counter = 0; +} + +void assert_exact() { check_failed_tests(0); } +void assert_almost_exact(bool skip = false) { check_failed_tests(0.05, skip); } + +struct TestExactOperationsAdd { + template void operator()(T /*unused*/, D d) { + do_run_test_exact_add(d); + assert_exact(); + } +}; + +template struct TestBasicAssertions { + template void operator()(T /*unused*/, D d) { + do_run_test_simple_case(d); + assert_almost_exact(); + } +}; + +using TestBasicAssertionsAdd = TestBasicAssertions; +using TestBasicAssertionsSub = TestBasicAssertions; +using TestBasicAssertionsMul = TestBasicAssertions; +using TestBasicAssertionsDiv = TestBasicAssertions; + +HWY_NOINLINE void TestAllExactOperationsAdd() { + hn::ForFloat3264Types(hn::ForPartialVectors()); +} + +HWY_NOINLINE void TestAllBasicAssertionsAdd() { + hn::ForFloat3264Types(hn::ForPartialVectors()); +} + +HWY_NOINLINE void TestAllBasicAssertionsSub() { + hn::ForFloat3264Types(hn::ForPartialVectors()); +} + +HWY_NOINLINE void TestAllBasicAssertionsMul() { + hn::ForFloat3264Types(hn::ForPartialVectors()); +} + +HWY_NOINLINE void TestAllBasicAssertionsDiv() { + hn::ForFloat3264Types(hn::ForPartialVectors()); +} + +template struct TestRandom01Assertions { + template void operator()(T /*unused*/, D d) { + do_run_test_random(d); + } +}; + +using TestRandom01AssertionsAdd = TestRandom01Assertions; +using TestRandom01AssertionsSub = TestRandom01Assertions; +using TestRandom01AssertionsMul = TestRandom01Assertions; +using TestRandom01AssertionsDiv = TestRandom01Assertions; + +HWY_NOINLINE void TestAllRandom01AssertionsAdd() { + hn::ForFloat3264Types(hn::ForPartialVectors()); +} + +HWY_NOINLINE void TestAllRandom01AssertionsSub() { + hn::ForFloat3264Types(hn::ForPartialVectors()); +} + +HWY_NOINLINE void TestAllRandom01AssertionsMul() { + hn::ForFloat3264Types(hn::ForPartialVectors()); +} + +HWY_NOINLINE void TestAllRandom01AssertionsDiv() { + hn::ForFloat3264Types(hn::ForPartialVectors()); +} + +template , + typename T = hn::TFromD> +void do_random_no_overlap_test(D d) { + double start_range_1st = 0.0; + double end_range_1st = 1.0; + int s2; + if constexpr (std::is_same_v) { + s2 = -25; + } else { + s2 = -54; + }; + double start_range_2nd = std::ldexp(1.0, s2 + 1); + double end_range_2nd = std::ldexp(1.0, s2 + 1); + do_run_test_random(d, start_range_1st, end_range_1st, start_range_2nd, + end_range_2nd); +} + +template struct TestRandomNoOverlapAssertions { + template void operator()(T /*unused*/, D d) { + do_random_no_overlap_test(d); + } +}; + +using TestRandomNoOverlapAssertionsAdd = TestRandomNoOverlapAssertions; +using TestRandomNoOverlapAssertionsSub = TestRandomNoOverlapAssertions; +using TestRandomNoOverlapAssertionsMul = TestRandomNoOverlapAssertions; +using TestRandomNoOverlapAssertionsDiv = TestRandomNoOverlapAssertions; + +HWY_NOINLINE void TestAllRandomNoOverlapAssertionsAdd() { + hn::ForFloat3264Types( + hn::ForPartialVectors()); +} + +HWY_NOINLINE void TestAllRandomNoOverlapAssertionsSub() { + hn::ForFloat3264Types( + hn::ForPartialVectors()); +} + +HWY_NOINLINE void TestAllRandomNoOverlapAssertionsMul() { + hn::ForFloat3264Types( + hn::ForPartialVectors()); +} + +HWY_NOINLINE void TestAllRandomNoOverlapAssertionsDiv() { + hn::ForFloat3264Types( + hn::ForPartialVectors()); +} + +template , + typename T = hn::TFromD> +void do_random_last_bit_overlap(D d) { + double start_range_1st = 1.0; + double end_range_1st = 2.0; + int s2; + if constexpr (std::is_same_v) { + s2 = -24; + } else { + s2 = -53; + } + double start_range_2nd = std::ldexp(1.0, s2 + 1); + double end_range_2nd = std::ldexp(1.0, s2 + 1); + do_run_test_random(d, start_range_1st, end_range_1st, start_range_2nd, + end_range_2nd); +} + +template struct TestRandomLastBitOverlap { + template void operator()(T /*unused*/, D d) { + do_random_last_bit_overlap(d); + } +}; + +using TestRandomLastBitOverlapAdd = TestRandomLastBitOverlap; +using TestRandomLastBitOverlapSub = TestRandomLastBitOverlap; +using TestRandomLastBitOverlapMul = TestRandomLastBitOverlap; +using TestRandomLastBitOverlapDiv = TestRandomLastBitOverlap; + +HWY_NOINLINE void TestAllRandomLastBitOverlapAdd() { + hn::ForFloat3264Types(hn::ForPartialVectors()); +} + +HWY_NOINLINE void TestAllRandomLastBitOverlapSub() { + hn::ForFloat3264Types(hn::ForPartialVectors()); +} + +HWY_NOINLINE void TestAllRandomLastBitOverlapMul() { + hn::ForFloat3264Types(hn::ForPartialVectors()); +} + +HWY_NOINLINE void TestAllRandomLastBitOverlapDiv() { + hn::ForFloat3264Types(hn::ForPartialVectors()); +} + +template , + typename T = hn::TFromD> +void do_random_mid_overlap_test(D d) { + double start_range_1st = 1.0; + double end_range_1st = 2.0; + int s2; + if constexpr (std::is_same_v) { + s2 = -13; + } else { + s2 = -27; + }; + double start_range_2nd = std::ldexp(1.0, s2 + 1); + double end_range_2nd = std::ldexp(1.0, s2 + 1); + do_run_test_random(d, start_range_1st, end_range_1st, start_range_2nd, + end_range_2nd); +} + +template struct TestRandomMidOverlap { + template void operator()(T /*unused*/, D d) { + do_random_mid_overlap_test(d); + } +}; + +using TestRandomMidOverlapAdd = TestRandomMidOverlap; +using TestRandomMidOverlapSub = TestRandomMidOverlap; +using TestRandomMidOverlapMul = TestRandomMidOverlap; +using TestRandomMidOverlapDiv = TestRandomMidOverlap; + +HWY_NOINLINE void TestAllRandomMidOverlapAdd() { + hn::ForFloat3264Types(hn::ForPartialVectors()); +} + +HWY_NOINLINE void TestAllRandomMidOverlapSub() { + hn::ForFloat3264Types(hn::ForPartialVectors()); +} + +HWY_NOINLINE void TestAllRandomMidOverlapMul() { + hn::ForFloat3264Types(hn::ForPartialVectors()); +} + +HWY_NOINLINE void TestAllRandomMidOverlapDiv() { + hn::ForFloat3264Types(hn::ForPartialVectors()); +} + +} // namespace +} // namespace HWY_NAMESPACE +} // namespace sr +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace sr { +namespace HWY_NAMESPACE { + +HWY_BEFORE_TEST(SRRoundTest); +HWY_EXPORT_AND_TEST_P(SRRoundTest, TestAllExactOperationsAdd); +HWY_EXPORT_AND_TEST_P(SRRoundTest, TestAllBasicAssertionsAdd); +HWY_EXPORT_AND_TEST_P(SRRoundTest, TestAllBasicAssertionsSub); +HWY_EXPORT_AND_TEST_P(SRRoundTest, TestAllBasicAssertionsMul); +HWY_EXPORT_AND_TEST_P(SRRoundTest, TestAllBasicAssertionsDiv); +HWY_EXPORT_AND_TEST_P(SRRoundTest, TestAllRandom01AssertionsAdd); +HWY_EXPORT_AND_TEST_P(SRRoundTest, TestAllRandom01AssertionsSub); +HWY_EXPORT_AND_TEST_P(SRRoundTest, TestAllRandom01AssertionsMul); +HWY_EXPORT_AND_TEST_P(SRRoundTest, TestAllRandom01AssertionsDiv); +HWY_EXPORT_AND_TEST_P(SRRoundTest, TestAllRandomNoOverlapAssertionsAdd); +HWY_EXPORT_AND_TEST_P(SRRoundTest, TestAllRandomNoOverlapAssertionsSub); +HWY_EXPORT_AND_TEST_P(SRRoundTest, TestAllRandomNoOverlapAssertionsMul); +HWY_EXPORT_AND_TEST_P(SRRoundTest, TestAllRandomNoOverlapAssertionsDiv); +HWY_EXPORT_AND_TEST_P(SRRoundTest, TestAllRandomLastBitOverlapAdd); +HWY_EXPORT_AND_TEST_P(SRRoundTest, TestAllRandomLastBitOverlapSub); +HWY_EXPORT_AND_TEST_P(SRRoundTest, TestAllRandomLastBitOverlapMul); +HWY_EXPORT_AND_TEST_P(SRRoundTest, TestAllRandomLastBitOverlapDiv); +HWY_EXPORT_AND_TEST_P(SRRoundTest, TestAllRandomMidOverlapAdd); +HWY_EXPORT_AND_TEST_P(SRRoundTest, TestAllRandomMidOverlapSub); +HWY_EXPORT_AND_TEST_P(SRRoundTest, TestAllRandomMidOverlapMul); +HWY_EXPORT_AND_TEST_P(SRRoundTest, TestAllRandomMidOverlapDiv); +HWY_AFTER_TEST(); + +} // namespace HWY_NAMESPACE +} // namespace sr + +HWY_TEST_MAIN(); + +#endif // HWY_ONCE + +// int main(int argc, char **argv) { +// testing::InitGoogleTest(&argc, argv); +// init(); +// return RUN_ALL_TESTS(); +// } \ No newline at end of file diff --git a/src/libvfcinstrumentonline/rand/tests/sr_hw_test.cpp b/src/libvfcinstrumentonline/rand/tests/test_sr_hwy_performance.cpp similarity index 100% rename from src/libvfcinstrumentonline/rand/tests/sr_hw_test.cpp rename to src/libvfcinstrumentonline/rand/tests/test_sr_hwy_performance.cpp diff --git a/src/libvfcinstrumentonline/rand/tests/twoprodfma_test.cpp b/src/libvfcinstrumentonline/rand/tests/test_twoprodfma.cpp similarity index 100% rename from src/libvfcinstrumentonline/rand/tests/twoprodfma_test.cpp rename to src/libvfcinstrumentonline/rand/tests/test_twoprodfma.cpp diff --git a/src/libvfcinstrumentonline/rand/tests/twosum_test.cpp b/src/libvfcinstrumentonline/rand/tests/test_twosum.cpp similarity index 100% rename from src/libvfcinstrumentonline/rand/tests/twosum_test.cpp rename to src/libvfcinstrumentonline/rand/tests/test_twosum.cpp diff --git a/src/libvfcinstrumentonline/rand/tests/srround.cpp b/src/libvfcinstrumentonline/rand/tests/test_ud_accuracy.cpp similarity index 66% rename from src/libvfcinstrumentonline/rand/tests/srround.cpp rename to src/libvfcinstrumentonline/rand/tests/test_ud_accuracy.cpp index c6cb7c65..b4590143 100644 --- a/src/libvfcinstrumentonline/rand/tests/srround.cpp +++ b/src/libvfcinstrumentonline/rand/tests/test_ud_accuracy.cpp @@ -5,17 +5,24 @@ #include #include -#include "src/eft.hpp" -#include "src/main.hpp" -#include "src/sr.hpp" +#include +#include + +#include "src/ud.h" #include "src/utils.hpp" #include "tests/helper.hpp" -constexpr auto default_alpha = 0.001; +using ::testing::AllOf; +using ::testing::Eq; +using ::testing::Ge; +using ::testing::Le; +using ::testing::Lt; + +constexpr auto default_alpha = 0.0001; #ifdef SR_DEBUG constexpr auto default_repetitions = 10; #else -constexpr auto default_repetitions = 100'000; +constexpr auto default_repetitions = 1'000; #endif namespace reference { // return pred(|s|) @@ -24,22 +31,22 @@ namespace reference { // compute in double precision if the input type is float // compute in quad precision if the input type is double template ::H> -H sr_add(T a, T b) { +H add(T a, T b) { return static_cast(a) + static_cast(b); } template ::H> -H sr_sub(T a, T b) { +H sub(T a, T b) { return static_cast(a) - static_cast(b); } template ::H> -H sr_mul(T a, T b) { +H mul(T a, T b) { return static_cast(a) * static_cast(b); } template ::H> -H sr_div(T a, T b) { +H div(T a, T b) { return static_cast(a) / static_cast(b); } @@ -47,16 +54,16 @@ template ::H> std::function get_operator() { static_assert(std::is_base_of_v, Op>); if constexpr (std::is_same_v, Op>) { - return sr_add; + return add; } if constexpr (std::is_same_v>) { - return sr_sub; + return sub; } if constexpr (std::is_same_v>) { - return sr_mul; + return mul; } if constexpr (std::is_same_v>) { - return sr_div; + return div; } } @@ -66,59 +73,70 @@ template typename Op::function get_target_operator() { static_assert(std::is_base_of_v, Op>); if constexpr (std::is_same_v>) { - return sr_add; + return ud::scalar::add; } if constexpr (std::is_same_v>) { - return sr_sub; + return ud::scalar::sub; } if constexpr (std::is_same_v>) { - return sr_mul; + return ud::scalar::mul; } if constexpr (std::is_same_v>) { - return sr_div; + return ud::scalar::div; } } template ::H> -std::pair compute_distance_error(T a, T b, H reference) { +std::tuple compute_distance_error(T a, T b, + H reference) { T ref_cast = static_cast(reference); - H error = helper::absolute_distance(reference, static_cast(ref_cast)); - H ulp = helper::get_ulp(ref_cast); - - H probability_down = 1 - (error / ulp); - H probability_up = 1 - probability_down; - - if (ref_cast > reference) { - std::swap(probability_down, probability_up); - } - - if (error == 0 or ulp == 0 or - helper::abs(reference) < helper::IEEE754::min_subnormal or - helper::abs(error) < helper::IEEE754::min_subnormal) { - probability_down = 0; - probability_up = 1; + if (helper::isnan(a) or helper::isnan(b) or helper::isnan(reference) or + helper::isinf(a) or helper::isinf(b) or helper::isinf(reference) or + helper::isinf(ref_cast)) { + return {0, 0, reference, reference, 0, ""}; } - return {probability_down, probability_up}; -} - -template ::H> -std::string compute_distance_error_str(T a, T b, H reference) { - T ref_cast = static_cast(reference); - H error = helper::absolute_distance(reference, static_cast(ref_cast)); + H error = 0, error_c = 0; + H probability_down = 0, probability_up = 0; + H next = 0, prev = 0; H ulp = helper::get_ulp(ref_cast); - auto [probability_down, probability_up] = - compute_distance_error(a, b, reference); - - H next, prev; - if (ref_cast < reference) { - prev = ref_cast; - next = static_cast(reference + error); + if (ref_cast == reference) { + error = 0; + probability_down = 1; + probability_up = 1; + prev = reference; + next = reference; } else { - prev = static_cast(reference - error); - next = ref_cast; + error = helper::absolute_distance(reference, static_cast(ref_cast)); + error_c = helper::absolute_distance(static_cast(ulp), error); + prev = (ref_cast < reference) ? ref_cast : (ref_cast - ulp); + next = (ref_cast < reference) ? (ref_cast + ulp) : ref_cast; + probability_down = (next - reference) / ulp; + probability_up = (reference - prev) / ulp; + + if (((error + error_c) != ulp)) { + std::cerr << "error + error_c != ulp" << "\n" + << "error: " << helper::hexfloat(error) << "\n" + << "error_c: " << helper::hexfloat(error_c) << "\n" + << "prev: " << helper::hexfloat(prev) << "\n" + << "next: " << helper::hexfloat(next) << "\n" + << "ulp: " << helper::hexfloat(ulp) << std::endl; + HWY_ASSERT(false); + } + if (probability_down + probability_up != 1) { + std::cerr << "probability_down + probability_up != 1" << "\n" + << "probability_down: " << probability_down << "\n" + << "probability_up: " << probability_up << "\n" + << "reference: " << helper::hexfloat(reference) << "\n" + << "prev: " << helper::hexfloat(prev) << "\n" + << "next: " << helper::hexfloat(next) << "\n" + << "error: " << helper::hexfloat(error) << "\n" + << "error_c: " << helper::hexfloat(error_c) << "\n" + << "ulp: " << helper::hexfloat(ulp) << std::endl; + HWY_ASSERT(false); + } } std::ostringstream os; @@ -133,13 +151,16 @@ std::string compute_distance_error_str(T a, T b, H reference) { os << " (double)reference: " << helper::hexfloat(ref_cast) << std::endl; os << " |ref-(double)ref|: " << helper::hexfloat(error) << std::endl; } + os << " error_c: " << helper::hexfloat(error_c) << std::endl; os << " ulp: " << helper::hexfloat(ulp) << std::endl; os << " reference ↓: " << helper::hexfloat(prev) << std::endl; os << " reference ↑: " << helper::hexfloat(next) << std::endl; os << std::defaultfloat; os << " p: " << probability_down << std::endl; os << " 1-p: " << probability_up << std::endl; - return os.str(); + auto msg = os.str(); + + return {probability_down, probability_up, prev, next, error, msg}; } template @@ -173,10 +194,13 @@ void check_distribution_match(T a, T b, const float alpha = default_alpha) { using H = typename helper::IEEE754::H; + const auto op_name = Op::name; + const auto ftype = Op::ftype; + auto reference_op = reference::get_operator(); H reference = reference_op(a, b); - auto [probability_down, probability_up] = - compute_distance_error(a, b, reference); + auto [probability_down, probability_up, down, up, distance_error, + distance_error_msg] = compute_distance_error(a, b, reference); auto counter = eval_op(a, b, repetitions); auto count_down = counter.down_count(); @@ -199,58 +223,63 @@ void check_distribution_match(T a, T b, return; } - /* code */ - // EXPECT_THAT(static_cast(probability_down), AllOf(Ge(0.0), Le(1.0))) - // << "Probability ↓ is not in [0, 1] range\n" - // << "-- theoretical -\n" - // << " probability ↓: " << fmt_proba(probability_down) << "\n" - // << " probability ↑: " << fmt_proba(probability_up) << "\n" - // << "--- estimated --\n" - // << " sample size: " << repetitions << "\n" - // << " #↓: " << count_down << " (" - // << fmt_proba(probability_down_estimated) << ")\n" - // << " #↑: " << count_up << " (" - // << fmt_proba(probability_up_estimated) << ")\n" - // << std::hexfloat << "" - // << " ↓: " << counter.down() << "\n" - // << " ↑: " << counter.up() << "\n" - // << compute_distance_error_str(a, b, reference) << std::defaultfloat - // << flush(); - - auto test = helper::binomial_test(repetitions, count_down, - static_cast(probability_down)); - - const auto op_name = Op::name; - const auto ftype = Op::ftype; - - if (test.pvalue == 0) - return; - - // EXPECT_THAT(alpha / 2, Lt(test.pvalue)) - // << "Null hypotheis rejected!\n" - // << " type: " << ftype << "\n" - // << " op: " << op_name << "\n" - // << " alpha: " << alpha << "\n" - // << std::hexfloat << std::setprecision(13) << "" - // << " a: " << helper::hexfloat(a) << "\n" - // << " b: " << helper::hexfloat(b) << "\n" - // << " a+b: " << helper::hexfloat(reference) << "\n" - // << std::defaultfloat << "" - // << "-- theoretical -\n" - // << " probability ↓: " << fmt_proba(probability_down) << "\n" - // << " probability ↑: " << fmt_proba(probability_up) << "\n" - // << "--- estimated --\n" - // << " sample size: " << repetitions << "\n" - // << " #↓: " << count_down << " (" - // << fmt_proba(probability_down_estimated) << ")\n" - // << " #↑: " << count_up << " (" - // << fmt_proba(probability_up_estimated) << ")\n" - // << " p-value: " << test.pvalue << "\n" - // << std::hexfloat << "" - // << " ↓: " << helper::hexfloat(counter.down()) << "\n" - // << " ↑: " << helper::hexfloat(counter.up()) << "\n" - // << compute_distance_error_str(a, b, reference) << std::defaultfloat - // << flush(); + EXPECT_THAT(static_cast(probability_down), AllOf(Ge(0.0), Le(1.0))) + << "Probability ↓ is not in [0, 1] range\n" + << "-- theoretical -\n" + << " probability ↓: " << fmt_proba(probability_down) << "\n" + << " probability ↑: " << fmt_proba(probability_up) << "\n" + << "--- estimated --\n" + << " sample size: " << repetitions << "\n" + << " #↓: " << count_down << " (" + << fmt_proba(probability_down_estimated) << ")\n" + << " #↑: " << count_up << " (" + << fmt_proba(probability_up_estimated) << ")\n" + << std::hexfloat << "" << " ↓: " << counter.down() << "\n" + << " ↑: " << counter.up() << "\n" + << distance_error_msg << std::defaultfloat << flush(); + + // do the test only if the operation is not exact (probability is not zero) + bool is_exact = probability_down == 1 and probability_up == 1; + + // do the test only if the operation is not exact (probability is not zero) + // do the test if the distance between the reference and the estimated value + // is greater than the minimum subnormal + // do not the test if the probability is lower than 1/repetitions + bool compare_down_values = not is_exact and down != 0 and + probability_down > (1.0 / repetitions) and + distance_error > helper::IEEE754::min_subnormal; + bool compare_up_values = not is_exact and up != 0 and + probability_up > (1.0 / repetitions) and + distance_error > helper::IEEE754::min_subnormal; + + // binomial test + auto test = helper::binomial_test(repetitions, count_down, 0.5); + + // check probability if we compare the values + if (compare_down_values and compare_up_values) + EXPECT_THAT(alpha / 2, Lt(test.pvalue)) + << "Null hypotheis rejected!\n" + << " type: " << ftype << "\n" + << " op: " << op_name << "\n" + << " alpha: " << alpha << "\n" + << std::hexfloat << std::setprecision(13) << "" + << " a: " << helper::hexfloat(a) << "\n" + << " b: " << helper::hexfloat(b) << "\n" + << " a+b: " << helper::hexfloat(reference) << "\n" + << std::defaultfloat << "" << "-- theoretical -\n" + << " probability ↓: " << fmt_proba(probability_down) << "\n" + << " probability ↑: " << fmt_proba(probability_up) << "\n" + << "--- estimated --\n" + << " sample size: " << repetitions << "\n" + << " #↓: " << count_down << " (" + << fmt_proba(probability_down_estimated) << ")\n" + << " #↑: " << count_up << " (" + << fmt_proba(probability_up_estimated) << ")\n" + << " p-value: " << test.pvalue << "\n" + << std::hexfloat << "" + << " ↓: " << helper::hexfloat(counter.down()) << "\n" + << " ↑: " << helper::hexfloat(counter.up()) << "\n" + << distance_error_msg << std::defaultfloat << flush(); debug_reset(); } @@ -491,6 +520,5 @@ TEST(SRRoundTest, RandomMidOverlapDivAssertions) { int main(int argc, char **argv) { testing::InitGoogleTest(&argc, argv); - init(); return RUN_ALL_TESTS(); } \ No newline at end of file diff --git a/src/libvfcinstrumentonline/rand/tests/srround_test_hw-inl.cpp b/src/libvfcinstrumentonline/rand/tests/test_ud_hwy_accuracy.cpp similarity index 82% rename from src/libvfcinstrumentonline/rand/tests/srround_test_hw-inl.cpp rename to src/libvfcinstrumentonline/rand/tests/test_ud_hwy_accuracy.cpp index 9d7f9b37..684e9f4d 100644 --- a/src/libvfcinstrumentonline/rand/tests/srround_test_hw-inl.cpp +++ b/src/libvfcinstrumentonline/rand/tests/test_ud_hwy_accuracy.cpp @@ -12,17 +12,17 @@ // clang-format off #undef HWY_TARGET_INCLUDE -#define HWY_TARGET_INCLUDE "tests/srround_test_hw-inl.cpp" +#define HWY_TARGET_INCLUDE "tests/test_ud_hwy_accuracy.cpp" #include "hwy/foreach_target.h" #include "hwy/highway.h" #include "hwy/base.h" #include "hwy/tests/test_util-inl.h" #include "src/debug_hwy-inl.h" -#include "src/sr_hw-inl.h" +#include "src/ud_hw-inl.h" // clang-format on -#include "src/sr_hw.h" +#include "src/ud_hw.h" #include "src/utils.hpp" #include "tests/helper.hpp" @@ -41,11 +41,11 @@ using ::testing::Ge; using ::testing::Le; using ::testing::Lt; -constexpr auto default_alpha = 0.001; +constexpr auto default_alpha = 0.0001; #ifdef SR_DEBUG constexpr auto default_repetitions = 100; #else -constexpr auto default_repetitions = 10'000; +constexpr auto default_repetitions = 1'000; #endif static auto distribution_failed_tests_counter = 0; @@ -79,15 +79,15 @@ H div(T a, T b) { }; // namespace reference -namespace srvh = sr::vector::HWY_NAMESPACE; +namespace udvh = ud::vector::HWY_NAMESPACE; struct SRAdd { - static constexpr std::string name = "add"; - static constexpr std::string symbol = "+"; + static constexpr char name[] = "add"; + static constexpr char symbol[] = "+"; template , typename T = hn::TFromD> V operator()(D d, V a, V b) { - return srvh::sr_add(a, b); + return udvh::add(a, b); } template ::H> @@ -97,12 +97,12 @@ struct SRAdd { }; struct SRSub { - static constexpr std::string name = "sub"; - static constexpr std::string symbol = "-"; + static constexpr char name[] = "sub"; + static constexpr char symbol[] = "-"; template , typename T = hn::TFromD> V operator()(D d, V a, V b) { - return srvh::sr_sub(a, b); + return udvh::sub(a, b); } template ::H> @@ -112,12 +112,12 @@ struct SRSub { }; struct SRMul { - static constexpr std::string name = "mul"; - static constexpr std::string symbol = "*"; + static constexpr char name[] = "mul"; + static constexpr char symbol[] = "*"; template , typename T = hn::TFromD> V operator()(D d, V a, V b) { - return srvh::sr_mul(a, b); + return udvh::mul(a, b); } template ::H> @@ -127,12 +127,12 @@ struct SRMul { }; struct SRDiv { - static constexpr std::string name = "div"; - static constexpr std::string symbol = "/"; + static constexpr char name[] = "div"; + static constexpr char symbol[] = "/"; template , typename T = hn::TFromD> V operator()(D d, V a, V b) { - return srvh::sr_div(a, b); + return udvh::div(a, b); } template ::H> @@ -142,13 +142,14 @@ struct SRDiv { }; template ::H> -std::tuple compute_distance_error(T a, T b, H reference) { +std::tuple compute_distance_error(T a, T b, + H reference) { T ref_cast = static_cast(reference); if (helper::isnan(a) or helper::isnan(b) or helper::isnan(reference) or helper::isinf(a) or helper::isinf(b) or helper::isinf(reference) or helper::isinf(ref_cast)) { - return {0, 0, ""}; + return {0, 0, reference, reference, 0, ""}; } H error = 0, error_c = 0; @@ -214,7 +215,7 @@ std::tuple compute_distance_error(T a, T b, H reference) { os << " 1-p: " << probability_up << std::endl; auto msg = os.str(); - return {probability_down, probability_up, msg}; + return {probability_down, probability_up, prev, next, error, msg}; } template , @@ -281,8 +282,8 @@ void check_distribution_match(D d, V va, V vb, auto counters = eval_op(d, va, vb, repetitions); H reference = Op::reference(a, b); - auto [probability_down, probability_up, distance_error_msg] = - compute_distance_error(a, b, reference); + auto [probability_down, probability_up, down, up, distance_error, + distance_error_msg] = compute_distance_error(a, b, reference); size_t lane = 0; for (auto &counter : counters) { @@ -340,60 +341,65 @@ void check_distribution_match(D d, V va, V vb, HWY_ASSERT(false); } + // do the test only if the operation is not exact (probability is not zero) + bool is_exact = probability_down == 1 and probability_up == 1; + + // do the test only if the operation is not exact (probability is not zero) + // do the test if the distance between the reference and the estimated value + // is greater than the minimum subnormal + // do not the test if the probability is lower than 1/repetitions + bool compare_down_values = + not is_exact and down != 0 and + probability_down > (1.0 / repetitions) and + distance_error > helper::IEEE754::min_subnormal; + bool compare_up_values = not is_exact and up != 0 and + probability_up > (1.0 / repetitions) and + distance_error > helper::IEEE754::min_subnormal; + // 95% relative error auto error_down = (probability_down_estimated / probability_down); auto error_up = (probability_up_estimated / probability_up); // binomial test - auto test = helper::binomial_test(repetitions, count_down, - static_cast(probability_down)); + auto test = helper::binomial_test(repetitions, count_down, .5); const auto op_name = Op::name; const auto op_symbol = Op::symbol; const auto ftype = typeid(T).name(); - if (test.pvalue == 0) - return; - // Bonferroni correction, divide by the number of lanes const auto lanes = hn::Lanes(d); const auto alpha_bon = (alpha / 2) / lanes; - if (test.pvalue < alpha_bon) { - // check that we are in the 95% confidence interval - // to avoid false positives - if (error_down < 0.90 or error_up < 0.90) { - std::cerr << "Null hypotheis rejected!\n" - << " Lane/#Lanes: " << lane + 1 << "/" << lanes << "\n" - << " type: " << ftype << "\n" - << " op: " << op_name << "\n" - << " alpha: " << alpha_bon << "\n" - << std::hexfloat << std::setprecision(13) << "" - << " a: " << helper::hexfloat(a) << "\n" - << " b: " << helper::hexfloat(b) << "\n" - << " a" << op_symbol - << "b: " << helper::hexfloat(reference) << "\n" - << std::defaultfloat << "" << "-- theoretical -\n" - << " probability ↓: " << fmt_proba(probability_down) << "\n" - << " probability ↑: " << fmt_proba(probability_up) << "\n" - << "--- estimated --\n" - << " sample size: " << repetitions << "\n" - << " #↓: " << count_down << " (" - << fmt_proba(probability_down_estimated) << ")\n" - << " #↑: " << count_up << " (" - << fmt_proba(probability_up_estimated) << ")\n" - << " p-value: " << test.pvalue << "\n" - << " ↓ %: " << error_down * 100 << "\n" - << " ↑ %: " << error_up * 100 << "\n" - << std::hexfloat - << " ↓: " << helper::hexfloat(counter.down()) - << "\n" - << " ↑: " << helper::hexfloat(counter.up()) - << "\n" - << distance_error_msg << std::defaultfloat << flush(); - HWY_ASSERT(0); - } - distribution_failed_tests_counter++; + if (compare_down_values and compare_up_values and test.pvalue < alpha_bon) { + std::cerr << "Null hypotheis rejected!\n" + << " Lane/#Lanes: " << lane + 1 << "/" << lanes << "\n" + << " type: " << ftype << "\n" + << " op: " << op_name << "\n" + << " alpha: " << alpha_bon << "\n" + << std::hexfloat << std::setprecision(13) << "" + << " a: " << helper::hexfloat(a) << "\n" + << " b: " << helper::hexfloat(b) << "\n" + << " a" << op_symbol + << "b: " << helper::hexfloat(reference) << "\n" + << std::defaultfloat << "" << "-- theoretical -\n" + << " probability ↓: " << fmt_proba(probability_down) << "\n" + << " probability ↑: " << fmt_proba(probability_up) << "\n" + << "--- estimated --\n" + << " sample size: " << repetitions << "\n" + << " #↓: " << count_down << " (" + << fmt_proba(probability_down_estimated) << ")\n" + << " #↑: " << count_up << " (" + << fmt_proba(probability_up_estimated) << ")\n" + << " p-value: " << test.pvalue << "\n" + << " ↓ %: " << error_down * 100 << "\n" + << " ↑ %: " << error_up * 100 << "\n" + << std::hexfloat + << " ↓: " << helper::hexfloat(counter.down()) + << "\n" + << " ↑: " << helper::hexfloat(counter.up()) << "\n" + << distance_error_msg << std::defaultfloat << flush(); + HWY_ASSERT(0); } distribution_tests_counter++; lane++; @@ -705,9 +711,21 @@ HWY_EXPORT_AND_TEST_P(SRRoundTest, TestAllBasicAssertionsSub); HWY_EXPORT_AND_TEST_P(SRRoundTest, TestAllBasicAssertionsMul); HWY_EXPORT_AND_TEST_P(SRRoundTest, TestAllBasicAssertionsDiv); HWY_EXPORT_AND_TEST_P(SRRoundTest, TestAllRandom01AssertionsAdd); +HWY_EXPORT_AND_TEST_P(SRRoundTest, TestAllRandom01AssertionsSub); +HWY_EXPORT_AND_TEST_P(SRRoundTest, TestAllRandom01AssertionsMul); +HWY_EXPORT_AND_TEST_P(SRRoundTest, TestAllRandom01AssertionsDiv); HWY_EXPORT_AND_TEST_P(SRRoundTest, TestAllRandomNoOverlapAssertionsAdd); +HWY_EXPORT_AND_TEST_P(SRRoundTest, TestAllRandomNoOverlapAssertionsSub); +HWY_EXPORT_AND_TEST_P(SRRoundTest, TestAllRandomNoOverlapAssertionsMul); +HWY_EXPORT_AND_TEST_P(SRRoundTest, TestAllRandomNoOverlapAssertionsDiv); HWY_EXPORT_AND_TEST_P(SRRoundTest, TestAllRandomLastBitOverlapAdd); +HWY_EXPORT_AND_TEST_P(SRRoundTest, TestAllRandomLastBitOverlapSub); +HWY_EXPORT_AND_TEST_P(SRRoundTest, TestAllRandomLastBitOverlapMul); +HWY_EXPORT_AND_TEST_P(SRRoundTest, TestAllRandomLastBitOverlapDiv); HWY_EXPORT_AND_TEST_P(SRRoundTest, TestAllRandomMidOverlapAdd); +HWY_EXPORT_AND_TEST_P(SRRoundTest, TestAllRandomMidOverlapSub); +HWY_EXPORT_AND_TEST_P(SRRoundTest, TestAllRandomMidOverlapMul); +HWY_EXPORT_AND_TEST_P(SRRoundTest, TestAllRandomMidOverlapDiv); HWY_AFTER_TEST(); } // namespace HWY_NAMESPACE diff --git a/src/libvfcinstrumentonline/rand/tests/test_ud_hwy_performance.cpp b/src/libvfcinstrumentonline/rand/tests/test_ud_hwy_performance.cpp new file mode 100644 index 00000000..f2c15b80 --- /dev/null +++ b/src/libvfcinstrumentonline/rand/tests/test_ud_hwy_performance.cpp @@ -0,0 +1,1175 @@ +#include "hwy/nanobenchmark.h" + +#include +#include +#include +#include + +#include "gtest/gtest.h" + +#include "hwy/tests/hwy_gtest.h" +#include "hwy/tests/test_util-inl.h" + +#include "src/ud_hw.h" + +namespace ud { +namespace vector { + +using VecArgf32 = hwy::AlignedUniquePtr; +using VecArgf64 = hwy::AlignedUniquePtr; + +using binary_f32_op = void (*)(const VecArgf32 &, const VecArgf32 &, + const VecArgf32 &, const size_t); +using binary_f64_op = void (*)(const VecArgf64 &, const VecArgf64 &, + const VecArgf64 &, const size_t); + +template constexpr const char *GetFormatString() { + if constexpr (std::is_same_v) { + return "%+.6a "; + } else { + return "%+.13a "; + } +} + +template +void MeasureFunction(Op func, const std::size_t lanes = 0, + const bool verbose = false) { + + constexpr size_t inputs_size = S; + const auto a = hwy::MakeUniqueAlignedArray(inputs_size); + const auto b = hwy::MakeUniqueAlignedArray(inputs_size); + auto c = hwy::MakeUniqueAlignedArray(inputs_size); + constexpr T ulp = std::is_same_v ? 0x1.0p-24f : 0x1.0p-53; + constexpr const char *fmt = GetFormatString(); + + for (size_t i = 0; i < inputs_size; i++) { + a[i] = 1.0; + b[i] = ulp; + } + + std::vector times(N); + + for (size_t i = 0; i < N; i++) { + auto start = std::chrono::high_resolution_clock::now(); + func(a, b, c, inputs_size); + auto end = std::chrono::high_resolution_clock::now(); + std::chrono::duration diff = end - start; + times[i] = diff.count(); + if (verbose) + std::cout << "Iteration: " << i << " time: " << diff.count() << " s\n"; + + if (verbose) { + for (size_t i = 0; i < inputs_size; i++) { + fprintf(stderr, fmt, c[i]); + if ((lanes != 0) and ((i % lanes) == (lanes - 1))) { + fprintf(stderr, "\n"); + } + } + fprintf(stderr, "\n"); + } + } + + // min, median, mean, max + auto min = *std::min_element(times.begin(), times.end()); + auto max = *std::max_element(times.begin(), times.end()); + auto mean = std::accumulate(times.begin(), times.end(), 0.0) / N; + auto std = std::sqrt( + std::inner_product(times.begin(), times.end(), times.begin(), 0.0) / N - + mean * mean); + + fprintf(stderr, "[%-4zu] ", inputs_size); + fprintf(stderr, "%.4e ± %.4e [%.4e - %.4e] (%zu)\n", mean, std, min, max, N); +} + +template +void MeasureFunctionX(Op func, const std::size_t lanes = 0, + const bool verbose = false) { + + constexpr size_t inputs_size = S; + constexpr T ulp = std::is_same_v ? 0x1.0p-24 : 0x1.0p-53; + V a = {0}; + V b = {0}; + constexpr const char *fmt = GetFormatString(); + + for (size_t i = 0; i < inputs_size; i++) { + a[i] = 1.0; + b[i] = ulp; + } + + std::vector times(N); + + for (size_t i = 0; i < N; i++) { + auto start = std::chrono::high_resolution_clock::now(); + auto c = func(a, b, inputs_size); + auto end = std::chrono::high_resolution_clock::now(); + std::chrono::duration diff = end - start; + times[i] = diff.count(); + if (verbose) + std::cout << "Iteration: " << i << " time: " << diff.count() << " s\n"; + + if (verbose) { + for (size_t j = 0; j < inputs_size; j++) { + fprintf(stderr, fmt, ((T *)&c)[j]); + if ((lanes != 0) and ((j % lanes) == (lanes - 1))) { + fprintf(stderr, "\n"); + } + } + fprintf(stderr, "\n"); + } + } + + // min, median, mean, max + auto min = *std::min_element(times.begin(), times.end()); + auto max = *std::max_element(times.begin(), times.end()); + auto mean = std::accumulate(times.begin(), times.end(), 0.0) / N; + auto std = std::sqrt( + std::inner_product(times.begin(), times.end(), times.begin(), 0.0) / N - + mean * mean); + + fprintf(stderr, "[%-4zu] ", inputs_size); + fprintf(stderr, "%.4e ± %.4e [%.4e - %.4e] (%zu)\n", mean, std, min, max, N); +} + +/* Test array inputs */ + +#define define_array_test(op, type) \ + void test_##op##type(const VecArg##type &a, const VecArg##type &b, \ + const VecArg##type &c, const size_t count) { \ + op##type(a.get(), b.get(), c.get(), count); \ + } + +define_array_test(add, f32); +define_array_test(sub, f32); +define_array_test(mul, f32); +define_array_test(div, f32); + +define_array_test(add, f64); +define_array_test(sub, f64); +define_array_test(mul, f64); +define_array_test(div, f64); + +/* Test vector pointer inputs */ + +#define define_array_test_vector(op, type, size) \ + void test_##op##type##x##size(const VecArg##type &a, const VecArg##type &b, \ + const VecArg##type &c, const size_t count) { \ + op##type##x##size(a.get(), b.get(), c.get()); \ + } + +/* 64-bits */ +#if HWY_MAX_BYTES >= 8 +define_array_test_vector(add, f32, 2); +define_array_test_vector(sub, f32, 2); +define_array_test_vector(mul, f32, 2); +define_array_test_vector(div, f32, 2); +#endif + +/* 128-bits */ +#if HWY_MAX_BYTES >= 16 +define_array_test_vector(add, f32, 4); +define_array_test_vector(sub, f32, 4); +define_array_test_vector(mul, f32, 4); +define_array_test_vector(div, f32, 4); + +define_array_test_vector(add, f64, 2); +define_array_test_vector(sub, f64, 2); +define_array_test_vector(mul, f64, 2); +define_array_test_vector(div, f64, 2); +#endif + +/* 256-bits */ +#if HWY_MAX_BYTES >= 32 +define_array_test_vector(add, f32, 8); +define_array_test_vector(sub, f32, 8); +define_array_test_vector(mul, f32, 8); +define_array_test_vector(div, f32, 8); + +define_array_test_vector(add, f64, 4); +define_array_test_vector(sub, f64, 4); +define_array_test_vector(mul, f64, 4); +define_array_test_vector(div, f64, 4); +#endif + +/* 512-bits */ +#if HWY_MAX_BYTES >= 64 +define_array_test_vector(add, f64, 8); +define_array_test_vector(sub, f64, 8); +define_array_test_vector(mul, f64, 8); +define_array_test_vector(div, f64, 8); + +define_array_test_vector(add, f32, 16); +define_array_test_vector(sub, f32, 16); +define_array_test_vector(mul, f32, 16); +define_array_test_vector(div, f32, 16); +#endif + +/* Test vector inputs (static dispatch) */ + +#define define_vector_test(op, type, size) \ + type##x##size##_v test_##op##type##x##size##_v(const type##x##size##_v a, \ + const type##x##size##_v b, \ + const size_t count) { \ + return op##type##x##size##_v(a, b); \ + } + +/* 64-bits */ + +#if HWY_MAX_BYTES >= 8 +define_vector_test(add, f32, 2); +define_vector_test(sub, f32, 2); +define_vector_test(mul, f32, 2); +define_vector_test(div, f32, 2); +#endif + +/* 128-bits */ + +#if HWY_MAX_BYTES >= 16 +define_vector_test(add, f64, 2); +define_vector_test(sub, f64, 2); +define_vector_test(mul, f64, 2); +define_vector_test(div, f64, 2); + +define_vector_test(add, f32, 4); +define_vector_test(sub, f32, 4); +define_vector_test(mul, f32, 4); +define_vector_test(div, f32, 4); +#endif + +/* 256-bits */ +#if HWY_MAX_BYTES >= 32 +define_vector_test(add, f64, 4); +define_vector_test(sub, f64, 4); +define_vector_test(mul, f64, 4); +define_vector_test(div, f64, 4); + +define_vector_test(add, f32, 8); +define_vector_test(sub, f32, 8); +define_vector_test(mul, f32, 8); +define_vector_test(div, f32, 8); +#endif + +/* 512-bits */ +#if HWY_MAX_BYTES >= 64 +define_vector_test(add, f64, 8); +define_vector_test(sub, f64, 8); +define_vector_test(mul, f64, 8); +define_vector_test(div, f64, 8); + +define_vector_test(add, f32, 16); +define_vector_test(sub, f32, 16); +define_vector_test(mul, f32, 16); +define_vector_test(div, f32, 16); +#endif + +/* Test vector inputs (dynamic dispatch) */ + +#define define_vector_test_dynamic(op, type, size) \ + type##x##size##_v test_##op##type##x##size##_d(const type##x##size##_v a, \ + const type##x##size##_v b, \ + const size_t count) { \ + return op##type##x##size##_d(a, b); \ + } + +define_vector_test_dynamic(add, f32, 2); +define_vector_test_dynamic(sub, f32, 2); +define_vector_test_dynamic(mul, f32, 2); +define_vector_test_dynamic(div, f32, 2); + +define_vector_test_dynamic(add, f64, 2); +define_vector_test_dynamic(sub, f64, 2); +define_vector_test_dynamic(mul, f64, 2); +define_vector_test_dynamic(div, f64, 2); + +define_vector_test_dynamic(add, f32, 4); +define_vector_test_dynamic(sub, f32, 4); +define_vector_test_dynamic(mul, f32, 4); +define_vector_test_dynamic(div, f32, 4); + +define_vector_test_dynamic(add, f64, 4); +define_vector_test_dynamic(sub, f64, 4); +define_vector_test_dynamic(mul, f64, 4); +define_vector_test_dynamic(div, f64, 4); + +define_vector_test_dynamic(add, f32, 8); +define_vector_test_dynamic(sub, f32, 8); +define_vector_test_dynamic(mul, f32, 8); +define_vector_test_dynamic(div, f32, 8); + +define_vector_test_dynamic(add, f64, 8); +define_vector_test_dynamic(sub, f64, 8); +define_vector_test_dynamic(mul, f64, 8); +define_vector_test_dynamic(div, f64, 8); + +define_vector_test_dynamic(add, f32, 16); +define_vector_test_dynamic(sub, f32, 16); +define_vector_test_dynamic(mul, f32, 16); +define_vector_test_dynamic(div, f32, 16); + +// Recursion function to call MeasureFunction with powers of 2 +template +void callMeasureFunctions(Op function) { + MeasureFunction(function); + if constexpr (N * 2 <= Max) { + callMeasureFunctions(function); + } +} + +/* Test on Array inputs */ + +/* IEEE-754 binary32 */ + +TEST(SRArrayBenchmark, SRAddF32) { + constexpr size_t N = 10; + std::cout << "Measure function ud::addf32 with " << N << " repetitions\n"; + callMeasureFunctions<2, 1024, float>(&test_addf32); +} + +TEST(SRArrayBenchmark, SRSubF32) { + constexpr size_t N = 10; + std::cout << "Measure function ud::subf32 with " << N << " repetitions\n"; + callMeasureFunctions<2, 1024, float>(&test_subf32); +} + +TEST(SRArrayBenchmark, SRMulF32) { + constexpr size_t N = 10; + std::cout << "Measure function ud::mulf32 with " << N << " repetitions\n"; + callMeasureFunctions<2, 1024, float>(&test_mulf32); +} + +TEST(SRArrayBenchmark, SRDivF32) { + constexpr size_t N = 10; + std::cout << "Measure function ud::divf32 with " << N << " repetitions\n"; + callMeasureFunctions<2, 1024, float>(&test_divf32); +} + +/* IEEE-754 binary64 */ + +TEST(SRArrayBenchmark, SRAddF64) { + constexpr size_t N = 10; + std::cout << "Measure function ud::addf64 with " << N << " repetitions\n"; + callMeasureFunctions<2, 1024, double>(&test_addf64); +} + +TEST(SRArrayBenchmark, SRSubF64) { + constexpr size_t N = 10; + std::cout << "Measure function ud::subf64 with " << N << " repetitions\n"; + callMeasureFunctions<2, 1024, double>(&test_subf64); +} + +TEST(SRArrayBenchmark, SRMulF64) { + constexpr size_t N = 10; + std::cout << "Measure function ud::mulf64 with " << N << " repetitions\n"; + callMeasureFunctions<2, 1024, double>(&test_mulf64); +} + +TEST(SRArrayBenchmark, SRDivF64) { + constexpr size_t N = 10; + std::cout << "Measure function ud::divf64 with " << N << " repetitions\n"; + callMeasureFunctions<2, 1024, double>(&test_divf64); +} + +/* Test on single vector passed by pointer */ + +constexpr bool kVerbose = false; + +/* IEEE-754 binary32 x2 */ + +TEST(SRVectorPtrBenchmark, SRAddF32x2) { + constexpr size_t N = 10; + std::cout << "Measure function ud::addf32x2 with " << N << " repetitions\n"; + MeasureFunction<2, float>(&test_addf32x2, 2, kVerbose); +} + +TEST(SRVectorPtrBenchmark, SRSubF32x2) { + constexpr size_t N = 10; + std::cout << "Measure function ud::subf32x2 with " << N << " repetitions\n"; + MeasureFunction<2, float>(&test_subf32x2, 2, kVerbose); +} + +TEST(SRVectorPtrBenchmark, SRMulF32x2) { + constexpr size_t N = 10; + std::cout << "Measure function ud::mulf32x2 with " << N << " repetitions\n"; + MeasureFunction<2, float>(&test_mulf32x2, 2, kVerbose); +} + +TEST(SRVectorPtrBenchmark, SRDivF32x2) { + constexpr size_t N = 10; + std::cout << "Measure function ud::divf32x2 with " << N << " repetitions\n"; + MeasureFunction<2, float>(&test_divf32x2, 2, kVerbose); +} + +/* IEEE-754 binary64 x2 */ + +TEST(SRVectorPtrBenchmark, SRAddF64x2) { + constexpr size_t N = 10; + std::cout << "Measure function ud::addf64x2 with " << N << " repetitions\n"; + MeasureFunction<2, double>(&test_addf64x2, 2, kVerbose); +} + +TEST(SRVectorPtrBenchmark, SRSubF64x2) { + constexpr size_t N = 10; + std::cout << "Measure function ud::subf64x2 with " << N << " repetitions\n"; + MeasureFunction<2, double>(&test_subf64x2, 2, kVerbose); +} + +TEST(SRVectorPtrBenchmark, SRMulF64x2) { + constexpr size_t N = 10; + std::cout << "Measure function ud::mulf64x2 with " << N << " repetitions\n"; + MeasureFunction<2, double>(&test_mulf64x2, 2, kVerbose); +} + +TEST(SRVectorPtrBenchmark, SRDivF64x2) { + constexpr size_t N = 10; + std::cout << "Measure function ud::divf64x2 with " << N << " repetitions\n"; + MeasureFunction<2, double>(&test_divf64x2, 2, kVerbose); +} + +/* IEEE-754 binary32 x4 */ + +TEST(SRVectorPtrBenchmark, SRAddF32x4) { +#if HWY_MAX_BYTES >= 16 + constexpr size_t N = 10; + std::cout << "Measure function ud::addf32x4 with " << N << " repetitions\n"; + MeasureFunction<4, float>(&test_addf32x4, 4, kVerbose); +#else + GTEST_SKIP() << "SRAddF32x4 is not available"; +#endif +} + +TEST(SRVectorPtrBenchmark, SRSubF32x4) { +#if HWY_MAX_BYTES >= 16 + constexpr size_t N = 10; + std::cout << "Measure function ud::subf32x4 with " << N << " repetitions\n"; + MeasureFunction<4, float>(&test_subf32x4, 4, kVerbose); +#else + GTEST_SKIP() << "SRSubF32x4 is not available"; +#endif +} + +TEST(SRVectorPtrBenchmark, SRMulF32x4) { +#if HWY_MAX_BYTES >= 16 + constexpr size_t N = 10; + std::cout << "Measure function ud::mulf32x4 with " << N << " repetitions\n"; + MeasureFunction<4, float>(&test_mulf32x4, 4, kVerbose); +#else + GTEST_SKIP() << "SRMulF32x4 is not available"; +#endif +} + +TEST(SRVectorPtrBenchmark, SRDivF32x4) { +#if HWY_MAX_BYTES >= 16 + constexpr size_t N = 10; + std::cout << "Measure function ud::divf32x4 with " << N << " repetitions\n"; + MeasureFunction<4, float>(&test_divf32x4, 4, kVerbose); +#else + GTEST_SKIP() << "SRDivF32x4 is not available"; +#endif +} + +/* IEEE-754 binary32 x8 */ + +TEST(SRVectorPtrBenchmark, SRAddF32x8) { +#if HWY_MAX_BYTES >= 32 + constexpr size_t N = 10; + std::cout << "Measure function ud::addf32x8 with " << N << " repetitions\n"; + MeasureFunction<8, float>(&test_addf32x8, 8, kVerbose); +#else + GTEST_SKIP() << "SRAddF32x8 is not available"; +#endif +} + +TEST(SRVectorPtrBenchmark, SRSubF32x8) { +#if HWY_MAX_BYTES >= 32 + constexpr size_t N = 10; + std::cout << "Measure function ud::subf32x8 with " << N << " repetitions\n"; + MeasureFunction<8, float>(&test_subf32x8, 8, kVerbose); +#else + GTEST_SKIP() << "SRSubF32x8 is not available"; +#endif +} + +TEST(SRVectorPtrBenchmark, SRMulF32x8) { +#if HWY_MAX_BYTES >= 32 + constexpr size_t N = 10; + std::cout << "Measure function ud::mulf32x8 with " << N << " repetitions\n"; + MeasureFunction<8, float>(&test_mulf32x8, 8, kVerbose); +#else + GTEST_SKIP() << "SRMulF32x8 is not available"; +#endif +} + +TEST(SRVectorPtrBenchmark, SRDivF32x8) { +#if HWY_MAX_BYTES >= 32 + constexpr size_t N = 10; + std::cout << "Measure function ud::divf32x8 with " << N << " repetitions\n"; + MeasureFunction<8, float>(&test_divf32x8, 8, kVerbose); +#else + GTEST_SKIP() << "SRDivF32x8 is not available"; +#endif +} + +/* IEEE-754 binary64 x4 */ + +TEST(SRVectorPtrBenchmark, SRAddF64x4) { +#if HWY_MAX_BYTES >= 32 + constexpr size_t N = 10; + std::cout << "Measure function ud::addf64x4 with " << N << "repetitions\n"; + MeasureFunction<4, double>(&test_addf64x4, 4, kVerbose); +#else + GTEST_SKIP() << "SRAddF64x4 is not available"; +#endif +} + +TEST(SRVectorPtrBenchmark, SRSubF64x4) { +#if HWY_MAX_BYTES >= 32 + constexpr size_t N = 10; + std::cout << "Measure function ud::subf64x4 with " << N << "repetitions\n"; + MeasureFunction<4, double>(&test_subf64x4, 4, kVerbose); +#else + GTEST_SKIP() << "SRSubF64x4 is not available"; +#endif +} + +TEST(SRVectorPtrBenchmark, SRMulF64x4) { +#if HWY_MAX_BYTES >= 32 + constexpr size_t N = 10; + std::cout << "Measure function ud::mulf64x4 with " << N << "repetitions\n"; + MeasureFunction<4, double>(&test_mulf64x4, 4, kVerbose); +#else + GTEST_SKIP() << "SRMulF64x4 is not available"; +#endif +} + +TEST(SRVectorPtrBenchmark, SRDivF64x4) { +#if HWY_MAX_BYTES >= 32 + constexpr size_t N = 10; + std::cout << "Measure function ud::divf64x4 with " << N << "repetitions\n"; + MeasureFunction<4, double>(&test_divf64x4, 4, kVerbose); +#else + GTEST_SKIP() << "SRDivF64x4 is not available"; +#endif +} + +/* IEEE-754 binary64 x8 */ + +TEST(SRVectorPtrBenchmark, SRAddF64x8) { +#if HWY_MAX_BYTES >= 64 + constexpr size_t N = 10; + std::cout << "Measure function ud::addf64x8 with " << N << "repetitions\n"; + MeasureFunction<8, double>(&test_addf64x8, 8, kVerbose); +#else + GTEST_SKIP() << "SRAddF64x8 is not available"; +#endif +} + +TEST(SRVectorPtrBenchmark, SRSubF64x8) { +#if HWY_MAX_BYTES >= 64 + constexpr size_t N = 10; + std::cout << "Measure function ud::subf64x8 with " << N << "repetitions\n"; + MeasureFunction<8, double>(&test_subf64x8, 8, kVerbose); +#else + GTEST_SKIP() << "SRSubF64x8 is not available"; +#endif +} + +TEST(SRVectorPtrBenchmark, SRMulF64x8) { +#if HWY_MAX_BYTES >= 64 + constexpr size_t N = 10; + std::cout << "Measure function ud::mulf64x8 with " << N << "repetitions\n"; + MeasureFunction<8, double>(&test_mulf64x8, 8, kVerbose); +#else + GTEST_SKIP() << "SRMulF64x8 is not available"; +#endif +} + +TEST(SRVectorPtrBenchmark, SRDivF64x8) { +#if HWY_MAX_BYTES >= 64 + constexpr size_t N = 10; + std::cout << "Measure function ud::divf64x8 with " << N << "repetitions\n"; + MeasureFunction<8, double>(&test_divf64x8, 8, kVerbose); +#else + GTEST_SKIP() << "SRDivF64x8 is not available"; +#endif +} + +/* IEEE-754 binary32 x16 */ + +TEST(SRVectorPtrBenchmark, SRAddF32x16) { +#if HWY_MAX_BYTES >= 64 + constexpr size_t N = 10; + std::cout << "Measure function ud::addf32x16 with " << N << "repetitions\n"; + MeasureFunction<16, float>(&test_addf32x16, 16, kVerbose); +#else + GTEST_SKIP() << "SRAddF32x16 is not available"; +#endif +} + +TEST(SRVectorPtrBenchmark, SRSubF32x16) { +#if HWY_MAX_BYTES >= 64 + constexpr size_t N = 10; + std::cout << "Measure function ud::subf32x16 with " << N << "repetitions\n"; + MeasureFunction<16, float>(&test_subf32x16, 16, kVerbose); +#else + GTEST_SKIP() << "SRSubF32x16 is not available"; +#endif +} + +TEST(SRVectorPtrBenchmark, SRMulF32x16) { +#if HWY_MAX_BYTES >= 64 + constexpr size_t N = 10; + std::cout << "Measure function ud::mulf32x16 with " << N << "repetitions\n"; + MeasureFunction<16, float>(&test_mulf32x16, 16, kVerbose); +#else + GTEST_SKIP() << "SRMulF32x16 is not available"; +#endif +} + +TEST(SRVectorPtrBenchmark, SRDivF32x16) { +#if HWY_MAX_BYTES >= 64 + constexpr size_t N = 10; + std::cout << "Measure function ud::divf32x16 with " << N << "repetitions\n"; + MeasureFunction<16, float>(&test_divf32x16, 16, kVerbose); +#else + GTEST_SKIP() << "SRDivF32x16 is not available"; +#endif +} + +/* Test on single vector passed by value with static dispatch */ + +/* IEEE-754 binary32 x2 */ + +TEST(SRVectorBenchmark, SRAddF32x2V) { +#if HWY_MAX_BYTES >= 8 + constexpr size_t N = 10; + std::cout << "Measure function ud::addf32x2_v with " << N + << " repetitions\n"; + MeasureFunctionX<2, float, f32x2_v>(&test_addf32x2_v, 2, kVerbose); +#else + GTEST_SKIP() << "SRAddF32x2V is not available"; +#endif +} + +TEST(SRVectorBenchmark, SRSubF32x2V) { +#if HWY_MAX_BYTES >= 8 + constexpr size_t N = 10; + std::cout << "Measure function ud::subf32x2_v with " << N + << " repetitions\n"; + MeasureFunctionX<2, float, f32x2_v>(&test_subf32x2_v, 2, kVerbose); +#else + GTEST_SKIP() << "SRSubF32x2V is not available"; +#endif +} + +TEST(SRVectorBenchmark, SRMulF32x2V) { +#if HWY_MAX_BYTES >= 8 + constexpr size_t N = 10; + std::cout << "Measure function ud::mulf32x2_v with " << N + << " repetitions\n"; + MeasureFunctionX<2, float, f32x2_v>(&test_mulf32x2_v, 2, kVerbose); +#else + GTEST_SKIP() << "SRMulF32x2V is not available"; +#endif +} + +TEST(SRVectorBenchmark, SRDivF32x2V) { +#if HWY_MAX_BYTES >= 8 + constexpr size_t N = 10; + std::cout << "Measure function ud::divf32x2_v with " << N + << " repetitions\n"; + MeasureFunctionX<2, float, f32x2_v>(&test_divf32x2_v, 2, kVerbose); +#else + GTEST_SKIP() << "SRDivF32x2V is not available"; +#endif +} + +/* IEEE-754 binary64 x2 */ +TEST(SRVectorBenchmark, SRAddF64x2V) { +#if HWY_MAX_BYTES >= 16 + constexpr size_t N = 10; + std::cout << "Measure function ud::addf64x2_v with " << N + << " repetitions\n"; + MeasureFunctionX<2, double, f64x2_v>(&test_addf64x2_v, 2, kVerbose); +#else + GTEST_SKIP() << "SRAddF64x2V is not available"; +#endif +} + +TEST(SRVectorBenchmark, SRSubF64x2V) { +#if HWY_MAX_BYTES >= 16 + constexpr size_t N = 10; + std::cout << "Measure function ud::subf64x2_v with " << N + << " repetitions\n"; + MeasureFunctionX<2, double, f64x2_v>(&test_subf64x2_v, 2, kVerbose); +#else + GTEST_SKIP() << "SRSubF64x2V is not available"; +#endif +} + +TEST(SRVectorBenchmark, SRMulF64x2V) { +#if HWY_MAX_BYTES >= 16 + constexpr size_t N = 10; + std::cout << "Measure function ud::mulf64x2_v with " << N + << " repetitions\n"; + MeasureFunctionX<2, double, f64x2_v>(&test_mulf64x2_v, 2, kVerbose); +#else + GTEST_SKIP() << "SRMulF64x2V is not available"; +#endif +} + +TEST(SRVectorBenchmark, SRDivF64x2V) { +#if HWY_MAX_BYTES >= 16 + constexpr size_t N = 10; + std::cout << "Measure function ud::divf64x2_v with " << N + << " repetitions\n"; + MeasureFunctionX<2, double, f64x2_v>(&test_divf64x2_v, 2, kVerbose); +#else + GTEST_SKIP() << "SRDivF64x2V is not available"; +#endif +} + +/* IEEE-754 binary32 x4 */ + +TEST(SRVectorBenchmark, SRAddF32x4V) { +#if HWY_MAX_BYTES >= 16 + constexpr size_t N = 10; + std::cout << "Measure function ud::addf32x4_v with " << N + << " repetitions\n"; + MeasureFunctionX<4, float, f32x4_v>(&test_addf32x4_v, 4, kVerbose); +#else + GTEST_SKIP() << "SRAddF32x4V is not available"; +#endif +} + +TEST(SRVectorBenchmark, SRSubF32x4V) { +#if HWY_MAX_BYTES >= 16 + constexpr size_t N = 10; + std::cout << "Measure function ud::subf32x4_v with " << N + << " repetitions\n"; + MeasureFunctionX<4, float, f32x4_v>(&test_subf32x4_v, 4, kVerbose); +#else + GTEST_SKIP() << "SRSubF32x4V is not available"; +#endif +} + +TEST(SRVectorBenchmark, SRMulF32x4V) { +#if HWY_MAX_BYTES >= 16 + constexpr size_t N = 10; + std::cout << "Measure function ud::mulf32x4_v with " << N + << " repetitions\n"; + MeasureFunctionX<4, float, f32x4_v>(&test_mulf32x4_v, 4, kVerbose); +#else + GTEST_SKIP() << "SRMulF32x4V is not available"; +#endif +} + +TEST(SRVectorBenchmark, SRDivF32x4V) { +#if HWY_MAX_BYTES >= 16 + constexpr size_t N = 10; + std::cout << "Measure function ud::divf32x4_v with " << N + << " repetitions\n"; + MeasureFunctionX<4, float, f32x4_v>(&test_divf32x4_v, 4, kVerbose); +#else + GTEST_SKIP() << "SRDivF32x4V is not available"; +#endif +} + +/* IEEE-754 binary32 x8 */ + +TEST(SRVectorBenchmark, SRAddF32x8V) { +#if HWY_MAX_BYTES >= 32 + constexpr size_t N = 10; + std::cout << "Measure function ud::addf32x8_v with " << N + << " repetitions\n"; + MeasureFunctionX<8, float, f32x8_v>(&test_addf32x8_v, 8, kVerbose); +#else + GTEST_SKIP() << "SRAddF32x8V is not available"; +#endif +} + +TEST(SRVectorBenchmark, SRSubF32x8V) { +#if HWY_MAX_BYTES >= 32 + constexpr size_t N = 10; + std::cout << "Measure function ud::subf32x8_v with " << N + << " repetitions\n"; + MeasureFunctionX<8, float, f32x8_v>(&test_subf32x8_v, 8, kVerbose); +#else + GTEST_SKIP() << "SRSubF32x8V is not available"; +#endif +} + +TEST(SRVectorBenchmark, SRMulF32x8V) { +#if HWY_MAX_BYTES >= 32 + constexpr size_t N = 10; + std::cout << "Measure function ud::mulf32x8_v with " << N + << " repetitions\n"; + MeasureFunctionX<8, float, f32x8_v>(&test_mulf32x8_v, 8, kVerbose); +#else + GTEST_SKIP() << "SRMulF32x8V is not available"; +#endif +} + +TEST(SRVectorBenchmark, SRDivF32x8V) { +#if HWY_MAX_BYTES >= 32 + constexpr size_t N = 10; + std::cout << "Measure function ud::divf32x8_v with " << N + << " repetitions\n"; + MeasureFunctionX<8, float, f32x8_v>(&test_divf32x8_v, 8, kVerbose); +#else + GTEST_SKIP() << "SRDivF32x8V is not available"; +#endif +} + +/* IEEE-754 binary64 x4 */ + +TEST(SRVectorBenchmark, SRAddF64x4V) { +#if HWY_MAX_BYTES >= 32 + constexpr size_t N = 10; + std::cout << "Measure function ud::addf64x4_v with " << N + << " repetitions\n"; + MeasureFunctionX<4, double, f64x4_v>(&test_addf64x4_v, 4, kVerbose); +#else + GTEST_SKIP() << "SRAddF64x4V is not available"; +#endif +} + +TEST(SRVectorBenchmark, SRSubF64x4V) { +#if HWY_MAX_BYTES >= 32 + constexpr size_t N = 10; + std::cout << "Measure function ud::subf64x4_v with " << N + << " repetitions\n"; + MeasureFunctionX<4, double, f64x4_v>(&test_subf64x4_v, 4, kVerbose); +#else + GTEST_SKIP() << "SRSubF64x4V is not available"; +#endif +} + +TEST(SRVectorBenchmark, SRMulF64x4V) { +#if HWY_MAX_BYTES >= 32 + constexpr size_t N = 10; + std::cout << "Measure function ud::mulf64x4_v with " << N + << " repetitions\n"; + MeasureFunctionX<4, double, f64x4_v>(&test_mulf64x4_v, 4, kVerbose); +#else + GTEST_SKIP() << "SRMulF64x4V is not available"; +#endif +} + +TEST(SRVectorBenchmark, SRDivF64x4V) { +#if HWY_MAX_BYTES >= 32 + constexpr size_t N = 10; + std::cout << "Measure function ud::divf64x4_v with " << N + << " repetitions\n"; + MeasureFunctionX<4, double, f64x4_v>(&test_divf64x4_v, 4, kVerbose); +#else + GTEST_SKIP() << "SRDivF64x4V is not available"; +#endif +} + +/* IEEE-754 binary64 x8 */ + +TEST(SRVectorBenchmark, SRAddF64x8V) { +#if HWY_MAX_BYTES >= 64 + constexpr size_t N = 10; + std::cout << "Measure function ud::addf64x8_v with " << N + << " repetitions\n"; + MeasureFunctionX<8, double, f64x8_v>(&test_addf64x8_v, 8, kVerbose); +#else + GTEST_SKIP() << "SRAddF64x8V is not available"; +#endif +} + +TEST(SRVectorBenchmark, SRSubF64x8V) { +#if HWY_MAX_BYTES >= 64 + constexpr size_t N = 10; + std::cout << "Measure function ud::subf64x8_v with " << N + << " repetitions\n"; + MeasureFunctionX<8, double, f64x8_v>(&test_subf64x8_v, 8, kVerbose); + +#else + GTEST_SKIP() << "SRSubF64x8V is not available"; +#endif +} + +TEST(SRVectorBenchmark, SRMulF64x8V) { +#if HWY_MAX_BYTES >= 64 + constexpr size_t N = 10; + std::cout << "Measure function ud::mulf64x8_v with " << N + << " repetitions\n"; + MeasureFunctionX<8, double, f64x8_v>(&test_mulf64x8_v, 8, kVerbose); +#else + GTEST_SKIP() << "SRMulF64x8V is not available"; +#endif +} + +TEST(SRVectorBenchmark, SRDivF64x8V) { +#if HWY_MAX_BYTES >= 64 + constexpr size_t N = 10; + std::cout << "Measure function ud::divf64x8_v with " << N + << " repetitions\n"; + MeasureFunctionX<8, double, f64x8_v>(&test_divf64x8_v, 8, kVerbose); +#else + GTEST_SKIP() << "SRDivF64x8V is not available"; +#endif +} + +/* IEEE-754 binary32 x16 */ + +TEST(SRVectorBenchmark, SRAddF32x16V) { +#if HWY_MAX_BYTES >= 64 + constexpr size_t N = 10; + std::cout << "Measure function ud::addf32x16_v with " << N + << " repetitions\n"; + MeasureFunctionX<16, float, f32x16_v>(&test_addf32x16_v, 16, kVerbose); +#else + GTEST_SKIP() << "SRAddF32x16V is not available"; +#endif +} + +TEST(SRVectorBenchmark, SRSubF32x16V) { +#if HWY_MAX_BYTES >= 64 + constexpr size_t N = 10; + std::cout << "Measure function ud::subf32x16_v with " << N + << " repetitions\n"; + MeasureFunctionX<16, float, f32x16_v>(&test_subf32x16_v, 16, kVerbose); +#else + GTEST_SKIP() << "SRSubF32x16V is not available"; +#endif +} + +TEST(SRVectorBenchmark, SRMulF32x16V) { +#if HWY_MAX_BYTES >= 64 + constexpr size_t N = 10; + std::cout << "Measure function ud::mulf32x16_v with " << N + << " repetitions\n"; + MeasureFunctionX<16, float, f32x16_v>(&test_mulf32x16_v, 16, kVerbose); +#else + GTEST_SKIP() << "SRMulF32x16V is not available"; +#endif +} + +TEST(SRVectorBenchmark, SRDivF32x16V) { +#if HWY_MAX_BYTES >= 64 + constexpr size_t N = 10; + std::cout << "Measure function ud::divf32x16_v with " << N + << " repetitions\n"; + MeasureFunctionX<16, float, f32x16_v>(&test_divf32x16_v, 16, kVerbose); +#else + GTEST_SKIP() << "SRDivF32x16V is not available"; +#endif +} + +/* Test on single vector passed by value with dynamic dispatch */ + +TEST(SRVectorDynamicBenchmark, SRAddF32x2D) { + constexpr size_t N = 10; + std::cout << "Measure function ud::addf32x2_d with " << N + << " repetitions\n"; + MeasureFunctionX<2, float, f32x2_v>(&test_addf32x2_d, 2, kVerbose); +} + +TEST(SRVectorDynamicBenchmark, SRSubF32x2D) { + constexpr size_t N = 10; + std::cout << "Measure function ud::subf32x2_d with " << N + << " repetitions\n"; + MeasureFunctionX<2, float, f32x2_v>(&test_subf32x2_d, 2, kVerbose); +} + +TEST(SRVectorDynamicBenchmark, SRMulF32x2D) { + constexpr size_t N = 10; + std::cout << "Measure function ud::mulf32x2_d with " << N + << " repetitions\n"; + MeasureFunctionX<2, float, f32x2_v>(&test_mulf32x2_d, 2, kVerbose); +} + +TEST(SRVectorDynamicBenchmark, SRDivF32x2D) { + constexpr size_t N = 10; + std::cout << "Measure function ud::divf32x2_d with " << N + << " repetitions\n"; + MeasureFunctionX<2, float, f32x2_v>(&test_divf32x2_d, 2, kVerbose); +} + +/* IEEE-754 binary64 x2 */ + +TEST(SRVectorDynamicBenchmark, SRAddF64x2D) { + constexpr size_t N = 10; + std::cout << "Measure function ud::addf64x2_d with " << N + << " repetitions\n"; + MeasureFunctionX<2, double, f64x2_v>(&test_addf64x2_d, 2, kVerbose); +} + +TEST(SRVectorDynamicBenchmark, SRSubF64x2D) { + constexpr size_t N = 10; + std::cout << "Measure function ud::subf64x2_d with " << N + << " repetitions\n"; + MeasureFunctionX<2, double, f64x2_v>(&test_subf64x2_d, 2, kVerbose); +} + +TEST(SRVectorDynamicBenchmark, SRMulF64x2D) { + constexpr size_t N = 10; + std::cout << "Measure function ud::mulf64x2_d with " << N + << " repetitions\n"; + MeasureFunctionX<2, double, f64x2_v>(&test_mulf64x2_d, 2, kVerbose); +} + +TEST(SRVectorDynamicBenchmark, SRDivF64x2D) { + constexpr size_t N = 10; + std::cout << "Measure function ud::divf64x2_d with " << N + << " repetitions\n"; + MeasureFunctionX<2, double, f64x2_v>(&test_divf64x2_d, 2, kVerbose); +} + +/* IEEE-754 binary32 x4 */ + +TEST(SRVectorDynamicBenchmark, SRAddF32x4D) { + constexpr size_t N = 10; + std::cout << "Measure function ud::addf32x4_d with " << N + << " repetitions\n"; + MeasureFunctionX<4, float, f32x4_v>(&test_addf32x4_d, 4, kVerbose); +} + +TEST(SRVectorDynamicBenchmark, SRSubF32x4D) { + constexpr size_t N = 10; + std::cout << "Measure function ud::subf32x4_d with " << N + << " repetitions\n"; + MeasureFunctionX<4, float, f32x4_v>(&test_subf32x4_d, 4, kVerbose); +} + +TEST(SRVectorDynamicBenchmark, SRMulF32x4D) { + constexpr size_t N = 10; + std::cout << "Measure function ud::mulf32x4_d with " << N + << " repetitions\n"; + MeasureFunctionX<4, float, f32x4_v>(&test_mulf32x4_d, 4, kVerbose); +} + +TEST(SRVectorDynamicBenchmark, SRDivF32x4D) { + constexpr size_t N = 10; + std::cout << "Measure function ud::divf32x4_d with " << N + << " repetitions\n"; + MeasureFunctionX<4, float, f32x4_v>(&test_divf32x4_d, 4, kVerbose); +} + +/* IEEE-754 binary32 x8 */ + +TEST(SRVectorDynamicBenchmark, SRAddF32x8D) { + constexpr size_t N = 10; + std::cout << "Measure function ud::addf32x8_d with " << N + << " repetitions\n"; + MeasureFunctionX<8, float, f32x8_v>(&test_addf32x8_d, 8, kVerbose); +} + +TEST(SRVectorDynamicBenchmark, SRSubF32x8D) { + constexpr size_t N = 10; + std::cout << "Measure function ud::subf32x8_d with " << N + << " repetitions\n"; + MeasureFunctionX<8, float, f32x8_v>(&test_subf32x8_d, 8, kVerbose); +} + +TEST(SRVectorDynamicBenchmark, SRMulF32x8D) { + constexpr size_t N = 10; + std::cout << "Measure function ud::mulf32x8_d with " << N + << " repetitions\n"; + MeasureFunctionX<8, float, f32x8_v>(&test_mulf32x8_d, 8, kVerbose); +} + +TEST(SRVectorDynamicBenchmark, SRDivF32x8D) { + constexpr size_t N = 10; + std::cout << "Measure function ud::divf32x8_d with " << N + << " repetitions\n"; + MeasureFunctionX<8, float, f32x8_v>(&test_divf32x8_d, 8, kVerbose); +} + +/* IEEE-754 binary64 x4 */ + +TEST(SRVectorDynamicBenchmark, SRAddF64x4D) { + constexpr size_t N = 10; + std::cout << "Measure function ud::addf64x4_d with " << N + << " repetitions\n"; + MeasureFunctionX<4, double, f64x4_v>(&test_addf64x4_d, 4, kVerbose); +} + +TEST(SRVectorDynamicBenchmark, SRSubF64x4D) { + constexpr size_t N = 10; + std::cout << "Measure function ud::subf64x4_d with " << N + << " repetitions\n"; + MeasureFunctionX<4, double, f64x4_v>(&test_subf64x4_d, 4, kVerbose); +} + +TEST(SRVectorDynamicBenchmark, SRMulF64x4D) { + constexpr size_t N = 10; + std::cout << "Measure function ud::mulf64x4_d with " << N + << " repetitions\n"; + MeasureFunctionX<4, double, f64x4_v>(&test_mulf64x4_d, 4, kVerbose); +} + +TEST(SRVectorDynamicBenchmark, SRDivF64x4D) { + constexpr size_t N = 10; + std::cout << "Measure function ud::divf64x4_d with " << N + << " repetitions\n"; + MeasureFunctionX<4, double, f64x4_v>(&test_divf64x4_d, 4, kVerbose); +} + +/* IEEE-754 binary64 x8 */ + +TEST(SRVectorDynamicBenchmark, SRAddF64x8D) { + constexpr size_t N = 10; + std::cout << "Measure function ud::addf64x8_d with " << N + << " repetitions\n"; + MeasureFunctionX<8, double, f64x8_v>(&test_addf64x8_d, 8, kVerbose); +} + +TEST(SRVectorDynamicBenchmark, SRSubF64x8D) { + constexpr size_t N = 10; + std::cout << "Measure function ud::subf64x8_d with " << N + << " repetitions\n"; + MeasureFunctionX<8, double, f64x8_v>(&test_subf64x8_d, 8, kVerbose); +} + +TEST(SRVectorDynamicBenchmark, SRMulF64x8D) { + constexpr size_t N = 10; + std::cout << "Measure function ud::mulf64x8_d with " << N + << " repetitions\n"; + MeasureFunctionX<8, double, f64x8_v>(&test_mulf64x8_d, 8, kVerbose); +} + +TEST(SRVectorDynamicBenchmark, SRDivF64x8D) { + constexpr size_t N = 10; + std::cout << "Measure function ud::divf64x8_d with " << N + << " repetitions\n"; + MeasureFunctionX<8, double, f64x8_v>(&test_divf64x8_d, 8, kVerbose); +} + +/* IEEE-754 binary32 x16 */ + +TEST(SRVectorDynamicBenchmark, SRAddF32x16D) { + constexpr size_t N = 10; + std::cout << "Measure function ud::addf32x16_d with " << N + << " repetitions\n"; + MeasureFunctionX<16, float, f32x16_v>(&test_addf32x16_d, 16, kVerbose); +} + +TEST(SRVectorDynamicBenchmark, SRSubF32x16D) { + constexpr size_t N = 10; + std::cout << "Measure function ud::subf32x16_d with " << N + << " repetitions\n"; + MeasureFunctionX<16, float, f32x16_v>(&test_subf32x16_d, 16, kVerbose); +} + +TEST(SRVectorDynamicBenchmark, SRMulF32x16D) { + constexpr size_t N = 10; + std::cout << "Measure function ud::mulf32x16_d with " << N + << " repetitions\n"; + MeasureFunctionX<16, float, f32x16_v>(&test_mulf32x16_d, 16, kVerbose); +} + +TEST(SRVectorDynamicBenchmark, SRDivF32x16D) { + constexpr size_t N = 10; + std::cout << "Measure function ud::divf32x16_d with " << N + << " repetitions\n"; + MeasureFunctionX<16, float, f32x16_v>(&test_divf32x16_d, 16, kVerbose); +} + +} // namespace vector +} // namespace ud + +// HYW_TEST_MAIN();1 diff --git a/src/libvfcinstrumentonline/rand/tests/xoroshiro.cpp b/src/libvfcinstrumentonline/rand/tests/test_xoroshiro.cpp similarity index 98% rename from src/libvfcinstrumentonline/rand/tests/xoroshiro.cpp rename to src/libvfcinstrumentonline/rand/tests/test_xoroshiro.cpp index 2743b493..e60854bc 100644 --- a/src/libvfcinstrumentonline/rand/tests/xoroshiro.cpp +++ b/src/libvfcinstrumentonline/rand/tests/test_xoroshiro.cpp @@ -27,6 +27,8 @@ namespace HWY_NAMESPACE { // required: unique per target namespace { constexpr std::uint64_t tests = 1UL << 10; +constexpr std::uint64_t u64{}; +constexpr std::uint32_t u32{}; std::uint64_t GetSeed() { return static_cast(std::time(nullptr)); } @@ -35,7 +37,7 @@ void RngLoop(const std::uint64_t seed, std::uint64_t *HWY_RESTRICT result, const ScalableTag d; VectorXoshiro generator{seed}; for (size_t i = 0; i < size; i += Lanes(d)) { - Store(generator(), d, result + i); + Store(generator(u64), d, result + i); } } @@ -224,7 +226,7 @@ void TestUniformVecDistF32() { void TestNextNRandomUint64() { const std::uint64_t seed = GetSeed(); VectorXoshiro generator{seed}; - const auto result_array = generator.operator()(tests); + const auto result_array = generator.operator()(u64, tests); std::vector reference; reference.emplace_back(seed); const ScalableTag d; @@ -252,7 +254,7 @@ void TestNextNRandomUint64() { void TestNextFixedNRandomUint64() { const std::uint64_t seed = GetSeed(); VectorXoshiro generator{seed}; - const auto result_array = generator.operator()(); + const auto result_array = generator.operator()(u64); std::vector reference; reference.emplace_back(seed); const ScalableTag d; diff --git a/src/libvfcinstrumentonline/rand/tests/udround_test.cpp b/src/libvfcinstrumentonline/rand/tests/udround_test.cpp deleted file mode 100644 index 6e448155..00000000 --- a/src/libvfcinstrumentonline/rand/tests/udround_test.cpp +++ /dev/null @@ -1,488 +0,0 @@ -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#include "src/eft.hpp" -#include "src/main.hpp" -#include "src/ud.hpp" -#include "src/utils.hpp" -#include "tests/helper.hpp" - -using ::testing::AllOf; -using ::testing::Ge; -using ::testing::Gt; -using ::testing::Le; -using ::testing::Lt; - -constexpr float default_alpha = 0.05; - -namespace reference { -// return pred(|s|) - -// twosum reference -// compute in double precision if the input type is float -// compute in quad precision if the input type is double -template ::H> -R ud_add(T a, T b) { - using H = typename helper::IEEE754::H; - return static_cast(a) + static_cast(b); -} - -template ::H> -R ud_sub(T a, T b) { - using H = typename helper::IEEE754::H; - return static_cast(a) - static_cast(b); -} - -template ::H> -R ud_mul(T a, T b) { - using H = typename helper::IEEE754::H; - return static_cast(a) * static_cast(b); -} - -template ::H> -R ud_div(T a, T b) { - using H = typename helper::IEEE754::H; - return static_cast(a) / static_cast(b); -} - -template T absolute_distance(T a, T b = 0) { - if constexpr (std::is_same::value) { - __uint128_t u = reinterpret_cast<__uint128_t &>(a - b); - __uint128_t mask = 1; - mask <<= 127; - u &= ~mask; - return reinterpret_cast(u); - } else { - return std::abs(a - b); - } -} - -template int get_exponent(T a) { - int exp = 0; - if (a == 0) - return 0; - if constexpr (std::is_same_v) { - __uint128_t u = reinterpret_cast<__uint128_t &>(a); - __uint128_t mask = 0x7fff; - u >>= 112; - exp = (int)(u & mask); - exp -= 0x3FFF; - } else { - using U = typename helper::IEEE754::U; - U u = reinterpret_cast(a); - u >>= helper::IEEE754::mantissa; - exp = (int)(u & helper::IEEE754::exponent_mask); - exp -= helper::IEEE754::bias; - } - return exp; -} - -template T relative_distance(T a, T b) { - if (a == 0 and b == 0) - return 0; - if (a == 0) - return b; - if (b == 0) - return a; - return absolute_distance(a, b) / absolute_distance(a); -} - -}; // namespace reference - -template class Counter { -private: - int _up_count; - int _down_count; - T _down; - T _up; - std::map data; - bool _is_finalized = false; - -public: - Counter() : _up_count(0), _down_count(0) {} - - int &operator[](const T &key) { - _is_finalized = false; - return data[key]; - } - - void finalize() { - if (_is_finalized) - return; - - auto keys = data.begin(); - _down = keys->first; - _down_count = keys->second; - keys++; - _up = keys->first; - _up_count = keys->second; - - if (_down > _up) { - std::swap(_down, _up); - std::swap(_down_count, _up_count); - } - - _is_finalized = true; - } - - const int size() const { return data.size(); } - - const T &down() { - finalize(); - return _down; - } - - const T &up() { - finalize(); - return _up; - } - - const int &down_count() { - finalize(); - return _down_count; - } - - const int &up_count() { - finalize(); - return _up_count; - } -}; - -template struct Operator { - using binary_operator = std::function; -}; - -// compute ulp(a) -template ::H> -H get_ulp(T a) { - int exponent; - std::frexp(a, &exponent); - exponent--; - H ulp = std::ldexp(1.0, exponent - helper::IEEE754::mantissa); - return ulp; -} - -template ::H> -std::pair compute_distance_error(T a, T b, H reference) { - - T ref_cast = static_cast(reference); - - H distance_to_fp = reference::absolute_distance(reference, ref_cast); - H ulp = get_ulp(ref_cast); - - H probability = distance_to_fp / ulp; - H probability_c = 1 - probability; - - if (ref_cast > reference) - return {probability_c, probability}; - else - return {probability, probability_c}; -} - -template ::binary> -Counter eval_op(T a, T b, Op op, const int repetitions = 1000) { - Counter c; - for (int i = 0; i < repetitions; i++) { - T v = op(a, b); - c[v]++; - } - - EXPECT_EQ(c.size(), 2) << "More than two values returned by " - "target function"; - EXPECT_EQ(c.down_count() + c.up_count(), repetitions) - << "Total count mismatch"; - - return c; -} - -struct BinomialTest { - double lower; - double upper; - double pvalue; -}; - -BinomialTest binomial_test(const int n, const int k, const float p, - const float alpha = 0.05) { - - boost::math::binomial_distribution<> dist(n, p); - double lower = boost::math::cdf(dist, k); - double upper = boost::math::cdf(complement(dist, k - 1)); - double pvalue = 2 * std::min(lower, upper); - - return BinomialTest{lower, upper, pvalue}; -} - -std::string demangle(const char *name) { - int status = -1; - std::unique_ptr res{ - abi::__cxa_demangle(name, NULL, NULL, &status), std::free}; - return (status == 0) ? res.get() : name; -} - -template ::binary> -void check_exact_proportion(T a, T b, Op op, const float proportion, - const int repetitions = 1000, - const float alpha = default_alpha) { - - auto counter = eval_op(a, b, op, repetitions); - auto test = binomial_test(repetitions, counter.down_count(), proportion); - - EXPECT_THAT(alpha / 2, Lt(test.pvalue)) - << "Null hypotheis rejected!\n" - // << "op: " << op << "\n" - << "alpha: " << alpha << "\n" - << "proportion: " << proportion << "\n" - << "count_down: " << counter.down_count() << "\n" - << "sample size: " << repetitions << "\n" - << "pvalue: " << test.pvalue << "\n" - << "lower_bound: " << test.lower << "\n" - << "upper_bound: " << test.upper << "\n" - << std::hexfloat << "down: " << counter.down() << "\n" - << std::hexfloat << "up: " << counter.down() << "\n"; -} - -template ::binary> -void check_distribution_match(T a, T b, Op reference_op, Op target_op, - const int repetitions = 1000, - const float alpha = default_alpha) { - using H = typename helper::IEEE754::H; - - H reference = reference_op(a, b); - auto [down, up] = compute_distance_error(a, b, reference); - - auto counter = eval_op(a, b, target_op, repetitions); - auto count_down = counter.down_count(); - auto count_up = counter.up_count(); - - EXPECT_EQ(count_down, repetitions * down) << "Down value count mismatch"; - EXPECT_EQ(count_up, repetitions * up) << "Up value count mismatch"; - EXPECT_EQ(count_down + count_up, repetitions) << "Total count mismatch"; - - auto down_test = binomial_test(repetitions, count_down, down, alpha); - - EXPECT_THAT(down, AllOf(Ge(down_test.lower_bound), Le(down_test.upper_bound))) - << "Propability of down value mismatch!\n" - << "op: " << target_op.target_type().name() << "\n" - << "down: " << down << "\n" - << "lower_bound_down: " << down_test.lower_bound << "\n" - << "upper_bound_down: " << down_test.upper_bound - << "sample size: " << repetitions << "\n" - << "count_down: " << counter.down_count() << "\n" - << "alpha: " << alpha << "\n"; - - auto up_test = binomial_test(repetitions, count_up, up, alpha); - - EXPECT_THAT(up, AllOf(Ge(up_test.lower_bound), Le(up_test.upper_bound))) - << "Propability of up value mismatch\n" - << "op: " << target_op.target_type().name() << "\n" - << "up: " << counter.up() << "\n" - << "lower_bound_up: " << up_test.lower_bound << "\n" - << "upper_bound_up: " << up_test.upper_bound - << "sample size: " << repetitions << "\n" - << "count_up: " << counter.up_count() << "\n" - << "alpha: " << alpha << "\n"; -} - -template std::vector get_simple_case() { - std::vector simple_case = {0.0, - 1.0, - 2.0, - 3.0, - std::numeric_limits::min(), - std::numeric_limits::lowest(), - std::numeric_limits::max(), - std::numeric_limits::epsilon(), - std::numeric_limits::infinity(), - std::numeric_limits::denorm_min(), - std::numeric_limits::quiet_NaN(), - std::numeric_limits::signaling_NaN()}; - return simple_case; -} - -enum class OperatorType { ADD, SUB, MUL, DIV }; - -template void do_basic_test() { - using I = typename helper::IEEE754::I; - constexpr I mantissa = helper::IEEE754::mantissa; - - T a = 1.25; - T b; - - std::function op; - switch (op_type) { - case OperatorType::ADD: - op = ud_add; - break; - case OperatorType::SUB: - op = ud_sub; - break; - case OperatorType::MUL: - op = ud_mul; - break; - case OperatorType::DIV: - op = ud_div; - break; - } - - for (int i = 0; i <= 5; i++) { - b = std::ldexp(1.0, -(mantissa + i)); - check_exact_proportion(a, b, op, 0.5, 100000, 0.01); - } -} - -template void do_basic_test_add() { - do_basic_test(); -} - -template void do_basic_test_sub() { - do_basic_test(); -} - -template void do_basic_test_mul() { - do_basic_test(); -} - -template void do_basic_test_div() { - do_basic_test(); -} - -TEST(UDRoundTest, BasicAssertionsAddition) { - do_basic_test_add(); - do_basic_test_add(); -} - -TEST(UDRoundTest, BasicAssertionsSubstraction) { - do_basic_test_sub(); - do_basic_test_sub(); -} - -TEST(UDRoundTest, BasicAssertionsMultiplication) { - do_basic_test_mul(); - do_basic_test_mul(); -} - -TEST(UDRoundTest, BasicAssertionsDivision) { - do_basic_test_div(); - do_basic_test_div(); -} - -// TEST(UDRoundTest, Random01Assertions) { -// RNG rng; - -// for (int i = 0; i < 1000; i++) { -// float a = rng(); -// float b = rng(); -// test_equality(a, b); -// test_equality(a, -b); -// test_equality(-a, b); -// test_equality(-a, -b); -// } - -// for (int i = 0; i < 1000; i++) { -// double a = rng(); -// double b = rng(); -// test_equality(a, b); -// test_equality(a, -b); -// test_equality(-a, b); -// test_equality(-a, -b); -// } -// } - -// TEST(UDRoundTest, RandomNoOverlapAssertions) { -// RNG rng_0v1_float(0, 1); -// RNG rng_24v25_float(0x1p24, 0x1p25); - -// for (int i = 0; i < 1000; i++) { -// float a = rng_0v1_float(); -// float b = rng_24v25_float(); -// test_equality(a, b); -// test_equality(a, -b); -// test_equality(-a, b); -// test_equality(-a, -b); -// } - -// RNG rng_0v1_double(0, 1); -// RNG rng_53v54_double(0x1p53, 0x1p54); - -// for (int i = 0; i < 1000; i++) { -// double a = rng_0v1_double(); -// double b = rng_53v54_double(); -// test_equality(a, b); -// test_equality(a, -b); -// test_equality(-a, b); -// test_equality(-a, -b); -// } -// } - -// TEST(UDRoundTest, RandomLastBitOverlapAssertions) { -// RNG rng_1v2_float(1, 2); -// RNG rng_23v24_float(0x1p23, 0x1p24); - -// for (int i = 0; i < 1000; i++) { -// float a = rng_1v2_float(); -// float b = rng_23v24_float(); -// test_equality(a, b); -// test_equality(a, -b); -// test_equality(-a, b); -// test_equality(-a, -b); -// } - -// RNG rng_1v2_double(1, 2); -// RNG rng_52v53_double(0x1p52, 0x1p53); - -// for (int i = 0; i < 1000; i++) { -// double a = rng_1v2_double(); -// double b = rng_52v53_double(); -// test_equality(a, b); -// test_equality(a, -b); -// test_equality(-a, b); -// test_equality(-a, -b); -// } -// } - -// TEST(UDRoundTest, RandomMidOverlapAssertions) { -// RNG rng_0v1_float(0, 1); -// RNG rng_12v13_float(0x1p12, 0x1p13); - -// for (int i = 0; i < 1000; i++) { -// float a = rng_0v1_float(); -// float b = rng_12v13_float(); -// test_equality(a, b); -// test_equality(a, -b); -// test_equality(-a, b); -// test_equality(-a, -b); -// } - -// RNG rng_0v1_double(0, 1); -// RNG rng_26v27_double(0x1p26, 0x1p27); - -// for (int i = 0; i < 1000; i++) { -// double a = rng_0v1_double(); -// double b = rng_26v27_double(); -// test_equality(a, b); -// test_equality(a, -b); -// test_equality(-a, b); -// test_equality(-a, -b); -// } -// } - -// TEST(UDRoundTest, BinadeAssertions) { -// for (int i = -126; i < 127; i++) { -// testBinade(i); -// testBinade(i); -// } -// } - -int main(int argc, char **argv) { - testing::InitGoogleTest(&argc, argv); - init(); - return RUN_ALL_TESTS(); -} \ No newline at end of file