Skip to content

Commit

Permalink
Add up-down operators
Browse files Browse the repository at this point in the history
  • Loading branch information
yohanchatelain committed Nov 21, 2024
1 parent 6fea34f commit 022cadf
Show file tree
Hide file tree
Showing 40 changed files with 4,613 additions and 851 deletions.
7 changes: 7 additions & 0 deletions src/libvfcinstrumentonline/rand/build-ir.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,11 @@ bazelisk build --repo_env=CC=$(llvm-config-14 --bindir)/clang \
--cxxopt="-march=native" --cxxopt="-emit-llvm" --cxxopt="-O3" --save_temps \
--cxxopt="-USR_DEBUG" --cxxopt="-g0"

bazelisk build --repo_env=CC=$(llvm-config-14 --bindir)/clang \
--repo_env=CXX=$(llvm-config-14 --bindir)/clang++ \
--compile_one_dependency //src:ud_hw.cpp \
--cxxopt="-march=native" --cxxopt="-emit-llvm" --cxxopt="-O3" --save_temps \
--cxxopt="-USR_DEBUG" --cxxopt="-g0"

cp -f bazel-bin/src/_objs/sr/sr_hw.pic.s sr_hw.ll
cp -f bazel-bin/src/_objs/ud/ud_hw.pic.s ud_hw.ll
75 changes: 64 additions & 11 deletions src/libvfcinstrumentonline/rand/src/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,10 @@ exports_files([
"sr_hw.h",
"sr.h",
"sr_hw-inl.h",
"ud.hpp",
"ud.h",
"ud_hw.h",
"ud_hw.cpp",
"ud_hw-inl.h",
"utils.hpp",
"xoroshiro256+_hw.hpp",
"xoroshiro256+_hw.h",
Expand All @@ -20,23 +23,62 @@ exports_files([

# Compiler options
COPTS = [
"-std=c++20",
"-std=c++17",
"-Wfatal-errors",
]

filegroup(
name = "srcs",
srcs = glob([
"*.cpp",
"*.hpp",
"*.h",
]),
name = "xoroshiro",
srcs = [
"debug_hwy-inl.h",
"random-inl.h",
"target-utils.h",
"xoroshiro256+_hw.cpp",
"xoroshiro256+_hw.h",
"xoroshiro256+_hw-inl.hpp",
],
visibility = ["//visibility:public"],
)

filegroup(
name = "srcs-sr",
srcs = glob(
[
"*.cpp",
"*.hpp",
"*.h",
],
exclude = [
"ud_hw.cpp",
"ud_hw.h",
"ud_hw-inl.h",
"ud.h",
],
),
visibility = ["//visibility:public"],
)

filegroup(
name = "srcs-ud",
srcs = glob(
[
"*.cpp",
"*.hpp",
"*.h",
],
exclude = [
"sr_hw.cpp",
"sr_hw.h",
"sr_hw-inl.h",
"sr.h",
],
),
visibility = ["//visibility:public"],
)

cc_library(
name = "sr",
srcs = [":srcs"],
srcs = [":srcs-sr"],
hdrs = ["//src:sr_hw.h"],
copts = COPTS + [
"-O2",
Expand All @@ -47,6 +89,19 @@ cc_library(
deps = ["@hwy"],
)

cc_library(
name = "ud",
srcs = [":srcs-ud"],
hdrs = ["//src:ud_hw.h"],
copts = COPTS + [
"-O2",
"-Wall",
"-Wno-psabi",
],
visibility = ["//visibility:public"],
deps = ["@hwy"],
)

cc_library(
name = "sr-dbg",
srcs = [
Expand All @@ -66,10 +121,8 @@ cc_library(
"//src:sr_hw.h",
],
copts = COPTS + [
"-std=c++20",
"-Og",
"-g",
"-march=native",
"-Wfatal-errors",
"-DHWY_DISABLED_TARGETS=(HWY_AVX3|HWY_AVX3_ZEN4|HWY_AVX3_SPR)",
"-DSR_DEBUG",
Expand Down
Empty file.
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
#include <iostream>
#include <map>

const std::size_t N = 1'000'000;

#ifndef REAL
#define REAL double
#warning "REAL not defined, using double"
Expand Down Expand Up @@ -49,10 +51,10 @@ int main(int argc, char *argv[]) {

std::map<REAL, int> visited;

for (int i = 0; i < 1000; i++)
for (int i = 0; i < N; i++)
visited[apply_op(argv[1][0], a, b)]++;

compute_proba(visited, 1000);
compute_proba(visited, N);

return 0;
}
Binary file not shown.
Binary file not shown.
68 changes: 53 additions & 15 deletions src/libvfcinstrumentonline/rand/src/random-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,7 @@ class Xoshiro {
class VectorXoshiro {
private:
using VU64 = Vec<ScalableTag<std::uint64_t>>;
using VU32 = Vec<ScalableTag<std::uint64_t>>;
using VU32 = Vec<ScalableTag<std::uint32_t>>;
using StateType = AlignedNDArray<std::uint64_t, 2>;
using VF32 = Vec<ScalableTag<float>>;
#if HWY_HAVE_FLOAT64
Expand All @@ -232,9 +232,34 @@ class VectorXoshiro {
}
}

HWY_INLINE VU64 operator()() noexcept { return Next(); }
HWY_INLINE VU32 operator()(std::uint32_t) noexcept {
const auto result = Next();
return BitCast(ScalableTag<std::uint32_t>{}, result);
}

HWY_INLINE VU64 operator()(std::uint64_t) noexcept { return Next(); }

AlignedVector<std::uint64_t> operator()(const std::size_t n) {
AlignedVector<std::uint32_t> operator()(std::uint32_t, const std::size_t n) {
const auto u32_tag = ScalableTag<std::uint32_t>{};
AlignedVector<std::uint32_t> result(n);
const ScalableTag<std::uint64_t> tag{};
auto s0 = Load(tag, state_[{0}].data());
auto s1 = Load(tag, state_[{1}].data());
auto s2 = Load(tag, state_[{2}].data());
auto s3 = Load(tag, state_[{3}].data());
for (std::uint64_t i = 0; i < n; i += Lanes(u32_tag)) {
const auto next = Update(s0, s1, s2, s3);
const auto next_u32 = BitCast(u32_tag, next);
Store(next_u32, u32_tag, result.data() + i);
}
Store(s0, tag, state_[{0}].data());
Store(s1, tag, state_[{1}].data());
Store(s2, tag, state_[{2}].data());
Store(s3, tag, state_[{3}].data());
return result;
}

AlignedVector<std::uint64_t> operator()(std::uint64_t, const std::size_t n) {
AlignedVector<std::uint64_t> result(n);
const ScalableTag<std::uint64_t> tag{};
auto s0 = Load(tag, state_[{0}].data());
Expand All @@ -252,8 +277,29 @@ class VectorXoshiro {
return result;
}

template <std::uint32_t N>
std::array<std::uint32_t, N> operator()(std::uint32_t) noexcept {
alignas(HWY_ALIGNMENT) std::array<std::uint32_t, N> result;
const ScalableTag<std::uint64_t> tag{};
const ScalableTag<std::uint32_t> u32_tag{};
auto s0 = Load(tag, state_[{0}].data());
auto s1 = Load(tag, state_[{1}].data());
auto s2 = Load(tag, state_[{2}].data());
auto s3 = Load(tag, state_[{3}].data());
for (std::uint64_t i = 0; i < N; i += Lanes(u32_tag)) {
const auto next = Update(s0, s1, s2, s3);
const auto next_u32 = BitCast(u32_tag, next);
Store(next_u32, u32_tag, result.data() + i);
}
Store(s0, tag, state_[{0}].data());
Store(s1, tag, state_[{1}].data());
Store(s2, tag, state_[{2}].data());
Store(s3, tag, state_[{3}].data());
return result;
}

template <std::uint64_t N>
std::array<std::uint64_t, N> operator()() noexcept {
std::array<std::uint64_t, N> operator()(std::uint64_t) noexcept {
alignas(HWY_ALIGNMENT) std::array<std::uint64_t, N> result;
const ScalableTag<std::uint64_t> tag{};
auto s0 = Load(tag, state_[{0}].data());
Expand Down Expand Up @@ -294,10 +340,7 @@ class VectorXoshiro {
}

HWY_INLINE AlignedVector<float> Uniform(float, const std::size_t n) {
// std::cerr << "Uniform float start\n";
AlignedVector<float> result(n);
// std::cerr << "result address: " << &result << "\n";
// std::cerr << "result size: " << result.size() << "\n";
const ScalableTag<std::uint32_t> u32_tag{};
const ScalableTag<std::uint64_t> tag{};
const ScalableTag<float> real_tag{};
Expand All @@ -309,22 +352,17 @@ class VectorXoshiro {
auto s3 = Load(tag, state_[{3}].data());

for (std::size_t i = 0; i < n; i += Lanes(real_tag)) {
// std::cerr << "i: " << i << "\n";
const auto next = Update(s0, s1, s2, s3);
const auto bits = BitCast(u32_tag, next);
const auto bitscast = ShiftRight<8>(bits);
const auto real = ConvertTo(real_tag, bitscast);
const auto uniform = Mul(real, MUL_VALUE);
// std::cerr << "store at " << i << " " << result.data() + i << "\n";
Store(uniform, real_tag, result.data() + i);
}

Store(s0, tag, state_[{0}].data());
Store(s1, tag, state_[{1}].data());
Store(s2, tag, state_[{2}].data());
Store(s3, tag, state_[{3}].data());

// std::cerr << "Uniform float end\n";
return result;
}

Expand Down Expand Up @@ -466,12 +504,12 @@ template <std::uint64_t size = 1024> class CachedXoshiro {

explicit CachedXoshiro(const result_type seed,
const result_type threadNumber = 0)
: generator_{seed, threadNumber}, cache_{generator_.operator()<size>()},
index_{0} {}
: generator_{seed, threadNumber},
cache_{generator_.operator()<size>(result_type{})}, index_{0} {}

result_type operator()() noexcept {
if (HWY_UNLIKELY(index_ == size)) {
cache_ = std::move(generator_.operator()<size>());
cache_ = std::move(generator_.operator()<size>(result_type{}));
index_ = 0;
}
return cache_[index_++];
Expand Down
2 changes: 1 addition & 1 deletion src/libvfcinstrumentonline/rand/src/sr.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ template <typename T> inline T sround(const T sigma, const T tau) {
debug_start();
if (tau == 0) {
debug_end();
return sigma;
return 0;
}
constexpr int32_t mantissa = IEEE754<T>::mantissa;
const bool sign_tau = tau < 0;
Expand Down
37 changes: 25 additions & 12 deletions src/libvfcinstrumentonline/rand/src/sr_hw-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,15 +49,18 @@ void twosum(V a, V b, V &sigma, V &tau) {
}

/*
"Emulation of the FMA in rounded-to-nearest loating-point arithmetic"
Stef Graillat, Jean-Michel Muller
---
Algorithm 3 – Split(x, s). Veltkamp’s splitting algorithm. Returns a pair
(xh, xℓ) of FP numbers such that the significand of xh fits in s − p bits, the
significand of xℓ fits in s − 1 bits, and xh + xℓ = x.
Require: K = 2s + 1
Require: K = 2^s + 1
Require: 2 ≤ s ≤ p − 2
γ ← RN(K · x)
δ ← RN(x − γ)
ah ← RN(γ + δ)
aℓ ← RN(x − ah)
xh ← RN(γ + δ)
xℓ ← RN(x − xh)
return (xh, xℓ)
*/
template <class D, class V = hn::TFromD<D>, typename T = hn::TFromD<D>>
Expand All @@ -80,6 +83,9 @@ void Split(V x, V &xh, V &xl) {
}

/*
"Emulation of the FMA in rounded-to-nearest loating-point arithmetic"
Stef Graillat, Jean-Michel Muller
---
Algorithm 4 – DekkerProd(a, b). Dekker’s product. Returns a pair (πh, πℓ)
of FP numbers such that πh = RN(ab) and πh + πℓ = ab.
Require: s = ⌈p/2⌉
Expand Down Expand Up @@ -115,6 +121,9 @@ void DekkerProd(V a, V b, V &pi_h, V &pi_l) {
}

/*
"Emulation of the FMA in rounded-to-nearest loating-point arithmetic"
Stef Graillat, Jean-Michel Muller
---
Algorithm 7 EmulFMA(a, b, c).
Require: P = 2^(p−1) + 1
Require: Q = 2^(p−1)
Expand Down Expand Up @@ -192,11 +201,12 @@ V fma(V a, V b, V c) {
auto g = hn::Mul(t, w);
auto mask3 = hn::Lt(g, hn::Zero(d)); // if g < 0 then

auto res = hn::IfThenElse(
mask, d_temp_1,
hn::IfThenElse(mask1, z_h,
hn::IfThenElse(mask2, d_temp_2,
hn::IfThenElse(mask3, z_h, d_temp_2))));
auto ret3 = hn::IfThenElse(mask3, z_h, d_temp_2);
auto ret2 = hn::IfThenElse(mask2, d_temp_2, ret3);
auto ret1 = hn::IfThenElse(mask1, z_h, ret2);
auto ret = hn::IfThenElse(mask, d_temp_1, ret1);

auto res = ret;

debug_vec<D>("[fma] res", res);
debug_msg("[fma] END\n");
Expand Down Expand Up @@ -276,7 +286,7 @@ HWY_INLINE hn::Vec<D> FastPow2I(D d, VI x) {
const hn::Rebind<hwy::MakeSigned<D>, D> di;
const auto kOffset = Set(di, kOffsetS);
const auto offset = Add(x, kOffset);
const auto shift = ShiftLeft<mantissa>(offset);
const auto shift = hn::ShiftLeft<mantissa>(offset);
return BitCast(d, shift);
}

Expand Down Expand Up @@ -329,7 +339,7 @@ hn::Vec<D> pow2(D d, V n) {
return hn::BitCast(d, res);
}

template <class D, class V, typename T = hn::TFromD<D>>
template <class D, class V = hn::VFromD<D>, typename T = hn::TFromD<D>>
V sr_round(V sigma, V tau) {
debug_msg("\n[sr_round] START");
debug_vec<D>("[sr_round] sigma", sigma);
Expand Down Expand Up @@ -362,14 +372,17 @@ V sr_round(V sigma, V tau) {

auto exp = hn::Sub(eta, hn::Set(di, mantissa));
auto abs_ulp = pow2(d, exp);
debug_vec<D>("[sr_round] abs_ulp", abs_ulp);
debug_vec<D>("[sr_round] |ulp|", abs_ulp);

auto ulp = hn::CopySign(abs_ulp, tau);
debug_vec<D>("[sr_round] ulp", ulp);

auto pi = hn::Mul(ulp, z);
debug_vec<D>("[sr_round] pi", pi);

auto abs_tau_plus_pi = hn::Abs(hn::Add(tau, pi));
debug_vec<D>("[sr_round] abs_tau_plus_pi", abs_tau_plus_pi);
debug_vec<D>("[sr_round] |tau|+pi", abs_tau_plus_pi);

auto round = hn::IfThenElse(hn::Ge(abs_tau_plus_pi, abs_ulp), ulp, zero);
debug_vec<D>("[sr_round] round", round);

Expand Down
Loading

0 comments on commit 022cadf

Please sign in to comment.