Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Horner's rule for polynomial evaluation with symmetry idea diccussed in discussions #461 #477

Merged
merged 20 commits into from
Jul 17, 2024
Merged
Changes from 13 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
103 changes: 95 additions & 8 deletions src/spreadinterp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ namespace { // anonymous namespace for internal structs equivalent to declaring
// static
struct zip_low;
struct zip_hi;
template<unsigned cap> struct reverse_index;
template<unsigned cap> struct shuffle_index;
struct select_even;
struct select_odd;
// forward declaration to clean up the code and be able to use this everywhere in the file
Expand Down Expand Up @@ -777,24 +779,99 @@ Two upsampfacs implemented. Params must match ref formula. Barnett 4/24/18 */
const FLT z = std::fma(FLT(2.0), x, FLT(w - 1)); // scale so local grid offset z in
// [-1,1]
if (opts.upsampfac == 2.0) { // floating point equality is fine here
static constexpr auto alignment = simd_type::arch_type::alignment();
using arch_t = typename simd_type::arch_type;
static constexpr auto alignment = arch_t::alignment();
static constexpr auto simd_size = simd_type::size;
static constexpr auto padded_ns = (w + simd_size - 1) & ~(simd_size - 1);
static constexpr auto nc = nc200<w>();
static constexpr auto horner_coeffs = get_horner_coeffs_200<FLT, w>();
static constexpr auto use_ker_sym = (simd_size < w);

alignas(alignment) static constexpr auto padded_coeffs =
pad_2D_array_with_zeros<FLT, nc, w, padded_ns>(horner_coeffs);

const simd_type zv(z);
for (uint8_t i = 0; i < w; i += simd_size) {
auto k = simd_type::load_aligned(padded_coeffs[0].data() + i);
for (uint8_t j = 1; j < nc; ++j) {
const auto cji = simd_type::load_aligned(padded_coeffs[j].data() + i);
k = xsimd::fma(k, zv, cji);
// use kernel symmetry trick if w > simd_size
if constexpr (use_ker_sym) {
static constexpr uint8_t tail = w % simd_size;
static constexpr uint8_t if_odd_degree = ((nc + 1) % 2);
static const simd_type zerov(0.0);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

in my experiments retuning {0} is often faster than saving it as a static const

const simd_type zv(z);
const simd_type z2v = zv * zv;

// no xsimd::shuffle neeeded if tail is zero
if constexpr (tail) {
// some xsimd constant for shuffle
static constexpr auto shuffle_batch =
xsimd::make_batch_constant<xsimd::as_unsigned_integer_t<FLT>, arch_t,
shuffle_index<tail>>();

// process simd vecs
simd_type k_odd, k_even, k_prev, k_sym = zerov;
for (uint8_t i = 0, offset = w - tail; i < (w + 1) / 2;
i += simd_size, offset -= simd_size) {
k_odd = if_odd_degree ? simd_type::load_aligned(padded_coeffs[0].data() + i)
Copy link
Collaborator

@DiamonDinoia DiamonDinoia Jul 12, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this should be:

k_odd = [i]() constexpr noexept{
	if constexpr(if_odd_degree){
		return simd_type::load_aligned(padded_coeffs[0].data() + i);
	} else return simd_type{0};
}();

: zerov;
k_even = simd_type::load_aligned(padded_coeffs[if_odd_degree].data() + i);
for (uint8_t j = 1 + if_odd_degree; j < nc; j += 2) {
const auto cji_odd = simd_type::load_aligned(padded_coeffs[j].data() + i);
k_odd = xsimd::fma(k_odd, z2v, cji_odd);
const auto cji_even =
simd_type::load_aligned(padded_coeffs[j + 1].data() + i);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is [j+1] safe here? The loop is up to nc, should not be up to (nc-1) or (nc&~1)

k_even = xsimd::fma(k_even, z2v, cji_even);
}
// left part
xsimd::fma(k_odd, zv, k_even).store_aligned(ker + i);
// right part symmetric to the left part
if (offset >= (w + 1) / 2) {
// to use aligned store, we need shuffle the previous k_sym and current k_sym
k_prev = k_sym;
k_sym = xsimd::fma(k_odd, -zv, k_even);
lu1and10 marked this conversation as resolved.
Show resolved Hide resolved
xsimd::shuffle(k_sym, k_prev, shuffle_batch).store_aligned(ker + offset);
}
}
} else {
// xsimd constants for reverse
static constexpr auto reverse_batch =
xsimd::make_batch_constant<xsimd::as_unsigned_integer_t<FLT>, arch_t,
reverse_index<simd_size>>();

// process simd vecs
for (uint8_t i = 0, offset = w - simd_size; i < w / 2;
i += simd_size, offset -= simd_size) {
auto k_odd = if_odd_degree
? simd_type::load_aligned(padded_coeffs[0].data() + i)
: zerov;
auto k_even = simd_type::load_aligned(padded_coeffs[if_odd_degree].data() + i);
for (uint8_t j = 1 + if_odd_degree; j < nc; j += 2) {
const auto cji_odd = simd_type::load_aligned(padded_coeffs[j].data() + i);
k_odd = xsimd::fma(k_odd, z2v, cji_odd);
const auto cji_even =
simd_type::load_aligned(padded_coeffs[j + 1].data() + i);
k_even = xsimd::fma(k_even, z2v, cji_even);
}
// left part
xsimd::fma(k_odd, zv, k_even).store_aligned(ker + i);
// right part symmetric to the left part
if (offset >= w / 2) {
// reverse the order for symmetric part
xsimd::swizzle(xsimd::fma(k_odd, -zv, k_even), reverse_batch)
.store_aligned(ker + offset);
}
}
}
} else {
const simd_type zv(z);

for (uint8_t i = 0; i < w; i += simd_size) {
auto k = simd_type::load_aligned(padded_coeffs[0].data() + i);
for (uint8_t j = 1; j < nc; ++j) {
const auto cji = simd_type::load_aligned(padded_coeffs[j].data() + i);
k = xsimd::fma(k, zv, cji);
}
k.store_aligned(ker + i);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

there is quite a lot of repetition I think you could do something like w/2 + (tail>0) to merge most of the code.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you give more of a hint what you're suggesting here? Or maybe Libin sees it...

}
k.store_aligned(ker + i);
}

return;
}
// insert the auto-generated code which expects z, w args, writes to ker...
Expand Down Expand Up @@ -2168,6 +2245,16 @@ struct zip_hi {
return (size + index) / 2;
}
};
template<unsigned cap> struct reverse_index {
static constexpr unsigned get(unsigned index, const unsigned size) {
return index < cap ? (cap - 1 - index) : index;
}
};
template<unsigned cap> struct shuffle_index {
static constexpr unsigned get(unsigned index, const unsigned size) {
return index < cap ? (cap - 1 - index) : size + size + cap - 1 - index;
}
};

struct select_even {
static constexpr unsigned get(unsigned index, unsigned /*size*/) { return index * 2; }
Expand Down
Loading