Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enable parthenon::par_reduce for MD loops with Kokkos 1D Range #1130

Merged
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
## Current develop

### Added (new features/APIs/variables/...)
- [[PR 1130]](https://github.com/parthenon-hpc-lab/parthenon/pull/1130) Enable `parthenon::par_reduce` for MD loops with Kokkos 1D Range
- [[PR 1119]](https://github.com/parthenon-hpc-lab/parthenon/pull/1119) Formalize MeshData partitioning.
- [[PR 1128]](https://github.com/parthenon-hpc-lab/parthenon/pull/1128) Add cycle and nbtotal to hst
- [[PR 1099]](https://github.com/parthenon-hpc-lab/parthenon/pull/1099) Functionality for outputting task graphs in GraphViz format.
Expand Down
85 changes: 59 additions & 26 deletions src/kokkos_abstraction.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -258,6 +258,35 @@ par_dispatch(LoopPatternMDRange, const std::string &name, DevExecSpace exec_spac
function, std::forward<Args>(args)...);
}

template <typename, typename>
class FlatFunctor;

template <typename F, typename... Args>
auto MakeFlatFunctor(F &function, Args... args) {
return FlatFunctor<F, decltype(&F::operator())>(function, std::forward<Args>(args)...);
}

template <typename Function, typename R, typename T, typename Index, typename... FArgs>
class FlatFunctor<Function, R (T::*)(Index, Index, Index, FArgs...) const> {
int NjNi, Ni, kl, jl, il;
Function function;

public:
FlatFunctor(const Function _function, const int _NjNi, const int _Ni, const int _kl,
const int _jl, const int _il)
: function(_function), NjNi(_NjNi), Ni(_Ni), kl(_kl), jl(_jl), il(_il) {}
KOKKOS_INLINE_FUNCTION
void operator()(const int &idx, FArgs &&...fargs) const {
int k = idx / NjNi;
int j = (idx - k * NjNi) / Ni;
int i = idx - k * NjNi - j * Ni;
k += kl;
j += jl;
i += il;
function(k, j, i, std::forward<FArgs>(fargs)...);
}
};

// 3D loop using Kokkos 1D Range
template <typename Tag, typename Function, class... Args>
inline typename std::enable_if<sizeof...(Args) <= 1, void>::type
Expand All @@ -270,18 +299,9 @@ par_dispatch(LoopPatternFlatRange, const std::string &name, DevExecSpace exec_sp
const int Ni = iu - il + 1;
const int NkNjNi = Nk * Nj * Ni;
const int NjNi = Nj * Ni;
kokkos_dispatch(
tag, name, Kokkos::RangePolicy<>(exec_space, 0, NkNjNi),
KOKKOS_LAMBDA(const int &idx) {
int k = idx / NjNi;
int j = (idx - k * NjNi) / Ni;
int i = idx - k * NjNi - j * Ni;
k += kl;
j += jl;
i += il;
function(k, j, i);
},
std::forward<Args>(args)...);
kokkos_dispatch(tag, name, Kokkos::RangePolicy<>(exec_space, 0, NkNjNi),
MakeFlatFunctor(function, NjNi, Ni, kl, jl, il),
std::forward<Args>(args)...);
}

// 3D loop using MDRange loops
Expand Down Expand Up @@ -372,6 +392,30 @@ inline void par_dispatch(LoopPatternSimdFor, const std::string &name,
function(k, j, i);
}

template <typename Function, typename R, typename T, typename Index, typename... FArgs>
class FlatFunctor<Function, R (T::*)(Index, Index, Index, Index, FArgs...) const> {
int NkNjNi, NjNi, Ni, nl, kl, jl, il;
Function function;

public:
FlatFunctor(const Function _function, const int _NkNjNi, const int _NjNi, const int _Ni,
const int _nl, const int _kl, const int _jl, const int _il)
: function(_function), NkNjNi(_NkNjNi), NjNi(_NjNi), Ni(_Ni), nl(_nl), kl(_kl),
jl(_jl), il(_il) {}
KOKKOS_INLINE_FUNCTION
void operator()(const int &idx, FArgs &&...fargs) const {
int n = idx / NkNjNi;
int k = (idx - n * NkNjNi) / NjNi;
int j = (idx - n * NkNjNi - k * NjNi) / Ni;
int i = idx - n * NkNjNi - k * NjNi - j * Ni;
n += nl;
k += kl;
j += jl;
i += il;
function(n, k, j, i, std::forward<FArgs>(fargs)...);
}
};

// 4D loop using Kokkos 1D Range
template <typename Tag, typename Function, class... Args>
inline typename std::enable_if<sizeof...(Args) <= 1, void>::type
Expand All @@ -387,20 +431,9 @@ par_dispatch(LoopPatternFlatRange, const std::string &name, DevExecSpace exec_sp
const int NnNkNjNi = Nn * Nk * Nj * Ni;
const int NkNjNi = Nk * Nj * Ni;
const int NjNi = Nj * Ni;
kokkos_dispatch(
tag, name, Kokkos::RangePolicy<>(exec_space, 0, NnNkNjNi),
KOKKOS_LAMBDA(const int &idx) {
int n = idx / NkNjNi;
int k = (idx - n * NkNjNi) / NjNi;
int j = (idx - n * NkNjNi - k * NjNi) / Ni;
int i = idx - n * NkNjNi - k * NjNi - j * Ni;
n += nl;
k += kl;
j += jl;
i += il;
function(n, k, j, i);
},
std::forward<Args>(args)...);
kokkos_dispatch(tag, name, Kokkos::RangePolicy<>(exec_space, 0, NnNkNjNi),
MakeFlatFunctor(function, NkNjNi, NjNi, Ni, nl, kl, jl, il),
std::forward<Args>(args)...);
}

// 4D loop using MDRange loops
Expand Down
81 changes: 77 additions & 4 deletions tst/unit/kokkos_abstraction.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -500,12 +500,85 @@ bool test_wrapper_reduce_1d(T loop_pattern, DevExecSpace exec_space) {
return total == test_tot;
}

template <class T>
bool test_wrapper_reduce_3d(T loop_pattern, DevExecSpace exec_space) {
constexpr int N = 10;
parthenon::ParArray3D<int> buffer("Testing buffer", N, N, N);
// Initialize data
parthenon::par_for(
loop_pattern, "Initialize parallel reduce array", exec_space, 0, N - 1, 0, N - 1, 0,
N - 1, KOKKOS_LAMBDA(const int k, const int j, const int i) {
buffer(k, j, i) = i + j + k;
});
int tot = 0;
for (int k = 0; k < N; ++k) {
for (int j = 0; j < N; ++j) {
for (int i = 0; i < N; ++i) {
tot += i + j + k;
}
}
}
int test_tot = 0;
parthenon::par_reduce(
loop_pattern, "Sum via par reduce", exec_space, 0, N - 1, 0, N - 1, 0, N - 1,
KOKKOS_LAMBDA(const int k, const int j, const int i, int &t) { t += i + j + k; },
Kokkos::Sum<int>(test_tot));
return tot == test_tot;
}

template <class T>
bool test_wrapper_reduce_4d(T loop_pattern, DevExecSpace exec_space) {
constexpr int N = 10;
parthenon::ParArray4D<int> buffer("Testing buffer", N, N, N, N);
// Initialize data
parthenon::par_for(
loop_pattern, "Initialize parallel reduce array", exec_space, 0, N - 1, 0, N - 1, 0,
N - 1, 0, N - 1, KOKKOS_LAMBDA(const int n, const int k, const int j, const int i) {
buffer(n, k, j, i) = i + j + k + n;
});
int tot = 0;
for (int n = 0; n < N; ++n) {
for (int k = 0; k < N; ++k) {
for (int j = 0; j < N; ++j) {
for (int i = 0; i < N; ++i) {
tot += i + j + k + n;
}
}
}
}
int test_tot = 0;
parthenon::par_reduce(
loop_pattern, "Sum via par reduce", exec_space, 0, N - 1, 0, N - 1, 0, N - 1, 0,
N - 1,
KOKKOS_LAMBDA(const int n, const int k, const int j, const int i, int &t) {
t += i + j + k + n;
},
Kokkos::Sum<int>(test_tot));
return tot == test_tot;
}

TEST_CASE("Parallel reduce", "[par_reduce]") {
auto default_exec_space = DevExecSpace();
REQUIRE(test_wrapper_reduce_1d(parthenon::loop_pattern_flatrange_tag,
default_exec_space) == true);
if constexpr (std::is_same<DevExecSpace, Kokkos::Serial>::value) {
REQUIRE(test_wrapper_reduce_1d(parthenon::loop_pattern_simdfor_tag,
SECTION("1D loops") {
REQUIRE(test_wrapper_reduce_1d(parthenon::loop_pattern_flatrange_tag,
default_exec_space) == true);
if constexpr (std::is_same<DevExecSpace, Kokkos::Serial>::value) {
REQUIRE(test_wrapper_reduce_1d(parthenon::loop_pattern_simdfor_tag,
default_exec_space) == true);
}
}

SECTION("3D loops") {
REQUIRE(test_wrapper_reduce_3d(parthenon::loop_pattern_flatrange_tag,
default_exec_space) == true);
REQUIRE(test_wrapper_reduce_3d(parthenon::loop_pattern_mdrange_tag,
default_exec_space) == true);
}

SECTION("4D loops") {
REQUIRE(test_wrapper_reduce_4d(parthenon::loop_pattern_flatrange_tag,
default_exec_space) == true);
REQUIRE(test_wrapper_reduce_4d(parthenon::loop_pattern_mdrange_tag,
default_exec_space) == true);
}
}
Loading