Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: Add boundary comm inner loop specializations for performance and allow setting vector length #1175

Open
wants to merge 9 commits into
base: develop
Choose a base branch
from
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
- [[PR1173]](https://github.com/parthenon-hpc-lab/parthenon/pull/1173) Make debugging easier by making parthenon throw an error if ParameterInput is different on multiple MPI ranks.

### Infrastructure (changes irrelevant to downstream codes)
- [[PR 1175]](https://github.com/parthenon-hpc-lab/parthenon/pull/1175) Add inner comm loop specializations to maximize comms performance
- [[PR1176]](https://github.com/parthenon-hpc-lab/parthenon/pull/1176) Move some code from header to implementation files

### Removed (removing behavior/API/varaibles/...)
Expand Down
1 change: 1 addition & 0 deletions src/basic_types.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ using Real = double;
struct IndexRange {
int s = 0; /// Starting Index (inclusive)
int e = 0; /// Ending Index (inclusive)
operator std::pair<int, int>() const {return {s, e};}
};

// Enum speficying whether or not you requested a flux variable in
Expand Down
200 changes: 137 additions & 63 deletions src/bvals/comms/boundary_communication.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,8 @@ TaskStatus SendBoundBufs(std::shared_ptr<MeshData<Real>> &md) {

Kokkos::parallel_for(
PARTHENON_AUTO_LABEL,
Kokkos::TeamPolicy<>(parthenon::DevExecSpace(), nbound, Kokkos::AUTO),
Kokkos::TeamPolicy<>(parthenon::DevExecSpace(), nbound, Kokkos::AUTO,
pmesh->GetCommVectorLength()),
KOKKOS_LAMBDA(parthenon::team_mbr_t team_member) {
const int b = team_member.league_rank();

Expand All @@ -110,28 +111,42 @@ TaskStatus SendBoundBufs(std::shared_ptr<MeshData<Real>> &md) {
auto &idxer = bnd_info(b).idxer[it];
const int iel = static_cast<int>(bnd_info(b).topo_idx[it]) % 3;
const int Ni = idxer.template EndIdx<5>() - idxer.template StartIdx<5>() + 1;
Kokkos::parallel_reduce(
Kokkos::TeamThreadRange<>(team_member, idxer.size() / Ni),
[&](const int idx, bool &lnon_zero) {
const auto [t, u, v, k, j, i] = idxer(idx * Ni);
Real *var = &bnd_info(b).var(iel, t, u, v, k, j, i);
Real *buf = &bnd_info(b).buf(idx * Ni + idx_offset);

Kokkos::parallel_for(Kokkos::ThreadVectorRange<>(team_member, Ni),
[&](int m) { buf[m] = var[m]; });

bool mnon_zero = false;
Kokkos::parallel_reduce(
Kokkos::ThreadVectorRange<>(team_member, Ni),
[&](int m, bool &llnon_zero) {
llnon_zero = llnon_zero || (std::abs(buf[m]) >= threshold);
},
Kokkos::LOr<bool, parthenon::DevMemSpace>(mnon_zero));

lnon_zero = lnon_zero || mnon_zero;
if (bound_type == BoundaryType::flxcor_send) lnon_zero = true;
},
Kokkos::LOr<bool, parthenon::DevMemSpace>(non_zero[iel]));
if (threshold > 0.0) {
Kokkos::parallel_reduce(
Kokkos::TeamThreadRange<>(team_member, idxer.size() / Ni),
[&](const int idx, bool &lnon_zero) {
const auto [t, u, v, k, j, i] = idxer(idx * Ni);
Real *var = &bnd_info(b).var(iel, t, u, v, k, j, i);
Real *buf = &bnd_info(b).buf(idx * Ni + idx_offset);

Kokkos::parallel_for(Kokkos::ThreadVectorRange<>(team_member, Ni),
[&](int m) { buf[m] = var[m]; });

bool mnon_zero = false;
Kokkos::parallel_reduce(
Kokkos::ThreadVectorRange<>(team_member, Ni),
[&](int m, bool &llnon_zero) {
llnon_zero = llnon_zero || (std::abs(buf[m]) >= threshold);
},
Kokkos::LOr<bool, parthenon::DevMemSpace>(mnon_zero));

lnon_zero = lnon_zero || mnon_zero;
if (bound_type == BoundaryType::flxcor_send) lnon_zero = true;
},
Kokkos::LOr<bool, parthenon::DevMemSpace>(non_zero[iel]));
} else {
Kokkos::parallel_for(
Kokkos::TeamThreadRange<>(team_member, idxer.size() / Ni),
[&](const int idx) {
const auto [t, u, v, k, j, i] = idxer(idx * Ni);
Real *var = &bnd_info(b).var(iel, t, u, v, k, j, i);
Real *buf = &bnd_info(b).buf(idx * Ni + idx_offset);

Kokkos::parallel_for(Kokkos::ThreadVectorRange<>(team_member, Ni),
[&](int m) { buf[m] = var[m]; });
});
non_zero[iel] = true;
}
idx_offset += idxer.size();
}
Kokkos::single(Kokkos::PerTeam(team_member), [&]() {
Expand Down Expand Up @@ -272,62 +287,121 @@ TaskStatus SetBounds(std::shared_ptr<MeshData<Real>> &md) {
auto &bnd_info = cache.bnd_info;
Kokkos::parallel_for(
PARTHENON_AUTO_LABEL,
Kokkos::TeamPolicy<>(parthenon::DevExecSpace(), nbound, Kokkos::AUTO),
Kokkos::TeamPolicy<>(parthenon::DevExecSpace(), nbound, Kokkos::AUTO,
pmesh->GetCommVectorLength()),
KOKKOS_LAMBDA(parthenon::team_mbr_t team_member) {
const int b = team_member.league_rank();
if (bnd_info(b).same_to_same) return;
int idx_offset = 0;
for (int it = 0; it < bnd_info(b).ntopological_elements; ++it) {
auto &idxer = bnd_info(b).idxer[it];
auto &lcoord_trans = bnd_info(b).lcoord_trans;
const bool isTrivial = lcoord_trans.IsTrivial();
auto &var = bnd_info(b).var;
const auto [tel, ftemp] =
lcoord_trans.InverseTransform(bnd_info(b).topo_idx[it]);
const bool isCell = (tel == TopologicalElement::CC);
Real fac = ftemp; // Can't capture structured bindings
const int iel = static_cast<int>(tel) % 3;
const int Ni = idxer.template EndIdx<5>() - idxer.template StartIdx<5>() + 1;
if (bnd_info(b).buf_allocated && bnd_info(b).allocated) {
Kokkos::parallel_for(
Kokkos::TeamThreadRange<>(team_member, idxer.size() / Ni),
[&](const int idx) {
Real *buf = &bnd_info(b).buf(idx * Ni + idx_offset);
const auto [t, u, v, k, j, i] = idxer(idx * Ni);
// Have to do this because of some weird issue about structure bindings
// being captured
const int tt = t;
const int uu = u;
const int vv = v;
const int kk = k;
const int jj = j;
const int ii = i;
Kokkos::parallel_for(
Kokkos::ThreadVectorRange<>(team_member, Ni), [&](int m) {
const auto [il, jl, kl] =
lcoord_trans.InverseTransform({ii + m, jj, kk});
if (idxer.IsActive(kl, jl, il))
var(iel, tt, uu, vv, kl, jl, il) = fac * buf[m];
});
});
if (isTrivial && isCell) {
Kokkos::parallel_for(
Kokkos::TeamThreadRange<>(team_member, idxer.size() / Ni),
[&](const int idx) {
Real *buf = &bnd_info(b).buf(idx * Ni + idx_offset);
const auto [t, u, v, k, j, i] = idxer(idx * Ni);
Real *var_ptr = &var(iel, t, u, v, k, j, i);
Kokkos::parallel_for(Kokkos::ThreadVectorRange<>(team_member, Ni),
[&](int m) { var_ptr[m] = buf[m]; });
});
} else if (isTrivial) {
Kokkos::parallel_for(
Kokkos::TeamThreadRange<>(team_member, idxer.size() / Ni),
[&](const int idx) {
Real *buf = &bnd_info(b).buf(idx * Ni + idx_offset);
const auto [t, u, v, k, j, i] = idxer(idx * Ni);
// Have to do this because of some weird issue about structure
// bindings being captured
const int kk = k;
const int jj = j;
const int ii = i;
Real *var_ptr = &var(iel, t, u, v, kk, jj, ii);
Kokkos::parallel_for(Kokkos::ThreadVectorRange<>(team_member, Ni),
[&](int m) {
if (idxer.IsActive(kk, jj, ii + m))
var_ptr[m] = buf[m];
});
});
} else {
Kokkos::parallel_for(
Kokkos::TeamThreadRange<>(team_member, idxer.size() / Ni),
[&](const int idx) {
Real *buf = &bnd_info(b).buf(idx * Ni + idx_offset);
const auto [t, u, v, k, j, i] = idxer(idx * Ni);
// Have to do this because of some weird issue about structure
// bindings being captured
const int tt = t;
const int uu = u;
const int vv = v;
const int kk = k;
const int jj = j;
const int ii = i;
Kokkos::parallel_for(
Kokkos::ThreadVectorRange<>(team_member, Ni), [&](int m) {
const auto [il, jl, kl] =
lcoord_trans.InverseTransform({ii + m, jj, kk});
if (idxer.IsActive(kl, jl, il))
var(iel, tt, uu, vv, kl, jl, il) = fac * buf[m];
});
});
}
} else if (bnd_info(b).allocated && bound_type != BoundaryType::flxcor_recv) {
const Real default_val = bnd_info(b).var.sparse_default_val;
Kokkos::parallel_for(
Kokkos::TeamThreadRange<>(team_member, idxer.size() / Ni),
[&](const int idx) {
const auto [t, u, v, k, j, i] = idxer(idx * Ni);
const int tt = t;
const int uu = u;
const int vv = v;
const int kk = k;
const int jj = j;
const int ii = i;
Kokkos::parallel_for(
Kokkos::ThreadVectorRange<>(team_member, Ni), [&](int m) {
const auto [il, jl, kl] =
lcoord_trans.InverseTransform({ii + m, jj, kk});
if (idxer.IsActive(kl, jl, il))
var(iel, tt, uu, vv, kl, jl, il) = default_val;
});
});
if (isTrivial && isCell) {
Kokkos::parallel_for(
Kokkos::TeamThreadRange<>(team_member, idxer.size() / Ni),
[&](const int idx) {
const auto [t, u, v, k, j, i] = idxer(idx * Ni);
Real *var_ptr = &var(iel, t, u, v, k, j, i);
Kokkos::parallel_for(Kokkos::ThreadVectorRange<>(team_member, Ni),
[&](int m) { var_ptr[m] = default_val; });
});
} else if (isTrivial) {
Kokkos::parallel_for(
Kokkos::TeamThreadRange<>(team_member, idxer.size() / Ni),
[&](const int idx) {
const auto [t, u, v, k, j, i] = idxer(idx * Ni);
const int kk = k;
const int jj = j;
const int ii = i;
Real *var_ptr = &var(iel, t, u, v, k, j, i);
Kokkos::parallel_for(Kokkos::ThreadVectorRange<>(team_member, Ni),
[&](int m) {
if (idxer.IsActive(kk, jj, ii + m))
var_ptr[m] = default_val;
});
});
} else {
Kokkos::parallel_for(
Kokkos::TeamThreadRange<>(team_member, idxer.size() / Ni),
[&](const int idx) {
const auto [t, u, v, k, j, i] = idxer(idx * Ni);
const int tt = t;
const int uu = u;
const int vv = v;
const int kk = k;
const int jj = j;
const int ii = i;
Kokkos::parallel_for(
Kokkos::ThreadVectorRange<>(team_member, Ni), [&](int m) {
const auto [il, jl, kl] =
lcoord_trans.InverseTransform({ii + m, jj, kk});
if (idxer.IsActive(kl, jl, il))
var(iel, tt, uu, vv, kl, jl, il) = default_val;
});
});
}
}
idx_offset += idxer.size();
}
Expand Down
7 changes: 7 additions & 0 deletions src/mesh/forest/logical_coordinate_transformation.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,13 @@ struct LogicalCoordinateTransformation {
std::int64_t origin) const;
CellCentOffsets Transform(CellCentOffsets in) const;

// Check if this transformation includes at most a translation
KOKKOS_INLINE_FUNCTION
bool IsTrivial() const {
return (dir_connection[0] == 0) && (dir_connection[1] == 1) &&
(dir_connection[2] == 2) && !dir_flip[0] && !dir_flip[1] && !dir_flip[2];
}

KOKKOS_INLINE_FUNCTION
std::tuple<TopologicalElement, Real> Transform(TopologicalElement el) const {
int iel = static_cast<int>(el);
Expand Down
2 changes: 2 additions & 0 deletions src/mesh/mesh.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,8 @@ Mesh::Mesh(ParameterInput *pin, ApplicationInput *app_in, Packages_t &packages,
default_pack_size_(pin->GetOrAddInteger("parthenon/mesh", "pack_size", -1)),
// private members:
num_mesh_threads_(pin->GetOrAddInteger("parthenon/mesh", "num_threads", 1)),
comm_vector_length_(
pin->GetOrAddInteger("parthenon/mesh", "comm_vector_length", 1)),
Yurlungur marked this conversation as resolved.
Show resolved Hide resolved
use_uniform_meshgen_fn_{true, true, true, true}, lb_flag_(true), lb_automatic_(),
lb_manual_(), nslist(Globals::nranks), nblist(Globals::nranks),
nref(Globals::nranks), nderef(Globals::nranks), rdisp(Globals::nranks),
Expand Down
3 changes: 3 additions & 0 deletions src/mesh/mesh.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,8 @@ class Mesh {
return nblist[my_rank];
}
int GetNumMeshThreads() const { return num_mesh_threads_; }
int GetCommVectorLength() const { return comm_vector_length_; }

std::int64_t GetTotalCells();
// TODO(JMM): Move block_size into mesh.
int GetNumberOfMeshBlockCells() const;
Expand Down Expand Up @@ -259,6 +261,7 @@ class Mesh {
// data
int root_level, max_level, current_level;
int num_mesh_threads_;
int comm_vector_length_;
/// Maps Global Block IDs to which rank the block is mapped to.
std::vector<int> ranklist;
/// Maps rank to start of local block IDs.
Expand Down
18 changes: 18 additions & 0 deletions src/utils/indexer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,11 @@ struct Indexer {
return GetIndicesImpl(idx, std::make_index_sequence<sizeof...(Ts)>());
}

KOKKOS_FORCEINLINE_FUNCTION
std::size_t GetFlatIdx(Ts... ts) const {
return GetFlatIndexImpl(ts..., std::make_index_sequence<sizeof...(Ts)>());
}

KOKKOS_FORCEINLINE_FUNCTION
auto GetIdxArray(int idx) const {
return get_array_from_tuple(
Expand Down Expand Up @@ -128,6 +133,19 @@ struct Indexer {
return idxs;
}

template <std::size_t... Is>
KOKKOS_FORCEINLINE_FUNCTION std::size_t
GetFlatIndexImpl(Ts... idxs, std::index_sequence<Is...>) const {
std::size_t out{0};
(
[&] {
idxs -= start[Is];
out += idxs * N[Is];
}(),
...);
return out;
}

template <std::size_t... Is>
KOKKOS_FORCEINLINE_FUNCTION static std::array<int, sizeof...(Ts)>
GetFactors(std::tuple<Ts...> Nt, std::index_sequence<Is...>) {
Expand Down
Loading