diff --git a/CHANGELOG.md b/CHANGELOG.md index 19ca04abc04d..6e940ad7fa79 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,7 @@ - [[PR1173]](https://github.com/parthenon-hpc-lab/parthenon/pull/1173) Make debugging easier by making parthenon throw an error if ParameterInput is different on multiple MPI ranks. ### Infrastructure (changes irrelevant to downstream codes) +- [[PR 1175]](https://github.com/parthenon-hpc-lab/parthenon/pull/1175) Add inner comm loop specializations to maximize comms performance - [[PR1176]](https://github.com/parthenon-hpc-lab/parthenon/pull/1176) Move some code from header to implementation files ### Removed (removing behavior/API/varaibles/...) diff --git a/src/basic_types.hpp b/src/basic_types.hpp index f1f07878a533..16dedfc2f941 100644 --- a/src/basic_types.hpp +++ b/src/basic_types.hpp @@ -41,6 +41,7 @@ using Real = double; struct IndexRange { int s = 0; /// Starting Index (inclusive) int e = 0; /// Ending Index (inclusive) + operator std::pair() const {return {s, e};} }; // Enum speficying whether or not you requested a flux variable in diff --git a/src/bvals/comms/boundary_communication.cpp b/src/bvals/comms/boundary_communication.cpp index 78121cd3fac9..5a4d66fc4c30 100644 --- a/src/bvals/comms/boundary_communication.cpp +++ b/src/bvals/comms/boundary_communication.cpp @@ -94,7 +94,8 @@ TaskStatus SendBoundBufs(std::shared_ptr> &md) { Kokkos::parallel_for( PARTHENON_AUTO_LABEL, - Kokkos::TeamPolicy<>(parthenon::DevExecSpace(), nbound, Kokkos::AUTO), + Kokkos::TeamPolicy<>(parthenon::DevExecSpace(), nbound, Kokkos::AUTO, + pmesh->GetCommVectorLength()), KOKKOS_LAMBDA(parthenon::team_mbr_t team_member) { const int b = team_member.league_rank(); @@ -110,28 +111,42 @@ TaskStatus SendBoundBufs(std::shared_ptr> &md) { auto &idxer = bnd_info(b).idxer[it]; const int iel = static_cast(bnd_info(b).topo_idx[it]) % 3; const int Ni = idxer.template EndIdx<5>() - idxer.template StartIdx<5>() + 1; - Kokkos::parallel_reduce( - Kokkos::TeamThreadRange<>(team_member, idxer.size() / Ni), - [&](const int idx, bool &lnon_zero) { - const auto [t, u, v, k, j, i] = idxer(idx * Ni); - Real *var = &bnd_info(b).var(iel, t, u, v, k, j, i); - Real *buf = &bnd_info(b).buf(idx * Ni + idx_offset); - - Kokkos::parallel_for(Kokkos::ThreadVectorRange<>(team_member, Ni), - [&](int m) { buf[m] = var[m]; }); - - bool mnon_zero = false; - Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange<>(team_member, Ni), - [&](int m, bool &llnon_zero) { - llnon_zero = llnon_zero || (std::abs(buf[m]) >= threshold); - }, - Kokkos::LOr(mnon_zero)); - - lnon_zero = lnon_zero || mnon_zero; - if (bound_type == BoundaryType::flxcor_send) lnon_zero = true; - }, - Kokkos::LOr(non_zero[iel])); + if (threshold > 0.0) { + Kokkos::parallel_reduce( + Kokkos::TeamThreadRange<>(team_member, idxer.size() / Ni), + [&](const int idx, bool &lnon_zero) { + const auto [t, u, v, k, j, i] = idxer(idx * Ni); + Real *var = &bnd_info(b).var(iel, t, u, v, k, j, i); + Real *buf = &bnd_info(b).buf(idx * Ni + idx_offset); + + Kokkos::parallel_for(Kokkos::ThreadVectorRange<>(team_member, Ni), + [&](int m) { buf[m] = var[m]; }); + + bool mnon_zero = false; + Kokkos::parallel_reduce( + Kokkos::ThreadVectorRange<>(team_member, Ni), + [&](int m, bool &llnon_zero) { + llnon_zero = llnon_zero || (std::abs(buf[m]) >= threshold); + }, + Kokkos::LOr(mnon_zero)); + + lnon_zero = lnon_zero || mnon_zero; + if (bound_type == BoundaryType::flxcor_send) lnon_zero = true; + }, + Kokkos::LOr(non_zero[iel])); + } else { + Kokkos::parallel_for( + Kokkos::TeamThreadRange<>(team_member, idxer.size() / Ni), + [&](const int idx) { + const auto [t, u, v, k, j, i] = idxer(idx * Ni); + Real *var = &bnd_info(b).var(iel, t, u, v, k, j, i); + Real *buf = &bnd_info(b).buf(idx * Ni + idx_offset); + + Kokkos::parallel_for(Kokkos::ThreadVectorRange<>(team_member, Ni), + [&](int m) { buf[m] = var[m]; }); + }); + non_zero[iel] = true; + } idx_offset += idxer.size(); } Kokkos::single(Kokkos::PerTeam(team_member), [&]() { @@ -272,7 +287,8 @@ TaskStatus SetBounds(std::shared_ptr> &md) { auto &bnd_info = cache.bnd_info; Kokkos::parallel_for( PARTHENON_AUTO_LABEL, - Kokkos::TeamPolicy<>(parthenon::DevExecSpace(), nbound, Kokkos::AUTO), + Kokkos::TeamPolicy<>(parthenon::DevExecSpace(), nbound, Kokkos::AUTO, + pmesh->GetCommVectorLength()), KOKKOS_LAMBDA(parthenon::team_mbr_t team_member) { const int b = team_member.league_rank(); if (bnd_info(b).same_to_same) return; @@ -280,54 +296,112 @@ TaskStatus SetBounds(std::shared_ptr> &md) { for (int it = 0; it < bnd_info(b).ntopological_elements; ++it) { auto &idxer = bnd_info(b).idxer[it]; auto &lcoord_trans = bnd_info(b).lcoord_trans; + const bool isTrivial = lcoord_trans.IsTrivial(); auto &var = bnd_info(b).var; const auto [tel, ftemp] = lcoord_trans.InverseTransform(bnd_info(b).topo_idx[it]); + const bool isCell = (tel == TopologicalElement::CC); Real fac = ftemp; // Can't capture structured bindings const int iel = static_cast(tel) % 3; const int Ni = idxer.template EndIdx<5>() - idxer.template StartIdx<5>() + 1; if (bnd_info(b).buf_allocated && bnd_info(b).allocated) { - Kokkos::parallel_for( - Kokkos::TeamThreadRange<>(team_member, idxer.size() / Ni), - [&](const int idx) { - Real *buf = &bnd_info(b).buf(idx * Ni + idx_offset); - const auto [t, u, v, k, j, i] = idxer(idx * Ni); - // Have to do this because of some weird issue about structure bindings - // being captured - const int tt = t; - const int uu = u; - const int vv = v; - const int kk = k; - const int jj = j; - const int ii = i; - Kokkos::parallel_for( - Kokkos::ThreadVectorRange<>(team_member, Ni), [&](int m) { - const auto [il, jl, kl] = - lcoord_trans.InverseTransform({ii + m, jj, kk}); - if (idxer.IsActive(kl, jl, il)) - var(iel, tt, uu, vv, kl, jl, il) = fac * buf[m]; - }); - }); + if (isTrivial && isCell) { + Kokkos::parallel_for( + Kokkos::TeamThreadRange<>(team_member, idxer.size() / Ni), + [&](const int idx) { + Real *buf = &bnd_info(b).buf(idx * Ni + idx_offset); + const auto [t, u, v, k, j, i] = idxer(idx * Ni); + Real *var_ptr = &var(iel, t, u, v, k, j, i); + Kokkos::parallel_for(Kokkos::ThreadVectorRange<>(team_member, Ni), + [&](int m) { var_ptr[m] = buf[m]; }); + }); + } else if (isTrivial) { + Kokkos::parallel_for( + Kokkos::TeamThreadRange<>(team_member, idxer.size() / Ni), + [&](const int idx) { + Real *buf = &bnd_info(b).buf(idx * Ni + idx_offset); + const auto [t, u, v, k, j, i] = idxer(idx * Ni); + // Have to do this because of some weird issue about structure + // bindings being captured + const int kk = k; + const int jj = j; + const int ii = i; + Real *var_ptr = &var(iel, t, u, v, kk, jj, ii); + Kokkos::parallel_for(Kokkos::ThreadVectorRange<>(team_member, Ni), + [&](int m) { + if (idxer.IsActive(kk, jj, ii + m)) + var_ptr[m] = buf[m]; + }); + }); + } else { + Kokkos::parallel_for( + Kokkos::TeamThreadRange<>(team_member, idxer.size() / Ni), + [&](const int idx) { + Real *buf = &bnd_info(b).buf(idx * Ni + idx_offset); + const auto [t, u, v, k, j, i] = idxer(idx * Ni); + // Have to do this because of some weird issue about structure + // bindings being captured + const int tt = t; + const int uu = u; + const int vv = v; + const int kk = k; + const int jj = j; + const int ii = i; + Kokkos::parallel_for( + Kokkos::ThreadVectorRange<>(team_member, Ni), [&](int m) { + const auto [il, jl, kl] = + lcoord_trans.InverseTransform({ii + m, jj, kk}); + if (idxer.IsActive(kl, jl, il)) + var(iel, tt, uu, vv, kl, jl, il) = fac * buf[m]; + }); + }); + } } else if (bnd_info(b).allocated && bound_type != BoundaryType::flxcor_recv) { const Real default_val = bnd_info(b).var.sparse_default_val; - Kokkos::parallel_for( - Kokkos::TeamThreadRange<>(team_member, idxer.size() / Ni), - [&](const int idx) { - const auto [t, u, v, k, j, i] = idxer(idx * Ni); - const int tt = t; - const int uu = u; - const int vv = v; - const int kk = k; - const int jj = j; - const int ii = i; - Kokkos::parallel_for( - Kokkos::ThreadVectorRange<>(team_member, Ni), [&](int m) { - const auto [il, jl, kl] = - lcoord_trans.InverseTransform({ii + m, jj, kk}); - if (idxer.IsActive(kl, jl, il)) - var(iel, tt, uu, vv, kl, jl, il) = default_val; - }); - }); + if (isTrivial && isCell) { + Kokkos::parallel_for( + Kokkos::TeamThreadRange<>(team_member, idxer.size() / Ni), + [&](const int idx) { + const auto [t, u, v, k, j, i] = idxer(idx * Ni); + Real *var_ptr = &var(iel, t, u, v, k, j, i); + Kokkos::parallel_for(Kokkos::ThreadVectorRange<>(team_member, Ni), + [&](int m) { var_ptr[m] = default_val; }); + }); + } else if (isTrivial) { + Kokkos::parallel_for( + Kokkos::TeamThreadRange<>(team_member, idxer.size() / Ni), + [&](const int idx) { + const auto [t, u, v, k, j, i] = idxer(idx * Ni); + const int kk = k; + const int jj = j; + const int ii = i; + Real *var_ptr = &var(iel, t, u, v, k, j, i); + Kokkos::parallel_for(Kokkos::ThreadVectorRange<>(team_member, Ni), + [&](int m) { + if (idxer.IsActive(kk, jj, ii + m)) + var_ptr[m] = default_val; + }); + }); + } else { + Kokkos::parallel_for( + Kokkos::TeamThreadRange<>(team_member, idxer.size() / Ni), + [&](const int idx) { + const auto [t, u, v, k, j, i] = idxer(idx * Ni); + const int tt = t; + const int uu = u; + const int vv = v; + const int kk = k; + const int jj = j; + const int ii = i; + Kokkos::parallel_for( + Kokkos::ThreadVectorRange<>(team_member, Ni), [&](int m) { + const auto [il, jl, kl] = + lcoord_trans.InverseTransform({ii + m, jj, kk}); + if (idxer.IsActive(kl, jl, il)) + var(iel, tt, uu, vv, kl, jl, il) = default_val; + }); + }); + } } idx_offset += idxer.size(); } diff --git a/src/mesh/forest/logical_coordinate_transformation.hpp b/src/mesh/forest/logical_coordinate_transformation.hpp index 0e0d0bc293cf..055c3df5a39a 100644 --- a/src/mesh/forest/logical_coordinate_transformation.hpp +++ b/src/mesh/forest/logical_coordinate_transformation.hpp @@ -52,6 +52,13 @@ struct LogicalCoordinateTransformation { std::int64_t origin) const; CellCentOffsets Transform(CellCentOffsets in) const; + // Check if this transformation includes at most a translation + KOKKOS_INLINE_FUNCTION + bool IsTrivial() const { + return (dir_connection[0] == 0) && (dir_connection[1] == 1) && + (dir_connection[2] == 2) && !dir_flip[0] && !dir_flip[1] && !dir_flip[2]; + } + KOKKOS_INLINE_FUNCTION std::tuple Transform(TopologicalElement el) const { int iel = static_cast(el); diff --git a/src/mesh/mesh.cpp b/src/mesh/mesh.cpp index d513ec6e0b52..f8512a1ca0aa 100644 --- a/src/mesh/mesh.cpp +++ b/src/mesh/mesh.cpp @@ -81,6 +81,8 @@ Mesh::Mesh(ParameterInput *pin, ApplicationInput *app_in, Packages_t &packages, default_pack_size_(pin->GetOrAddInteger("parthenon/mesh", "pack_size", -1)), // private members: num_mesh_threads_(pin->GetOrAddInteger("parthenon/mesh", "num_threads", 1)), + comm_vector_length_( + pin->GetOrAddInteger("parthenon/mesh", "comm_vector_length", 1)), use_uniform_meshgen_fn_{true, true, true, true}, lb_flag_(true), lb_automatic_(), lb_manual_(), nslist(Globals::nranks), nblist(Globals::nranks), nref(Globals::nranks), nderef(Globals::nranks), rdisp(Globals::nranks), diff --git a/src/mesh/mesh.hpp b/src/mesh/mesh.hpp index 684a897aad56..6c41379bd2de 100644 --- a/src/mesh/mesh.hpp +++ b/src/mesh/mesh.hpp @@ -100,6 +100,8 @@ class Mesh { return nblist[my_rank]; } int GetNumMeshThreads() const { return num_mesh_threads_; } + int GetCommVectorLength() const { return comm_vector_length_; } + std::int64_t GetTotalCells(); // TODO(JMM): Move block_size into mesh. int GetNumberOfMeshBlockCells() const; @@ -259,6 +261,7 @@ class Mesh { // data int root_level, max_level, current_level; int num_mesh_threads_; + int comm_vector_length_; /// Maps Global Block IDs to which rank the block is mapped to. std::vector ranklist; /// Maps rank to start of local block IDs. diff --git a/src/utils/indexer.hpp b/src/utils/indexer.hpp index 43e4b613087c..f996b9c89281 100644 --- a/src/utils/indexer.hpp +++ b/src/utils/indexer.hpp @@ -95,6 +95,11 @@ struct Indexer { return GetIndicesImpl(idx, std::make_index_sequence()); } + KOKKOS_FORCEINLINE_FUNCTION + std::size_t GetFlatIdx(Ts... ts) const { + return GetFlatIndexImpl(ts..., std::make_index_sequence()); + } + KOKKOS_FORCEINLINE_FUNCTION auto GetIdxArray(int idx) const { return get_array_from_tuple( @@ -128,6 +133,19 @@ struct Indexer { return idxs; } + template + KOKKOS_FORCEINLINE_FUNCTION std::size_t + GetFlatIndexImpl(Ts... idxs, std::index_sequence) const { + std::size_t out{0}; + ( + [&] { + idxs -= start[Is]; + out += idxs * N[Is]; + }(), + ...); + return out; + } + template KOKKOS_FORCEINLINE_FUNCTION static std::array GetFactors(std::tuple Nt, std::index_sequence) {