diff --git a/src/bvals/comms/boundary_communication.cpp b/src/bvals/comms/boundary_communication.cpp index 78121cd3fac9..208942b74986 100644 --- a/src/bvals/comms/boundary_communication.cpp +++ b/src/bvals/comms/boundary_communication.cpp @@ -270,11 +270,24 @@ TaskStatus SetBounds(std::shared_ptr> &md) { } // const Real threshold = Globals::sparse_config.allocation_threshold; auto &bnd_info = cache.bnd_info; + auto &bnd_info_h = cache.bnd_info_h; + size_t Nel_max = 0; + for (int b = 0; b < nbound; b++) { + for (int it = 0; it < bnd_info_h(b).ntopological_elements; ++it) { + const int tensor_size = bnd_info_h(b).idxer[it].sizeAtAndBelow(2); + if (tensor_size > Nel_max) { + Nel_max = tensor_size; + } + } + } + const int Nteam = Nel_max; + Kokkos::parallel_for( PARTHENON_AUTO_LABEL, - Kokkos::TeamPolicy<>(parthenon::DevExecSpace(), nbound, Kokkos::AUTO), + Kokkos::TeamPolicy<>(parthenon::DevExecSpace(), nbound * Nteam, Kokkos::AUTO), KOKKOS_LAMBDA(parthenon::team_mbr_t team_member) { - const int b = team_member.league_rank(); + const int b = team_member.league_rank() / Nteam; + const int bteam = team_member.league_rank() % Nteam; if (bnd_info(b).same_to_same) return; int idx_offset = 0; for (int it = 0; it < bnd_info(b).ntopological_elements; ++it) { @@ -285,13 +298,17 @@ TaskStatus SetBounds(std::shared_ptr> &md) { lcoord_trans.InverseTransform(bnd_info(b).topo_idx[it]); Real fac = ftemp; // Can't capture structured bindings const int iel = static_cast(tel) % 3; - const int Ni = idxer.template EndIdx<5>() - idxer.template StartIdx<5>() + 1; + // Element t, u, v in variable + const int Nel = idxer.sizeAtAndBelow(2); + const int Nidx = idxer.sizeAtAndAbove(3); + if (bteam >= Nel) return; + const int Ni = idxer.size(5); if (bnd_info(b).buf_allocated && bnd_info(b).allocated) { Kokkos::parallel_for( - Kokkos::TeamThreadRange<>(team_member, idxer.size() / Ni), + Kokkos::TeamThreadRange<>(team_member, idxer.size() / Nel / Ni), [&](const int idx) { - Real *buf = &bnd_info(b).buf(idx * Ni + idx_offset); - const auto [t, u, v, k, j, i] = idxer(idx * Ni); + Real *buf = &bnd_info(b).buf(bteam * Nidx + idx * Ni + idx_offset); + const auto [t, u, v, k, j, i] = idxer(bteam * Nidx + idx * Ni); // Have to do this because of some weird issue about structure bindings // being captured const int tt = t; @@ -311,9 +328,9 @@ TaskStatus SetBounds(std::shared_ptr> &md) { } else if (bnd_info(b).allocated && bound_type != BoundaryType::flxcor_recv) { const Real default_val = bnd_info(b).var.sparse_default_val; Kokkos::parallel_for( - Kokkos::TeamThreadRange<>(team_member, idxer.size() / Ni), + Kokkos::TeamThreadRange<>(team_member, idxer.size() / Nel / Ni), [&](const int idx) { - const auto [t, u, v, k, j, i] = idxer(idx * Ni); + const auto [t, u, v, k, j, i] = idxer(bteam * Nidx + idx * Ni); const int tt = t; const int uu = u; const int vv = v; diff --git a/src/utils/indexer.hpp b/src/utils/indexer.hpp index 43e4b613087c..1557963d86ee 100644 --- a/src/utils/indexer.hpp +++ b/src/utils/indexer.hpp @@ -90,6 +90,25 @@ struct Indexer { KOKKOS_FORCEINLINE_FUNCTION std::size_t size() const { return _size; } + KOKKOS_FORCEINLINE_FUNCTION + std::size_t size(int dim) const { return end[dim] - start[dim] + 1; } + + KOKKOS_FORCEINLINE_FUNCTION + std::size_t sizeAtAndAbove(int dim) const { + std::size_t out = dim < rank; + for (int i = dim; i < rank; ++i) + out *= size(i); + return out; + } + + KOKKOS_FORCEINLINE_FUNCTION + std::size_t sizeAtAndBelow(int dim) const { + std::size_t out = dim >= 0; + for (int i = 0; i <= dim; ++i) + out *= size(i); + return out; + } + KOKKOS_FORCEINLINE_FUNCTION std::tuple operator()(int idx) const { return GetIndicesImpl(idx, std::make_index_sequence());