From 00c936c96d273da9b845c25481e5b3218ba1d4e8 Mon Sep 17 00:00:00 2001 From: Jonah Miller Date: Wed, 16 Oct 2024 08:56:23 -0600 Subject: [PATCH 01/18] bump Kokkos version --- external/Kokkos | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/external/Kokkos b/external/Kokkos index 62d2b6c879b7..e0dc0128e04f 160000 --- a/external/Kokkos +++ b/external/Kokkos @@ -1 +1 @@ -Subproject commit 62d2b6c879b74b6ae7bd06eb3e5e80139c4708e6 +Subproject commit e0dc0128e04f18c2bbbaefceef3616e7ddcfa3c4 From 0f386249fec65bf2b4873711610ea51b30841341 Mon Sep 17 00:00:00 2001 From: Jonah Miller Date: Wed, 16 Oct 2024 08:57:38 -0600 Subject: [PATCH 02/18] changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1f046faabe75..71d100d528aa 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ - [[PR 1161]](https://github.com/parthenon-hpc-lab/parthenon/pull/1161) Make flux field Metadata accessible, add Metadata::CellMemAligned flag, small perfomance upgrades ### Changed (changing behavior/API/variables/...) +- [[PR1191]](https://github.com/parthenon-hpc-lab/parthenon/pull/1191) Update Kokkos version to 4.2.01 - [[PR 1187]](https://github.com/parthenon-hpc-lab/parthenon/pull/1187) Make DataCollection::Add safer and generalize MeshBlockData::Initialize - [[Issue 1165]](https://github.com/parthenon-hpc-lab/parthenon/issues/1165) Bump Kokkos submodule to 4.4.1 - [[PR 1171]](https://github.com/parthenon-hpc-lab/parthenon/pull/1171) Add PARTHENON_USE_SYSTEM_PACKAGES build option From 86783d434bd465209467c889290bf8a8494ecb61 Mon Sep 17 00:00:00 2001 From: Patrick Mullen Date: Thu, 17 Oct 2024 08:38:16 -0600 Subject: [PATCH 03/18] Update Kokkos version to 4.4.1 --- CHANGELOG.md | 3 +-- external/Kokkos | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 71d100d528aa..ff9137b4dfc7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,9 +11,8 @@ - [[PR 1161]](https://github.com/parthenon-hpc-lab/parthenon/pull/1161) Make flux field Metadata accessible, add Metadata::CellMemAligned flag, small perfomance upgrades ### Changed (changing behavior/API/variables/...) -- [[PR1191]](https://github.com/parthenon-hpc-lab/parthenon/pull/1191) Update Kokkos version to 4.2.01 +- [[PR 1191]](https://github.com/parthenon-hpc-lab/parthenon/pull/1191) Update Kokkos version to 4.4.1 - [[PR 1187]](https://github.com/parthenon-hpc-lab/parthenon/pull/1187) Make DataCollection::Add safer and generalize MeshBlockData::Initialize -- [[Issue 1165]](https://github.com/parthenon-hpc-lab/parthenon/issues/1165) Bump Kokkos submodule to 4.4.1 - [[PR 1171]](https://github.com/parthenon-hpc-lab/parthenon/pull/1171) Add PARTHENON_USE_SYSTEM_PACKAGES build option - [[PR 1172]](https://github.com/parthenon-hpc-lab/parthenon/pull/1172) Make parthenon manager robust against external MPI init and finalize calls diff --git a/external/Kokkos b/external/Kokkos index e0dc0128e04f..15dc143e5f39 160000 --- a/external/Kokkos +++ b/external/Kokkos @@ -1 +1 @@ -Subproject commit e0dc0128e04f18c2bbbaefceef3616e7ddcfa3c4 +Subproject commit 15dc143e5f39949eece972a798e175c4b463d4b8 From b4ab05f5f3c059e6a9fa208a3101c6e47bd0d3aa Mon Sep 17 00:00:00 2001 From: Philipp Grete Date: Fri, 18 Oct 2024 15:34:01 +0200 Subject: [PATCH 04/18] Fix View of View dealloc --- src/bvals/comms/bnd_info.hpp | 2 +- src/bvals/comms/bvals_utils.hpp | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/bvals/comms/bnd_info.hpp b/src/bvals/comms/bnd_info.hpp index e6214ceba322..8800f6fd867f 100644 --- a/src/bvals/comms/bnd_info.hpp +++ b/src/bvals/comms/bnd_info.hpp @@ -127,7 +127,7 @@ struct ProResInfo { int GetBufferSize(MeshBlock *pmb, const NeighborBlock &nb, std::shared_ptr> v); -using BndInfoArr_t = ParArray1D; +using BndInfoArr_t = Kokkos::View; using BndInfoArrHost_t = typename BndInfoArr_t::HostMirror; using ProResInfoArr_t = ParArray1D; diff --git a/src/bvals/comms/bvals_utils.hpp b/src/bvals/comms/bvals_utils.hpp index f185c1207747..616d5cf21ee0 100644 --- a/src/bvals/comms/bvals_utils.hpp +++ b/src/bvals/comms/bvals_utils.hpp @@ -215,7 +215,8 @@ inline void RebuildBufferCache(std::shared_ptr> md, int nbound, using namespace loops; using namespace loops::shorthands; BvarsSubCache_t &cache = md->GetBvarsCache().GetSubCache(BOUND_TYPE, SENDER); - cache.bnd_info = BndInfoArr_t("bnd_info", nbound); + cache.bnd_info = + BndInfoArr_t(Kokkos::view_alloc("bnd_info", Kokkos::SequentialHostInit), nbound); cache.bnd_info_h = Kokkos::create_mirror_view(cache.bnd_info); // prolongation/restriction sub-sets From e333a7b6c33a4ede8a2c0f9a435bbac85227b59b Mon Sep 17 00:00:00 2001 From: Philipp Grete Date: Mon, 21 Oct 2024 11:18:17 +0200 Subject: [PATCH 05/18] Move view_alloc to host --- src/bvals/comms/bvals_utils.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/bvals/comms/bvals_utils.hpp b/src/bvals/comms/bvals_utils.hpp index 616d5cf21ee0..8a47c716aa22 100644 --- a/src/bvals/comms/bvals_utils.hpp +++ b/src/bvals/comms/bvals_utils.hpp @@ -215,9 +215,9 @@ inline void RebuildBufferCache(std::shared_ptr> md, int nbound, using namespace loops; using namespace loops::shorthands; BvarsSubCache_t &cache = md->GetBvarsCache().GetSubCache(BOUND_TYPE, SENDER); - cache.bnd_info = - BndInfoArr_t(Kokkos::view_alloc("bnd_info", Kokkos::SequentialHostInit), nbound); - cache.bnd_info_h = Kokkos::create_mirror_view(cache.bnd_info); + cache.bnd_info = BndInfoArr_t("bnd_info", nbound); + cache.bnd_info_h = Kokkos::create_mirror_view( + Kokkos::view_alloc(Kokkos::SequentialHostInit), cache.bnd_info); // prolongation/restriction sub-sets // TODO(JMM): Right now I exclude fluxcorrection boundaries but if From 094880e12701ba8ee80e37e8a5b0ca52b5f74326 Mon Sep 17 00:00:00 2001 From: Philipp Grete Date: Mon, 21 Oct 2024 11:34:18 +0200 Subject: [PATCH 06/18] Temp disable rocthrust on amd ci container --- cmake/machinecfg/GitHubActions.cmake | 1 + 1 file changed, 1 insertion(+) diff --git a/cmake/machinecfg/GitHubActions.cmake b/cmake/machinecfg/GitHubActions.cmake index 663dcb38d682..1adba870bdb2 100644 --- a/cmake/machinecfg/GitHubActions.cmake +++ b/cmake/machinecfg/GitHubActions.cmake @@ -19,6 +19,7 @@ message(STATUS "Loading machine configuration for GitHub Actions CI. ") # common options set(NUM_MPI_PROC_TESTING "2" CACHE STRING "CI runs tests with 2 MPI ranks") +set(Kokkos_ENABLE_ROCTHRUST OFF CACHE BOOL "Temporarily disabled as the container needs to be updated to the `-complete` base image.") set(MACHINE_CXX_FLAGS "") if (${MACHINE_VARIANT} MATCHES "cuda") From 0e0ef22cfc1540bc3156437fed5a4a903becc071 Mon Sep 17 00:00:00 2001 From: Philipp Grete Date: Mon, 21 Oct 2024 11:42:56 +0200 Subject: [PATCH 07/18] Fix doc creation. Rollback to ubuntu22-04 --- .github/workflows/docs.yml | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 4b95511d82f5..7fb6e0165ff0 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -12,7 +12,10 @@ on: jobs: docs: name: build and deploy docs - runs-on: ubuntu-latest + # not using latest due to issues with pip user packages, see + # https://github.com/actions/runner-images/issues/10781 and + # https://github.com/actions/runner-images/issues/10636 + runs-on: ubuntu-22.04 steps: - name: Checkout code @@ -23,9 +26,9 @@ jobs: run: export DEBIAN_FRONTEND=noninteractive - name: install dependencies run: | - pip install --break-system-packages sphinx - pip install --break-system-packages sphinx-rtd-theme - pip install --break-system-packages sphinx-multiversion + pip install sphinx + pip install sphinx-rtd-theme + pip install sphinx-multiversion - name: build docs run: | echo "Repo = ${GITHUB_REPOSITORY}" From 343b0092e838425514e357685066305ab208b9d1 Mon Sep 17 00:00:00 2001 From: Philipp Grete Date: Tue, 22 Oct 2024 12:02:45 +0200 Subject: [PATCH 08/18] Fix more view of views --- src/bvals/comms/bnd_info.cpp | 3 ++- src/interface/mesh_data.hpp | 3 ++- src/interface/sparse_pack_base.cpp | 6 ++++-- src/interface/swarm_comms.cpp | 1 + src/interface/swarm_pack_base.hpp | 6 ++++-- src/interface/variable_pack.hpp | 23 ++++++++++++++--------- src/parthenon_array_generic.hpp | 4 ++++ tst/unit/test_pararrays.cpp | 6 ++++-- 8 files changed, 35 insertions(+), 17 deletions(-) diff --git a/src/bvals/comms/bnd_info.cpp b/src/bvals/comms/bnd_info.cpp index 736992260913..54a6ae2b50fd 100644 --- a/src/bvals/comms/bnd_info.cpp +++ b/src/bvals/comms/bnd_info.cpp @@ -41,7 +41,8 @@ namespace parthenon { void ProResCache_t::Initialize(int n_regions, StateDescriptor *pkg) { prores_info = ParArray1D("prores_info", n_regions); - prores_info_h = Kokkos::create_mirror_view(prores_info); + prores_info_h = Kokkos::create_mirror_view( + Kokkos::view_alloc(Kokkos::SequentialHostInit), prores_info); int nref_funcs = pkg->NumRefinementFuncs(); // Note that assignment of Kokkos views resets them, but // buffer_subset_sizes is a std::vector. It must be cleared, then diff --git a/src/interface/mesh_data.hpp b/src/interface/mesh_data.hpp index 14ae3959c32a..7d7d1cabcbe5 100644 --- a/src/interface/mesh_data.hpp +++ b/src/interface/mesh_data.hpp @@ -150,7 +150,8 @@ const MeshBlockPack

&PackOnMesh(M &map, BlockDataList_t &block_data_, if (make_new_pack) { ParArray1D

packs("MeshData::PackVariables::packs", nblocks); - auto packs_host = Kokkos::create_mirror_view(packs); + auto packs_host = + Kokkos::create_mirror_view(Kokkos::view_alloc(Kokkos::SequentialHostInit), packs); for (size_t i = 0; i < nblocks; i++) { const auto &pack = packing_function(block_data_[i], this_map, this_key); diff --git a/src/interface/sparse_pack_base.cpp b/src/interface/sparse_pack_base.cpp index 2a7a5b70c41c..4ea5a558c3f5 100644 --- a/src/interface/sparse_pack_base.cpp +++ b/src/interface/sparse_pack_base.cpp @@ -152,7 +152,8 @@ SparsePackBase SparsePackBase::Build(T *pmd, const PackDescriptor &desc, leading_dim += 2; } pack.pack_ = pack_t("data_ptr", leading_dim, pack.nblocks_, max_size); - pack.pack_h_ = Kokkos::create_mirror_view(pack.pack_); + pack.pack_h_ = Kokkos::create_mirror_view( + Kokkos::view_alloc(Kokkos::SequentialHostInit), pack.pack_); // For non-flat packs, shape of pack is type x block x var x k x j x i // where type here might be a flux. @@ -168,7 +169,8 @@ SparsePackBase SparsePackBase::Build(T *pmd, const PackDescriptor &desc, pack.block_props_h_ = Kokkos::create_mirror_view(pack.block_props_); pack.coords_ = coords_t("coords", desc.flat ? max_size : nblocks); - auto coords_h = Kokkos::create_mirror_view(pack.coords_); + auto coords_h = Kokkos::create_mirror_view( + Kokkos::view_alloc(Kokkos::SequentialHostInit), pack.coords_); // Fill the views int idx = 0; diff --git a/src/interface/swarm_comms.cpp b/src/interface/swarm_comms.cpp index 3ee4e2af7512..bb5755e2cbe8 100644 --- a/src/interface/swarm_comms.cpp +++ b/src/interface/swarm_comms.cpp @@ -157,6 +157,7 @@ void Swarm::SetupPersistentMPI() { } void Swarm::CountParticlesToSend_() { + // TODO(PG->BRR) What's going on here? Why is the mask being copied? auto mask_h = Kokkos::create_mirror_view_and_copy(HostMemSpace(), mask_); auto swarm_d = GetDeviceContext(); auto pmb = GetBlockPointer(); diff --git a/src/interface/swarm_pack_base.hpp b/src/interface/swarm_pack_base.hpp index 0733aa51f329..52a2c3c47fc7 100644 --- a/src/interface/swarm_pack_base.hpp +++ b/src/interface/swarm_pack_base.hpp @@ -109,7 +109,8 @@ class SwarmPackBase { // Allocate the views int leading_dim = 1; pack.pack_ = pack_t("data_ptr", leading_dim, nblocks, max_size); - auto pack_h = Kokkos::create_mirror_view(pack.pack_); + auto pack_h = Kokkos::create_mirror_view( + Kokkos::view_alloc(Kokkos::SequentialHostInit), pack.pack_); pack.bounds_ = bounds_t("bounds", 2, nblocks, nvar); auto bounds_h = Kokkos::create_mirror_view(pack.bounds_); @@ -154,7 +155,8 @@ class SwarmPackBase { Kokkos::deep_copy(pack.bounds_, bounds_h); pack.contexts_ = contexts_t("contexts", nblocks); - pack.contexts_h_ = Kokkos::create_mirror_view(pack.contexts_); + pack.contexts_h_ = Kokkos::create_mirror_view( + Kokkos::view_alloc(Kokkos::SequentialHostInit), pack.contexts_); pack.max_active_indices_ = max_active_indices_t("max_active_indices", nblocks); pack.flat_index_map_ = max_active_indices_t("flat_index_map", nblocks + 1); BuildSupplemental(pmd, desc, pack); diff --git a/src/interface/variable_pack.hpp b/src/interface/variable_pack.hpp index 037731093ce1..07e56091ddfe 100644 --- a/src/interface/variable_pack.hpp +++ b/src/interface/variable_pack.hpp @@ -570,10 +570,11 @@ void FillVarView(const VariableVector &vars, int vsize, bool coarse, assert(vsize == sparse_id_out.size()); assert(vsize == vector_component_out.size()); - auto host_cv = Kokkos::create_mirror_view(Kokkos::HostSpace(), cv_out); - auto host_sp = Kokkos::create_mirror_view(Kokkos::HostSpace(), sparse_id_out); - auto host_vc = Kokkos::create_mirror_view(Kokkos::HostSpace(), vector_component_out); - auto host_al = Kokkos::create_mirror_view(Kokkos::HostSpace(), allocated_out); + auto host_cv = + Kokkos::create_mirror_view(Kokkos::view_alloc(Kokkos::SequentialHostInit), cv_out); + auto host_sp = Kokkos::create_mirror_view(sparse_id_out); + auto host_vc = Kokkos::create_mirror_view(vector_component_out); + auto host_al = Kokkos::create_mirror_view(allocated_out); int vindex = 0; for (const auto &v : vars) { @@ -634,7 +635,8 @@ void FillSwarmVarView(const vpack_types::SwarmVarList &vars, ViewOfParArrays1D &cv_out, PackIndexMap *pvmap) { using vpack_types::IndexPair; - auto host_cv = Kokkos::create_mirror_view(Kokkos::HostSpace(), cv_out); + auto host_cv = + Kokkos::create_mirror_view(Kokkos::view_alloc(Kokkos::SequentialHostInit), cv_out); int vindex = 0; for (const auto v : vars) { @@ -675,10 +677,13 @@ void FillFluxViews(const VariableVector &vars, const int ndim, PackIndexMap *pvmap) { using vpack_types::IndexPair; - auto host_f1 = Kokkos::create_mirror_view(Kokkos::HostSpace(), f1_out); - auto host_f2 = Kokkos::create_mirror_view(Kokkos::HostSpace(), f2_out); - auto host_f3 = Kokkos::create_mirror_view(Kokkos::HostSpace(), f3_out); - auto host_al = Kokkos::create_mirror_view(Kokkos::HostSpace(), flux_allocated_out); + auto host_f1 = + Kokkos::create_mirror_view(Kokkos::view_alloc(Kokkos::SequentialHostInit), f1_out); + auto host_f2 = + Kokkos::create_mirror_view(Kokkos::view_alloc(Kokkos::SequentialHostInit), f2_out); + auto host_f3 = + Kokkos::create_mirror_view(Kokkos::view_alloc(Kokkos::SequentialHostInit), f3_out); + auto host_al = Kokkos::create_mirror_view(flux_allocated_out); int vindex = 0; for (const auto &v : vars) { diff --git a/src/parthenon_array_generic.hpp b/src/parthenon_array_generic.hpp index d527707f9070..ac38b6cb5e5f 100644 --- a/src/parthenon_array_generic.hpp +++ b/src/parthenon_array_generic.hpp @@ -221,6 +221,8 @@ class ParArrayGeneric : public State { // return GetDim(1) * GetDim(2) * GetDim(3) * GetDim(4) * GetDim(5) * GetDim(6); } + // TODO(PG?) Can we use concepts here to add a + // Kokkos::view_alloc(Kokkos::SequentialHostInit) when the original is a ViewOfView? template auto GetMirror(MemSpace const &memspace) { auto mirror = Kokkos::create_mirror_view(memspace, data_); @@ -333,6 +335,8 @@ inline auto subview(std::index_sequence, return parthenon::ParArrayGeneric(v, arr); } +// TODO(PG?) Can we use concepts here to add a +// Kokkos::view_alloc(Kokkos::SequentialHostInit) when the original is a ViewOfView? template inline auto create_mirror_view_and_copy(Space const &space, const parthenon::ParArrayGeneric &arr) { diff --git a/tst/unit/test_pararrays.cpp b/tst/unit/test_pararrays.cpp index e79927c13b20..d281ce9e0c8d 100644 --- a/tst/unit/test_pararrays.cpp +++ b/tst/unit/test_pararrays.cpp @@ -452,7 +452,8 @@ TEST_CASE("ParArray state", "[ParArrayND]") { GIVEN("An array of ParArrays filled with the values contained in their state") { parthenon::ParArray1D pack("test pack", NS); - auto pack_h = Kokkos::create_mirror_view(pack); + auto pack_h = + Kokkos::create_mirror_view(Kokkos::view_alloc(Kokkos::SequentialHostInit), pack); for (int b = 0; b < NS; ++b) { state_t state(static_cast(b)); @@ -544,7 +545,8 @@ TEST_CASE("Check registry pressure", "[ParArrayND][performance]") { new (&views[n]) view_3d_t(Kokkos::view_alloc(label, Kokkos::WithoutInitializing), N, N, N); auto a_h = arrays(n).GetHostMirror(); - auto v_h = Kokkos::create_mirror_view(views(n)); + auto v_h = Kokkos::create_mirror_view(Kokkos::view_alloc(Kokkos::SequentialHostInit), + views(n)); for (int k = 0; k < N; k++) { for (int j = 0; j < N; j++) { for (int i = 0; i < N; i++) { From 4999a80408910c191c7c0e7fdee5aaca73afe50d Mon Sep 17 00:00:00 2001 From: Philipp Grete Date: Tue, 22 Oct 2024 19:40:46 +0200 Subject: [PATCH 09/18] Chasing more ViewOfView on host --- src/interface/variable_pack.hpp | 19 +++++++++++-------- src/kokkos_abstraction.hpp | 13 +++++++++++++ tst/unit/test_pararrays.cpp | 2 +- 3 files changed, 25 insertions(+), 9 deletions(-) diff --git a/src/interface/variable_pack.hpp b/src/interface/variable_pack.hpp index 07e56091ddfe..76fce5f6b861 100644 --- a/src/interface/variable_pack.hpp +++ b/src/interface/variable_pack.hpp @@ -244,10 +244,11 @@ class PackIndexMap { }; template -using ViewOfParArrays = ParArray1D>; +using ViewOfParArrays = + Kokkos::View *, LayoutWrapper, DevMemSpace>; template -using ViewOfParArrays1D = ParArray1D>; +using ViewOfParArrays1D = Kokkos::View *, LayoutWrapper, DevMemSpace>; // forward declaration template @@ -760,10 +761,11 @@ VariableFluxPack MakeFluxPack(const VarListWithKeys &var_list, } // make the outer view - ViewOfParArrays cv("MakeFluxPack::cv", vsize * (extra_components ? 3 : 1)); - ViewOfParArrays f1("MakeFluxPack::f1", fsize); - ViewOfParArrays f2("MakeFluxPack::f2", fsize); - ViewOfParArrays f3("MakeFluxPack::f3", fsize); + ViewOfParArrays cv(ViewOfViewAlloc("MakeFluxPack::cv"), + vsize * (extra_components ? 3 : 1)); + ViewOfParArrays f1(ViewOfViewAlloc("MakeFluxPack::f1"), fsize); + ViewOfParArrays f2(ViewOfViewAlloc("MakeFluxPack::f2"), fsize); + ViewOfParArrays f3(ViewOfViewAlloc("MakeFluxPack::f3"), fsize); ParArray1D flux_allocated("MakePack::allocated", fsize); ParArray1D sparse_id("MakeFluxPack::sparse_id", vsize); ParArray1D vector_component("MakeFluxPack::vector_component", vsize); @@ -814,7 +816,8 @@ VariablePack MakePack(const VarListWithKeys &var_list, bool coarse, } // make the outer view - ViewOfParArrays cv("MakePack::cv", vsize * (extra_components ? 3 : 1)); + ViewOfParArrays cv(ViewOfViewAlloc("MakePack::cv"), + vsize * (extra_components ? 3 : 1)); ParArray1D sparse_id("MakePack::sparse_id", vsize); ParArray1D vector_component("MakePack::vector_component", vsize); ParArray1D allocated("MakePack::allocated", vsize); @@ -847,7 +850,7 @@ SwarmVariablePack MakeSwarmPack(const vpack_types::SwarmVarList &vars, } // make the outer view - ViewOfParArrays1D cv("MakePack::cv", vsize); + ViewOfParArrays1D cv(ViewOfViewAlloc("MakePack::cv"), vsize); std::array cv_size{0, 0}; if (vsize > 0) { diff --git a/src/kokkos_abstraction.hpp b/src/kokkos_abstraction.hpp index 8fa89f82e95e..b5dfc3fa75a0 100644 --- a/src/kokkos_abstraction.hpp +++ b/src/kokkos_abstraction.hpp @@ -1035,6 +1035,19 @@ par_reduce_inner(InnerLoopPatternTTR, team_mbr_t team_member, const int il, cons reduction); } +// For ViewOfView we need to call the destructor of the inner views on +// the host and not on the device (which would happen by default). +// Thus, we need to pass `SquentialHostInit` as allocator, but only if the ViewOfView is +// on the host. If the ViewOfViews in on the device, then `SequentialHostInit` should be +// passed when calling `create_mirror_view`. +auto ViewOfViewAlloc(const std::string &label) { + if constexpr (std::is_same_v) { + return Kokkos::view_alloc(Kokkos::SequentialHostInit, label); + } else { + return Kokkos::view_alloc(label); + } +} + // reused from kokoks/core/perf_test/PerfTest_ExecSpacePartitioning.cpp // commit a0d011fb30022362c61b3bb000ae3de6906cb6a7 template diff --git a/tst/unit/test_pararrays.cpp b/tst/unit/test_pararrays.cpp index d281ce9e0c8d..9e1816b0e669 100644 --- a/tst/unit/test_pararrays.cpp +++ b/tst/unit/test_pararrays.cpp @@ -451,7 +451,7 @@ TEST_CASE("ParArray state", "[ParArrayND]") { } GIVEN("An array of ParArrays filled with the values contained in their state") { - parthenon::ParArray1D pack("test pack", NS); + Kokkos::View pack(parthenon::ViewOfViewAlloc("test pack"), NS); auto pack_h = Kokkos::create_mirror_view(Kokkos::view_alloc(Kokkos::SequentialHostInit), pack); From 05105495fdc39457ccadcd3e1b1d0c9e934edfe4 Mon Sep 17 00:00:00 2001 From: Ben Wibking Date: Wed, 23 Oct 2024 16:53:03 -0400 Subject: [PATCH 10/18] output and crash if dt is tiny --- src/driver/driver.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/driver/driver.cpp b/src/driver/driver.cpp index 17b913e3a2ac..0443bdb6a756 100644 --- a/src/driver/driver.cpp +++ b/src/driver/driver.cpp @@ -133,6 +133,12 @@ DriverStatus EvolutionDriver::Execute() { // check for signals signal = SignalHandler::CheckSignalFlags(); + // TODO(bwibking): check for application debug callback + // currently hard-coded to check for tiny dt + if (tm.dt < 1e-10 * tm.time) { + signal = OutputSignal::final; + } + if (signal == OutputSignal::final) { break; } From c799d026bd226e3c175e04ff435e5c39744361ee Mon Sep 17 00:00:00 2001 From: Philipp Grete Date: Wed, 6 Nov 2024 16:32:37 +0100 Subject: [PATCH 11/18] Fix ViewAlloc func and add doc --- README.md | 2 +- doc/sphinx/src/development.rst | 28 ++++++++++++++++++++++++++++ src/kokkos_abstraction.hpp | 3 ++- 3 files changed, 31 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index ed6a1bb05a16..f049c44ecd7d 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,7 @@ Parthenon -- a performance portable block-structured adaptive mesh refinement fr * CMake 3.16 or greater * C++17 compatible compiler -* Kokkos 4.0.1 or greater +* Kokkos 4.1.1 or greater ## Optional (enabling features) diff --git a/doc/sphinx/src/development.rst b/doc/sphinx/src/development.rst index dbab91d8d5ce..98ac9cef90a8 100644 --- a/doc/sphinx/src/development.rst +++ b/doc/sphinx/src/development.rst @@ -62,6 +62,34 @@ parallelism interface that is needed for managing memory cached in tightly nested loops. The wrappers are documented :ref:`here `. +View of Views +------------- + +Special care needs to be taken when working with a ``View`` of ``View``. + +To repeat the Kokkos documenation: `Don't use them `__ + +But if you have to (which is the case in some places inside Parthenon) +then follow this pattern: + +.. code:: c++ + + Kokkos::View *> view_of_pararrays(parthenon::ViewOfViewAlloc("myname"), 10); + +The ``ViewOfViewAlloc`` ensures that the ``Kokkos::SequentialHostInit`` property is added, +which results in the (inner ``View`` ) deallocators being called on the host (rather than on +the device by default). + +Similarly, when you create a host mirror of said ``View`` of ``View`` add the additional +property for the same reason. + +.. code:: c++ + + auto view_of_pararrays_h = + Kokkos::create_mirror_view(Kokkos::view_alloc(Kokkos::SequentialHostInit), view_of_pararrays); + +Note that the ``SequentialHostInit`` was only added in Kokkos 4.4.1 (which is now the default in Parthenon). + The need for reductions within function handling ``MeshBlock`` data ------------------------------------------------------------------- diff --git a/src/kokkos_abstraction.hpp b/src/kokkos_abstraction.hpp index b5dfc3fa75a0..37262dd0356c 100644 --- a/src/kokkos_abstraction.hpp +++ b/src/kokkos_abstraction.hpp @@ -1040,8 +1040,9 @@ par_reduce_inner(InnerLoopPatternTTR, team_mbr_t team_member, const int il, cons // Thus, we need to pass `SquentialHostInit` as allocator, but only if the ViewOfView is // on the host. If the ViewOfViews in on the device, then `SequentialHostInit` should be // passed when calling `create_mirror_view`. +template auto ViewOfViewAlloc(const std::string &label) { - if constexpr (std::is_same_v) { + if constexpr (std::is_same_v) { return Kokkos::view_alloc(Kokkos::SequentialHostInit, label); } else { return Kokkos::view_alloc(label); From ebbcc1704ed2c68c99808cf099eef41396b2d5ce Mon Sep 17 00:00:00 2001 From: Philipp Grete Date: Thu, 7 Nov 2024 13:41:44 +0100 Subject: [PATCH 12/18] Disable Ascent in testing --- .github/workflows/ci-extended.yml | 48 +++++++++++++++---------------- .github/workflows/ci-short.yml | 12 ++++---- 2 files changed, 30 insertions(+), 30 deletions(-) diff --git a/.github/workflows/ci-extended.yml b/.github/workflows/ci-extended.yml index 8ca646cfc2eb..0c083091cf0e 100644 --- a/.github/workflows/ci-extended.yml +++ b/.github/workflows/ci-extended.yml @@ -32,9 +32,9 @@ jobs: parallel: ['serial', 'mpi'] runs-on: [self-hosted, A100] container: - image: ghcr.io/parthenon-hpc-lab/cuda11.6-mpi-hdf5-ascent + image: ghcr.io/parthenon-hpc-lab/cuda11.6-noascent # map to local user id on CI machine to allow writing to build cache - options: --user 1001 + options: --user 1001 --cap-add CAP_SYS_PTRACE --shm-size="8g" --ulimit memlock=134217728 steps: - uses: actions/checkout@v3 with: @@ -91,27 +91,27 @@ jobs: ctest -L regression -L ${{ matrix.parallel }} -LE perf-reg --timeout 3600 # Test Ascent integration (only most complex setup with MPI and on device) - - name: Ascent tests - if: ${{ matrix.parallel == 'mpi' && matrix.device == 'cuda' }} - run: | - cmake -B build-ascent \ - -DCMAKE_BUILD_TYPE=Release \ - -DMACHINE_VARIANT=${{ matrix.device }}-${{ matrix.parallel }} \ - -DPARTHENON_ENABLE_ASCENT=ON \ - -DAscent_DIR=/usr/local/ascent-develop/lib/cmake/ascent - cmake --build build-ascent - cd example/advection/ - # Pick GPU with most available memory - export CUDA_VISIBLE_DEVICES=$(nvidia-smi --query-gpu=memory.free,index --format=csv,nounits,noheader | sort -nr | head -1 | awk '{ print $NF }') - mpirun -np 2 ../../build-ascent/example/advection/advection-example \ - -i parthinput.advection \ - parthenon/output5/dt=0.05 \ - parthenon/time/tlim=0.1 - # check if file exists - if [ ! -f "ascent_render_57.png" ]; then - echo "'ascent_render_57.png' does not exist." - exit 1 - fi + # - name: Ascent tests + # if: ${{ matrix.parallel == 'mpi' && matrix.device == 'cuda' }} + # run: | + # cmake -B build-ascent \ + # -DCMAKE_BUILD_TYPE=Release \ + # -DMACHINE_VARIANT=${{ matrix.device }}-${{ matrix.parallel }} \ + # -DPARTHENON_ENABLE_ASCENT=ON \ + # -DAscent_DIR=/usr/local/ascent-develop/lib/cmake/ascent + # cmake --build build-ascent + # cd example/advection/ + # # Pick GPU with most available memory + # export CUDA_VISIBLE_DEVICES=$(nvidia-smi --query-gpu=memory.free,index --format=csv,nounits,noheader | sort -nr | head -1 | awk '{ print $NF }') + # mpirun -np 2 ../../build-ascent/example/advection/advection-example \ + # -i parthinput.advection \ + # parthenon/output5/dt=0.05 \ + # parthenon/time/tlim=0.1 + # # check if file exists + # if [ ! -f "ascent_render_57.png" ]; then + # echo "'ascent_render_57.png' does not exist." + # exit 1 + # fi - uses: actions/upload-artifact@v3 with: @@ -120,7 +120,7 @@ jobs: build/CMakeFiles/CMakeOutput.log build/tst/regression/outputs/advection_convergence*/advection-errors.dat build/tst/regression/outputs/advection_convergence*/advection-errors.png - example/advection/ascent_render_57.png + # example/advection/ascent_render_57.png retention-days: 3 perf-and-regression-amdgpu: diff --git a/.github/workflows/ci-short.yml b/.github/workflows/ci-short.yml index ecb4052411ee..ab42a7450c83 100644 --- a/.github/workflows/ci-short.yml +++ b/.github/workflows/ci-short.yml @@ -20,9 +20,9 @@ jobs: style: runs-on: [self-hosted, A100] container: - image: ghcr.io/parthenon-hpc-lab/cuda11.6-mpi-hdf5-ascent + image: ghcr.io/parthenon-hpc-lab/cuda11.6-noascent # map to local user id on CI machine to allow writing to build cache - options: --user 1001 + options: --user 1001 --cap-add CAP_SYS_PTRACE --shm-size="8g" --ulimit memlock=134217728 steps: - uses: actions/checkout@v3 with: @@ -45,9 +45,9 @@ jobs: device: ['cuda', 'host'] runs-on: [self-hosted, A100] container: - image: ghcr.io/parthenon-hpc-lab/cuda11.6-mpi-hdf5-ascent + image: ghcr.io/parthenon-hpc-lab/cuda11.6-noascent # map to local user id on CI machine to allow writing to build cache - options: --user 1001 + options: --user 1001 --cap-add CAP_SYS_PTRACE --shm-size="8g" --ulimit memlock=134217728 steps: - uses: actions/checkout@v3 with: @@ -77,9 +77,9 @@ jobs: device: ['cuda', 'host'] runs-on: [self-hosted, A100] container: - image: ghcr.io/parthenon-hpc-lab/cuda11.6-mpi-hdf5-ascent + image: ghcr.io/parthenon-hpc-lab/cuda11.6-noascent # map to local user id on CI machine to allow writing to build cache - options: --user 1001 + options: --user 1001 --cap-add CAP_SYS_PTRACE --shm-size="8g" --ulimit memlock=134217728 steps: - uses: actions/checkout@v3 with: From 11e1e8614d07c0d85c0a04d914fde33e6115dbac Mon Sep 17 00:00:00 2001 From: Philipp Grete Date: Thu, 7 Nov 2024 14:27:39 +0100 Subject: [PATCH 13/18] Revert "Disable Ascent in testing" This reverts commit ebbcc1704ed2c68c99808cf099eef41396b2d5ce. --- .github/workflows/ci-extended.yml | 48 +++++++++++++++---------------- .github/workflows/ci-short.yml | 12 ++++---- 2 files changed, 30 insertions(+), 30 deletions(-) diff --git a/.github/workflows/ci-extended.yml b/.github/workflows/ci-extended.yml index 0c083091cf0e..8ca646cfc2eb 100644 --- a/.github/workflows/ci-extended.yml +++ b/.github/workflows/ci-extended.yml @@ -32,9 +32,9 @@ jobs: parallel: ['serial', 'mpi'] runs-on: [self-hosted, A100] container: - image: ghcr.io/parthenon-hpc-lab/cuda11.6-noascent + image: ghcr.io/parthenon-hpc-lab/cuda11.6-mpi-hdf5-ascent # map to local user id on CI machine to allow writing to build cache - options: --user 1001 --cap-add CAP_SYS_PTRACE --shm-size="8g" --ulimit memlock=134217728 + options: --user 1001 steps: - uses: actions/checkout@v3 with: @@ -91,27 +91,27 @@ jobs: ctest -L regression -L ${{ matrix.parallel }} -LE perf-reg --timeout 3600 # Test Ascent integration (only most complex setup with MPI and on device) - # - name: Ascent tests - # if: ${{ matrix.parallel == 'mpi' && matrix.device == 'cuda' }} - # run: | - # cmake -B build-ascent \ - # -DCMAKE_BUILD_TYPE=Release \ - # -DMACHINE_VARIANT=${{ matrix.device }}-${{ matrix.parallel }} \ - # -DPARTHENON_ENABLE_ASCENT=ON \ - # -DAscent_DIR=/usr/local/ascent-develop/lib/cmake/ascent - # cmake --build build-ascent - # cd example/advection/ - # # Pick GPU with most available memory - # export CUDA_VISIBLE_DEVICES=$(nvidia-smi --query-gpu=memory.free,index --format=csv,nounits,noheader | sort -nr | head -1 | awk '{ print $NF }') - # mpirun -np 2 ../../build-ascent/example/advection/advection-example \ - # -i parthinput.advection \ - # parthenon/output5/dt=0.05 \ - # parthenon/time/tlim=0.1 - # # check if file exists - # if [ ! -f "ascent_render_57.png" ]; then - # echo "'ascent_render_57.png' does not exist." - # exit 1 - # fi + - name: Ascent tests + if: ${{ matrix.parallel == 'mpi' && matrix.device == 'cuda' }} + run: | + cmake -B build-ascent \ + -DCMAKE_BUILD_TYPE=Release \ + -DMACHINE_VARIANT=${{ matrix.device }}-${{ matrix.parallel }} \ + -DPARTHENON_ENABLE_ASCENT=ON \ + -DAscent_DIR=/usr/local/ascent-develop/lib/cmake/ascent + cmake --build build-ascent + cd example/advection/ + # Pick GPU with most available memory + export CUDA_VISIBLE_DEVICES=$(nvidia-smi --query-gpu=memory.free,index --format=csv,nounits,noheader | sort -nr | head -1 | awk '{ print $NF }') + mpirun -np 2 ../../build-ascent/example/advection/advection-example \ + -i parthinput.advection \ + parthenon/output5/dt=0.05 \ + parthenon/time/tlim=0.1 + # check if file exists + if [ ! -f "ascent_render_57.png" ]; then + echo "'ascent_render_57.png' does not exist." + exit 1 + fi - uses: actions/upload-artifact@v3 with: @@ -120,7 +120,7 @@ jobs: build/CMakeFiles/CMakeOutput.log build/tst/regression/outputs/advection_convergence*/advection-errors.dat build/tst/regression/outputs/advection_convergence*/advection-errors.png - # example/advection/ascent_render_57.png + example/advection/ascent_render_57.png retention-days: 3 perf-and-regression-amdgpu: diff --git a/.github/workflows/ci-short.yml b/.github/workflows/ci-short.yml index ab42a7450c83..ecb4052411ee 100644 --- a/.github/workflows/ci-short.yml +++ b/.github/workflows/ci-short.yml @@ -20,9 +20,9 @@ jobs: style: runs-on: [self-hosted, A100] container: - image: ghcr.io/parthenon-hpc-lab/cuda11.6-noascent + image: ghcr.io/parthenon-hpc-lab/cuda11.6-mpi-hdf5-ascent # map to local user id on CI machine to allow writing to build cache - options: --user 1001 --cap-add CAP_SYS_PTRACE --shm-size="8g" --ulimit memlock=134217728 + options: --user 1001 steps: - uses: actions/checkout@v3 with: @@ -45,9 +45,9 @@ jobs: device: ['cuda', 'host'] runs-on: [self-hosted, A100] container: - image: ghcr.io/parthenon-hpc-lab/cuda11.6-noascent + image: ghcr.io/parthenon-hpc-lab/cuda11.6-mpi-hdf5-ascent # map to local user id on CI machine to allow writing to build cache - options: --user 1001 --cap-add CAP_SYS_PTRACE --shm-size="8g" --ulimit memlock=134217728 + options: --user 1001 steps: - uses: actions/checkout@v3 with: @@ -77,9 +77,9 @@ jobs: device: ['cuda', 'host'] runs-on: [self-hosted, A100] container: - image: ghcr.io/parthenon-hpc-lab/cuda11.6-noascent + image: ghcr.io/parthenon-hpc-lab/cuda11.6-mpi-hdf5-ascent # map to local user id on CI machine to allow writing to build cache - options: --user 1001 --cap-add CAP_SYS_PTRACE --shm-size="8g" --ulimit memlock=134217728 + options: --user 1001 steps: - uses: actions/checkout@v3 with: From e632f6b7cbab5535f8db91eb67cb81fa49146513 Mon Sep 17 00:00:00 2001 From: Philipp Grete Date: Thu, 7 Nov 2024 14:31:04 +0100 Subject: [PATCH 14/18] Disable CUDA IPC --- .github/workflows/ci-extended.yml | 4 +++- .github/workflows/ci-short.yml | 8 +++++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ci-extended.yml b/.github/workflows/ci-extended.yml index 8ca646cfc2eb..1dd39c782477 100644 --- a/.github/workflows/ci-extended.yml +++ b/.github/workflows/ci-extended.yml @@ -21,6 +21,8 @@ env: CMAKE_BUILD_PARALLEL_LEVEL: 5 # num threads for build MACHINE_CFG: cmake/machinecfg/CI.cmake OMPI_MCA_mpi_common_cuda_event_max: 1000 + # CUDA IPC within docker repeated seem to cause issue on the CI machine + OMPI_MCA_btl_smcuda_use_cuda_ipc: 0 # https://github.com/open-mpi/ompi/issues/4948#issuecomment-395468231 OMPI_MCA_btl_vader_single_copy_mechanism: none @@ -34,7 +36,7 @@ jobs: container: image: ghcr.io/parthenon-hpc-lab/cuda11.6-mpi-hdf5-ascent # map to local user id on CI machine to allow writing to build cache - options: --user 1001 + options: --user 1001 --cap-add CAP_SYS_PTRACE --shm-size="8g" --ulimit memlock=134217728 steps: - uses: actions/checkout@v3 with: diff --git a/.github/workflows/ci-short.yml b/.github/workflows/ci-short.yml index ecb4052411ee..7e0fd8bf759a 100644 --- a/.github/workflows/ci-short.yml +++ b/.github/workflows/ci-short.yml @@ -13,6 +13,8 @@ env: CMAKE_BUILD_PARALLEL_LEVEL: 5 # num threads for build MACHINE_CFG: cmake/machinecfg/CI.cmake OMPI_MCA_mpi_common_cuda_event_max: 1000 + # CUDA IPC within docker repeated seem to cause issue on the CI machine + OMPI_MCA_btl_smcuda_use_cuda_ipc: 0 # https://github.com/open-mpi/ompi/issues/4948#issuecomment-395468231 OMPI_MCA_btl_vader_single_copy_mechanism: none @@ -22,7 +24,7 @@ jobs: container: image: ghcr.io/parthenon-hpc-lab/cuda11.6-mpi-hdf5-ascent # map to local user id on CI machine to allow writing to build cache - options: --user 1001 + options: --user 1001 --cap-add CAP_SYS_PTRACE --shm-size="8g" --ulimit memlock=134217728 steps: - uses: actions/checkout@v3 with: @@ -47,7 +49,7 @@ jobs: container: image: ghcr.io/parthenon-hpc-lab/cuda11.6-mpi-hdf5-ascent # map to local user id on CI machine to allow writing to build cache - options: --user 1001 + options: --user 1001 --cap-add CAP_SYS_PTRACE --shm-size="8g" --ulimit memlock=134217728 steps: - uses: actions/checkout@v3 with: @@ -79,7 +81,7 @@ jobs: container: image: ghcr.io/parthenon-hpc-lab/cuda11.6-mpi-hdf5-ascent # map to local user id on CI machine to allow writing to build cache - options: --user 1001 + options: --user 1001 --cap-add CAP_SYS_PTRACE --shm-size="8g" --ulimit memlock=134217728 steps: - uses: actions/checkout@v3 with: From 120ceef6bd4e34cc9fb10009cb654572a3a00dd4 Mon Sep 17 00:00:00 2001 From: Philipp Grete Date: Thu, 7 Nov 2024 17:08:48 +0100 Subject: [PATCH 15/18] Fix more view of views --- README.md | 2 +- src/bvals/bvals.hpp | 3 --- src/bvals/comms/bnd_info.cpp | 2 +- src/bvals/comms/bnd_info.hpp | 5 +++-- src/bvals/comms/bvals_utils.hpp | 3 ++- src/interface/mesh_data.hpp | 4 +++- src/interface/sparse_pack_base.cpp | 5 +++-- src/interface/sparse_pack_base.hpp | 4 +++- src/interface/swarm_pack_base.hpp | 11 ++++++----- tst/unit/test_pararrays.cpp | 2 ++ 10 files changed, 24 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index f049c44ecd7d..b874e5172889 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,7 @@ Parthenon -- a performance portable block-structured adaptive mesh refinement fr * CMake 3.16 or greater * C++17 compatible compiler -* Kokkos 4.1.1 or greater +* Kokkos 4.4.1 or greater ## Optional (enabling features) diff --git a/src/bvals/bvals.hpp b/src/bvals/bvals.hpp index bc6ed0ebed56..aaa4da22c0f0 100644 --- a/src/bvals/bvals.hpp +++ b/src/bvals/bvals.hpp @@ -101,9 +101,6 @@ class BoundarySwarm : public BoundaryCommunication { explicit BoundarySwarm(std::weak_ptr pmb, const std::string &label); ~BoundarySwarm() = default; - std::vector> vars_int; - std::vector> vars_real; - // (usuallly the std::size_t unsigned integer type) std::vector::size_type bswarm_index; diff --git a/src/bvals/comms/bnd_info.cpp b/src/bvals/comms/bnd_info.cpp index 54a6ae2b50fd..198489c1c1f0 100644 --- a/src/bvals/comms/bnd_info.cpp +++ b/src/bvals/comms/bnd_info.cpp @@ -40,7 +40,7 @@ namespace parthenon { void ProResCache_t::Initialize(int n_regions, StateDescriptor *pkg) { - prores_info = ParArray1D("prores_info", n_regions); + prores_info = ProResInfoArr_t(ViewOfViewAlloc("prores_info"), n_regions); prores_info_h = Kokkos::create_mirror_view( Kokkos::view_alloc(Kokkos::SequentialHostInit), prores_info); int nref_funcs = pkg->NumRefinementFuncs(); diff --git a/src/bvals/comms/bnd_info.hpp b/src/bvals/comms/bnd_info.hpp index 8800f6fd867f..8c6c85d59f93 100644 --- a/src/bvals/comms/bnd_info.hpp +++ b/src/bvals/comms/bnd_info.hpp @@ -26,6 +26,7 @@ #include "bvals/neighbor_block.hpp" #include "coordinates/coordinates.hpp" #include "interface/variable_state.hpp" +#include "kokkos_abstraction.hpp" #include "mesh/domain.hpp" #include "mesh/forest/logical_coordinate_transformation.hpp" #include "utils/communication_buffer.hpp" @@ -130,8 +131,8 @@ int GetBufferSize(MeshBlock *pmb, const NeighborBlock &nb, using BndInfoArr_t = Kokkos::View; using BndInfoArrHost_t = typename BndInfoArr_t::HostMirror; -using ProResInfoArr_t = ParArray1D; -using ProResInfoArrHost_t = typename ParArray1D::HostMirror; +using ProResInfoArr_t = Kokkos::View; +using ProResInfoArrHost_t = typename ProResInfoArr_t::HostMirror; class StateDescriptor; struct ProResCache_t { ProResInfoArr_t prores_info{}; diff --git a/src/bvals/comms/bvals_utils.hpp b/src/bvals/comms/bvals_utils.hpp index 8a47c716aa22..87b55b51019d 100644 --- a/src/bvals/comms/bvals_utils.hpp +++ b/src/bvals/comms/bvals_utils.hpp @@ -28,6 +28,7 @@ #include "bvals/comms/bnd_info.hpp" #include "bvals/comms/bvals_in_one.hpp" #include "interface/variable.hpp" +#include "kokkos_abstraction.hpp" #include "mesh/domain.hpp" #include "mesh/mesh.hpp" #include "mesh/meshblock.hpp" @@ -215,7 +216,7 @@ inline void RebuildBufferCache(std::shared_ptr> md, int nbound, using namespace loops; using namespace loops::shorthands; BvarsSubCache_t &cache = md->GetBvarsCache().GetSubCache(BOUND_TYPE, SENDER); - cache.bnd_info = BndInfoArr_t("bnd_info", nbound); + cache.bnd_info = BndInfoArr_t(ViewOfViewAlloc("bnd_info"), nbound); cache.bnd_info_h = Kokkos::create_mirror_view( Kokkos::view_alloc(Kokkos::SequentialHostInit), cache.bnd_info); diff --git a/src/interface/mesh_data.hpp b/src/interface/mesh_data.hpp index 7d7d1cabcbe5..9a86f0e52014 100644 --- a/src/interface/mesh_data.hpp +++ b/src/interface/mesh_data.hpp @@ -26,6 +26,7 @@ #include "interface/sparse_pack_base.hpp" #include "interface/swarm_pack_base.hpp" #include "interface/variable_pack.hpp" +#include "kokkos_abstraction.hpp" #include "mesh/domain.hpp" #include "mesh/meshblock.hpp" #include "mesh/meshblock_pack.hpp" @@ -149,7 +150,8 @@ const MeshBlockPack

&PackOnMesh(M &map, BlockDataList_t &block_data_, } if (make_new_pack) { - ParArray1D

packs("MeshData::PackVariables::packs", nblocks); + Kokkos::View

packs( + ViewOfViewAlloc("MeshData::PackVariables::packs"), nblocks); auto packs_host = Kokkos::create_mirror_view(Kokkos::view_alloc(Kokkos::SequentialHostInit), packs); diff --git a/src/interface/sparse_pack_base.cpp b/src/interface/sparse_pack_base.cpp index 4ea5a558c3f5..751e88c200c6 100644 --- a/src/interface/sparse_pack_base.cpp +++ b/src/interface/sparse_pack_base.cpp @@ -30,6 +30,7 @@ #include "interface/sparse_pack_base.hpp" #include "interface/state_descriptor.hpp" #include "interface/variable.hpp" +#include "kokkos_abstraction.hpp" #include "utils/utils.hpp" namespace parthenon { namespace impl { @@ -151,7 +152,7 @@ SparsePackBase SparsePackBase::Build(T *pmd, const PackDescriptor &desc, } else if (contains_face_or_edge) { leading_dim += 2; } - pack.pack_ = pack_t("data_ptr", leading_dim, pack.nblocks_, max_size); + pack.pack_ = pack_t(ViewOfViewAlloc("data_ptr"), leading_dim, pack.nblocks_, max_size); pack.pack_h_ = Kokkos::create_mirror_view( Kokkos::view_alloc(Kokkos::SequentialHostInit), pack.pack_); @@ -168,7 +169,7 @@ SparsePackBase SparsePackBase::Build(T *pmd, const PackDescriptor &desc, pack.block_props_ = block_props_t("block_props", nblocks, 27 + 1); pack.block_props_h_ = Kokkos::create_mirror_view(pack.block_props_); - pack.coords_ = coords_t("coords", desc.flat ? max_size : nblocks); + pack.coords_ = coords_t(ViewOfViewAlloc("coords"), desc.flat ? max_size : nblocks); auto coords_h = Kokkos::create_mirror_view( Kokkos::view_alloc(Kokkos::SequentialHostInit), pack.coords_); diff --git a/src/interface/sparse_pack_base.hpp b/src/interface/sparse_pack_base.hpp index 0deca487a20a..25331c384b56 100644 --- a/src/interface/sparse_pack_base.hpp +++ b/src/interface/sparse_pack_base.hpp @@ -30,6 +30,7 @@ #include "interface/state_descriptor.hpp" #include "interface/variable.hpp" #include "interface/variable_state.hpp" +#include "kokkos_abstraction.hpp" #include "utils/utils.hpp" namespace parthenon { @@ -55,7 +56,8 @@ class SparsePackBase { using alloc_t = std::vector; using include_t = std::vector; - using pack_t = ParArray3D>; + using pack_t = + Kokkos::View ***, LayoutWrapper, DevMemSpace>; using pack_h_t = typename pack_t::HostMirror; using bounds_t = ParArray3D; using bounds_h_t = typename bounds_t::HostMirror; diff --git a/src/interface/swarm_pack_base.hpp b/src/interface/swarm_pack_base.hpp index 52a2c3c47fc7..6d0933b8485c 100644 --- a/src/interface/swarm_pack_base.hpp +++ b/src/interface/swarm_pack_base.hpp @@ -28,6 +28,7 @@ #include "interface/state_descriptor.hpp" #include "interface/swarm_device_context.hpp" #include "interface/variable.hpp" +#include "kokkos_abstraction.hpp" #include "utils/utils.hpp" namespace parthenon { @@ -43,10 +44,10 @@ class SwarmPackBase { SwarmPackBase() = default; virtual ~SwarmPackBase() = default; - using pack_t = ParArray3D>; + using pack_t = Kokkos::View ***, LayoutWrapper, DevMemSpace>; using bounds_t = ParArray3D; - using contexts_t = ParArray1D; - using contexts_h_t = typename ParArray1D::HostMirror; + using contexts_t = Kokkos::View; + using contexts_h_t = typename contexts_t::HostMirror; using max_active_indices_t = ParArray1D; using desc_t = impl::SwarmPackDescriptor; using idx_map_t = std::unordered_map; @@ -108,7 +109,7 @@ class SwarmPackBase { // Allocate the views int leading_dim = 1; - pack.pack_ = pack_t("data_ptr", leading_dim, nblocks, max_size); + pack.pack_ = pack_t(ViewOfViewAlloc("data_ptr"), leading_dim, nblocks, max_size); auto pack_h = Kokkos::create_mirror_view( Kokkos::view_alloc(Kokkos::SequentialHostInit), pack.pack_); @@ -154,7 +155,7 @@ class SwarmPackBase { Kokkos::deep_copy(pack.pack_, pack_h); Kokkos::deep_copy(pack.bounds_, bounds_h); - pack.contexts_ = contexts_t("contexts", nblocks); + pack.contexts_ = contexts_t(ViewOfViewAlloc("contexts"), nblocks); pack.contexts_h_ = Kokkos::create_mirror_view( Kokkos::view_alloc(Kokkos::SequentialHostInit), pack.contexts_); pack.max_active_indices_ = max_active_indices_t("max_active_indices", nblocks); diff --git a/tst/unit/test_pararrays.cpp b/tst/unit/test_pararrays.cpp index 9e1816b0e669..865dfbedf5e9 100644 --- a/tst/unit/test_pararrays.cpp +++ b/tst/unit/test_pararrays.cpp @@ -528,6 +528,8 @@ TEST_CASE("Check registry pressure", "[ParArrayND][performance]") { // view of views. See: // https://github.com/kokkos/kokkos/wiki/View#6232-whats-the-problem-with-a-view-of-views + // TODO(PG) depending on the results of the view of view discussion, we should add + // destructor or ViewOfViewAlloc with SequentialHostInit using view_3d_t = Kokkos::View; using arrays_t = Kokkos::View *, UVMSpace>; From eb44166831f88833cad96b6a0b8d5fbf5f540c9d Mon Sep 17 00:00:00 2001 From: Philipp Grete Date: Thu, 7 Nov 2024 17:15:41 +0100 Subject: [PATCH 16/18] Fix missig update of private var --- src/interface/sparse_pack_base.hpp | 2 +- src/mesh/meshblock_pack.hpp | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/interface/sparse_pack_base.hpp b/src/interface/sparse_pack_base.hpp index 25331c384b56..089ced81f290 100644 --- a/src/interface/sparse_pack_base.hpp +++ b/src/interface/sparse_pack_base.hpp @@ -63,7 +63,7 @@ class SparsePackBase { using bounds_h_t = typename bounds_t::HostMirror; using block_props_t = ParArray2D; using block_props_h_t = typename block_props_t::HostMirror; - using coords_t = ParArray1D>; + using coords_t = Kokkos::View *, LayoutWrapper, DevMemSpace>; // Returns a SparsePackBase object that is either newly created or taken // from the cache in pmd. The cache itself handles the all of this logic diff --git a/src/mesh/meshblock_pack.hpp b/src/mesh/meshblock_pack.hpp index 5669e112109b..2624da53824c 100644 --- a/src/mesh/meshblock_pack.hpp +++ b/src/mesh/meshblock_pack.hpp @@ -21,6 +21,7 @@ #include #include +#include "Kokkos_Core_fwd.hpp" #include "coordinates/coordinates.hpp" #include "interface/variable_pack.hpp" #include "kokkos_abstraction.hpp" @@ -38,7 +39,8 @@ class MeshBlockPack { using pack_type = T; MeshBlockPack() = default; - MeshBlockPack(const ParArray1D view, const std::array dims) + MeshBlockPack(const Kokkos::View view, + const std::array dims) : v_(view), dims_(dims), ndim_((dims[2] > 1 ? 3 : (dims[1] > 1 ? 2 : 1))) {} KOKKOS_FORCEINLINE_FUNCTION @@ -85,7 +87,7 @@ class MeshBlockPack { const Coordinates_t &GetCoords(const int i) const { return v_(i).GetCoords(); } private: - ParArray1D v_; + Kokkos::View v_; std::array dims_; int ndim_; }; From eb03328e965e5b1c53843a0e60485860122b8311 Mon Sep 17 00:00:00 2001 From: Philipp Grete Date: Thu, 7 Nov 2024 18:12:12 +0100 Subject: [PATCH 17/18] Make linter happy --- src/mesh/meshblock_pack.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/mesh/meshblock_pack.hpp b/src/mesh/meshblock_pack.hpp index 2624da53824c..1800489ccf0b 100644 --- a/src/mesh/meshblock_pack.hpp +++ b/src/mesh/meshblock_pack.hpp @@ -21,7 +21,6 @@ #include #include -#include "Kokkos_Core_fwd.hpp" #include "coordinates/coordinates.hpp" #include "interface/variable_pack.hpp" #include "kokkos_abstraction.hpp" From c6f8799c3b9747c677ffcdfe983ee6c930892ee8 Mon Sep 17 00:00:00 2001 From: Ben Wibking Date: Tue, 12 Nov 2024 12:36:27 -0500 Subject: [PATCH 18/18] return DriverStatus::failed --- src/driver/driver.cpp | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/src/driver/driver.cpp b/src/driver/driver.cpp index 0443bdb6a756..9d8ce66ad702 100644 --- a/src/driver/driver.cpp +++ b/src/driver/driver.cpp @@ -94,6 +94,8 @@ DriverStatus EvolutionDriver::Execute() { // Defaults must be set across all ranks DumpInputParameters(); + DriverStatus driver_status = DriverStatus::complete; + { // Main t < tmax loop region PARTHENON_INSTRUMENT while (tm.KeepGoing() && signal != OutputSignal::analysis) { @@ -135,8 +137,10 @@ DriverStatus EvolutionDriver::Execute() { // TODO(bwibking): check for application debug callback // currently hard-coded to check for tiny dt - if (tm.dt < 1e-10 * tm.time) { - signal = OutputSignal::final; + if (tm.dt < 1e-6 * tm.time) { + signal = OutputSignal::final; + driver_status = DriverStatus::failed; + // do not return here, since we still want to write an output } if (signal == OutputSignal::final) { @@ -161,14 +165,16 @@ DriverStatus EvolutionDriver::Execute() { pmesh->UserWorkAfterLoop(pmesh, pinput, tm); } - DriverStatus status = tm.KeepGoing() ? DriverStatus::timeout : DriverStatus::complete; + if (driver_status != DriverStatus::failed) { + driver_status = tm.KeepGoing() ? DriverStatus::timeout : DriverStatus::complete; + } // Do *not* write the "final" output, if this is analysis run. // The analysis output itself has already been written above before the main loop. if (signal != OutputSignal::analysis) { pouts->MakeOutputs(pmesh, pinput, &tm, OutputSignal::final); } - PostExecute(status); - return status; + PostExecute(driver_status); + return driver_status; } void EvolutionDriver::PostExecute(DriverStatus status) {