diff --git a/.github/workflows/ci-extended.yml b/.github/workflows/ci-extended.yml index 8ca646cfc2eb..1dd39c782477 100644 --- a/.github/workflows/ci-extended.yml +++ b/.github/workflows/ci-extended.yml @@ -21,6 +21,8 @@ env: CMAKE_BUILD_PARALLEL_LEVEL: 5 # num threads for build MACHINE_CFG: cmake/machinecfg/CI.cmake OMPI_MCA_mpi_common_cuda_event_max: 1000 + # CUDA IPC within docker repeated seem to cause issue on the CI machine + OMPI_MCA_btl_smcuda_use_cuda_ipc: 0 # https://github.com/open-mpi/ompi/issues/4948#issuecomment-395468231 OMPI_MCA_btl_vader_single_copy_mechanism: none @@ -34,7 +36,7 @@ jobs: container: image: ghcr.io/parthenon-hpc-lab/cuda11.6-mpi-hdf5-ascent # map to local user id on CI machine to allow writing to build cache - options: --user 1001 + options: --user 1001 --cap-add CAP_SYS_PTRACE --shm-size="8g" --ulimit memlock=134217728 steps: - uses: actions/checkout@v3 with: diff --git a/.github/workflows/ci-short.yml b/.github/workflows/ci-short.yml index ecb4052411ee..7e0fd8bf759a 100644 --- a/.github/workflows/ci-short.yml +++ b/.github/workflows/ci-short.yml @@ -13,6 +13,8 @@ env: CMAKE_BUILD_PARALLEL_LEVEL: 5 # num threads for build MACHINE_CFG: cmake/machinecfg/CI.cmake OMPI_MCA_mpi_common_cuda_event_max: 1000 + # CUDA IPC within docker repeated seem to cause issue on the CI machine + OMPI_MCA_btl_smcuda_use_cuda_ipc: 0 # https://github.com/open-mpi/ompi/issues/4948#issuecomment-395468231 OMPI_MCA_btl_vader_single_copy_mechanism: none @@ -22,7 +24,7 @@ jobs: container: image: ghcr.io/parthenon-hpc-lab/cuda11.6-mpi-hdf5-ascent # map to local user id on CI machine to allow writing to build cache - options: --user 1001 + options: --user 1001 --cap-add CAP_SYS_PTRACE --shm-size="8g" --ulimit memlock=134217728 steps: - uses: actions/checkout@v3 with: @@ -47,7 +49,7 @@ jobs: container: image: ghcr.io/parthenon-hpc-lab/cuda11.6-mpi-hdf5-ascent # map to local user id on CI machine to allow writing to build cache - options: --user 1001 + options: --user 1001 --cap-add CAP_SYS_PTRACE --shm-size="8g" --ulimit memlock=134217728 steps: - uses: actions/checkout@v3 with: @@ -79,7 +81,7 @@ jobs: container: image: ghcr.io/parthenon-hpc-lab/cuda11.6-mpi-hdf5-ascent # map to local user id on CI machine to allow writing to build cache - options: --user 1001 + options: --user 1001 --cap-add CAP_SYS_PTRACE --shm-size="8g" --ulimit memlock=134217728 steps: - uses: actions/checkout@v3 with: diff --git a/CHANGELOG.md b/CHANGELOG.md index c29941247e43..6a00ac911ba5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,7 +3,7 @@ ## Current develop ### Added (new features/APIs/variables/...) -- [[PR 1185]](https://github.com/parthenon-hpc-lab/parthenon/pull/1185/files) Bugfix to particle defragmentation +- [[PR 1185]](https://github.com/parthenon-hpc-lab/parthenon/pull/1185) Bugfix to particle defragmentation - [[PR 1184]](https://github.com/parthenon-hpc-lab/parthenon/pull/1184) Fix swarm block neighbor indexing in 1D, 2D - [[PR 1183]](https://github.com/parthenon-hpc-lab/parthenon/pull/1183) Fix particle leapfrog example initialization data - [[PR 1179]](https://github.com/parthenon-hpc-lab/parthenon/pull/1179) Make a global variable for whether simulation is a restart @@ -11,11 +11,11 @@ - [[PR 1161]](https://github.com/parthenon-hpc-lab/parthenon/pull/1161) Make flux field Metadata accessible, add Metadata::CellMemAligned flag, small perfomance upgrades ### Changed (changing behavior/API/variables/...) +- [[PR 1191]](https://github.com/parthenon-hpc-lab/parthenon/pull/1191) Update Kokkos version to 4.4.1 - [[PR 1206]](https://github.com/parthenon-hpc-lab/parthenon/pull/1206) Leapfrog fix -- [[PR1203]](https://github.com/parthenon-hpc-lab/parthenon/pull/1203) Pin Ubuntu CI image -- [[PR1177]](https://github.com/parthenon-hpc-lab/parthenon/pull/1177) Make mesh-level boundary conditions usable without the "user" flag +- [[PR 1203]](https://github.com/parthenon-hpc-lab/parthenon/pull/1203) Pin Ubuntu CI image +- [[PR 1177]](https://github.com/parthenon-hpc-lab/parthenon/pull/1177) Make mesh-level boundary conditions usable without the "user" flag - [[PR 1187]](https://github.com/parthenon-hpc-lab/parthenon/pull/1187) Make DataCollection::Add safer and generalize MeshBlockData::Initialize -- [[Issue 1165]](https://github.com/parthenon-hpc-lab/parthenon/issues/1165) Bump Kokkos submodule to 4.4.1 - [[PR 1171]](https://github.com/parthenon-hpc-lab/parthenon/pull/1171) Add PARTHENON_USE_SYSTEM_PACKAGES build option - [[PR 1172]](https://github.com/parthenon-hpc-lab/parthenon/pull/1172) Make parthenon manager robust against external MPI init and finalize calls @@ -32,7 +32,7 @@ ### Incompatibilities (i.e. breaking changes) -- [[PR1177]](https://github.com/parthenon-hpc-lab/parthenon/pull/1177) Make mesh-level boundary conditions usable without the "user" flag +- [[PR 1177]](https://github.com/parthenon-hpc-lab/parthenon/pull/1177) Make mesh-level boundary conditions usable without the "user" flag ## Release 24.08 Date: 2024-08-30 @@ -156,12 +156,12 @@ Date: 2024-03-21 - [[PR 973]](https://github.com/parthenon-hpc-lab/parthenon/pull/973) Multigrid performance upgrades ### Fixed (not changing behavior/API/variables/...) -- [[PR1023]](https://github.com/parthenon-hpc-lab/parthenon/pull/1023) Fix broken param of a scalar bool -- [[PR1012]](https://github.com/parthenon-hpc-lab/parthenon/pull/1012) Remove accidentally duplicated code -- [[PR992]](https://github.com/parthenon-hpc-lab/parthenon/pull/992) Allow custom PR ops with sparse pools -- [[PR988]](https://github.com/parthenon-hpc-lab/parthenon/pull/988) Fix bug in neighbor finding routine for small, periodic, refined meshes -- [[PR986]](https://github.com/parthenon-hpc-lab/parthenon/pull/986) Fix bug in sparse boundary communication BndInfo cacheing -- [[PR978]](https://github.com/parthenon-hpc-lab/parthenon/pull/978) remove erroneous sparse check +- [[PR 1023]](https://github.com/parthenon-hpc-lab/parthenon/pull/1023) Fix broken param of a scalar bool +- [[PR 1012]](https://github.com/parthenon-hpc-lab/parthenon/pull/1012) Remove accidentally duplicated code +- [[PR 992]](https://github.com/parthenon-hpc-lab/parthenon/pull/992) Allow custom PR ops with sparse pools +- [[PR 988]](https://github.com/parthenon-hpc-lab/parthenon/pull/988) Fix bug in neighbor finding routine for small, periodic, refined meshes +- [[PR 986]](https://github.com/parthenon-hpc-lab/parthenon/pull/986) Fix bug in sparse boundary communication BndInfo cacheing +- [[PR 978]](https://github.com/parthenon-hpc-lab/parthenon/pull/978) remove erroneous sparse check ### Infrastructure (changes irrelevant to downstream codes) - [[PR 1027]](https://github.com/parthenon-hpc-lab/parthenon/pull/1027) Refactor RestartReader as abstract class @@ -228,7 +228,7 @@ Date: 2023-11-16 - [[PR 901]](https://github.com/parthenon-hpc-lab/parthenon/pull/901) Implement shared element ownership model ### Removed (removing behavior/API/varaibles/...) -- [[PR 930](https://github.com/parthenon-hpc-lab/parthenon/pull/930) Remove ParthenonManager::ParthenonInit as it is error-prone and the split functions are the recommended usage. +- [[PR 930]](https://github.com/parthenon-hpc-lab/parthenon/pull/930) Remove ParthenonManager::ParthenonInit as it is error-prone and the split functions are the recommended usage. ## Release 0.8.0 diff --git a/README.md b/README.md index ed6a1bb05a16..b874e5172889 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,7 @@ Parthenon -- a performance portable block-structured adaptive mesh refinement fr * CMake 3.16 or greater * C++17 compatible compiler -* Kokkos 4.0.1 or greater +* Kokkos 4.4.1 or greater ## Optional (enabling features) diff --git a/cmake/machinecfg/GitHubActions.cmake b/cmake/machinecfg/GitHubActions.cmake index 663dcb38d682..1adba870bdb2 100644 --- a/cmake/machinecfg/GitHubActions.cmake +++ b/cmake/machinecfg/GitHubActions.cmake @@ -19,6 +19,7 @@ message(STATUS "Loading machine configuration for GitHub Actions CI. ") # common options set(NUM_MPI_PROC_TESTING "2" CACHE STRING "CI runs tests with 2 MPI ranks") +set(Kokkos_ENABLE_ROCTHRUST OFF CACHE BOOL "Temporarily disabled as the container needs to be updated to the `-complete` base image.") set(MACHINE_CXX_FLAGS "") if (${MACHINE_VARIANT} MATCHES "cuda") diff --git a/doc/sphinx/src/development.rst b/doc/sphinx/src/development.rst index dbab91d8d5ce..98ac9cef90a8 100644 --- a/doc/sphinx/src/development.rst +++ b/doc/sphinx/src/development.rst @@ -62,6 +62,34 @@ parallelism interface that is needed for managing memory cached in tightly nested loops. The wrappers are documented :ref:`here `. +View of Views +------------- + +Special care needs to be taken when working with a ``View`` of ``View``. + +To repeat the Kokkos documenation: `Don't use them `__ + +But if you have to (which is the case in some places inside Parthenon) +then follow this pattern: + +.. code:: c++ + + Kokkos::View *> view_of_pararrays(parthenon::ViewOfViewAlloc("myname"), 10); + +The ``ViewOfViewAlloc`` ensures that the ``Kokkos::SequentialHostInit`` property is added, +which results in the (inner ``View`` ) deallocators being called on the host (rather than on +the device by default). + +Similarly, when you create a host mirror of said ``View`` of ``View`` add the additional +property for the same reason. + +.. code:: c++ + + auto view_of_pararrays_h = + Kokkos::create_mirror_view(Kokkos::view_alloc(Kokkos::SequentialHostInit), view_of_pararrays); + +Note that the ``SequentialHostInit`` was only added in Kokkos 4.4.1 (which is now the default in Parthenon). + The need for reductions within function handling ``MeshBlock`` data ------------------------------------------------------------------- diff --git a/external/Kokkos b/external/Kokkos index 62d2b6c879b7..15dc143e5f39 160000 --- a/external/Kokkos +++ b/external/Kokkos @@ -1 +1 @@ -Subproject commit 62d2b6c879b74b6ae7bd06eb3e5e80139c4708e6 +Subproject commit 15dc143e5f39949eece972a798e175c4b463d4b8 diff --git a/src/bvals/bvals.hpp b/src/bvals/bvals.hpp index bc6ed0ebed56..aaa4da22c0f0 100644 --- a/src/bvals/bvals.hpp +++ b/src/bvals/bvals.hpp @@ -101,9 +101,6 @@ class BoundarySwarm : public BoundaryCommunication { explicit BoundarySwarm(std::weak_ptr pmb, const std::string &label); ~BoundarySwarm() = default; - std::vector> vars_int; - std::vector> vars_real; - // (usuallly the std::size_t unsigned integer type) std::vector::size_type bswarm_index; diff --git a/src/bvals/comms/bnd_info.cpp b/src/bvals/comms/bnd_info.cpp index 736992260913..198489c1c1f0 100644 --- a/src/bvals/comms/bnd_info.cpp +++ b/src/bvals/comms/bnd_info.cpp @@ -40,8 +40,9 @@ namespace parthenon { void ProResCache_t::Initialize(int n_regions, StateDescriptor *pkg) { - prores_info = ParArray1D("prores_info", n_regions); - prores_info_h = Kokkos::create_mirror_view(prores_info); + prores_info = ProResInfoArr_t(ViewOfViewAlloc("prores_info"), n_regions); + prores_info_h = Kokkos::create_mirror_view( + Kokkos::view_alloc(Kokkos::SequentialHostInit), prores_info); int nref_funcs = pkg->NumRefinementFuncs(); // Note that assignment of Kokkos views resets them, but // buffer_subset_sizes is a std::vector. It must be cleared, then diff --git a/src/bvals/comms/bnd_info.hpp b/src/bvals/comms/bnd_info.hpp index e6214ceba322..8c6c85d59f93 100644 --- a/src/bvals/comms/bnd_info.hpp +++ b/src/bvals/comms/bnd_info.hpp @@ -26,6 +26,7 @@ #include "bvals/neighbor_block.hpp" #include "coordinates/coordinates.hpp" #include "interface/variable_state.hpp" +#include "kokkos_abstraction.hpp" #include "mesh/domain.hpp" #include "mesh/forest/logical_coordinate_transformation.hpp" #include "utils/communication_buffer.hpp" @@ -127,11 +128,11 @@ struct ProResInfo { int GetBufferSize(MeshBlock *pmb, const NeighborBlock &nb, std::shared_ptr> v); -using BndInfoArr_t = ParArray1D; +using BndInfoArr_t = Kokkos::View; using BndInfoArrHost_t = typename BndInfoArr_t::HostMirror; -using ProResInfoArr_t = ParArray1D; -using ProResInfoArrHost_t = typename ParArray1D::HostMirror; +using ProResInfoArr_t = Kokkos::View; +using ProResInfoArrHost_t = typename ProResInfoArr_t::HostMirror; class StateDescriptor; struct ProResCache_t { ProResInfoArr_t prores_info{}; diff --git a/src/bvals/comms/bvals_utils.hpp b/src/bvals/comms/bvals_utils.hpp index f185c1207747..87b55b51019d 100644 --- a/src/bvals/comms/bvals_utils.hpp +++ b/src/bvals/comms/bvals_utils.hpp @@ -28,6 +28,7 @@ #include "bvals/comms/bnd_info.hpp" #include "bvals/comms/bvals_in_one.hpp" #include "interface/variable.hpp" +#include "kokkos_abstraction.hpp" #include "mesh/domain.hpp" #include "mesh/mesh.hpp" #include "mesh/meshblock.hpp" @@ -215,8 +216,9 @@ inline void RebuildBufferCache(std::shared_ptr> md, int nbound, using namespace loops; using namespace loops::shorthands; BvarsSubCache_t &cache = md->GetBvarsCache().GetSubCache(BOUND_TYPE, SENDER); - cache.bnd_info = BndInfoArr_t("bnd_info", nbound); - cache.bnd_info_h = Kokkos::create_mirror_view(cache.bnd_info); + cache.bnd_info = BndInfoArr_t(ViewOfViewAlloc("bnd_info"), nbound); + cache.bnd_info_h = Kokkos::create_mirror_view( + Kokkos::view_alloc(Kokkos::SequentialHostInit), cache.bnd_info); // prolongation/restriction sub-sets // TODO(JMM): Right now I exclude fluxcorrection boundaries but if diff --git a/src/driver/driver.cpp b/src/driver/driver.cpp index 17b913e3a2ac..9d8ce66ad702 100644 --- a/src/driver/driver.cpp +++ b/src/driver/driver.cpp @@ -94,6 +94,8 @@ DriverStatus EvolutionDriver::Execute() { // Defaults must be set across all ranks DumpInputParameters(); + DriverStatus driver_status = DriverStatus::complete; + { // Main t < tmax loop region PARTHENON_INSTRUMENT while (tm.KeepGoing() && signal != OutputSignal::analysis) { @@ -133,6 +135,14 @@ DriverStatus EvolutionDriver::Execute() { // check for signals signal = SignalHandler::CheckSignalFlags(); + // TODO(bwibking): check for application debug callback + // currently hard-coded to check for tiny dt + if (tm.dt < 1e-6 * tm.time) { + signal = OutputSignal::final; + driver_status = DriverStatus::failed; + // do not return here, since we still want to write an output + } + if (signal == OutputSignal::final) { break; } @@ -155,14 +165,16 @@ DriverStatus EvolutionDriver::Execute() { pmesh->UserWorkAfterLoop(pmesh, pinput, tm); } - DriverStatus status = tm.KeepGoing() ? DriverStatus::timeout : DriverStatus::complete; + if (driver_status != DriverStatus::failed) { + driver_status = tm.KeepGoing() ? DriverStatus::timeout : DriverStatus::complete; + } // Do *not* write the "final" output, if this is analysis run. // The analysis output itself has already been written above before the main loop. if (signal != OutputSignal::analysis) { pouts->MakeOutputs(pmesh, pinput, &tm, OutputSignal::final); } - PostExecute(status); - return status; + PostExecute(driver_status); + return driver_status; } void EvolutionDriver::PostExecute(DriverStatus status) { diff --git a/src/interface/mesh_data.hpp b/src/interface/mesh_data.hpp index 14ae3959c32a..9a86f0e52014 100644 --- a/src/interface/mesh_data.hpp +++ b/src/interface/mesh_data.hpp @@ -26,6 +26,7 @@ #include "interface/sparse_pack_base.hpp" #include "interface/swarm_pack_base.hpp" #include "interface/variable_pack.hpp" +#include "kokkos_abstraction.hpp" #include "mesh/domain.hpp" #include "mesh/meshblock.hpp" #include "mesh/meshblock_pack.hpp" @@ -149,8 +150,10 @@ const MeshBlockPack

&PackOnMesh(M &map, BlockDataList_t &block_data_, } if (make_new_pack) { - ParArray1D

packs("MeshData::PackVariables::packs", nblocks); - auto packs_host = Kokkos::create_mirror_view(packs); + Kokkos::View

packs( + ViewOfViewAlloc("MeshData::PackVariables::packs"), nblocks); + auto packs_host = + Kokkos::create_mirror_view(Kokkos::view_alloc(Kokkos::SequentialHostInit), packs); for (size_t i = 0; i < nblocks; i++) { const auto &pack = packing_function(block_data_[i], this_map, this_key); diff --git a/src/interface/sparse_pack_base.cpp b/src/interface/sparse_pack_base.cpp index 2a7a5b70c41c..751e88c200c6 100644 --- a/src/interface/sparse_pack_base.cpp +++ b/src/interface/sparse_pack_base.cpp @@ -30,6 +30,7 @@ #include "interface/sparse_pack_base.hpp" #include "interface/state_descriptor.hpp" #include "interface/variable.hpp" +#include "kokkos_abstraction.hpp" #include "utils/utils.hpp" namespace parthenon { namespace impl { @@ -151,8 +152,9 @@ SparsePackBase SparsePackBase::Build(T *pmd, const PackDescriptor &desc, } else if (contains_face_or_edge) { leading_dim += 2; } - pack.pack_ = pack_t("data_ptr", leading_dim, pack.nblocks_, max_size); - pack.pack_h_ = Kokkos::create_mirror_view(pack.pack_); + pack.pack_ = pack_t(ViewOfViewAlloc("data_ptr"), leading_dim, pack.nblocks_, max_size); + pack.pack_h_ = Kokkos::create_mirror_view( + Kokkos::view_alloc(Kokkos::SequentialHostInit), pack.pack_); // For non-flat packs, shape of pack is type x block x var x k x j x i // where type here might be a flux. @@ -167,8 +169,9 @@ SparsePackBase SparsePackBase::Build(T *pmd, const PackDescriptor &desc, pack.block_props_ = block_props_t("block_props", nblocks, 27 + 1); pack.block_props_h_ = Kokkos::create_mirror_view(pack.block_props_); - pack.coords_ = coords_t("coords", desc.flat ? max_size : nblocks); - auto coords_h = Kokkos::create_mirror_view(pack.coords_); + pack.coords_ = coords_t(ViewOfViewAlloc("coords"), desc.flat ? max_size : nblocks); + auto coords_h = Kokkos::create_mirror_view( + Kokkos::view_alloc(Kokkos::SequentialHostInit), pack.coords_); // Fill the views int idx = 0; diff --git a/src/interface/sparse_pack_base.hpp b/src/interface/sparse_pack_base.hpp index 0deca487a20a..089ced81f290 100644 --- a/src/interface/sparse_pack_base.hpp +++ b/src/interface/sparse_pack_base.hpp @@ -30,6 +30,7 @@ #include "interface/state_descriptor.hpp" #include "interface/variable.hpp" #include "interface/variable_state.hpp" +#include "kokkos_abstraction.hpp" #include "utils/utils.hpp" namespace parthenon { @@ -55,13 +56,14 @@ class SparsePackBase { using alloc_t = std::vector; using include_t = std::vector; - using pack_t = ParArray3D>; + using pack_t = + Kokkos::View ***, LayoutWrapper, DevMemSpace>; using pack_h_t = typename pack_t::HostMirror; using bounds_t = ParArray3D; using bounds_h_t = typename bounds_t::HostMirror; using block_props_t = ParArray2D; using block_props_h_t = typename block_props_t::HostMirror; - using coords_t = ParArray1D>; + using coords_t = Kokkos::View *, LayoutWrapper, DevMemSpace>; // Returns a SparsePackBase object that is either newly created or taken // from the cache in pmd. The cache itself handles the all of this logic diff --git a/src/interface/swarm_pack_base.hpp b/src/interface/swarm_pack_base.hpp index 0733aa51f329..6d0933b8485c 100644 --- a/src/interface/swarm_pack_base.hpp +++ b/src/interface/swarm_pack_base.hpp @@ -28,6 +28,7 @@ #include "interface/state_descriptor.hpp" #include "interface/swarm_device_context.hpp" #include "interface/variable.hpp" +#include "kokkos_abstraction.hpp" #include "utils/utils.hpp" namespace parthenon { @@ -43,10 +44,10 @@ class SwarmPackBase { SwarmPackBase() = default; virtual ~SwarmPackBase() = default; - using pack_t = ParArray3D>; + using pack_t = Kokkos::View ***, LayoutWrapper, DevMemSpace>; using bounds_t = ParArray3D; - using contexts_t = ParArray1D; - using contexts_h_t = typename ParArray1D::HostMirror; + using contexts_t = Kokkos::View; + using contexts_h_t = typename contexts_t::HostMirror; using max_active_indices_t = ParArray1D; using desc_t = impl::SwarmPackDescriptor; using idx_map_t = std::unordered_map; @@ -108,8 +109,9 @@ class SwarmPackBase { // Allocate the views int leading_dim = 1; - pack.pack_ = pack_t("data_ptr", leading_dim, nblocks, max_size); - auto pack_h = Kokkos::create_mirror_view(pack.pack_); + pack.pack_ = pack_t(ViewOfViewAlloc("data_ptr"), leading_dim, nblocks, max_size); + auto pack_h = Kokkos::create_mirror_view( + Kokkos::view_alloc(Kokkos::SequentialHostInit), pack.pack_); pack.bounds_ = bounds_t("bounds", 2, nblocks, nvar); auto bounds_h = Kokkos::create_mirror_view(pack.bounds_); @@ -153,8 +155,9 @@ class SwarmPackBase { Kokkos::deep_copy(pack.pack_, pack_h); Kokkos::deep_copy(pack.bounds_, bounds_h); - pack.contexts_ = contexts_t("contexts", nblocks); - pack.contexts_h_ = Kokkos::create_mirror_view(pack.contexts_); + pack.contexts_ = contexts_t(ViewOfViewAlloc("contexts"), nblocks); + pack.contexts_h_ = Kokkos::create_mirror_view( + Kokkos::view_alloc(Kokkos::SequentialHostInit), pack.contexts_); pack.max_active_indices_ = max_active_indices_t("max_active_indices", nblocks); pack.flat_index_map_ = max_active_indices_t("flat_index_map", nblocks + 1); BuildSupplemental(pmd, desc, pack); diff --git a/src/interface/variable_pack.hpp b/src/interface/variable_pack.hpp index 037731093ce1..76fce5f6b861 100644 --- a/src/interface/variable_pack.hpp +++ b/src/interface/variable_pack.hpp @@ -244,10 +244,11 @@ class PackIndexMap { }; template -using ViewOfParArrays = ParArray1D>; +using ViewOfParArrays = + Kokkos::View *, LayoutWrapper, DevMemSpace>; template -using ViewOfParArrays1D = ParArray1D>; +using ViewOfParArrays1D = Kokkos::View *, LayoutWrapper, DevMemSpace>; // forward declaration template @@ -570,10 +571,11 @@ void FillVarView(const VariableVector &vars, int vsize, bool coarse, assert(vsize == sparse_id_out.size()); assert(vsize == vector_component_out.size()); - auto host_cv = Kokkos::create_mirror_view(Kokkos::HostSpace(), cv_out); - auto host_sp = Kokkos::create_mirror_view(Kokkos::HostSpace(), sparse_id_out); - auto host_vc = Kokkos::create_mirror_view(Kokkos::HostSpace(), vector_component_out); - auto host_al = Kokkos::create_mirror_view(Kokkos::HostSpace(), allocated_out); + auto host_cv = + Kokkos::create_mirror_view(Kokkos::view_alloc(Kokkos::SequentialHostInit), cv_out); + auto host_sp = Kokkos::create_mirror_view(sparse_id_out); + auto host_vc = Kokkos::create_mirror_view(vector_component_out); + auto host_al = Kokkos::create_mirror_view(allocated_out); int vindex = 0; for (const auto &v : vars) { @@ -634,7 +636,8 @@ void FillSwarmVarView(const vpack_types::SwarmVarList &vars, ViewOfParArrays1D &cv_out, PackIndexMap *pvmap) { using vpack_types::IndexPair; - auto host_cv = Kokkos::create_mirror_view(Kokkos::HostSpace(), cv_out); + auto host_cv = + Kokkos::create_mirror_view(Kokkos::view_alloc(Kokkos::SequentialHostInit), cv_out); int vindex = 0; for (const auto v : vars) { @@ -675,10 +678,13 @@ void FillFluxViews(const VariableVector &vars, const int ndim, PackIndexMap *pvmap) { using vpack_types::IndexPair; - auto host_f1 = Kokkos::create_mirror_view(Kokkos::HostSpace(), f1_out); - auto host_f2 = Kokkos::create_mirror_view(Kokkos::HostSpace(), f2_out); - auto host_f3 = Kokkos::create_mirror_view(Kokkos::HostSpace(), f3_out); - auto host_al = Kokkos::create_mirror_view(Kokkos::HostSpace(), flux_allocated_out); + auto host_f1 = + Kokkos::create_mirror_view(Kokkos::view_alloc(Kokkos::SequentialHostInit), f1_out); + auto host_f2 = + Kokkos::create_mirror_view(Kokkos::view_alloc(Kokkos::SequentialHostInit), f2_out); + auto host_f3 = + Kokkos::create_mirror_view(Kokkos::view_alloc(Kokkos::SequentialHostInit), f3_out); + auto host_al = Kokkos::create_mirror_view(flux_allocated_out); int vindex = 0; for (const auto &v : vars) { @@ -755,10 +761,11 @@ VariableFluxPack MakeFluxPack(const VarListWithKeys &var_list, } // make the outer view - ViewOfParArrays cv("MakeFluxPack::cv", vsize * (extra_components ? 3 : 1)); - ViewOfParArrays f1("MakeFluxPack::f1", fsize); - ViewOfParArrays f2("MakeFluxPack::f2", fsize); - ViewOfParArrays f3("MakeFluxPack::f3", fsize); + ViewOfParArrays cv(ViewOfViewAlloc("MakeFluxPack::cv"), + vsize * (extra_components ? 3 : 1)); + ViewOfParArrays f1(ViewOfViewAlloc("MakeFluxPack::f1"), fsize); + ViewOfParArrays f2(ViewOfViewAlloc("MakeFluxPack::f2"), fsize); + ViewOfParArrays f3(ViewOfViewAlloc("MakeFluxPack::f3"), fsize); ParArray1D flux_allocated("MakePack::allocated", fsize); ParArray1D sparse_id("MakeFluxPack::sparse_id", vsize); ParArray1D vector_component("MakeFluxPack::vector_component", vsize); @@ -809,7 +816,8 @@ VariablePack MakePack(const VarListWithKeys &var_list, bool coarse, } // make the outer view - ViewOfParArrays cv("MakePack::cv", vsize * (extra_components ? 3 : 1)); + ViewOfParArrays cv(ViewOfViewAlloc("MakePack::cv"), + vsize * (extra_components ? 3 : 1)); ParArray1D sparse_id("MakePack::sparse_id", vsize); ParArray1D vector_component("MakePack::vector_component", vsize); ParArray1D allocated("MakePack::allocated", vsize); @@ -842,7 +850,7 @@ SwarmVariablePack MakeSwarmPack(const vpack_types::SwarmVarList &vars, } // make the outer view - ViewOfParArrays1D cv("MakePack::cv", vsize); + ViewOfParArrays1D cv(ViewOfViewAlloc("MakePack::cv"), vsize); std::array cv_size{0, 0}; if (vsize > 0) { diff --git a/src/kokkos_abstraction.hpp b/src/kokkos_abstraction.hpp index 8fa89f82e95e..37262dd0356c 100644 --- a/src/kokkos_abstraction.hpp +++ b/src/kokkos_abstraction.hpp @@ -1035,6 +1035,20 @@ par_reduce_inner(InnerLoopPatternTTR, team_mbr_t team_member, const int il, cons reduction); } +// For ViewOfView we need to call the destructor of the inner views on +// the host and not on the device (which would happen by default). +// Thus, we need to pass `SquentialHostInit` as allocator, but only if the ViewOfView is +// on the host. If the ViewOfViews in on the device, then `SequentialHostInit` should be +// passed when calling `create_mirror_view`. +template +auto ViewOfViewAlloc(const std::string &label) { + if constexpr (std::is_same_v) { + return Kokkos::view_alloc(Kokkos::SequentialHostInit, label); + } else { + return Kokkos::view_alloc(label); + } +} + // reused from kokoks/core/perf_test/PerfTest_ExecSpacePartitioning.cpp // commit a0d011fb30022362c61b3bb000ae3de6906cb6a7 template diff --git a/src/mesh/meshblock_pack.hpp b/src/mesh/meshblock_pack.hpp index 5669e112109b..1800489ccf0b 100644 --- a/src/mesh/meshblock_pack.hpp +++ b/src/mesh/meshblock_pack.hpp @@ -38,7 +38,8 @@ class MeshBlockPack { using pack_type = T; MeshBlockPack() = default; - MeshBlockPack(const ParArray1D view, const std::array dims) + MeshBlockPack(const Kokkos::View view, + const std::array dims) : v_(view), dims_(dims), ndim_((dims[2] > 1 ? 3 : (dims[1] > 1 ? 2 : 1))) {} KOKKOS_FORCEINLINE_FUNCTION @@ -85,7 +86,7 @@ class MeshBlockPack { const Coordinates_t &GetCoords(const int i) const { return v_(i).GetCoords(); } private: - ParArray1D v_; + Kokkos::View v_; std::array dims_; int ndim_; }; diff --git a/src/parthenon_array_generic.hpp b/src/parthenon_array_generic.hpp index d527707f9070..ac38b6cb5e5f 100644 --- a/src/parthenon_array_generic.hpp +++ b/src/parthenon_array_generic.hpp @@ -221,6 +221,8 @@ class ParArrayGeneric : public State { // return GetDim(1) * GetDim(2) * GetDim(3) * GetDim(4) * GetDim(5) * GetDim(6); } + // TODO(PG?) Can we use concepts here to add a + // Kokkos::view_alloc(Kokkos::SequentialHostInit) when the original is a ViewOfView? template auto GetMirror(MemSpace const &memspace) { auto mirror = Kokkos::create_mirror_view(memspace, data_); @@ -333,6 +335,8 @@ inline auto subview(std::index_sequence, return parthenon::ParArrayGeneric(v, arr); } +// TODO(PG?) Can we use concepts here to add a +// Kokkos::view_alloc(Kokkos::SequentialHostInit) when the original is a ViewOfView? template inline auto create_mirror_view_and_copy(Space const &space, const parthenon::ParArrayGeneric &arr) { diff --git a/tst/unit/test_pararrays.cpp b/tst/unit/test_pararrays.cpp index e79927c13b20..865dfbedf5e9 100644 --- a/tst/unit/test_pararrays.cpp +++ b/tst/unit/test_pararrays.cpp @@ -451,8 +451,9 @@ TEST_CASE("ParArray state", "[ParArrayND]") { } GIVEN("An array of ParArrays filled with the values contained in their state") { - parthenon::ParArray1D pack("test pack", NS); - auto pack_h = Kokkos::create_mirror_view(pack); + Kokkos::View pack(parthenon::ViewOfViewAlloc("test pack"), NS); + auto pack_h = + Kokkos::create_mirror_view(Kokkos::view_alloc(Kokkos::SequentialHostInit), pack); for (int b = 0; b < NS; ++b) { state_t state(static_cast(b)); @@ -527,6 +528,8 @@ TEST_CASE("Check registry pressure", "[ParArrayND][performance]") { // view of views. See: // https://github.com/kokkos/kokkos/wiki/View#6232-whats-the-problem-with-a-view-of-views + // TODO(PG) depending on the results of the view of view discussion, we should add + // destructor or ViewOfViewAlloc with SequentialHostInit using view_3d_t = Kokkos::View; using arrays_t = Kokkos::View *, UVMSpace>; @@ -544,7 +547,8 @@ TEST_CASE("Check registry pressure", "[ParArrayND][performance]") { new (&views[n]) view_3d_t(Kokkos::view_alloc(label, Kokkos::WithoutInitializing), N, N, N); auto a_h = arrays(n).GetHostMirror(); - auto v_h = Kokkos::create_mirror_view(views(n)); + auto v_h = Kokkos::create_mirror_view(Kokkos::view_alloc(Kokkos::SequentialHostInit), + views(n)); for (int k = 0; k < N; k++) { for (int j = 0; j < N; j++) { for (int i = 0; i < N; i++) {