From 00c936c96d273da9b845c25481e5b3218ba1d4e8 Mon Sep 17 00:00:00 2001
From: Jonah Miller <jonahm@lanl.gov>
Date: Wed, 16 Oct 2024 08:56:23 -0600
Subject: [PATCH 01/18] bump Kokkos version

---
 external/Kokkos | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/external/Kokkos b/external/Kokkos
index 62d2b6c879b7..e0dc0128e04f 160000
--- a/external/Kokkos
+++ b/external/Kokkos
@@ -1 +1 @@
-Subproject commit 62d2b6c879b74b6ae7bd06eb3e5e80139c4708e6
+Subproject commit e0dc0128e04f18c2bbbaefceef3616e7ddcfa3c4

From 0f386249fec65bf2b4873711610ea51b30841341 Mon Sep 17 00:00:00 2001
From: Jonah Miller <jonahm@lanl.gov>
Date: Wed, 16 Oct 2024 08:57:38 -0600
Subject: [PATCH 02/18] changelog

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1f046faabe75..71d100d528aa 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -11,6 +11,7 @@
 - [[PR 1161]](https://github.com/parthenon-hpc-lab/parthenon/pull/1161) Make flux field Metadata accessible, add Metadata::CellMemAligned flag, small perfomance upgrades
 
 ### Changed (changing behavior/API/variables/...)
+- [[PR1191]](https://github.com/parthenon-hpc-lab/parthenon/pull/1191) Update Kokkos version to 4.2.01
 - [[PR 1187]](https://github.com/parthenon-hpc-lab/parthenon/pull/1187) Make DataCollection::Add safer and generalize MeshBlockData::Initialize
 - [[Issue 1165]](https://github.com/parthenon-hpc-lab/parthenon/issues/1165) Bump Kokkos submodule to 4.4.1
 - [[PR 1171]](https://github.com/parthenon-hpc-lab/parthenon/pull/1171) Add PARTHENON_USE_SYSTEM_PACKAGES build option

From 86783d434bd465209467c889290bf8a8494ecb61 Mon Sep 17 00:00:00 2001
From: Patrick Mullen <pdmullen@pn2293621.lanl.gov>
Date: Thu, 17 Oct 2024 08:38:16 -0600
Subject: [PATCH 03/18] Update Kokkos version to 4.4.1

---
 CHANGELOG.md    | 3 +--
 external/Kokkos | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 71d100d528aa..ff9137b4dfc7 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -11,9 +11,8 @@
 - [[PR 1161]](https://github.com/parthenon-hpc-lab/parthenon/pull/1161) Make flux field Metadata accessible, add Metadata::CellMemAligned flag, small perfomance upgrades
 
 ### Changed (changing behavior/API/variables/...)
-- [[PR1191]](https://github.com/parthenon-hpc-lab/parthenon/pull/1191) Update Kokkos version to 4.2.01
+- [[PR 1191]](https://github.com/parthenon-hpc-lab/parthenon/pull/1191) Update Kokkos version to 4.4.1
 - [[PR 1187]](https://github.com/parthenon-hpc-lab/parthenon/pull/1187) Make DataCollection::Add safer and generalize MeshBlockData::Initialize
-- [[Issue 1165]](https://github.com/parthenon-hpc-lab/parthenon/issues/1165) Bump Kokkos submodule to 4.4.1
 - [[PR 1171]](https://github.com/parthenon-hpc-lab/parthenon/pull/1171) Add PARTHENON_USE_SYSTEM_PACKAGES build option
 - [[PR 1172]](https://github.com/parthenon-hpc-lab/parthenon/pull/1172) Make parthenon manager robust against external MPI init and finalize calls
 
diff --git a/external/Kokkos b/external/Kokkos
index e0dc0128e04f..15dc143e5f39 160000
--- a/external/Kokkos
+++ b/external/Kokkos
@@ -1 +1 @@
-Subproject commit e0dc0128e04f18c2bbbaefceef3616e7ddcfa3c4
+Subproject commit 15dc143e5f39949eece972a798e175c4b463d4b8

From b4ab05f5f3c059e6a9fa208a3101c6e47bd0d3aa Mon Sep 17 00:00:00 2001
From: Philipp Grete <pgrete@hs.uni-hamburg.de>
Date: Fri, 18 Oct 2024 15:34:01 +0200
Subject: [PATCH 04/18] Fix View of View dealloc

---
 src/bvals/comms/bnd_info.hpp    | 2 +-
 src/bvals/comms/bvals_utils.hpp | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/bvals/comms/bnd_info.hpp b/src/bvals/comms/bnd_info.hpp
index e6214ceba322..8800f6fd867f 100644
--- a/src/bvals/comms/bnd_info.hpp
+++ b/src/bvals/comms/bnd_info.hpp
@@ -127,7 +127,7 @@ struct ProResInfo {
 int GetBufferSize(MeshBlock *pmb, const NeighborBlock &nb,
                   std::shared_ptr<Variable<Real>> v);
 
-using BndInfoArr_t = ParArray1D<BndInfo>;
+using BndInfoArr_t = Kokkos::View<BndInfo *, LayoutWrapper, DevMemSpace>;
 using BndInfoArrHost_t = typename BndInfoArr_t::HostMirror;
 
 using ProResInfoArr_t = ParArray1D<ProResInfo>;
diff --git a/src/bvals/comms/bvals_utils.hpp b/src/bvals/comms/bvals_utils.hpp
index f185c1207747..616d5cf21ee0 100644
--- a/src/bvals/comms/bvals_utils.hpp
+++ b/src/bvals/comms/bvals_utils.hpp
@@ -215,7 +215,8 @@ inline void RebuildBufferCache(std::shared_ptr<MeshData<Real>> md, int nbound,
   using namespace loops;
   using namespace loops::shorthands;
   BvarsSubCache_t &cache = md->GetBvarsCache().GetSubCache(BOUND_TYPE, SENDER);
-  cache.bnd_info = BndInfoArr_t("bnd_info", nbound);
+  cache.bnd_info =
+      BndInfoArr_t(Kokkos::view_alloc("bnd_info", Kokkos::SequentialHostInit), nbound);
   cache.bnd_info_h = Kokkos::create_mirror_view(cache.bnd_info);
 
   // prolongation/restriction sub-sets

From e333a7b6c33a4ede8a2c0f9a435bbac85227b59b Mon Sep 17 00:00:00 2001
From: Philipp Grete <pgrete@hs.uni-hamburg.de>
Date: Mon, 21 Oct 2024 11:18:17 +0200
Subject: [PATCH 05/18] Move view_alloc to host

---
 src/bvals/comms/bvals_utils.hpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/bvals/comms/bvals_utils.hpp b/src/bvals/comms/bvals_utils.hpp
index 616d5cf21ee0..8a47c716aa22 100644
--- a/src/bvals/comms/bvals_utils.hpp
+++ b/src/bvals/comms/bvals_utils.hpp
@@ -215,9 +215,9 @@ inline void RebuildBufferCache(std::shared_ptr<MeshData<Real>> md, int nbound,
   using namespace loops;
   using namespace loops::shorthands;
   BvarsSubCache_t &cache = md->GetBvarsCache().GetSubCache(BOUND_TYPE, SENDER);
-  cache.bnd_info =
-      BndInfoArr_t(Kokkos::view_alloc("bnd_info", Kokkos::SequentialHostInit), nbound);
-  cache.bnd_info_h = Kokkos::create_mirror_view(cache.bnd_info);
+  cache.bnd_info = BndInfoArr_t("bnd_info", nbound);
+  cache.bnd_info_h = Kokkos::create_mirror_view(
+      Kokkos::view_alloc(Kokkos::SequentialHostInit), cache.bnd_info);
 
   // prolongation/restriction sub-sets
   // TODO(JMM): Right now I exclude fluxcorrection boundaries but if

From 094880e12701ba8ee80e37e8a5b0ca52b5f74326 Mon Sep 17 00:00:00 2001
From: Philipp Grete <pgrete@hs.uni-hamburg.de>
Date: Mon, 21 Oct 2024 11:34:18 +0200
Subject: [PATCH 06/18] Temp disable rocthrust on amd ci container

---
 cmake/machinecfg/GitHubActions.cmake | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cmake/machinecfg/GitHubActions.cmake b/cmake/machinecfg/GitHubActions.cmake
index 663dcb38d682..1adba870bdb2 100644
--- a/cmake/machinecfg/GitHubActions.cmake
+++ b/cmake/machinecfg/GitHubActions.cmake
@@ -19,6 +19,7 @@ message(STATUS "Loading machine configuration for GitHub Actions CI. ")
 
 # common options
 set(NUM_MPI_PROC_TESTING "2" CACHE STRING "CI runs tests with 2 MPI ranks")
+set(Kokkos_ENABLE_ROCTHRUST OFF CACHE BOOL "Temporarily disabled as the container needs to be updated to the `-complete` base image.")
 
 set(MACHINE_CXX_FLAGS "")
 if (${MACHINE_VARIANT} MATCHES "cuda")

From 0e0ef22cfc1540bc3156437fed5a4a903becc071 Mon Sep 17 00:00:00 2001
From: Philipp Grete <pgrete@hs.uni-hamburg.de>
Date: Mon, 21 Oct 2024 11:42:56 +0200
Subject: [PATCH 07/18] Fix doc creation. Rollback to ubuntu22-04

---
 .github/workflows/docs.yml | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
index 4b95511d82f5..7fb6e0165ff0 100644
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -12,7 +12,10 @@ on:
 jobs:
     docs:
       name: build and deploy docs
-      runs-on: ubuntu-latest
+      # not using latest due to issues with pip user packages, see
+      # https://github.com/actions/runner-images/issues/10781 and
+      # https://github.com/actions/runner-images/issues/10636
+      runs-on: ubuntu-22.04
 
       steps:      
         - name: Checkout code
@@ -23,9 +26,9 @@ jobs:
           run: export DEBIAN_FRONTEND=noninteractive
         - name: install dependencies
           run: |
-            pip install --break-system-packages sphinx
-            pip install --break-system-packages sphinx-rtd-theme
-            pip install --break-system-packages sphinx-multiversion
+            pip install sphinx
+            pip install sphinx-rtd-theme
+            pip install sphinx-multiversion
         - name: build docs
           run: |
             echo "Repo = ${GITHUB_REPOSITORY}"

From 343b0092e838425514e357685066305ab208b9d1 Mon Sep 17 00:00:00 2001
From: Philipp Grete <pgrete@hs.uni-hamburg.de>
Date: Tue, 22 Oct 2024 12:02:45 +0200
Subject: [PATCH 08/18] Fix more view of views

---
 src/bvals/comms/bnd_info.cpp       |  3 ++-
 src/interface/mesh_data.hpp        |  3 ++-
 src/interface/sparse_pack_base.cpp |  6 ++++--
 src/interface/swarm_comms.cpp      |  1 +
 src/interface/swarm_pack_base.hpp  |  6 ++++--
 src/interface/variable_pack.hpp    | 23 ++++++++++++++---------
 src/parthenon_array_generic.hpp    |  4 ++++
 tst/unit/test_pararrays.cpp        |  6 ++++--
 8 files changed, 35 insertions(+), 17 deletions(-)

diff --git a/src/bvals/comms/bnd_info.cpp b/src/bvals/comms/bnd_info.cpp
index 736992260913..54a6ae2b50fd 100644
--- a/src/bvals/comms/bnd_info.cpp
+++ b/src/bvals/comms/bnd_info.cpp
@@ -41,7 +41,8 @@ namespace parthenon {
 
 void ProResCache_t::Initialize(int n_regions, StateDescriptor *pkg) {
   prores_info = ParArray1D<ProResInfo>("prores_info", n_regions);
-  prores_info_h = Kokkos::create_mirror_view(prores_info);
+  prores_info_h = Kokkos::create_mirror_view(
+      Kokkos::view_alloc(Kokkos::SequentialHostInit), prores_info);
   int nref_funcs = pkg->NumRefinementFuncs();
   // Note that assignment of Kokkos views resets them, but
   // buffer_subset_sizes is a std::vector. It must be cleared, then
diff --git a/src/interface/mesh_data.hpp b/src/interface/mesh_data.hpp
index 14ae3959c32a..7d7d1cabcbe5 100644
--- a/src/interface/mesh_data.hpp
+++ b/src/interface/mesh_data.hpp
@@ -150,7 +150,8 @@ const MeshBlockPack<P> &PackOnMesh(M &map, BlockDataList_t<Real> &block_data_,
 
   if (make_new_pack) {
     ParArray1D<P> packs("MeshData::PackVariables::packs", nblocks);
-    auto packs_host = Kokkos::create_mirror_view(packs);
+    auto packs_host =
+        Kokkos::create_mirror_view(Kokkos::view_alloc(Kokkos::SequentialHostInit), packs);
 
     for (size_t i = 0; i < nblocks; i++) {
       const auto &pack = packing_function(block_data_[i], this_map, this_key);
diff --git a/src/interface/sparse_pack_base.cpp b/src/interface/sparse_pack_base.cpp
index 2a7a5b70c41c..4ea5a558c3f5 100644
--- a/src/interface/sparse_pack_base.cpp
+++ b/src/interface/sparse_pack_base.cpp
@@ -152,7 +152,8 @@ SparsePackBase SparsePackBase::Build(T *pmd, const PackDescriptor &desc,
     leading_dim += 2;
   }
   pack.pack_ = pack_t("data_ptr", leading_dim, pack.nblocks_, max_size);
-  pack.pack_h_ = Kokkos::create_mirror_view(pack.pack_);
+  pack.pack_h_ = Kokkos::create_mirror_view(
+      Kokkos::view_alloc(Kokkos::SequentialHostInit), pack.pack_);
 
   // For non-flat packs, shape of pack is type x block x var x k x j x i
   // where type here might be a flux.
@@ -168,7 +169,8 @@ SparsePackBase SparsePackBase::Build(T *pmd, const PackDescriptor &desc,
   pack.block_props_h_ = Kokkos::create_mirror_view(pack.block_props_);
 
   pack.coords_ = coords_t("coords", desc.flat ? max_size : nblocks);
-  auto coords_h = Kokkos::create_mirror_view(pack.coords_);
+  auto coords_h = Kokkos::create_mirror_view(
+      Kokkos::view_alloc(Kokkos::SequentialHostInit), pack.coords_);
 
   // Fill the views
   int idx = 0;
diff --git a/src/interface/swarm_comms.cpp b/src/interface/swarm_comms.cpp
index 3ee4e2af7512..bb5755e2cbe8 100644
--- a/src/interface/swarm_comms.cpp
+++ b/src/interface/swarm_comms.cpp
@@ -157,6 +157,7 @@ void Swarm::SetupPersistentMPI() {
 }
 
 void Swarm::CountParticlesToSend_() {
+  // TODO(PG->BRR) What's going on here? Why is the mask being copied?
   auto mask_h = Kokkos::create_mirror_view_and_copy(HostMemSpace(), mask_);
   auto swarm_d = GetDeviceContext();
   auto pmb = GetBlockPointer();
diff --git a/src/interface/swarm_pack_base.hpp b/src/interface/swarm_pack_base.hpp
index 0733aa51f329..52a2c3c47fc7 100644
--- a/src/interface/swarm_pack_base.hpp
+++ b/src/interface/swarm_pack_base.hpp
@@ -109,7 +109,8 @@ class SwarmPackBase {
     // Allocate the views
     int leading_dim = 1;
     pack.pack_ = pack_t("data_ptr", leading_dim, nblocks, max_size);
-    auto pack_h = Kokkos::create_mirror_view(pack.pack_);
+    auto pack_h = Kokkos::create_mirror_view(
+        Kokkos::view_alloc(Kokkos::SequentialHostInit), pack.pack_);
 
     pack.bounds_ = bounds_t("bounds", 2, nblocks, nvar);
     auto bounds_h = Kokkos::create_mirror_view(pack.bounds_);
@@ -154,7 +155,8 @@ class SwarmPackBase {
     Kokkos::deep_copy(pack.bounds_, bounds_h);
 
     pack.contexts_ = contexts_t("contexts", nblocks);
-    pack.contexts_h_ = Kokkos::create_mirror_view(pack.contexts_);
+    pack.contexts_h_ = Kokkos::create_mirror_view(
+        Kokkos::view_alloc(Kokkos::SequentialHostInit), pack.contexts_);
     pack.max_active_indices_ = max_active_indices_t("max_active_indices", nblocks);
     pack.flat_index_map_ = max_active_indices_t("flat_index_map", nblocks + 1);
     BuildSupplemental(pmd, desc, pack);
diff --git a/src/interface/variable_pack.hpp b/src/interface/variable_pack.hpp
index 037731093ce1..07e56091ddfe 100644
--- a/src/interface/variable_pack.hpp
+++ b/src/interface/variable_pack.hpp
@@ -570,10 +570,11 @@ void FillVarView(const VariableVector<T> &vars, int vsize, bool coarse,
   assert(vsize == sparse_id_out.size());
   assert(vsize == vector_component_out.size());
 
-  auto host_cv = Kokkos::create_mirror_view(Kokkos::HostSpace(), cv_out);
-  auto host_sp = Kokkos::create_mirror_view(Kokkos::HostSpace(), sparse_id_out);
-  auto host_vc = Kokkos::create_mirror_view(Kokkos::HostSpace(), vector_component_out);
-  auto host_al = Kokkos::create_mirror_view(Kokkos::HostSpace(), allocated_out);
+  auto host_cv =
+      Kokkos::create_mirror_view(Kokkos::view_alloc(Kokkos::SequentialHostInit), cv_out);
+  auto host_sp = Kokkos::create_mirror_view(sparse_id_out);
+  auto host_vc = Kokkos::create_mirror_view(vector_component_out);
+  auto host_al = Kokkos::create_mirror_view(allocated_out);
 
   int vindex = 0;
   for (const auto &v : vars) {
@@ -634,7 +635,8 @@ void FillSwarmVarView(const vpack_types::SwarmVarList<T> &vars,
                       ViewOfParArrays1D<T> &cv_out, PackIndexMap *pvmap) {
   using vpack_types::IndexPair;
 
-  auto host_cv = Kokkos::create_mirror_view(Kokkos::HostSpace(), cv_out);
+  auto host_cv =
+      Kokkos::create_mirror_view(Kokkos::view_alloc(Kokkos::SequentialHostInit), cv_out);
 
   int vindex = 0;
   for (const auto v : vars) {
@@ -675,10 +677,13 @@ void FillFluxViews(const VariableVector<T> &vars, const int ndim,
                    PackIndexMap *pvmap) {
   using vpack_types::IndexPair;
 
-  auto host_f1 = Kokkos::create_mirror_view(Kokkos::HostSpace(), f1_out);
-  auto host_f2 = Kokkos::create_mirror_view(Kokkos::HostSpace(), f2_out);
-  auto host_f3 = Kokkos::create_mirror_view(Kokkos::HostSpace(), f3_out);
-  auto host_al = Kokkos::create_mirror_view(Kokkos::HostSpace(), flux_allocated_out);
+  auto host_f1 =
+      Kokkos::create_mirror_view(Kokkos::view_alloc(Kokkos::SequentialHostInit), f1_out);
+  auto host_f2 =
+      Kokkos::create_mirror_view(Kokkos::view_alloc(Kokkos::SequentialHostInit), f2_out);
+  auto host_f3 =
+      Kokkos::create_mirror_view(Kokkos::view_alloc(Kokkos::SequentialHostInit), f3_out);
+  auto host_al = Kokkos::create_mirror_view(flux_allocated_out);
 
   int vindex = 0;
   for (const auto &v : vars) {
diff --git a/src/parthenon_array_generic.hpp b/src/parthenon_array_generic.hpp
index d527707f9070..ac38b6cb5e5f 100644
--- a/src/parthenon_array_generic.hpp
+++ b/src/parthenon_array_generic.hpp
@@ -221,6 +221,8 @@ class ParArrayGeneric : public State {
     // return GetDim(1) * GetDim(2) * GetDim(3) * GetDim(4) * GetDim(5) * GetDim(6);
   }
 
+  // TODO(PG?) Can we use concepts here to add a
+  // Kokkos::view_alloc(Kokkos::SequentialHostInit) when the original is a ViewOfView?
   template <typename MemSpace>
   auto GetMirror(MemSpace const &memspace) {
     auto mirror = Kokkos::create_mirror_view(memspace, data_);
@@ -333,6 +335,8 @@ inline auto subview(std::index_sequence<I...>,
   return parthenon::ParArrayGeneric<decltype(v), SU>(v, arr);
 }
 
+// TODO(PG?) Can we use concepts here to add a
+// Kokkos::view_alloc(Kokkos::SequentialHostInit) when the original is a ViewOfView?
 template <class Space, class U, class SU>
 inline auto create_mirror_view_and_copy(Space const &space,
                                         const parthenon::ParArrayGeneric<U, SU> &arr) {
diff --git a/tst/unit/test_pararrays.cpp b/tst/unit/test_pararrays.cpp
index e79927c13b20..d281ce9e0c8d 100644
--- a/tst/unit/test_pararrays.cpp
+++ b/tst/unit/test_pararrays.cpp
@@ -452,7 +452,8 @@ TEST_CASE("ParArray state", "[ParArrayND]") {
 
   GIVEN("An array of ParArrays filled with the values contained in their state") {
     parthenon::ParArray1D<arr3d_t> pack("test pack", NS);
-    auto pack_h = Kokkos::create_mirror_view(pack);
+    auto pack_h =
+        Kokkos::create_mirror_view(Kokkos::view_alloc(Kokkos::SequentialHostInit), pack);
 
     for (int b = 0; b < NS; ++b) {
       state_t state(static_cast<double>(b));
@@ -544,7 +545,8 @@ TEST_CASE("Check registry pressure", "[ParArrayND][performance]") {
     new (&views[n])
         view_3d_t(Kokkos::view_alloc(label, Kokkos::WithoutInitializing), N, N, N);
     auto a_h = arrays(n).GetHostMirror();
-    auto v_h = Kokkos::create_mirror_view(views(n));
+    auto v_h = Kokkos::create_mirror_view(Kokkos::view_alloc(Kokkos::SequentialHostInit),
+                                          views(n));
     for (int k = 0; k < N; k++) {
       for (int j = 0; j < N; j++) {
         for (int i = 0; i < N; i++) {

From 4999a80408910c191c7c0e7fdee5aaca73afe50d Mon Sep 17 00:00:00 2001
From: Philipp Grete <pgrete@hs.uni-hamburg.de>
Date: Tue, 22 Oct 2024 19:40:46 +0200
Subject: [PATCH 09/18] Chasing more ViewOfView on host

---
 src/interface/variable_pack.hpp | 19 +++++++++++--------
 src/kokkos_abstraction.hpp      | 13 +++++++++++++
 tst/unit/test_pararrays.cpp     |  2 +-
 3 files changed, 25 insertions(+), 9 deletions(-)

diff --git a/src/interface/variable_pack.hpp b/src/interface/variable_pack.hpp
index 07e56091ddfe..76fce5f6b861 100644
--- a/src/interface/variable_pack.hpp
+++ b/src/interface/variable_pack.hpp
@@ -244,10 +244,11 @@ class PackIndexMap {
 };
 
 template <typename T>
-using ViewOfParArrays = ParArray1D<ParArray3D<T, VariableState>>;
+using ViewOfParArrays =
+    Kokkos::View<ParArray3D<T, VariableState> *, LayoutWrapper, DevMemSpace>;
 
 template <typename T>
-using ViewOfParArrays1D = ParArray1D<ParArray1D<T>>;
+using ViewOfParArrays1D = Kokkos::View<ParArray1D<T> *, LayoutWrapper, DevMemSpace>;
 
 // forward declaration
 template <typename T>
@@ -760,10 +761,11 @@ VariableFluxPack<T> MakeFluxPack(const VarListWithKeys<T> &var_list,
   }
 
   // make the outer view
-  ViewOfParArrays<T> cv("MakeFluxPack::cv", vsize * (extra_components ? 3 : 1));
-  ViewOfParArrays<T> f1("MakeFluxPack::f1", fsize);
-  ViewOfParArrays<T> f2("MakeFluxPack::f2", fsize);
-  ViewOfParArrays<T> f3("MakeFluxPack::f3", fsize);
+  ViewOfParArrays<T> cv(ViewOfViewAlloc("MakeFluxPack::cv"),
+                        vsize * (extra_components ? 3 : 1));
+  ViewOfParArrays<T> f1(ViewOfViewAlloc("MakeFluxPack::f1"), fsize);
+  ViewOfParArrays<T> f2(ViewOfViewAlloc("MakeFluxPack::f2"), fsize);
+  ViewOfParArrays<T> f3(ViewOfViewAlloc("MakeFluxPack::f3"), fsize);
   ParArray1D<bool> flux_allocated("MakePack::allocated", fsize);
   ParArray1D<int> sparse_id("MakeFluxPack::sparse_id", vsize);
   ParArray1D<int> vector_component("MakeFluxPack::vector_component", vsize);
@@ -814,7 +816,8 @@ VariablePack<T> MakePack(const VarListWithKeys<T> &var_list, bool coarse,
   }
 
   // make the outer view
-  ViewOfParArrays<T> cv("MakePack::cv", vsize * (extra_components ? 3 : 1));
+  ViewOfParArrays<T> cv(ViewOfViewAlloc("MakePack::cv"),
+                        vsize * (extra_components ? 3 : 1));
   ParArray1D<int> sparse_id("MakePack::sparse_id", vsize);
   ParArray1D<int> vector_component("MakePack::vector_component", vsize);
   ParArray1D<bool> allocated("MakePack::allocated", vsize);
@@ -847,7 +850,7 @@ SwarmVariablePack<T> MakeSwarmPack(const vpack_types::SwarmVarList<T> &vars,
   }
 
   // make the outer view
-  ViewOfParArrays1D<T> cv("MakePack::cv", vsize);
+  ViewOfParArrays1D<T> cv(ViewOfViewAlloc("MakePack::cv"), vsize);
 
   std::array<int, 2> cv_size{0, 0};
   if (vsize > 0) {
diff --git a/src/kokkos_abstraction.hpp b/src/kokkos_abstraction.hpp
index 8fa89f82e95e..b5dfc3fa75a0 100644
--- a/src/kokkos_abstraction.hpp
+++ b/src/kokkos_abstraction.hpp
@@ -1035,6 +1035,19 @@ par_reduce_inner(InnerLoopPatternTTR, team_mbr_t team_member, const int il, cons
       reduction);
 }
 
+// For ViewOfView we need to call the destructor of the inner views on
+// the host and not on the device (which would happen by default).
+// Thus, we need to pass `SquentialHostInit` as allocator, but only if the ViewOfView is
+// on the host. If the ViewOfViews in on the device, then `SequentialHostInit` should be
+// passed when calling `create_mirror_view`.
+auto ViewOfViewAlloc(const std::string &label) {
+  if constexpr (std::is_same_v<DevMemSpace, HostMemSpace>) {
+    return Kokkos::view_alloc(Kokkos::SequentialHostInit, label);
+  } else {
+    return Kokkos::view_alloc(label);
+  }
+}
+
 // reused from kokoks/core/perf_test/PerfTest_ExecSpacePartitioning.cpp
 // commit a0d011fb30022362c61b3bb000ae3de6906cb6a7
 template <class ExecSpace>
diff --git a/tst/unit/test_pararrays.cpp b/tst/unit/test_pararrays.cpp
index d281ce9e0c8d..9e1816b0e669 100644
--- a/tst/unit/test_pararrays.cpp
+++ b/tst/unit/test_pararrays.cpp
@@ -451,7 +451,7 @@ TEST_CASE("ParArray state", "[ParArrayND]") {
   }
 
   GIVEN("An array of ParArrays filled with the values contained in their state") {
-    parthenon::ParArray1D<arr3d_t> pack("test pack", NS);
+    Kokkos::View<arr3d_t *> pack(parthenon::ViewOfViewAlloc("test pack"), NS);
     auto pack_h =
         Kokkos::create_mirror_view(Kokkos::view_alloc(Kokkos::SequentialHostInit), pack);
 

From 05105495fdc39457ccadcd3e1b1d0c9e934edfe4 Mon Sep 17 00:00:00 2001
From: Ben Wibking <ben@wibking.com>
Date: Wed, 23 Oct 2024 16:53:03 -0400
Subject: [PATCH 10/18] output and crash if dt is tiny

---
 src/driver/driver.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/driver/driver.cpp b/src/driver/driver.cpp
index 17b913e3a2ac..0443bdb6a756 100644
--- a/src/driver/driver.cpp
+++ b/src/driver/driver.cpp
@@ -133,6 +133,12 @@ DriverStatus EvolutionDriver::Execute() {
       // check for signals
       signal = SignalHandler::CheckSignalFlags();
 
+      // TODO(bwibking): check for application debug callback
+      // currently hard-coded to check for tiny dt
+      if (tm.dt < 1e-10 * tm.time) {
+	signal = OutputSignal::final;
+      }
+      
       if (signal == OutputSignal::final) {
         break;
       }

From c799d026bd226e3c175e04ff435e5c39744361ee Mon Sep 17 00:00:00 2001
From: Philipp Grete <pgrete@hs.uni-hamburg.de>
Date: Wed, 6 Nov 2024 16:32:37 +0100
Subject: [PATCH 11/18] Fix ViewAlloc func and add doc

---
 README.md                      |  2 +-
 doc/sphinx/src/development.rst | 28 ++++++++++++++++++++++++++++
 src/kokkos_abstraction.hpp     |  3 ++-
 3 files changed, 31 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index ed6a1bb05a16..f049c44ecd7d 100644
--- a/README.md
+++ b/README.md
@@ -30,7 +30,7 @@ Parthenon -- a performance portable block-structured adaptive mesh refinement fr
 
 * CMake 3.16 or greater
 * C++17 compatible compiler
-* Kokkos 4.0.1 or greater
+* Kokkos 4.1.1 or greater
 
 ## Optional (enabling features)
 
diff --git a/doc/sphinx/src/development.rst b/doc/sphinx/src/development.rst
index dbab91d8d5ce..98ac9cef90a8 100644
--- a/doc/sphinx/src/development.rst
+++ b/doc/sphinx/src/development.rst
@@ -62,6 +62,34 @@ parallelism interface that is needed for managing memory cached in
 tightly nested loops. The wrappers are documented
 :ref:`here <nested par for>`.
 
+View of Views
+-------------
+
+Special care needs to be taken when working with a ``View`` of ``View``.
+
+To repeat the Kokkos documenation: `Don't use them <https://kokkos.org/kokkos-core-wiki/ProgrammingGuide/View.html#can-i-make-a-view-of-views>`__
+
+But if you have to (which is the case in some places inside Parthenon)
+then follow this pattern:
+
+.. code:: c++
+
+   Kokkos::View<ParArray1D<Real> *> view_of_pararrays(parthenon::ViewOfViewAlloc("myname"), 10);
+
+The ``ViewOfViewAlloc`` ensures that the ``Kokkos::SequentialHostInit`` property is added,
+which results in the (inner ``View`` ) deallocators being called on the host (rather than on
+the device by default).
+
+Similarly, when you create a host mirror of said ``View`` of ``View`` add the additional
+property for the same reason.
+
+.. code:: c++
+
+   auto view_of_pararrays_h =
+        Kokkos::create_mirror_view(Kokkos::view_alloc(Kokkos::SequentialHostInit), view_of_pararrays);
+
+Note that the ``SequentialHostInit`` was only added in Kokkos 4.4.1 (which is now the default in Parthenon).
+
 The need for reductions within function handling ``MeshBlock`` data
 -------------------------------------------------------------------
 
diff --git a/src/kokkos_abstraction.hpp b/src/kokkos_abstraction.hpp
index b5dfc3fa75a0..37262dd0356c 100644
--- a/src/kokkos_abstraction.hpp
+++ b/src/kokkos_abstraction.hpp
@@ -1040,8 +1040,9 @@ par_reduce_inner(InnerLoopPatternTTR, team_mbr_t team_member, const int il, cons
 // Thus, we need to pass `SquentialHostInit` as allocator, but only if the ViewOfView is
 // on the host. If the ViewOfViews in on the device, then `SequentialHostInit` should be
 // passed when calling `create_mirror_view`.
+template <typename T = DevMemSpace>
 auto ViewOfViewAlloc(const std::string &label) {
-  if constexpr (std::is_same_v<DevMemSpace, HostMemSpace>) {
+  if constexpr (std::is_same_v<T, HostMemSpace>) {
     return Kokkos::view_alloc(Kokkos::SequentialHostInit, label);
   } else {
     return Kokkos::view_alloc(label);

From ebbcc1704ed2c68c99808cf099eef41396b2d5ce Mon Sep 17 00:00:00 2001
From: Philipp Grete <pgrete@hs.uni-hamburg.de>
Date: Thu, 7 Nov 2024 13:41:44 +0100
Subject: [PATCH 12/18] Disable Ascent in testing

---
 .github/workflows/ci-extended.yml | 48 +++++++++++++++----------------
 .github/workflows/ci-short.yml    | 12 ++++----
 2 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/.github/workflows/ci-extended.yml b/.github/workflows/ci-extended.yml
index 8ca646cfc2eb..0c083091cf0e 100644
--- a/.github/workflows/ci-extended.yml
+++ b/.github/workflows/ci-extended.yml
@@ -32,9 +32,9 @@ jobs:
         parallel: ['serial', 'mpi']
     runs-on: [self-hosted, A100]
     container:
-      image: ghcr.io/parthenon-hpc-lab/cuda11.6-mpi-hdf5-ascent
+      image: ghcr.io/parthenon-hpc-lab/cuda11.6-noascent
       # map to local user id on CI  machine to allow writing to build cache
-      options: --user 1001
+      options: --user 1001 --cap-add CAP_SYS_PTRACE --shm-size="8g" --ulimit memlock=134217728
     steps:
       - uses: actions/checkout@v3
         with:
@@ -91,27 +91,27 @@ jobs:
           ctest -L regression -L ${{ matrix.parallel }} -LE perf-reg --timeout 3600
 
       # Test Ascent integration (only most complex setup with MPI and on device)
-      - name: Ascent tests
-        if: ${{ matrix.parallel == 'mpi' && matrix.device == 'cuda' }}
-        run: |
-          cmake -B build-ascent \
-            -DCMAKE_BUILD_TYPE=Release \
-            -DMACHINE_VARIANT=${{ matrix.device }}-${{ matrix.parallel }} \
-            -DPARTHENON_ENABLE_ASCENT=ON \
-            -DAscent_DIR=/usr/local/ascent-develop/lib/cmake/ascent
-          cmake --build build-ascent
-          cd example/advection/
-          # Pick GPU with most available memory
-          export CUDA_VISIBLE_DEVICES=$(nvidia-smi --query-gpu=memory.free,index --format=csv,nounits,noheader | sort -nr | head -1 | awk '{ print $NF }')
-          mpirun -np 2 ../../build-ascent/example/advection/advection-example \
-            -i parthinput.advection \
-            parthenon/output5/dt=0.05 \
-            parthenon/time/tlim=0.1
-          # check if file exists
-          if [ ! -f "ascent_render_57.png" ]; then
-            echo "'ascent_render_57.png' does not exist."
-            exit 1
-          fi
+      # - name: Ascent tests
+      #   if: ${{ matrix.parallel == 'mpi' && matrix.device == 'cuda' }}
+      #   run: |
+      #     cmake -B build-ascent \
+      #       -DCMAKE_BUILD_TYPE=Release \
+      #       -DMACHINE_VARIANT=${{ matrix.device }}-${{ matrix.parallel }} \
+      #       -DPARTHENON_ENABLE_ASCENT=ON \
+      #       -DAscent_DIR=/usr/local/ascent-develop/lib/cmake/ascent
+      #     cmake --build build-ascent
+      #     cd example/advection/
+      #     # Pick GPU with most available memory
+      #     export CUDA_VISIBLE_DEVICES=$(nvidia-smi --query-gpu=memory.free,index --format=csv,nounits,noheader | sort -nr | head -1 | awk '{ print $NF }')
+      #     mpirun -np 2 ../../build-ascent/example/advection/advection-example \
+      #       -i parthinput.advection \
+      #       parthenon/output5/dt=0.05 \
+      #       parthenon/time/tlim=0.1
+      #     # check if file exists
+      #     if [ ! -f "ascent_render_57.png" ]; then
+      #       echo "'ascent_render_57.png' does not exist."
+      #       exit 1
+      #     fi
 
       - uses: actions/upload-artifact@v3
         with:
@@ -120,7 +120,7 @@ jobs:
             build/CMakeFiles/CMakeOutput.log
             build/tst/regression/outputs/advection_convergence*/advection-errors.dat
             build/tst/regression/outputs/advection_convergence*/advection-errors.png
-            example/advection/ascent_render_57.png
+          # example/advection/ascent_render_57.png
           retention-days: 3
 
   perf-and-regression-amdgpu:
diff --git a/.github/workflows/ci-short.yml b/.github/workflows/ci-short.yml
index ecb4052411ee..ab42a7450c83 100644
--- a/.github/workflows/ci-short.yml
+++ b/.github/workflows/ci-short.yml
@@ -20,9 +20,9 @@ jobs:
   style:
     runs-on: [self-hosted, A100]
     container:
-      image: ghcr.io/parthenon-hpc-lab/cuda11.6-mpi-hdf5-ascent
+      image: ghcr.io/parthenon-hpc-lab/cuda11.6-noascent
       # map to local user id on CI  machine to allow writing to build cache
-      options: --user 1001
+      options: --user 1001 --cap-add CAP_SYS_PTRACE --shm-size="8g" --ulimit memlock=134217728
     steps:
       - uses: actions/checkout@v3
         with:
@@ -45,9 +45,9 @@ jobs:
         device: ['cuda', 'host']
     runs-on: [self-hosted, A100]
     container:
-      image: ghcr.io/parthenon-hpc-lab/cuda11.6-mpi-hdf5-ascent
+      image: ghcr.io/parthenon-hpc-lab/cuda11.6-noascent
       # map to local user id on CI  machine to allow writing to build cache
-      options: --user 1001
+      options: --user 1001 --cap-add CAP_SYS_PTRACE --shm-size="8g" --ulimit memlock=134217728
     steps:
       - uses: actions/checkout@v3
         with:
@@ -77,9 +77,9 @@ jobs:
         device: ['cuda', 'host']
     runs-on: [self-hosted, A100]
     container:
-      image: ghcr.io/parthenon-hpc-lab/cuda11.6-mpi-hdf5-ascent
+      image: ghcr.io/parthenon-hpc-lab/cuda11.6-noascent
       # map to local user id on CI  machine to allow writing to build cache
-      options: --user 1001
+      options: --user 1001 --cap-add CAP_SYS_PTRACE --shm-size="8g" --ulimit memlock=134217728
     steps:
       - uses: actions/checkout@v3
         with:

From 11e1e8614d07c0d85c0a04d914fde33e6115dbac Mon Sep 17 00:00:00 2001
From: Philipp Grete <pgrete@hs.uni-hamburg.de>
Date: Thu, 7 Nov 2024 14:27:39 +0100
Subject: [PATCH 13/18] Revert "Disable Ascent in testing"

This reverts commit ebbcc1704ed2c68c99808cf099eef41396b2d5ce.
---
 .github/workflows/ci-extended.yml | 48 +++++++++++++++----------------
 .github/workflows/ci-short.yml    | 12 ++++----
 2 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/.github/workflows/ci-extended.yml b/.github/workflows/ci-extended.yml
index 0c083091cf0e..8ca646cfc2eb 100644
--- a/.github/workflows/ci-extended.yml
+++ b/.github/workflows/ci-extended.yml
@@ -32,9 +32,9 @@ jobs:
         parallel: ['serial', 'mpi']
     runs-on: [self-hosted, A100]
     container:
-      image: ghcr.io/parthenon-hpc-lab/cuda11.6-noascent
+      image: ghcr.io/parthenon-hpc-lab/cuda11.6-mpi-hdf5-ascent
       # map to local user id on CI  machine to allow writing to build cache
-      options: --user 1001 --cap-add CAP_SYS_PTRACE --shm-size="8g" --ulimit memlock=134217728
+      options: --user 1001
     steps:
       - uses: actions/checkout@v3
         with:
@@ -91,27 +91,27 @@ jobs:
           ctest -L regression -L ${{ matrix.parallel }} -LE perf-reg --timeout 3600
 
       # Test Ascent integration (only most complex setup with MPI and on device)
-      # - name: Ascent tests
-      #   if: ${{ matrix.parallel == 'mpi' && matrix.device == 'cuda' }}
-      #   run: |
-      #     cmake -B build-ascent \
-      #       -DCMAKE_BUILD_TYPE=Release \
-      #       -DMACHINE_VARIANT=${{ matrix.device }}-${{ matrix.parallel }} \
-      #       -DPARTHENON_ENABLE_ASCENT=ON \
-      #       -DAscent_DIR=/usr/local/ascent-develop/lib/cmake/ascent
-      #     cmake --build build-ascent
-      #     cd example/advection/
-      #     # Pick GPU with most available memory
-      #     export CUDA_VISIBLE_DEVICES=$(nvidia-smi --query-gpu=memory.free,index --format=csv,nounits,noheader | sort -nr | head -1 | awk '{ print $NF }')
-      #     mpirun -np 2 ../../build-ascent/example/advection/advection-example \
-      #       -i parthinput.advection \
-      #       parthenon/output5/dt=0.05 \
-      #       parthenon/time/tlim=0.1
-      #     # check if file exists
-      #     if [ ! -f "ascent_render_57.png" ]; then
-      #       echo "'ascent_render_57.png' does not exist."
-      #       exit 1
-      #     fi
+      - name: Ascent tests
+        if: ${{ matrix.parallel == 'mpi' && matrix.device == 'cuda' }}
+        run: |
+          cmake -B build-ascent \
+            -DCMAKE_BUILD_TYPE=Release \
+            -DMACHINE_VARIANT=${{ matrix.device }}-${{ matrix.parallel }} \
+            -DPARTHENON_ENABLE_ASCENT=ON \
+            -DAscent_DIR=/usr/local/ascent-develop/lib/cmake/ascent
+          cmake --build build-ascent
+          cd example/advection/
+          # Pick GPU with most available memory
+          export CUDA_VISIBLE_DEVICES=$(nvidia-smi --query-gpu=memory.free,index --format=csv,nounits,noheader | sort -nr | head -1 | awk '{ print $NF }')
+          mpirun -np 2 ../../build-ascent/example/advection/advection-example \
+            -i parthinput.advection \
+            parthenon/output5/dt=0.05 \
+            parthenon/time/tlim=0.1
+          # check if file exists
+          if [ ! -f "ascent_render_57.png" ]; then
+            echo "'ascent_render_57.png' does not exist."
+            exit 1
+          fi
 
       - uses: actions/upload-artifact@v3
         with:
@@ -120,7 +120,7 @@ jobs:
             build/CMakeFiles/CMakeOutput.log
             build/tst/regression/outputs/advection_convergence*/advection-errors.dat
             build/tst/regression/outputs/advection_convergence*/advection-errors.png
-          # example/advection/ascent_render_57.png
+            example/advection/ascent_render_57.png
           retention-days: 3
 
   perf-and-regression-amdgpu:
diff --git a/.github/workflows/ci-short.yml b/.github/workflows/ci-short.yml
index ab42a7450c83..ecb4052411ee 100644
--- a/.github/workflows/ci-short.yml
+++ b/.github/workflows/ci-short.yml
@@ -20,9 +20,9 @@ jobs:
   style:
     runs-on: [self-hosted, A100]
     container:
-      image: ghcr.io/parthenon-hpc-lab/cuda11.6-noascent
+      image: ghcr.io/parthenon-hpc-lab/cuda11.6-mpi-hdf5-ascent
       # map to local user id on CI  machine to allow writing to build cache
-      options: --user 1001 --cap-add CAP_SYS_PTRACE --shm-size="8g" --ulimit memlock=134217728
+      options: --user 1001
     steps:
       - uses: actions/checkout@v3
         with:
@@ -45,9 +45,9 @@ jobs:
         device: ['cuda', 'host']
     runs-on: [self-hosted, A100]
     container:
-      image: ghcr.io/parthenon-hpc-lab/cuda11.6-noascent
+      image: ghcr.io/parthenon-hpc-lab/cuda11.6-mpi-hdf5-ascent
       # map to local user id on CI  machine to allow writing to build cache
-      options: --user 1001 --cap-add CAP_SYS_PTRACE --shm-size="8g" --ulimit memlock=134217728
+      options: --user 1001
     steps:
       - uses: actions/checkout@v3
         with:
@@ -77,9 +77,9 @@ jobs:
         device: ['cuda', 'host']
     runs-on: [self-hosted, A100]
     container:
-      image: ghcr.io/parthenon-hpc-lab/cuda11.6-noascent
+      image: ghcr.io/parthenon-hpc-lab/cuda11.6-mpi-hdf5-ascent
       # map to local user id on CI  machine to allow writing to build cache
-      options: --user 1001 --cap-add CAP_SYS_PTRACE --shm-size="8g" --ulimit memlock=134217728
+      options: --user 1001
     steps:
       - uses: actions/checkout@v3
         with:

From e632f6b7cbab5535f8db91eb67cb81fa49146513 Mon Sep 17 00:00:00 2001
From: Philipp Grete <pgrete@hs.uni-hamburg.de>
Date: Thu, 7 Nov 2024 14:31:04 +0100
Subject: [PATCH 14/18] Disable CUDA IPC

---
 .github/workflows/ci-extended.yml | 4 +++-
 .github/workflows/ci-short.yml    | 8 +++++---
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/ci-extended.yml b/.github/workflows/ci-extended.yml
index 8ca646cfc2eb..1dd39c782477 100644
--- a/.github/workflows/ci-extended.yml
+++ b/.github/workflows/ci-extended.yml
@@ -21,6 +21,8 @@ env:
   CMAKE_BUILD_PARALLEL_LEVEL: 5 # num threads for build
   MACHINE_CFG: cmake/machinecfg/CI.cmake
   OMPI_MCA_mpi_common_cuda_event_max: 1000
+  # CUDA IPC within docker repeated seem to cause issue on the CI machine
+  OMPI_MCA_btl_smcuda_use_cuda_ipc: 0
   # https://github.com/open-mpi/ompi/issues/4948#issuecomment-395468231
   OMPI_MCA_btl_vader_single_copy_mechanism: none
 
@@ -34,7 +36,7 @@ jobs:
     container:
       image: ghcr.io/parthenon-hpc-lab/cuda11.6-mpi-hdf5-ascent
       # map to local user id on CI  machine to allow writing to build cache
-      options: --user 1001
+      options: --user 1001 --cap-add CAP_SYS_PTRACE --shm-size="8g" --ulimit memlock=134217728
     steps:
       - uses: actions/checkout@v3
         with:
diff --git a/.github/workflows/ci-short.yml b/.github/workflows/ci-short.yml
index ecb4052411ee..7e0fd8bf759a 100644
--- a/.github/workflows/ci-short.yml
+++ b/.github/workflows/ci-short.yml
@@ -13,6 +13,8 @@ env:
   CMAKE_BUILD_PARALLEL_LEVEL: 5 # num threads for build
   MACHINE_CFG: cmake/machinecfg/CI.cmake
   OMPI_MCA_mpi_common_cuda_event_max: 1000
+  # CUDA IPC within docker repeated seem to cause issue on the CI machine
+  OMPI_MCA_btl_smcuda_use_cuda_ipc: 0
   # https://github.com/open-mpi/ompi/issues/4948#issuecomment-395468231
   OMPI_MCA_btl_vader_single_copy_mechanism: none
 
@@ -22,7 +24,7 @@ jobs:
     container:
       image: ghcr.io/parthenon-hpc-lab/cuda11.6-mpi-hdf5-ascent
       # map to local user id on CI  machine to allow writing to build cache
-      options: --user 1001
+      options: --user 1001 --cap-add CAP_SYS_PTRACE --shm-size="8g" --ulimit memlock=134217728
     steps:
       - uses: actions/checkout@v3
         with:
@@ -47,7 +49,7 @@ jobs:
     container:
       image: ghcr.io/parthenon-hpc-lab/cuda11.6-mpi-hdf5-ascent
       # map to local user id on CI  machine to allow writing to build cache
-      options: --user 1001
+      options: --user 1001 --cap-add CAP_SYS_PTRACE --shm-size="8g" --ulimit memlock=134217728
     steps:
       - uses: actions/checkout@v3
         with:
@@ -79,7 +81,7 @@ jobs:
     container:
       image: ghcr.io/parthenon-hpc-lab/cuda11.6-mpi-hdf5-ascent
       # map to local user id on CI  machine to allow writing to build cache
-      options: --user 1001
+      options: --user 1001 --cap-add CAP_SYS_PTRACE --shm-size="8g" --ulimit memlock=134217728
     steps:
       - uses: actions/checkout@v3
         with:

From 120ceef6bd4e34cc9fb10009cb654572a3a00dd4 Mon Sep 17 00:00:00 2001
From: Philipp Grete <pgrete@hs.uni-hamburg.de>
Date: Thu, 7 Nov 2024 17:08:48 +0100
Subject: [PATCH 15/18] Fix more view of views

---
 README.md                          |  2 +-
 src/bvals/bvals.hpp                |  3 ---
 src/bvals/comms/bnd_info.cpp       |  2 +-
 src/bvals/comms/bnd_info.hpp       |  5 +++--
 src/bvals/comms/bvals_utils.hpp    |  3 ++-
 src/interface/mesh_data.hpp        |  4 +++-
 src/interface/sparse_pack_base.cpp |  5 +++--
 src/interface/sparse_pack_base.hpp |  4 +++-
 src/interface/swarm_pack_base.hpp  | 11 ++++++-----
 tst/unit/test_pararrays.cpp        |  2 ++
 10 files changed, 24 insertions(+), 17 deletions(-)

diff --git a/README.md b/README.md
index f049c44ecd7d..b874e5172889 100644
--- a/README.md
+++ b/README.md
@@ -30,7 +30,7 @@ Parthenon -- a performance portable block-structured adaptive mesh refinement fr
 
 * CMake 3.16 or greater
 * C++17 compatible compiler
-* Kokkos 4.1.1 or greater
+* Kokkos 4.4.1 or greater
 
 ## Optional (enabling features)
 
diff --git a/src/bvals/bvals.hpp b/src/bvals/bvals.hpp
index bc6ed0ebed56..aaa4da22c0f0 100644
--- a/src/bvals/bvals.hpp
+++ b/src/bvals/bvals.hpp
@@ -101,9 +101,6 @@ class BoundarySwarm : public BoundaryCommunication {
   explicit BoundarySwarm(std::weak_ptr<MeshBlock> pmb, const std::string &label);
   ~BoundarySwarm() = default;
 
-  std::vector<ParArrayND<int>> vars_int;
-  std::vector<ParArrayND<Real>> vars_real;
-
   // (usuallly the std::size_t unsigned integer type)
   std::vector<BoundaryCommunication *>::size_type bswarm_index;
 
diff --git a/src/bvals/comms/bnd_info.cpp b/src/bvals/comms/bnd_info.cpp
index 54a6ae2b50fd..198489c1c1f0 100644
--- a/src/bvals/comms/bnd_info.cpp
+++ b/src/bvals/comms/bnd_info.cpp
@@ -40,7 +40,7 @@
 namespace parthenon {
 
 void ProResCache_t::Initialize(int n_regions, StateDescriptor *pkg) {
-  prores_info = ParArray1D<ProResInfo>("prores_info", n_regions);
+  prores_info = ProResInfoArr_t(ViewOfViewAlloc("prores_info"), n_regions);
   prores_info_h = Kokkos::create_mirror_view(
       Kokkos::view_alloc(Kokkos::SequentialHostInit), prores_info);
   int nref_funcs = pkg->NumRefinementFuncs();
diff --git a/src/bvals/comms/bnd_info.hpp b/src/bvals/comms/bnd_info.hpp
index 8800f6fd867f..8c6c85d59f93 100644
--- a/src/bvals/comms/bnd_info.hpp
+++ b/src/bvals/comms/bnd_info.hpp
@@ -26,6 +26,7 @@
 #include "bvals/neighbor_block.hpp"
 #include "coordinates/coordinates.hpp"
 #include "interface/variable_state.hpp"
+#include "kokkos_abstraction.hpp"
 #include "mesh/domain.hpp"
 #include "mesh/forest/logical_coordinate_transformation.hpp"
 #include "utils/communication_buffer.hpp"
@@ -130,8 +131,8 @@ int GetBufferSize(MeshBlock *pmb, const NeighborBlock &nb,
 using BndInfoArr_t = Kokkos::View<BndInfo *, LayoutWrapper, DevMemSpace>;
 using BndInfoArrHost_t = typename BndInfoArr_t::HostMirror;
 
-using ProResInfoArr_t = ParArray1D<ProResInfo>;
-using ProResInfoArrHost_t = typename ParArray1D<ProResInfo>::HostMirror;
+using ProResInfoArr_t = Kokkos::View<ProResInfo *, LayoutWrapper, DevMemSpace>;
+using ProResInfoArrHost_t = typename ProResInfoArr_t::HostMirror;
 class StateDescriptor;
 struct ProResCache_t {
   ProResInfoArr_t prores_info{};
diff --git a/src/bvals/comms/bvals_utils.hpp b/src/bvals/comms/bvals_utils.hpp
index 8a47c716aa22..87b55b51019d 100644
--- a/src/bvals/comms/bvals_utils.hpp
+++ b/src/bvals/comms/bvals_utils.hpp
@@ -28,6 +28,7 @@
 #include "bvals/comms/bnd_info.hpp"
 #include "bvals/comms/bvals_in_one.hpp"
 #include "interface/variable.hpp"
+#include "kokkos_abstraction.hpp"
 #include "mesh/domain.hpp"
 #include "mesh/mesh.hpp"
 #include "mesh/meshblock.hpp"
@@ -215,7 +216,7 @@ inline void RebuildBufferCache(std::shared_ptr<MeshData<Real>> md, int nbound,
   using namespace loops;
   using namespace loops::shorthands;
   BvarsSubCache_t &cache = md->GetBvarsCache().GetSubCache(BOUND_TYPE, SENDER);
-  cache.bnd_info = BndInfoArr_t("bnd_info", nbound);
+  cache.bnd_info = BndInfoArr_t(ViewOfViewAlloc("bnd_info"), nbound);
   cache.bnd_info_h = Kokkos::create_mirror_view(
       Kokkos::view_alloc(Kokkos::SequentialHostInit), cache.bnd_info);
 
diff --git a/src/interface/mesh_data.hpp b/src/interface/mesh_data.hpp
index 7d7d1cabcbe5..9a86f0e52014 100644
--- a/src/interface/mesh_data.hpp
+++ b/src/interface/mesh_data.hpp
@@ -26,6 +26,7 @@
 #include "interface/sparse_pack_base.hpp"
 #include "interface/swarm_pack_base.hpp"
 #include "interface/variable_pack.hpp"
+#include "kokkos_abstraction.hpp"
 #include "mesh/domain.hpp"
 #include "mesh/meshblock.hpp"
 #include "mesh/meshblock_pack.hpp"
@@ -149,7 +150,8 @@ const MeshBlockPack<P> &PackOnMesh(M &map, BlockDataList_t<Real> &block_data_,
   }
 
   if (make_new_pack) {
-    ParArray1D<P> packs("MeshData::PackVariables::packs", nblocks);
+    Kokkos::View<P *, LayoutWrapper, DevMemSpace> packs(
+        ViewOfViewAlloc("MeshData::PackVariables::packs"), nblocks);
     auto packs_host =
         Kokkos::create_mirror_view(Kokkos::view_alloc(Kokkos::SequentialHostInit), packs);
 
diff --git a/src/interface/sparse_pack_base.cpp b/src/interface/sparse_pack_base.cpp
index 4ea5a558c3f5..751e88c200c6 100644
--- a/src/interface/sparse_pack_base.cpp
+++ b/src/interface/sparse_pack_base.cpp
@@ -30,6 +30,7 @@
 #include "interface/sparse_pack_base.hpp"
 #include "interface/state_descriptor.hpp"
 #include "interface/variable.hpp"
+#include "kokkos_abstraction.hpp"
 #include "utils/utils.hpp"
 namespace parthenon {
 namespace impl {
@@ -151,7 +152,7 @@ SparsePackBase SparsePackBase::Build(T *pmd, const PackDescriptor &desc,
   } else if (contains_face_or_edge) {
     leading_dim += 2;
   }
-  pack.pack_ = pack_t("data_ptr", leading_dim, pack.nblocks_, max_size);
+  pack.pack_ = pack_t(ViewOfViewAlloc("data_ptr"), leading_dim, pack.nblocks_, max_size);
   pack.pack_h_ = Kokkos::create_mirror_view(
       Kokkos::view_alloc(Kokkos::SequentialHostInit), pack.pack_);
 
@@ -168,7 +169,7 @@ SparsePackBase SparsePackBase::Build(T *pmd, const PackDescriptor &desc,
   pack.block_props_ = block_props_t("block_props", nblocks, 27 + 1);
   pack.block_props_h_ = Kokkos::create_mirror_view(pack.block_props_);
 
-  pack.coords_ = coords_t("coords", desc.flat ? max_size : nblocks);
+  pack.coords_ = coords_t(ViewOfViewAlloc("coords"), desc.flat ? max_size : nblocks);
   auto coords_h = Kokkos::create_mirror_view(
       Kokkos::view_alloc(Kokkos::SequentialHostInit), pack.coords_);
 
diff --git a/src/interface/sparse_pack_base.hpp b/src/interface/sparse_pack_base.hpp
index 0deca487a20a..25331c384b56 100644
--- a/src/interface/sparse_pack_base.hpp
+++ b/src/interface/sparse_pack_base.hpp
@@ -30,6 +30,7 @@
 #include "interface/state_descriptor.hpp"
 #include "interface/variable.hpp"
 #include "interface/variable_state.hpp"
+#include "kokkos_abstraction.hpp"
 #include "utils/utils.hpp"
 
 namespace parthenon {
@@ -55,7 +56,8 @@ class SparsePackBase {
 
   using alloc_t = std::vector<int>;
   using include_t = std::vector<bool>;
-  using pack_t = ParArray3D<ParArray3D<Real, VariableState>>;
+  using pack_t =
+      Kokkos::View<ParArray3D<Real, VariableState> ***, LayoutWrapper, DevMemSpace>;
   using pack_h_t = typename pack_t::HostMirror;
   using bounds_t = ParArray3D<int>;
   using bounds_h_t = typename bounds_t::HostMirror;
diff --git a/src/interface/swarm_pack_base.hpp b/src/interface/swarm_pack_base.hpp
index 52a2c3c47fc7..6d0933b8485c 100644
--- a/src/interface/swarm_pack_base.hpp
+++ b/src/interface/swarm_pack_base.hpp
@@ -28,6 +28,7 @@
 #include "interface/state_descriptor.hpp"
 #include "interface/swarm_device_context.hpp"
 #include "interface/variable.hpp"
+#include "kokkos_abstraction.hpp"
 #include "utils/utils.hpp"
 
 namespace parthenon {
@@ -43,10 +44,10 @@ class SwarmPackBase {
   SwarmPackBase() = default;
   virtual ~SwarmPackBase() = default;
 
-  using pack_t = ParArray3D<ParArray1D<TYPE>>;
+  using pack_t = Kokkos::View<ParArray1D<TYPE> ***, LayoutWrapper, DevMemSpace>;
   using bounds_t = ParArray3D<int>;
-  using contexts_t = ParArray1D<SwarmDeviceContext>;
-  using contexts_h_t = typename ParArray1D<SwarmDeviceContext>::HostMirror;
+  using contexts_t = Kokkos::View<SwarmDeviceContext *, LayoutWrapper, DevMemSpace>;
+  using contexts_h_t = typename contexts_t::HostMirror;
   using max_active_indices_t = ParArray1D<int>;
   using desc_t = impl::SwarmPackDescriptor<TYPE>;
   using idx_map_t = std::unordered_map<std::string, std::size_t>;
@@ -108,7 +109,7 @@ class SwarmPackBase {
 
     // Allocate the views
     int leading_dim = 1;
-    pack.pack_ = pack_t("data_ptr", leading_dim, nblocks, max_size);
+    pack.pack_ = pack_t(ViewOfViewAlloc("data_ptr"), leading_dim, nblocks, max_size);
     auto pack_h = Kokkos::create_mirror_view(
         Kokkos::view_alloc(Kokkos::SequentialHostInit), pack.pack_);
 
@@ -154,7 +155,7 @@ class SwarmPackBase {
     Kokkos::deep_copy(pack.pack_, pack_h);
     Kokkos::deep_copy(pack.bounds_, bounds_h);
 
-    pack.contexts_ = contexts_t("contexts", nblocks);
+    pack.contexts_ = contexts_t(ViewOfViewAlloc("contexts"), nblocks);
     pack.contexts_h_ = Kokkos::create_mirror_view(
         Kokkos::view_alloc(Kokkos::SequentialHostInit), pack.contexts_);
     pack.max_active_indices_ = max_active_indices_t("max_active_indices", nblocks);
diff --git a/tst/unit/test_pararrays.cpp b/tst/unit/test_pararrays.cpp
index 9e1816b0e669..865dfbedf5e9 100644
--- a/tst/unit/test_pararrays.cpp
+++ b/tst/unit/test_pararrays.cpp
@@ -528,6 +528,8 @@ TEST_CASE("Check registry pressure", "[ParArrayND][performance]") {
 
   // view of views. See:
   // https://github.com/kokkos/kokkos/wiki/View#6232-whats-the-problem-with-a-view-of-views
+  // TODO(PG) depending on the results of the view of view discussion, we should add
+  // destructor or ViewOfViewAlloc with SequentialHostInit
   using view_3d_t =
       Kokkos::View<Real ***, parthenon::LayoutWrapper, parthenon::DevMemSpace>;
   using arrays_t = Kokkos::View<parthenon::ParArray6D<Real> *, UVMSpace>;

From eb44166831f88833cad96b6a0b8d5fbf5f540c9d Mon Sep 17 00:00:00 2001
From: Philipp Grete <pgrete@hs.uni-hamburg.de>
Date: Thu, 7 Nov 2024 17:15:41 +0100
Subject: [PATCH 16/18] Fix missig update of private var

---
 src/interface/sparse_pack_base.hpp | 2 +-
 src/mesh/meshblock_pack.hpp        | 6 ++++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/interface/sparse_pack_base.hpp b/src/interface/sparse_pack_base.hpp
index 25331c384b56..089ced81f290 100644
--- a/src/interface/sparse_pack_base.hpp
+++ b/src/interface/sparse_pack_base.hpp
@@ -63,7 +63,7 @@ class SparsePackBase {
   using bounds_h_t = typename bounds_t::HostMirror;
   using block_props_t = ParArray2D<int>;
   using block_props_h_t = typename block_props_t::HostMirror;
-  using coords_t = ParArray1D<ParArray0D<Coordinates_t>>;
+  using coords_t = Kokkos::View<ParArray0D<Coordinates_t> *, LayoutWrapper, DevMemSpace>;
 
   // Returns a SparsePackBase object that is either newly created or taken
   // from the cache in pmd. The cache itself handles the all of this logic
diff --git a/src/mesh/meshblock_pack.hpp b/src/mesh/meshblock_pack.hpp
index 5669e112109b..2624da53824c 100644
--- a/src/mesh/meshblock_pack.hpp
+++ b/src/mesh/meshblock_pack.hpp
@@ -21,6 +21,7 @@
 #include <utility>
 #include <vector>
 
+#include "Kokkos_Core_fwd.hpp"
 #include "coordinates/coordinates.hpp"
 #include "interface/variable_pack.hpp"
 #include "kokkos_abstraction.hpp"
@@ -38,7 +39,8 @@ class MeshBlockPack {
   using pack_type = T;
 
   MeshBlockPack() = default;
-  MeshBlockPack(const ParArray1D<T> view, const std::array<int, 5> dims)
+  MeshBlockPack(const Kokkos::View<T *, LayoutWrapper, DevMemSpace> view,
+                const std::array<int, 5> dims)
       : v_(view), dims_(dims), ndim_((dims[2] > 1 ? 3 : (dims[1] > 1 ? 2 : 1))) {}
 
   KOKKOS_FORCEINLINE_FUNCTION
@@ -85,7 +87,7 @@ class MeshBlockPack {
   const Coordinates_t &GetCoords(const int i) const { return v_(i).GetCoords(); }
 
  private:
-  ParArray1D<T> v_;
+  Kokkos::View<T *, LayoutWrapper, DevMemSpace> v_;
   std::array<int, 5> dims_;
   int ndim_;
 };

From eb03328e965e5b1c53843a0e60485860122b8311 Mon Sep 17 00:00:00 2001
From: Philipp Grete <pgrete@hs.uni-hamburg.de>
Date: Thu, 7 Nov 2024 18:12:12 +0100
Subject: [PATCH 17/18] Make linter happy

---
 src/mesh/meshblock_pack.hpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/mesh/meshblock_pack.hpp b/src/mesh/meshblock_pack.hpp
index 2624da53824c..1800489ccf0b 100644
--- a/src/mesh/meshblock_pack.hpp
+++ b/src/mesh/meshblock_pack.hpp
@@ -21,7 +21,6 @@
 #include <utility>
 #include <vector>
 
-#include "Kokkos_Core_fwd.hpp"
 #include "coordinates/coordinates.hpp"
 #include "interface/variable_pack.hpp"
 #include "kokkos_abstraction.hpp"

From c6f8799c3b9747c677ffcdfe983ee6c930892ee8 Mon Sep 17 00:00:00 2001
From: Ben Wibking <ben@wibking.com>
Date: Tue, 12 Nov 2024 12:36:27 -0500
Subject: [PATCH 18/18] return DriverStatus::failed

---
 src/driver/driver.cpp | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/src/driver/driver.cpp b/src/driver/driver.cpp
index 0443bdb6a756..9d8ce66ad702 100644
--- a/src/driver/driver.cpp
+++ b/src/driver/driver.cpp
@@ -94,6 +94,8 @@ DriverStatus EvolutionDriver::Execute() {
   // Defaults must be set across all ranks
   DumpInputParameters();
 
+  DriverStatus driver_status = DriverStatus::complete;
+
   { // Main t < tmax loop region
     PARTHENON_INSTRUMENT
     while (tm.KeepGoing() && signal != OutputSignal::analysis) {
@@ -135,8 +137,10 @@ DriverStatus EvolutionDriver::Execute() {
 
       // TODO(bwibking): check for application debug callback
       // currently hard-coded to check for tiny dt
-      if (tm.dt < 1e-10 * tm.time) {
-	signal = OutputSignal::final;
+      if (tm.dt < 1e-6 * tm.time) {
+	      signal = OutputSignal::final;
+        driver_status = DriverStatus::failed;
+        // do not return here, since we still want to write an output
       }
       
       if (signal == OutputSignal::final) {
@@ -161,14 +165,16 @@ DriverStatus EvolutionDriver::Execute() {
     pmesh->UserWorkAfterLoop(pmesh, pinput, tm);
   }
 
-  DriverStatus status = tm.KeepGoing() ? DriverStatus::timeout : DriverStatus::complete;
+  if (driver_status != DriverStatus::failed) {
+    driver_status = tm.KeepGoing() ? DriverStatus::timeout : DriverStatus::complete;
+  }
   // Do *not* write the "final" output, if this is analysis run.
   // The analysis output itself has already been written above before the main loop.
   if (signal != OutputSignal::analysis) {
     pouts->MakeOutputs(pmesh, pinput, &tm, OutputSignal::final);
   }
-  PostExecute(status);
-  return status;
+  PostExecute(driver_status);
+  return driver_status;
 }
 
 void EvolutionDriver::PostExecute(DriverStatus status) {