From daabe3234a8230e00f860795cb815417e82647b4 Mon Sep 17 00:00:00 2001
From: jdolence <jcdolence@gmail.com>
Date: Wed, 30 Aug 2023 13:12:01 -0600
Subject: [PATCH 01/67] start adding in the pieces

---
 src/CMakeLists.txt               |  1 +
 src/interface/sparse_pack.hpp    |  5 +++
 src/interface/variable.cpp       |  6 ++--
 src/interface/variable.hpp       |  2 +-
 src/interface/variable_state.cpp |  4 ++-
 src/interface/variable_state.hpp |  7 +++--
 src/kokkos_abstraction.hpp       |  5 +++
 src/mesh/amr_loadbalance.cpp     |  4 +++
 src/mesh/mesh.cpp                |  2 ++
 src/mesh/mesh.hpp                |  1 +
 src/utils/block_timer.hpp        | 52 ++++++++++++++++++++++++++++++++
 11 files changed, 81 insertions(+), 8 deletions(-)
 create mode 100644 src/utils/block_timer.hpp
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 54a226c55383..caffe520652d 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -221,6 +221,7 @@ add_library(parthenon
   utils/alias_method.cpp
   utils/alias_method.hpp
   utils/array_to_tuple.hpp
+  utils/block_timer.hpp
   utils/buffer_utils.cpp
   utils/buffer_utils.hpp
   utils/change_rundir.cpp
diff --git a/src/interface/sparse_pack.hpp b/src/interface/sparse_pack.hpp
index 1cafee22e3a6..abb0b0397c3a 100644
--- a/src/interface/sparse_pack.hpp
+++ b/src/interface/sparse_pack.hpp
@@ -156,6 +156,11 @@ class SparsePack : public SparsePackBase {
   KOKKOS_INLINE_FUNCTION
   const Coordinates_t &GetCoordinates(const int b = 0) const { return coords_(b)(); }
 
+  KOKKOS_INLINE_FUNCTION
+  const int GetLid(const int b) const {
+    return pack_(0,b,0).lid;
+  }
+
   // Bound overloads
   KOKKOS_INLINE_FUNCTION int GetLowerBound(const int b) const {
     return (flat_ && (b > 0)) ? (bounds_(1, b - 1, nvar_) + 1) : 0;
diff --git a/src/interface/variable.cpp b/src/interface/variable.cpp
index fd8b534c95e5..8182358fef7c 100644
--- a/src/interface/variable.cpp
+++ b/src/interface/variable.cpp
@@ -121,7 +121,7 @@ void Variable<T>::AllocateData(MeshBlock *pmb, bool flag_uninitialized) {
       !is_allocated_,
       "Tried to allocate data for variable that's already allocated: " + label());
   data = std::make_from_tuple<ParArrayND<T, VariableState>>(std::tuple_cat(
-      std::make_tuple(label(), MakeVariableState()), ArrayToReverseTuple(dims_)));
+      std::make_tuple(label(), MakeVariableState(pmb->lid)), ArrayToReverseTuple(dims_)));
 
   ++num_alloc_;
 
@@ -160,7 +160,7 @@ void Variable<T>::AllocateFluxesAndCoarse(std::weak_ptr<MeshBlock> wpmb) {
     // A nodal field is the appropriate flux field for an edge variable
     dims_flux[MAX_VARIABLE_DIMENSION - 1] = n_outer;
     flux_data_ = std::make_from_tuple<ParArrayND<T, VariableState>>(
-        std::tuple_cat(std::make_tuple(label() + ".flux_data", MakeVariableState()),
+        std::tuple_cat(std::make_tuple(label() + ".flux_data", MakeVariableState(wpmb.lock()->lid)),
                        ArrayToReverseTuple(dims_flux)));
     // set up fluxes
     for (int d = X1DIR; d <= n_outer; ++d) {
@@ -178,7 +178,7 @@ void Variable<T>::AllocateFluxesAndCoarse(std::weak_ptr<MeshBlock> wpmb) {
 
     if (pmb->pmy_mesh != nullptr && pmb->pmy_mesh->multilevel) {
       coarse_s = std::make_from_tuple<ParArrayND<T, VariableState>>(
-          std::tuple_cat(std::make_tuple(label() + ".coarse", MakeVariableState()),
+          std::tuple_cat(std::make_tuple(label() + ".coarse", MakeVariableState(wpmb.lock()->lid)),
                          ArrayToReverseTuple(coarse_dims_)));
       pmb->LogMemUsage(coarse_s.size() * sizeof(T));
     }
diff --git a/src/interface/variable.hpp b/src/interface/variable.hpp
index 6542d99e7e43..7001f8858e93 100644
--- a/src/interface/variable.hpp
+++ b/src/interface/variable.hpp
@@ -170,7 +170,7 @@ class Variable {
   /// (Metadata::FillGhost is set)
   void AllocateFluxesAndCoarse(std::weak_ptr<MeshBlock> wpmb);
 
-  VariableState MakeVariableState() const { return VariableState(m_, sparse_id_, dims_); }
+  VariableState MakeVariableState(int lid) const { return VariableState(m_, lid, sparse_id_, dims_); }
 
   Metadata m_;
   const std::string base_name_;
diff --git a/src/interface/variable_state.cpp b/src/interface/variable_state.cpp
index b3d363f8613e..fa65b22303a0 100644
--- a/src/interface/variable_state.cpp
+++ b/src/interface/variable_state.cpp
@@ -16,11 +16,12 @@
 
 namespace parthenon {
 
-VariableState::VariableState(const Metadata &md, int sparse_id,
+VariableState::VariableState(const Metadata &md, int lid, int sparse_id,
                              const std::array<int, MAX_VARIABLE_DIMENSION> &dims) {
   allocation_threshold = md.GetAllocationThreshold();
   deallocation_threshold = md.GetDeallocationThreshold();
   sparse_default_val = md.GetDefaultValue();
+  this->lid = lid;
   this->sparse_id = sparse_id;
 
   tensor_shape[0] = dims[3];
@@ -28,6 +29,7 @@ VariableState::VariableState(const Metadata &md, int sparse_id,
   tensor_shape[2] = dims[5];
   tensor_components = tensor_shape[0] * tensor_shape[1] * tensor_shape[2];
   topological_type = GetTopologicalType(md);
+
 }
 
 } // namespace parthenon
diff --git a/src/interface/variable_state.hpp b/src/interface/variable_state.hpp
index 43dfefa8af35..4d0ae4e17cd1 100644
--- a/src/interface/variable_state.hpp
+++ b/src/interface/variable_state.hpp
@@ -29,7 +29,7 @@ static constexpr int InvalidSparseID = std::numeric_limits<int>::min();
 
 struct VariableState : public empty_state_t {
   explicit VariableState(
-      const Metadata &md, int sparse_id = InvalidSparseID,
+      const Metadata &md, int lid, int sparse_id = InvalidSparseID,
       const std::array<int, MAX_VARIABLE_DIMENSION> &dims = [] {
         std::array<int, MAX_VARIABLE_DIMENSION> d;
         for (int i = 0; i < MAX_VARIABLE_DIMENSION; ++i)
@@ -41,7 +41,8 @@ struct VariableState : public empty_state_t {
   VariableState(Real alloc, Real dealloc, Real sparse_default_val = 0.0,
                 int sparse_id = InvalidSparseID)
       : allocation_threshold(alloc), deallocation_threshold(dealloc),
-        sparse_default_val(sparse_default_val), sparse_id(sparse_id) {}
+        sparse_default_val(sparse_default_val), sparse_id(sparse_id),
+        lid(lid) {}
 
   KOKKOS_INLINE_FUNCTION
   VariableState(Real alloc, Real dealloc, int sparse_id)
@@ -58,7 +59,7 @@ struct VariableState : public empty_state_t {
   Real allocation_threshold;
   Real deallocation_threshold;
   Real sparse_default_val;
-  int sparse_id;
+  int sparse_id, lid;
   int vector_component = NODIR;
   bool initialized = true;
 
diff --git a/src/kokkos_abstraction.hpp b/src/kokkos_abstraction.hpp
index cabae1273b6f..bc6331fea796 100644
--- a/src/kokkos_abstraction.hpp
+++ b/src/kokkos_abstraction.hpp
@@ -47,6 +47,7 @@ using ScratchMemSpace = DevExecSpace::scratch_memory_space;
 using HostExecSpace = Kokkos::DefaultHostExecutionSpace;
 using LayoutWrapper = Kokkos::LayoutRight;
 using MemUnmanaged = Kokkos::MemoryTraits<Kokkos::Unmanaged>;
+using Atomic = Kokkos::MemoryTraits<Kokkos::Atomic>;
 
 #if defined(PARTHENON_ENABLE_HOST_COMM_BUFFERS)
 #if defined(KOKKOS_ENABLE_CUDA)
@@ -111,6 +112,10 @@ using HostArray6D = typename ParArray6D<T>::HostMirror;
 template <typename T>
 using HostArray7D = typename ParArray7D<T>::HostMirror;
 
+// Atomic arrays
+template <typename T>
+using AtomicParArray1D = Kokkos::View<T *, LayoutWrapper, DevMemSpace, Atomic>;
+
 using team_policy = Kokkos::TeamPolicy<>;
 using team_mbr_t = Kokkos::TeamPolicy<>::member_type;
 
diff --git a/src/mesh/amr_loadbalance.cpp b/src/mesh/amr_loadbalance.cpp
index ed84bee4a65e..dee739a17969 100644
--- a/src/mesh/amr_loadbalance.cpp
+++ b/src/mesh/amr_loadbalance.cpp
@@ -465,6 +465,10 @@ void Mesh::ResetLoadBalanceVariables() {
       costlist[pmb->gid] = TINY_NUMBER;
       pmb->ResetTimeMeasurement();
     }
+    parthenon::par_for(loop_pattern_flatrange_tag, "reset cost_d", DevExecSpace(), 0, block_list.size()-1,
+        KOKKOS_LAMBDA(const int b) {
+          cost_d[b] = TINY_NUMBER;
+        });
   }
   lb_flag_ = false;
   step_since_lb = 0;
diff --git a/src/mesh/mesh.cpp b/src/mesh/mesh.cpp
index bb5df2b95006..d293e30e3f6c 100644
--- a/src/mesh/mesh.cpp
+++ b/src/mesh/mesh.cpp
@@ -473,6 +473,7 @@ Mesh::Mesh(ParameterInput *pin, ApplicationInput *app_in, Packages_t &packages,
                         packages, resolved_packages, gflag);
     block_list[i - nbs]->SearchAndSetNeighbors(tree, ranklist.data(), nslist.data());
   }
+  Kokkos::realloc(cost_d, block_list.size());
 
   ResetLoadBalanceVariables();
 
@@ -737,6 +738,7 @@ Mesh::Mesh(ParameterInput *pin, ApplicationInput *app_in, RestartReader &rr,
     block_list[i - nbs]->SearchAndSetNeighbors(tree, ranklist.data(), nslist.data());
   }
 
+  Kokkos::realloc(cost_d, block_list.size());
   ResetLoadBalanceVariables();
 
   // Output variables in use in this run
diff --git a/src/mesh/mesh.hpp b/src/mesh/mesh.hpp
index ad0d1a85539c..e8a0f0199670 100644
--- a/src/mesh/mesh.hpp
+++ b/src/mesh/mesh.hpp
@@ -107,6 +107,7 @@ class Mesh {
   BlockList_t block_list;
   Packages_t packages;
   std::shared_ptr<StateDescriptor> resolved_packages;
+  AtomicParArray1D<Real> cost_d;
 
   DataCollection<MeshData<Real>> mesh_data;
 
diff --git a/src/utils/block_timer.hpp b/src/utils/block_timer.hpp
new file mode 100644
index 000000000000..dbb547b8a62e
--- /dev/null
+++ b/src/utils/block_timer.hpp
@@ -0,0 +1,52 @@
+//========================================================================================
+// Parthenon performance portable AMR framework
+// Copyright(C) 2023 The Parthenon collaboration
+// Licensed under the 3-clause BSD License, see LICENSE file for details
+//========================================================================================
+// (C) (or copyright) 2020-2022. Triad National Security, LLC. All rights reserved.
+//
+// This program was produced under U.S. Government contract 89233218CNA000001 for Los
+// Alamos National Laboratory (LANL), which is operated by Triad National Security, LLC
+// for the U.S. Department of Energy/National Nuclear Security Administration. All rights
+// in the program are reserved by Triad National Security, LLC, and the U.S. Department
+// of Energy/National Nuclear Security Administration. The Government is granted for
+// itself and others acting on its behalf a nonexclusive, paid-up, irrevocable worldwide
+// license in this material to reproduce, prepare derivative works, distribute copies to
+// the public, perform publicly and display publicly, and to permit others to do so.
+//========================================================================================
+#ifndef UTILS_BLOCK_TIMER_HPP_
+#define UTILS_BLOCK_TIMER_HPP_
+
+#include <limits>
+
+#include <impl/Kokkos_ClockTic.hpp>
+
+namespace parthenon {
+
+using cost_t = AtomicParArray1D<double>;
+class BlockTimer {
+ public:
+  BlockTimer() = delete;
+  KOKKOS_INLINE_FUNCTION
+  BlockTimer(const int lid, cost_t &cost) : lid_(lid), cost_(cost) {
+    start_ = Kokkos::Impl::clock_tic();
+  }
+  KOKKOS_INLINE_FUNCTION
+  ~BlockTimer() {
+    stop_ = Kokkos::Impl::clock_tic();
+    // deal with overflow of clock
+    cost_(lid) += (stop_ < start_ ?
+                  static_cast<double>(std::numeric_limits<uint64_t>::max() - start_)
+                    + static_cast<double>(stop_) :
+                  static_cast<double>(stop_ - start_));
+  }
+ private:
+  const int lid_;
+  const cost_t &cost_;
+  uint64_t start_, stop_;
+  ;
+};
+
+} // namespace parthenon
+
+#endif

From b1abf5bea0ba1298ad038be3c95a3162575ae39f Mon Sep 17 00:00:00 2001
From: jdolence <jcdolence@gmail.com>
Date: Wed, 30 Aug 2023 13:22:05 -0600
Subject: [PATCH 02/67] lid to lid_

---
 src/utils/block_timer.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/utils/block_timer.hpp b/src/utils/block_timer.hpp
index dbb547b8a62e..3eeee65ecdae 100644
--- a/src/utils/block_timer.hpp
+++ b/src/utils/block_timer.hpp
@@ -35,7 +35,7 @@ class BlockTimer {
   ~BlockTimer() {
     stop_ = Kokkos::Impl::clock_tic();
     // deal with overflow of clock
-    cost_(lid) += (stop_ < start_ ?
+    cost_(lid_) += (stop_ < start_ ?
                   static_cast<double>(std::numeric_limits<uint64_t>::max() - start_)
                     + static_cast<double>(stop_) :
                   static_cast<double>(stop_ - start_));

From 950eaa9c0762665c5f3ea84c50162238e1d43fb3 Mon Sep 17 00:00:00 2001
From: jdolence <jcdolence@gmail.com>
Date: Wed, 30 Aug 2023 13:23:48 -0600
Subject: [PATCH 03/67] add const

---
 src/utils/block_timer.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/utils/block_timer.hpp b/src/utils/block_timer.hpp
index 3eeee65ecdae..d408f7a09d9c 100644
--- a/src/utils/block_timer.hpp
+++ b/src/utils/block_timer.hpp
@@ -28,7 +28,7 @@ class BlockTimer {
  public:
   BlockTimer() = delete;
   KOKKOS_INLINE_FUNCTION
-  BlockTimer(const int lid, cost_t &cost) : lid_(lid), cost_(cost) {
+  BlockTimer(const int lid, const cost_t &cost) : lid_(lid), cost_(cost) {
     start_ = Kokkos::Impl::clock_tic();
   }
   KOKKOS_INLINE_FUNCTION

From a661f6e34eeaa37b41dfd6370550d7fdb6ab0f13 Mon Sep 17 00:00:00 2001
From: jdolence <jcdolence@gmail.com>
Date: Wed, 30 Aug 2023 13:49:48 -0600
Subject: [PATCH 04/67] change interface to BlockTimer

---
 src/mesh/mesh.hpp         |  2 +-
 src/utils/block_timer.hpp | 10 +++++-----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/mesh/mesh.hpp b/src/mesh/mesh.hpp
index e8a0f0199670..a8a07e0e1224 100644
--- a/src/mesh/mesh.hpp
+++ b/src/mesh/mesh.hpp
@@ -107,7 +107,7 @@ class Mesh {
   BlockList_t block_list;
   Packages_t packages;
   std::shared_ptr<StateDescriptor> resolved_packages;
-  AtomicParArray1D<Real> cost_d;
+  ParArray1D<Real> cost_d;
 
   DataCollection<MeshData<Real>> mesh_data;
 
diff --git a/src/utils/block_timer.hpp b/src/utils/block_timer.hpp
index d408f7a09d9c..c09ac150fd6b 100644
--- a/src/utils/block_timer.hpp
+++ b/src/utils/block_timer.hpp
@@ -19,30 +19,30 @@
 
 #include <limits>
 
+#include <Kokkos_Core.hpp>
 #include <impl/Kokkos_ClockTic.hpp>
 
 namespace parthenon {
 
-using cost_t = AtomicParArray1D<double>;
 class BlockTimer {
  public:
   BlockTimer() = delete;
   KOKKOS_INLINE_FUNCTION
-  BlockTimer(const int lid, const cost_t &cost) : lid_(lid), cost_(cost) {
+  BlockTimer(Real *cost) : cost_(cost) {
     start_ = Kokkos::Impl::clock_tic();
   }
   KOKKOS_INLINE_FUNCTION
   ~BlockTimer() {
     stop_ = Kokkos::Impl::clock_tic();
     // deal with overflow of clock
-    cost_(lid_) += (stop_ < start_ ?
+    auto diff = (stop_ < start_ ?
                   static_cast<double>(std::numeric_limits<uint64_t>::max() - start_)
                     + static_cast<double>(stop_) :
                   static_cast<double>(stop_ - start_));
+    Kokkos::atomic_add(cost_, diff);
   }
  private:
-  const int lid_;
-  const cost_t &cost_;
+  Real *cost_;
   uint64_t start_, stop_;
   ;
 };

From 23933c9f446327b8b6c6ea9c852c718fd6945aac Mon Sep 17 00:00:00 2001
From: jdolence <jcdolence@gmail.com>
Date: Wed, 30 Aug 2023 13:54:02 -0600
Subject: [PATCH 05/67] use a raw view for now

---
 src/mesh/mesh.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mesh/mesh.hpp b/src/mesh/mesh.hpp
index a8a07e0e1224..b4002f3d7064 100644
--- a/src/mesh/mesh.hpp
+++ b/src/mesh/mesh.hpp
@@ -107,7 +107,7 @@ class Mesh {
   BlockList_t block_list;
   Packages_t packages;
   std::shared_ptr<StateDescriptor> resolved_packages;
-  ParArray1D<Real> cost_d;
+  Kokkos::View<Real *> cost_d;
 
   DataCollection<MeshData<Real>> mesh_data;
 

From 09c8ea8a9f3c5c5b207c2a9cba845655deb20e05 Mon Sep 17 00:00:00 2001
From: jdolence <jcdolence@gmail.com>
Date: Wed, 30 Aug 2023 15:39:18 -0600
Subject: [PATCH 06/67] get lid from somewhere else

---
 src/interface/sparse_pack.hpp      |  2 +-
 src/interface/sparse_pack_base.cpp |  5 +++++
 src/interface/sparse_pack_base.hpp |  2 ++
 src/interface/variable.cpp         |  6 +++---
 src/interface/variable.hpp         |  2 +-
 src/interface/variable_state.cpp   |  3 +--
 src/interface/variable_state.hpp   |  7 +++----
 src/utils/block_timer.hpp          | 10 +++++-----
 8 files changed, 21 insertions(+), 16 deletions(-)

diff --git a/src/interface/sparse_pack.hpp b/src/interface/sparse_pack.hpp
index abb0b0397c3a..5ad8980a3a31 100644
--- a/src/interface/sparse_pack.hpp
+++ b/src/interface/sparse_pack.hpp
@@ -158,7 +158,7 @@ class SparsePack : public SparsePackBase {
 
   KOKKOS_INLINE_FUNCTION
   const int GetLid(const int b) const {
-    return pack_(0,b,0).lid;
+    return lid_(b);
   }
 
   // Bound overloads
diff --git a/src/interface/sparse_pack_base.cpp b/src/interface/sparse_pack_base.cpp
index 6ddc77194fba..5139e8151ed5 100644
--- a/src/interface/sparse_pack_base.cpp
+++ b/src/interface/sparse_pack_base.cpp
@@ -177,6 +177,9 @@ SparsePackBase SparsePackBase::Build(T *pmd, const PackDescriptor &desc,
   pack.coords_ = coords_t("coords", desc.flat ? max_size : nblocks);
   auto coords_h = Kokkos::create_mirror_view(pack.coords_);
 
+  pack.lid_ = lid_t("lid", nblocks);
+  auto lid_h = Kokkos::create_mirror_view(pack.lid_);
+
   // Fill the views
   int idx = 0;
   int blidx = 0;
@@ -191,6 +194,7 @@ SparsePackBase SparsePackBase::Build(T *pmd, const PackDescriptor &desc,
       // packs.
       coords_h(b) = pmbd->GetBlockPointer()->coords_device;
     }
+    lid_h(blidx) = pmbd->GetBlockPointer()->lid;
 
     for (int i = 0; i < nvar; ++i) {
       pack.bounds_h_(0, blidx, i) = idx;
@@ -261,6 +265,7 @@ SparsePackBase SparsePackBase::Build(T *pmd, const PackDescriptor &desc,
   Kokkos::deep_copy(pack.pack_, pack_h);
   Kokkos::deep_copy(pack.bounds_, pack.bounds_h_);
   Kokkos::deep_copy(pack.coords_, coords_h);
+  Kokkos::deep_copy(pack.lid_, lid_h);
 
   return pack;
 }
diff --git a/src/interface/sparse_pack_base.hpp b/src/interface/sparse_pack_base.hpp
index 7970d86f330a..a8d2611d2105 100644
--- a/src/interface/sparse_pack_base.hpp
+++ b/src/interface/sparse_pack_base.hpp
@@ -58,6 +58,7 @@ class SparsePackBase {
   using bounds_t = ParArray3D<int>;
   using bounds_h_t = typename ParArray3D<int>::HostMirror;
   using coords_t = ParArray1D<ParArray0D<Coordinates_t>>;
+  using lid_t = ParArray1D<int>;
 
   // Returns a SparsePackBase object that is either newly created or taken
   // from the cache in pmd. The cache itself handles the all of this logic
@@ -88,6 +89,7 @@ class SparsePackBase {
   bounds_t bounds_;
   bounds_h_t bounds_h_;
   coords_t coords_;
+  lid_t lid_;
 
   bool with_fluxes_;
   bool coarse_;
diff --git a/src/interface/variable.cpp b/src/interface/variable.cpp
index 8182358fef7c..fd8b534c95e5 100644
--- a/src/interface/variable.cpp
+++ b/src/interface/variable.cpp
@@ -121,7 +121,7 @@ void Variable<T>::AllocateData(MeshBlock *pmb, bool flag_uninitialized) {
       !is_allocated_,
       "Tried to allocate data for variable that's already allocated: " + label());
   data = std::make_from_tuple<ParArrayND<T, VariableState>>(std::tuple_cat(
-      std::make_tuple(label(), MakeVariableState(pmb->lid)), ArrayToReverseTuple(dims_)));
+      std::make_tuple(label(), MakeVariableState()), ArrayToReverseTuple(dims_)));
 
   ++num_alloc_;
 
@@ -160,7 +160,7 @@ void Variable<T>::AllocateFluxesAndCoarse(std::weak_ptr<MeshBlock> wpmb) {
     // A nodal field is the appropriate flux field for an edge variable
     dims_flux[MAX_VARIABLE_DIMENSION - 1] = n_outer;
     flux_data_ = std::make_from_tuple<ParArrayND<T, VariableState>>(
-        std::tuple_cat(std::make_tuple(label() + ".flux_data", MakeVariableState(wpmb.lock()->lid)),
+        std::tuple_cat(std::make_tuple(label() + ".flux_data", MakeVariableState()),
                        ArrayToReverseTuple(dims_flux)));
     // set up fluxes
     for (int d = X1DIR; d <= n_outer; ++d) {
@@ -178,7 +178,7 @@ void Variable<T>::AllocateFluxesAndCoarse(std::weak_ptr<MeshBlock> wpmb) {
 
     if (pmb->pmy_mesh != nullptr && pmb->pmy_mesh->multilevel) {
       coarse_s = std::make_from_tuple<ParArrayND<T, VariableState>>(
-          std::tuple_cat(std::make_tuple(label() + ".coarse", MakeVariableState(wpmb.lock()->lid)),
+          std::tuple_cat(std::make_tuple(label() + ".coarse", MakeVariableState()),
                          ArrayToReverseTuple(coarse_dims_)));
       pmb->LogMemUsage(coarse_s.size() * sizeof(T));
     }
diff --git a/src/interface/variable.hpp b/src/interface/variable.hpp
index 7001f8858e93..6542d99e7e43 100644
--- a/src/interface/variable.hpp
+++ b/src/interface/variable.hpp
@@ -170,7 +170,7 @@ class Variable {
   /// (Metadata::FillGhost is set)
   void AllocateFluxesAndCoarse(std::weak_ptr<MeshBlock> wpmb);
 
-  VariableState MakeVariableState(int lid) const { return VariableState(m_, lid, sparse_id_, dims_); }
+  VariableState MakeVariableState() const { return VariableState(m_, sparse_id_, dims_); }
 
   Metadata m_;
   const std::string base_name_;
diff --git a/src/interface/variable_state.cpp b/src/interface/variable_state.cpp
index fa65b22303a0..ca77cf7bb4e8 100644
--- a/src/interface/variable_state.cpp
+++ b/src/interface/variable_state.cpp
@@ -16,12 +16,11 @@
 
 namespace parthenon {
 
-VariableState::VariableState(const Metadata &md, int lid, int sparse_id,
+VariableState::VariableState(const Metadata &md, int sparse_id,
                              const std::array<int, MAX_VARIABLE_DIMENSION> &dims) {
   allocation_threshold = md.GetAllocationThreshold();
   deallocation_threshold = md.GetDeallocationThreshold();
   sparse_default_val = md.GetDefaultValue();
-  this->lid = lid;
   this->sparse_id = sparse_id;
 
   tensor_shape[0] = dims[3];
diff --git a/src/interface/variable_state.hpp b/src/interface/variable_state.hpp
index 4d0ae4e17cd1..43dfefa8af35 100644
--- a/src/interface/variable_state.hpp
+++ b/src/interface/variable_state.hpp
@@ -29,7 +29,7 @@ static constexpr int InvalidSparseID = std::numeric_limits<int>::min();
 
 struct VariableState : public empty_state_t {
   explicit VariableState(
-      const Metadata &md, int lid, int sparse_id = InvalidSparseID,
+      const Metadata &md, int sparse_id = InvalidSparseID,
       const std::array<int, MAX_VARIABLE_DIMENSION> &dims = [] {
         std::array<int, MAX_VARIABLE_DIMENSION> d;
         for (int i = 0; i < MAX_VARIABLE_DIMENSION; ++i)
@@ -41,8 +41,7 @@ struct VariableState : public empty_state_t {
   VariableState(Real alloc, Real dealloc, Real sparse_default_val = 0.0,
                 int sparse_id = InvalidSparseID)
       : allocation_threshold(alloc), deallocation_threshold(dealloc),
-        sparse_default_val(sparse_default_val), sparse_id(sparse_id),
-        lid(lid) {}
+        sparse_default_val(sparse_default_val), sparse_id(sparse_id) {}
 
   KOKKOS_INLINE_FUNCTION
   VariableState(Real alloc, Real dealloc, int sparse_id)
@@ -59,7 +58,7 @@ struct VariableState : public empty_state_t {
   Real allocation_threshold;
   Real deallocation_threshold;
   Real sparse_default_val;
-  int sparse_id, lid;
+  int sparse_id;
   int vector_component = NODIR;
   bool initialized = true;
 
diff --git a/src/utils/block_timer.hpp b/src/utils/block_timer.hpp
index c09ac150fd6b..76b8508b592e 100644
--- a/src/utils/block_timer.hpp
+++ b/src/utils/block_timer.hpp
@@ -33,17 +33,17 @@ class BlockTimer {
   }
   KOKKOS_INLINE_FUNCTION
   ~BlockTimer() {
-    stop_ = Kokkos::Impl::clock_tic();
+    auto stop = Kokkos::Impl::clock_tic();
     // deal with overflow of clock
-    auto diff = (stop_ < start_ ?
+    auto diff = (stop < start_ ?
                   static_cast<double>(std::numeric_limits<uint64_t>::max() - start_)
-                    + static_cast<double>(stop_) :
-                  static_cast<double>(stop_ - start_));
+                    + static_cast<double>(stop) :
+                  static_cast<double>(stop - start_));
     Kokkos::atomic_add(cost_, diff);
   }
  private:
   Real *cost_;
-  uint64_t start_, stop_;
+  uint64_t start_;
   ;
 };
 

From 10f141c0a6a90bce39a3582a98d8033e88c10677 Mon Sep 17 00:00:00 2001
From: jdolence <jcdolence@gmail.com>
Date: Wed, 30 Aug 2023 15:54:19 -0600
Subject: [PATCH 07/67] realloc cost_d

---
 src/mesh/amr_loadbalance.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/mesh/amr_loadbalance.cpp b/src/mesh/amr_loadbalance.cpp
index dee739a17969..e5c36a1f5eda 100644
--- a/src/mesh/amr_loadbalance.cpp
+++ b/src/mesh/amr_loadbalance.cpp
@@ -937,6 +937,7 @@ void Mesh::RedistributeAndRefineMeshBlocks(ParameterInput *pin, ApplicationInput
   loclist = std::move(newloc);
   ranklist = std::move(newrank);
   costlist = std::move(newcost);
+  Kokkos::realloc(cost_d, nbe - nbs + 1);
 
   // re-initialize the MeshBlocks
   for (auto &pmb : block_list) {

From 3d328785845aec50a82342c9214cfa9b93814d69 Mon Sep 17 00:00:00 2001
From: jdolence <jcdolence@gmail.com>
Date: Thu, 31 Aug 2023 08:41:15 -0600
Subject: [PATCH 08/67] refactor

---
 src/interface/sparse_pack.hpp      |  6 ++++++
 src/interface/sparse_pack_base.cpp |  2 ++
 src/interface/sparse_pack_base.hpp |  2 ++
 src/mesh/amr_loadbalance.cpp       |  2 +-
 src/mesh/mesh.cpp                  |  4 ++--
 src/mesh/mesh.hpp                  |  2 +-
 src/parthenon_array_generic.hpp    |  9 +++++++++
 src/utils/block_timer.hpp          | 26 ++++++++++++++++++++------
 8 files changed, 43 insertions(+), 10 deletions(-)

diff --git a/src/interface/sparse_pack.hpp b/src/interface/sparse_pack.hpp
index 5ad8980a3a31..401448d1705a 100644
--- a/src/interface/sparse_pack.hpp
+++ b/src/interface/sparse_pack.hpp
@@ -156,11 +156,17 @@ class SparsePack : public SparsePackBase {
   KOKKOS_INLINE_FUNCTION
   const Coordinates_t &GetCoordinates(const int b = 0) const { return coords_(b)(); }
 
+  // Methods used for timing blocks
   KOKKOS_INLINE_FUNCTION
   const int GetLid(const int b) const {
     return lid_(b);
   }
 
+  KOKKOS_INLINE_FUNCTION
+  double &GetCost(const int b) const {
+    return cost_(GetLid(b));
+  }
+
   // Bound overloads
   KOKKOS_INLINE_FUNCTION int GetLowerBound(const int b) const {
     return (flat_ && (b > 0)) ? (bounds_(1, b - 1, nvar_) + 1) : 0;
diff --git a/src/interface/sparse_pack_base.cpp b/src/interface/sparse_pack_base.cpp
index 5139e8151ed5..42d3d31cab09 100644
--- a/src/interface/sparse_pack_base.cpp
+++ b/src/interface/sparse_pack_base.cpp
@@ -28,6 +28,7 @@
 #include "interface/sparse_pack_base.hpp"
 #include "interface/state_descriptor.hpp"
 #include "interface/variable.hpp"
+#include "mesh/mesh.hpp"
 #include "utils/utils.hpp"
 namespace parthenon {
 namespace impl {
@@ -262,6 +263,7 @@ SparsePackBase SparsePackBase::Build(T *pmd, const PackDescriptor &desc,
     pack.bounds_h_(1, blidx, nvar) = idx - 1;
     blidx++;
   });
+  pack.cost_ = pmd->GetMeshPointer()->cost_d;
   Kokkos::deep_copy(pack.pack_, pack_h);
   Kokkos::deep_copy(pack.bounds_, pack.bounds_h_);
   Kokkos::deep_copy(pack.coords_, coords_h);
diff --git a/src/interface/sparse_pack_base.hpp b/src/interface/sparse_pack_base.hpp
index a8d2611d2105..c09869fc8aae 100644
--- a/src/interface/sparse_pack_base.hpp
+++ b/src/interface/sparse_pack_base.hpp
@@ -59,6 +59,7 @@ class SparsePackBase {
   using bounds_h_t = typename ParArray3D<int>::HostMirror;
   using coords_t = ParArray1D<ParArray0D<Coordinates_t>>;
   using lid_t = ParArray1D<int>;
+  using cost_t = ParArray1D<double>;
 
   // Returns a SparsePackBase object that is either newly created or taken
   // from the cache in pmd. The cache itself handles the all of this logic
@@ -90,6 +91,7 @@ class SparsePackBase {
   bounds_h_t bounds_h_;
   coords_t coords_;
   lid_t lid_;
+  cost_t cost_;
 
   bool with_fluxes_;
   bool coarse_;
diff --git a/src/mesh/amr_loadbalance.cpp b/src/mesh/amr_loadbalance.cpp
index e5c36a1f5eda..d82f97b0cacf 100644
--- a/src/mesh/amr_loadbalance.cpp
+++ b/src/mesh/amr_loadbalance.cpp
@@ -937,7 +937,7 @@ void Mesh::RedistributeAndRefineMeshBlocks(ParameterInput *pin, ApplicationInput
   loclist = std::move(newloc);
   ranklist = std::move(newrank);
   costlist = std::move(newcost);
-  Kokkos::realloc(cost_d, nbe - nbs + 1);
+  cost_d.Realloc(nbe - nbs + 1);
 
   // re-initialize the MeshBlocks
   for (auto &pmb : block_list) {
diff --git a/src/mesh/mesh.cpp b/src/mesh/mesh.cpp
index d293e30e3f6c..1c856748c1a4 100644
--- a/src/mesh/mesh.cpp
+++ b/src/mesh/mesh.cpp
@@ -473,7 +473,7 @@ Mesh::Mesh(ParameterInput *pin, ApplicationInput *app_in, Packages_t &packages,
                         packages, resolved_packages, gflag);
     block_list[i - nbs]->SearchAndSetNeighbors(tree, ranklist.data(), nslist.data());
   }
-  Kokkos::realloc(cost_d, block_list.size());
+  cost_d.Realloc(block_list.size());
 
   ResetLoadBalanceVariables();
 
@@ -738,7 +738,7 @@ Mesh::Mesh(ParameterInput *pin, ApplicationInput *app_in, RestartReader &rr,
     block_list[i - nbs]->SearchAndSetNeighbors(tree, ranklist.data(), nslist.data());
   }
 
-  Kokkos::realloc(cost_d, block_list.size());
+  cost_d.Realloc(block_list.size());
   ResetLoadBalanceVariables();
 
   // Output variables in use in this run
diff --git a/src/mesh/mesh.hpp b/src/mesh/mesh.hpp
index b4002f3d7064..48147e55e67a 100644
--- a/src/mesh/mesh.hpp
+++ b/src/mesh/mesh.hpp
@@ -107,7 +107,7 @@ class Mesh {
   BlockList_t block_list;
   Packages_t packages;
   std::shared_ptr<StateDescriptor> resolved_packages;
-  Kokkos::View<Real *> cost_d;
+  ParArray1D<double> cost_d;
 
   DataCollection<MeshData<Real>> mesh_data;
 
diff --git a/src/parthenon_array_generic.hpp b/src/parthenon_array_generic.hpp
index d527707f9070..609869fa256a 100644
--- a/src/parthenon_array_generic.hpp
+++ b/src/parthenon_array_generic.hpp
@@ -170,6 +170,11 @@ class ParArrayGeneric : public State {
   void Resize(Args... args) {
     Resize(std::make_index_sequence<Data::rank - sizeof...(Args)>{}, args...);
   }
+  template <class... Args, REQUIRES(all_implement<integral(Args...)>::value),
+            REQUIRES(Data::rank - sizeof...(Args) >= 0)>
+  void Realloc(Args... args) {
+    Realloc(std::make_index_sequence<Data::rank - sizeof...(Args)>{}, args...);
+  }
 
   template <class... Args, REQUIRES(all_implement<integral_or_enum(Args...)>::value),
             REQUIRES(Data::rank - sizeof...(Args) >= 0)>
@@ -289,6 +294,10 @@ class ParArrayGeneric : public State {
   void Resize(std::index_sequence<I...>, Args... args) {
     Kokkos::resize(data_, ((void)I, 1)..., args...);
   }
+  template <class... Args, std::size_t... I>
+  void Realloc(std::index_sequence<I...>, Args... args) {
+    Kokkos::realloc(data_, ((void)I, 1)..., args...);
+  }
 
   template <class... Args, std::size_t... I>
   KOKKOS_FORCEINLINE_FUNCTION auto &_operator_impl(std::index_sequence<I...>,
diff --git a/src/utils/block_timer.hpp b/src/utils/block_timer.hpp
index 76b8508b592e..c67f151ee3f4 100644
--- a/src/utils/block_timer.hpp
+++ b/src/utils/block_timer.hpp
@@ -24,13 +24,19 @@
 
 namespace parthenon {
 
+template <typename T>
 class BlockTimer {
  public:
   BlockTimer() = delete;
+  // constructor for team policies
   KOKKOS_INLINE_FUNCTION
-  BlockTimer(Real *cost) : cost_(cost) {
-    start_ = Kokkos::Impl::clock_tic();
-  }
+  BlockTimer(team_mbr_t &member, const T &pack, const int b = 0)
+    : member_(&member), pack_(pack), b_(b), start_(Kokkos::Impl::clock_tic()) {}
+  // constructor for non-team policies
+  KOKKOS_INLINE_FUNCTION
+  BlockTimer(const T &pack, const int b = 0)
+    : member_(nullptr), pack_(pack), b_(b), start_(Kokkos::Impl::clock_tic()) {}
+  // constructor for team policies without a block index
   KOKKOS_INLINE_FUNCTION
   ~BlockTimer() {
     auto stop = Kokkos::Impl::clock_tic();
@@ -39,11 +45,19 @@ class BlockTimer {
                   static_cast<double>(std::numeric_limits<uint64_t>::max() - start_)
                     + static_cast<double>(stop) :
                   static_cast<double>(stop - start_));
-    Kokkos::atomic_add(cost_, diff);
+    if (member_ == nullptr) {
+      Kokkos::atomic_add(&(pack.GetCost(b)), diff);
+    } else {
+      Kokkos::single(Kokkos::PerTeam(*member), [&] () {
+        Kokkos::atomic_add(&(pack.GetCost(b)), diff);
+      });
+    }
   }
  private:
-  Real *cost_;
-  uint64_t start_;
+  const team_mbr_t *member_;
+  const T &pack_;
+  const int b_;
+  const uint64_t start_;
   ;
 };
 

From 172da79f184a19c8c0db55864b1a141efcef3731 Mon Sep 17 00:00:00 2001
From: jdolence <jcdolence@gmail.com>
Date: Thu, 31 Aug 2023 08:44:52 -0600
Subject: [PATCH 09/67] oops

---
 src/utils/block_timer.hpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/utils/block_timer.hpp b/src/utils/block_timer.hpp
index c67f151ee3f4..b4e98ed58405 100644
--- a/src/utils/block_timer.hpp
+++ b/src/utils/block_timer.hpp
@@ -46,10 +46,10 @@ class BlockTimer {
                     + static_cast<double>(stop) :
                   static_cast<double>(stop - start_));
     if (member_ == nullptr) {
-      Kokkos::atomic_add(&(pack.GetCost(b)), diff);
+      Kokkos::atomic_add(&(pack_.GetCost(b_)), diff);
     } else {
-      Kokkos::single(Kokkos::PerTeam(*member), [&] () {
-        Kokkos::atomic_add(&(pack.GetCost(b)), diff);
+      Kokkos::single(Kokkos::PerTeam(*member_), [&] () {
+        Kokkos::atomic_add(&(pack_.GetCost(b_)), diff);
       });
     }
   }

From a7287dc20e40a2257e0c8aa316703ff19bb1b458 Mon Sep 17 00:00:00 2001
From: jdolence <jcdolence@gmail.com>
Date: Thu, 31 Aug 2023 14:03:48 -0600
Subject: [PATCH 10/67] new load balancer, new options, time boundaries

---
 CMakeLists.txt                             |   6 +
 src/bvals/comms/bnd_info.cpp               |   9 +-
 src/bvals/comms/bnd_info.hpp               |   2 +
 src/bvals/comms/boundary_communication.cpp |   9 +
 src/config.hpp.in                          |   3 +
 src/interface/sparse_pack.hpp              |   3 +
 src/interface/sparse_pack_base.cpp         |  10 +-
 src/interface/sparse_pack_base.hpp         |   3 +
 src/mesh/amr_loadbalance.cpp               | 328 +++++++++++----------
 src/mesh/mesh.cpp                          |  23 +-
 src/mesh/mesh.hpp                          |  17 +-
 src/utils/block_timer.hpp                  |  44 ++-
 12 files changed, 276 insertions(+), 181 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 33c19c34256d..e4be8054241f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -37,6 +37,7 @@ option(PARTHENON_ENABLE_HOST_COMM_BUFFERS "CUDA/HIP Only: Allocate communication
 option(PARTHENON_DISABLE_HDF5 "HDF5 is enabled by default if found, set this to True to disable HDF5" OFF)
 option(PARTHENON_DISABLE_HDF5_COMPRESSION "HDF5 compression is enabled by default, set this to True to disable compression in HDF5 output/restart files" OFF)
 option(PARTHENON_DISABLE_SPARSE "Sparse capability is enabled by default, set this to True to compile-time disable all sparse capability" OFF)
+option(PARTHENON_ENABLE_LB_TIMERS "Timer-based load balancing is disabled by default, set this to True to enable timers" OFF)
 option(PARTHENON_ENABLE_ASCENT "Enable Ascent for in situ visualization and analysis" OFF)
 option(PARTHENON_LINT_DEFAULT "Linting is turned off by default, use the \"lint\" target or set \
 this to True to enable linting in the default target" OFF)
@@ -132,6 +133,11 @@ if (PARTHENON_DISABLE_SPARSE)
   set(ENABLE_SPARSE OFF)
 endif()
 
+set(ENABLE_LB_TIMERS OFF)
+if (PARTHENON_ENABLE_LB_TIMERS)
+  set(ENABLE_LB_TIMERS ON)
+endif()
+
 set(ENABLE_HDF5 OFF)
 if (NOT PARTHENON_DISABLE_HDF5)
   set(HDF5_PREFER_PARALLEL ${ENABLE_MPI})
diff --git a/src/bvals/comms/bnd_info.cpp b/src/bvals/comms/bnd_info.cpp
index b3b569e75ae8..bd1ef608681e 100644
--- a/src/bvals/comms/bnd_info.cpp
+++ b/src/bvals/comms/bnd_info.cpp
@@ -190,6 +190,8 @@ BndInfo BndInfo::GetSendBndInfo(std::shared_ptr<MeshBlock> pmb, const NeighborBl
                                 CommBuffer<buf_pool_t<Real>::owner_t> *buf) {
   BndInfo out;
 
+  out.block_lid = pmb->lid;
+
   out.allocated = v->IsAllocated();
   if (!out.allocated) return out;
 
@@ -287,6 +289,8 @@ ProResInfo ProResInfo::GetSend(std::shared_ptr<MeshBlock> pmb, const NeighborBlo
                                std::shared_ptr<Variable<Real>> v) {
   ProResInfo out;
 
+  out.block_lid = pmb->lid;
+
   out.allocated = v->IsAllocated();
   if (!out.allocated) return out;
 
@@ -323,6 +327,7 @@ ProResInfo ProResInfo::GetSet(std::shared_ptr<MeshBlock> pmb, const NeighborBloc
   int Nu = v->GetDim(5);
   int Nt = v->GetDim(6);
 
+  out.block_lid = pmb->lid;
   int mylevel = pmb->loc.level();
   out.coords = pmb->coords;
   if (pmb->pmr) out.coarse_coords = pmb->pmr->GetCoarseCoords();
@@ -394,6 +399,7 @@ BndInfo BndInfo::GetSetBndInfo(std::shared_ptr<MeshBlock> pmb, const NeighborBlo
   int Nu = v->GetDim(5);
   int Nt = v->GetDim(6);
 
+  out.block_lid = pmb->lid;
   int mylevel = pmb->loc.level();
 
   auto elements = v->GetTopologicalElements();
@@ -416,6 +422,7 @@ BndInfo BndInfo::GetSendCCFluxCor(std::shared_ptr<MeshBlock> pmb, const Neighbor
                                   std::shared_ptr<Variable<Real>> v,
                                   CommBuffer<buf_pool_t<Real>::owner_t> *buf) {
   BndInfo out;
+  out.block_lid = pmb->lid;
   out.allocated = v->IsAllocated();
   if (!v->IsAllocated()) {
     // Not going to actually do anything with this buffer
@@ -473,7 +480,7 @@ BndInfo BndInfo::GetSetCCFluxCor(std::shared_ptr<MeshBlock> pmb, const NeighborB
                                  std::shared_ptr<Variable<Real>> v,
                                  CommBuffer<buf_pool_t<Real>::owner_t> *buf) {
   BndInfo out;
-
+  out.block_lid = pmb->lid;
   if (!v->IsAllocated() || buf->GetState() != BufferState::received) {
     out.allocated = false;
     return out;
diff --git a/src/bvals/comms/bnd_info.hpp b/src/bvals/comms/bnd_info.hpp
index d49303221745..c95944b5ec36 100644
--- a/src/bvals/comms/bnd_info.hpp
+++ b/src/bvals/comms/bnd_info.hpp
@@ -44,6 +44,7 @@ struct BndInfo {
   int ntopological_elements = 1;
   SpatiallyMaskedIndexer6D idxer[3];
 
+  int block_lid;
   CoordinateDirection dir;
   bool allocated = true;
   bool buf_allocated = true;
@@ -77,6 +78,7 @@ struct ProResInfo {
   // conversion of TopologicalElements
   SpatiallyMaskedIndexer6D idxer[10];
 
+  int block_lid;
   CoordinateDirection dir;
   bool allocated = true;
   RefinementOp_t refinement_op = RefinementOp_t::None;
diff --git a/src/bvals/comms/boundary_communication.cpp b/src/bvals/comms/boundary_communication.cpp
index 2deabe4ef81c..9f7b80f3e0a1 100644
--- a/src/bvals/comms/boundary_communication.cpp
+++ b/src/bvals/comms/boundary_communication.cpp
@@ -36,6 +36,7 @@
 #include "prolong_restrict/prolong_restrict.hpp"
 #include "tasks/task_id.hpp"
 #include "tasks/task_list.hpp"
+#include "utils/block_timer.hpp"
 #include "utils/error_checking.hpp"
 #include "utils/loop_utils.hpp"
 
@@ -49,6 +50,7 @@ TaskStatus SendBoundBufs(std::shared_ptr<MeshData<Real>> &md) {
   Kokkos::Profiling::pushRegion("Task_LoadAndSendBoundBufs");
 
   Mesh *pmesh = md->GetMeshPointer();
+  auto &block_cost = pmesh->block_cost;
   auto &cache = md->GetBvarsCache().GetSubCache(bound_type, true);
 
   if (cache.buf_vec.size() == 0)
@@ -87,6 +89,9 @@ TaskStatus SendBoundBufs(std::shared_ptr<MeshData<Real>> &md) {
       Kokkos::TeamPolicy<>(parthenon::DevExecSpace(), nbound, Kokkos::AUTO),
       KOKKOS_LAMBDA(parthenon::team_mbr_t team_member) {
         const int b = team_member.league_rank();
+#ifdef ENABLE_LB_TIMERS
+        BlockTimer timer(team_member, &block_cost(bnd_info(b).block_lid));
+#endif
 
         if (!bnd_info(b).allocated) {
           Kokkos::single(Kokkos::PerTeam(team_member),
@@ -212,6 +217,7 @@ TaskStatus SetBounds(std::shared_ptr<MeshData<Real>> &md) {
   Kokkos::Profiling::pushRegion("Task_SetInternalBoundaries");
 
   Mesh *pmesh = md->GetMeshPointer();
+  auto &block_cost = pmesh->block_cost;
   auto &cache = md->GetBvarsCache().GetSubCache(bound_type, false);
 
   auto [rebuild, nbound] = CheckReceiveBufferCacheForRebuild<bound_type, false>(md);
@@ -226,6 +232,9 @@ TaskStatus SetBounds(std::shared_ptr<MeshData<Real>> &md) {
       Kokkos::TeamPolicy<>(parthenon::DevExecSpace(), nbound, Kokkos::AUTO),
       KOKKOS_LAMBDA(parthenon::team_mbr_t team_member) {
         const int b = team_member.league_rank();
+#ifdef ENABLE_LB_TIMERS
+        BlockTimer timer(team_member, &block_cost(bnd_info(b).block_lid));
+#endif
         int idx_offset = 0;
         for (int iel = 0; iel < bnd_info(b).ntopological_elements; ++iel) {
           auto &idxer = bnd_info(b).idxer[iel];
diff --git a/src/config.hpp.in b/src/config.hpp.in
index 299fe0936978..454f31976104 100644
--- a/src/config.hpp.in
+++ b/src/config.hpp.in
@@ -51,6 +51,9 @@
 // define ENABLE_SPARSE or not at all
 #cmakedefine ENABLE_SPARSE
 
+// define ENABLE_LB_TIMERS or not at all
+#cmakedefine ENABLE_LB_TIMERS
+
 // define PARTHENON_ENABLE_ASCENT or not at all
 #cmakedefine PARTHENON_ENABLE_ASCENT
 
diff --git a/src/interface/sparse_pack.hpp b/src/interface/sparse_pack.hpp
index 401448d1705a..acbf57adf8cb 100644
--- a/src/interface/sparse_pack.hpp
+++ b/src/interface/sparse_pack.hpp
@@ -26,6 +26,7 @@
 #include <utility>
 #include <vector>
 
+#include "config.hpp"
 #include "coordinates/coordinates.hpp"
 #include "interface/sparse_pack_base.hpp"
 #include "interface/variable.hpp"
@@ -156,6 +157,7 @@ class SparsePack : public SparsePackBase {
   KOKKOS_INLINE_FUNCTION
   const Coordinates_t &GetCoordinates(const int b = 0) const { return coords_(b)(); }
 
+#ifdef ENABLE_LB_TIMERS
   // Methods used for timing blocks
   KOKKOS_INLINE_FUNCTION
   const int GetLid(const int b) const {
@@ -166,6 +168,7 @@ class SparsePack : public SparsePackBase {
   double &GetCost(const int b) const {
     return cost_(GetLid(b));
   }
+#endif
 
   // Bound overloads
   KOKKOS_INLINE_FUNCTION int GetLowerBound(const int b) const {
diff --git a/src/interface/sparse_pack_base.cpp b/src/interface/sparse_pack_base.cpp
index 42d3d31cab09..371a45c141de 100644
--- a/src/interface/sparse_pack_base.cpp
+++ b/src/interface/sparse_pack_base.cpp
@@ -178,8 +178,10 @@ SparsePackBase SparsePackBase::Build(T *pmd, const PackDescriptor &desc,
   pack.coords_ = coords_t("coords", desc.flat ? max_size : nblocks);
   auto coords_h = Kokkos::create_mirror_view(pack.coords_);
 
+#ifdef ENABLE_LB_TIMERS
   pack.lid_ = lid_t("lid", nblocks);
   auto lid_h = Kokkos::create_mirror_view(pack.lid_);
+#endif
 
   // Fill the views
   int idx = 0;
@@ -195,7 +197,9 @@ SparsePackBase SparsePackBase::Build(T *pmd, const PackDescriptor &desc,
       // packs.
       coords_h(b) = pmbd->GetBlockPointer()->coords_device;
     }
+  #ifdef ENABLE_LB_TIMERS
     lid_h(blidx) = pmbd->GetBlockPointer()->lid;
+  #endif
 
     for (int i = 0; i < nvar; ++i) {
       pack.bounds_h_(0, blidx, i) = idx;
@@ -263,11 +267,13 @@ SparsePackBase SparsePackBase::Build(T *pmd, const PackDescriptor &desc,
     pack.bounds_h_(1, blidx, nvar) = idx - 1;
     blidx++;
   });
-  pack.cost_ = pmd->GetMeshPointer()->cost_d;
+#ifdef ENABLE_LB_TIMERS
+  pack.cost_ = pmd->GetMeshPointer()->block_cost;
+  Kokkos::deep_copy(pack.lid_, lid_h);
+#endif
   Kokkos::deep_copy(pack.pack_, pack_h);
   Kokkos::deep_copy(pack.bounds_, pack.bounds_h_);
   Kokkos::deep_copy(pack.coords_, coords_h);
-  Kokkos::deep_copy(pack.lid_, lid_h);
 
   return pack;
 }
diff --git a/src/interface/sparse_pack_base.hpp b/src/interface/sparse_pack_base.hpp
index c09869fc8aae..e021a49db2f3 100644
--- a/src/interface/sparse_pack_base.hpp
+++ b/src/interface/sparse_pack_base.hpp
@@ -26,6 +26,7 @@
 #include <utility>
 #include <vector>
 
+#include "config.hpp"
 #include "coordinates/coordinates.hpp"
 #include "interface/state_descriptor.hpp"
 #include "interface/variable.hpp"
@@ -90,8 +91,10 @@ class SparsePackBase {
   bounds_t bounds_;
   bounds_h_t bounds_h_;
   coords_t coords_;
+#ifdef ENABLE_LB_TIMERS
   lid_t lid_;
   cost_t cost_;
+#endif
 
   bool with_fluxes_;
   bool coarse_;
diff --git a/src/mesh/amr_loadbalance.cpp b/src/mesh/amr_loadbalance.cpp
index d82f97b0cacf..4bf024437e15 100644
--- a/src/mesh/amr_loadbalance.cpp
+++ b/src/mesh/amr_loadbalance.cpp
@@ -326,132 +326,140 @@ void Mesh::LoadBalancingAndAdaptiveMeshRefinement(ParameterInput *pin,
     nbdel += ndel;
   }
 
-  lb_flag_ |= lb_automatic_;
-
-  UpdateCostList();
-
-  modified = false;
   if (nnew != 0 || ndel != 0) { // at least one (de)refinement happened
-    GatherCostListAndCheckBalance();
-    RedistributeAndRefineMeshBlocks(pin, app_in, nbtotal + nnew - ndel);
-    modified = true;
-  } else if (lb_flag_ && step_since_lb >= lb_interval_) {
-    if (!GatherCostListAndCheckBalance()) { // load imbalance detected
-      RedistributeAndRefineMeshBlocks(pin, app_in, nbtotal);
-      modified = true;
-    }
-    lb_flag_ = false;
+    RedistributeAndRefineMeshBlocks(pin, app_in, nbtotal + nnew - ndel, true);
+  } else if (step_since_lb >= lb_interval_) {
+    RedistributeAndRefineMeshBlocks(pin, app_in, nbtotal, false);
   }
   Kokkos::Profiling::popRegion(); // LoadBalancingAndAdaptiveMeshRefinement
 }
 
 // Private routines
 namespace {
-/**
- * @brief This routine assigns blocks to ranks by attempting to place index-contiguous
- * blocks of equal total cost on each rank.
- *
- * @param costlist (Input) A map of global block ID to a relative weight.
- * @param ranklist (Output) A map of global block ID to ranks.
- */
-void AssignBlocks(std::vector<double> const &costlist, std::vector<int> &ranklist) {
-  ranklist.resize(costlist.size());
-
-  double const total_cost = std::accumulate(costlist.begin(), costlist.end(), 0.0);
-
-  int rank = (Globals::nranks)-1;
-  double target_cost = total_cost / Globals::nranks;
-  double my_cost = 0.0;
-  double remaining_cost = total_cost;
-  // create rank list from the end: the master MPI rank should have less load
-  for (int block_id = costlist.size() - 1; block_id >= 0; block_id--) {
-    if (target_cost == 0.0) {
-      std::stringstream msg;
-      msg << "### FATAL ERROR in CalculateLoadBalance" << std::endl
-          << "There is at least one process which has no MeshBlock" << std::endl
-          << "Decrease the number of processes or use smaller MeshBlocks." << std::endl;
-      PARTHENON_FAIL(msg);
+Real DistributeTrial(std::vector<Real> const &cost, std::vector<int> &start,
+                     std::vector<int> &nb, const int nranks, Real target_max) {
+  Real trial_max = 0.0;
+  const int nblocks = cost.size();
+  int nleft = nblocks;
+  int last = 0;
+  for (int rank = 0; rank < nranks; rank++) {
+    const int ranks_left = nranks - rank;
+    start[rank] = last;
+    Real rank_cost = cost[last];
+    nleft--;
+    while (nleft > ranks_left && rank_cost + cost[last+1] < target_max) {
+      last += 1;
+      rank_cost += cost[last];
+      nleft--;
     }
-    my_cost += costlist[block_id];
-    ranklist[block_id] = rank;
-    if (my_cost >= target_cost && rank > 0) {
-      rank--;
-      remaining_cost -= my_cost;
-      my_cost = 0.0;
-      target_cost = remaining_cost / (rank + 1);
+    last++;
+    if (rank == nranks-1) {
+      for (int n = last; n < nblocks; n++) {
+        rank_cost += cost[n];
+      }
+      last = nblocks;
     }
+    nb[rank] = last - start[rank];
+    trial_max = std::max(trial_max, rank_cost);
   }
+  return trial_max;
 }
 
-void UpdateBlockList(std::vector<int> const &ranklist, std::vector<int> &nslist,
-                     std::vector<int> &nblist) {
-  nslist.resize(Globals::nranks);
-  nblist.resize(Globals::nranks);
-
-  nslist[0] = 0;
-  int rank = 0;
-  for (int block_id = 1; block_id < ranklist.size(); block_id++) {
-    if (ranklist[block_id] != ranklist[block_id - 1]) {
-      nblist[rank] = block_id - nslist[rank];
-      nslist[++rank] = block_id;
+double CalculateNewBalance(std::vector<double> const &cost, std::vector<int> &start, std::vector<int> &nb,
+                           const double avg_cost, const double max_block_cost) {
+  start.resize(Globals::nranks);
+  nb.resize(Globals::nranks);
+  const int nblocks = cost.size();
+  const int max_rank = std::min(nblocks, Globals::nranks);
+
+  for (int i = max_rank; i < Globals::nranks; i++) {
+    start[i] = -1;
+    nb[i] = 0;
+  }
+
+  // set the bounds for the search
+  double a = avg_cost;
+  double b = std::max(2.0*avg_cost, 1.01*max_block_cost);
+  Real h = b - a;
+  constexpr Real invphi = 0.618033988749894848204586834366;
+  constexpr Real invphi2 = 0.38196601125010515179541316563436188228;
+  Real c = a + invphi2 * h;
+  Real d = a + invphi * h;
+  Real yc = DistributeTrial(cost, start, nb, max_rank, c);
+  Real yd = DistributeTrial(cost, start, nb, max_rank, d);
+
+  while (yc != yd) {
+    if (yc < yd) {
+      b = d;
+      d = c;
+      yd = yc;
+      h = invphi * h;
+      c = a + invphi2 * h;
+      yc = DistributeTrial(cost, start, nb, max_rank, c);
+    } else {
+      a = c;
+      c = d;
+      yc = yd;
+      h = invphi * h;
+      d = a + invphi * h;
+      yd = DistributeTrial(cost, start, nb, max_rank, d);
     }
   }
-  nblist[rank] = ranklist.size() - nslist[rank];
+
+  return yc;
 }
-} // namespace
 
-//----------------------------------------------------------------------------------------
-// \brief Calculate distribution of MeshBlocks based on the cost list
-void Mesh::CalculateLoadBalance(std::vector<double> const &costlist,
-                                std::vector<int> &ranklist, std::vector<int> &nslist,
-                                std::vector<int> &nblist) {
-  Kokkos::Profiling::pushRegion("CalculateLoadBalance");
-  auto const total_blocks = costlist.size();
+void AssignBlocks(const std::vector<int> &start, const std::vector<int> &nb, std::vector<int> &rank) {
+  const int nblocks = rank.size();
+  for (int i = 0; i < std::min(nblocks, Globals::nranks); i++) {
+    for (int n = start[i]; n < start[i] + nb[i]; n++) {
+      rank[n] = i;
+    }
+  }
+}
 
-  using it = std::vector<double>::const_iterator;
-  std::pair<it, it> const min_max = std::minmax_element(costlist.begin(), costlist.end());
+std::tuple<double, double> BlockCostInfo(std::vector<double> const &cost) {
+  const int nblocks = cost.size();
+  const int max_rank = std::min(Globals::nranks, nblocks);
+  double avg_cost = 0.0;
+  double max_block_cost = 0.0;
+  for (int b = 0; b < nblocks; b++) {
+    avg_cost += cost[b];
+    max_block_cost = std::max(max_block_cost, cost[b]);
+  }
+  avg_cost /= max_rank;
+  return std::make_tuple(avg_cost, max_block_cost);
+}
 
-  double const mincost = min_max.first == costlist.begin() ? 0.0 : *min_max.first;
-  double const maxcost = min_max.second == costlist.begin() ? 0.0 : *min_max.second;
+void SetSimpleBalance(const std::vector<double> &cost, std::vector<int> &start, std::vector<int> &nb) {
+  const int nblocks = cost.size();
+  const int max_rank = std::min(nblocks, Globals::nranks);
+  int nassign = nblocks/max_rank;
+  start[0] = 0;
+  nb[0] = nassign;
+  for (int i = 1; i < max_rank; i++) {
+    nassign = (nblocks - i*nassign)/(max_rank - i);
+    start[i] = start[i-1] + nb[i-1];
+    nb[i] = nassign;
+  }
+}
 
-  // Assigns blocks to ranks on a rougly cost-equal basis.
-  AssignBlocks(costlist, ranklist);
+} // namespace
 
-  // Updates nslist with the ID of the starting block on each rank and the count of blocks
-  // on each rank.
-  UpdateBlockList(ranklist, nslist, nblist);
+void Mesh::CalculateLoadBalance(std::vector<double> const &cost,
+                                std::vector<int> &rank, std::vector<int> &start,
+                                std::vector<int> &nb) {
+  Kokkos::Profiling::pushRegion("CalculateLoadBalance");
+  if ((lb_automatic_ || lb_manual_)) {
+    auto [avg_cost, max_block_cost] = BlockCostInfo(cost);
+    double new_max = CalculateNewBalance(cost, start, nb, avg_cost, max_block_cost);
 
-#ifdef MPI_PARALLEL
-  if (total_blocks % (Globals::nranks) != 0 && !adaptive && !lb_flag_ &&
-      maxcost == mincost && Globals::my_rank == 0) {
-    std::cout << "### Warning in CalculateLoadBalance" << std::endl
-              << "The number of MeshBlocks cannot be divided evenly. "
-              << "This will result in poor load balancing." << std::endl;
-  }
-#endif
-  if (Globals::nranks > total_blocks) {
-    if (!adaptive) {
-      // mesh is refined statically, treat this an as error (all ranks need to
-      // participate)
-      std::stringstream msg;
-      msg << "### FATAL ERROR in CalculateLoadBalance" << std::endl
-          << "There are fewer MeshBlocks than OpenMP threads on each MPI rank"
-          << std::endl
-          << "Decrease the number of threads or use more MeshBlocks." << std::endl;
-      PARTHENON_FAIL(msg);
-    } else if (Globals::my_rank == 0) {
-      // we have AMR, print warning only on Rank 0
-      std::cout << "### WARNING in CalculateLoadBalance" << std::endl
-                << "There are fewer MeshBlocks than OpenMP threads on each MPI rank"
-                << std::endl
-                << "This is likely fine if the number of meshblocks is expected to grow "
-                   "during the "
-                   "simulations. Otherwise, it might be worthwhile to decrease the "
-                   "number of threads or "
-                   "use more meshblocks."
-                << std::endl;
-    }
+  } else {
+    // just try to distribute blocks evenly
+    SetSimpleBalance(cost, start, nb);
   }
+  AssignBlocks(start, nb, rank);
+  // now assign blocks to ranks
   Kokkos::Profiling::popRegion(); // CalculateLoadBalance
 }
 
@@ -461,34 +469,18 @@ void Mesh::CalculateLoadBalance(std::vector<double> const &costlist,
 
 void Mesh::ResetLoadBalanceVariables() {
   if (lb_automatic_) {
-    for (auto &pmb : block_list) {
-      costlist[pmb->gid] = TINY_NUMBER;
-      pmb->ResetTimeMeasurement();
-    }
+#ifdef ENABLE_LB_TIMERS
     parthenon::par_for(loop_pattern_flatrange_tag, "reset cost_d", DevExecSpace(), 0, block_list.size()-1,
         KOKKOS_LAMBDA(const int b) {
-          cost_d[b] = TINY_NUMBER;
+          block_cost(b) = TINY_NUMBER;
         });
-  }
-  lb_flag_ = false;
-  step_since_lb = 0;
-}
-
-//----------------------------------------------------------------------------------------
-// \!fn void Mesh::UpdateCostList()
-// \brief update the cost list
-
-void Mesh::UpdateCostList() {
-  if (lb_automatic_) {
-    double w = static_cast<double>(lb_interval_ - 1) / static_cast<double>(lb_interval_);
-    for (auto &pmb : block_list) {
-      costlist[pmb->gid] = costlist[pmb->gid] * w + pmb->cost_;
-    }
-  } else if (lb_flag_) {
-    for (auto &pmb : block_list) {
-      costlist[pmb->gid] = pmb->cost_;
+#endif
+  } else if (lb_manual_) {
+    for (int b = 0; b < block_list.size(); b++) {
+      block_cost[b] = TINY_NUMBER;
     }
   }
+  step_since_lb = 0;
 }
 
 //----------------------------------------------------------------------------------------
@@ -634,41 +626,66 @@ void Mesh::UpdateMeshBlockTree(int &nnew, int &ndel) {
 // \!fn bool Mesh::GatherCostListAndCheckBalance()
 // \brief collect the cost from MeshBlocks and check the load balance
 
-bool Mesh::GatherCostListAndCheckBalance() {
-  if (lb_manual_ || lb_automatic_) {
+void Mesh::GatherCostList() {
+  Kokkos::Profiling::pushRegion("GatherCostList");
+  if (lb_automatic_) {
+#ifdef ENABLE_LB_TIMERS
+    auto cost_h = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), block_cost);
 #ifdef MPI_PARALLEL
-    PARTHENON_MPI_CHECK(MPI_Allgatherv(MPI_IN_PLACE, nblist[Globals::my_rank], MPI_DOUBLE,
+    PARTHENON_MPI_CHECK(MPI_Allgatherv(cost_h.data(), nblist[Globals::my_rank], MPI_DOUBLE,
                                        costlist.data(), nblist.data(), nslist.data(),
                                        MPI_DOUBLE, MPI_COMM_WORLD));
 #endif
-    double maxcost = 0.0, avecost = 0.0;
-    for (int rank = 0; rank < Globals::nranks; rank++) {
-      double rcost = 0.0;
-      int ns = nslist[rank];
-      int ne = ns + nblist[rank];
-      for (int n = ns; n < ne; ++n)
-        rcost += costlist[n];
-      maxcost = std::max(maxcost, rcost);
-      avecost += rcost;
-    }
-    avecost /= Globals::nranks;
-
-    if (adaptive)
-      lb_tolerance_ =
-          2.0 * static_cast<double>(Globals::nranks) / static_cast<double>(nbtotal);
-
-    if (maxcost > (1.0 + lb_tolerance_) * avecost) return false;
+#endif
   }
-  return true;
+  if (lb_manual_) {
+#ifdef MPI_PARALLEL
+    PARTHENON_MPI_CHECK(MPI_Allgatherv(block_cost.data(), nblist[Globals::my_rank], MPI_DOUBLE,
+                                       costlist.data(), nblist.data(), nslist.data(),
+                                       MPI_DOUBLE, MPI_COMM_WORLD));
+#endif
+  }
+  Kokkos::Profiling::popRegion();
+  return;
 }
 
+
 //----------------------------------------------------------------------------------------
 // \!fn void Mesh::RedistributeAndRefineMeshBlocks(ParameterInput *pin, int ntot)
 // \brief redistribute MeshBlocks according to the new load balance
 
 void Mesh::RedistributeAndRefineMeshBlocks(ParameterInput *pin, ApplicationInput *app_in,
-                                           int ntot) {
+                                           int ntot, bool modified) {
   Kokkos::Profiling::pushRegion("RedistributeAndRefineMeshBlocks");
+
+  GatherCostList();
+  // store old nbstart and nbend before load balancing.
+  int onbs = nslist[Globals::my_rank];
+  int onbe = onbs + nblist[Globals::my_rank] - 1;
+
+  if (!modified) {
+    // The mesh hasn't actually changed.  Let's just check the load balancing
+    // and only move things around if needed
+    if (lb_automatic_ || lb_manual_) {
+      Kokkos::Profiling::pushRegion("Reloadbalance");
+      auto [avg_cost, max_block_cost] = BlockCostInfo(costlist);
+      std::vector<int> start_trial, nb_trial;
+      double new_max = CalculateNewBalance(costlist, start_trial, nb_trial, avg_cost, max_block_cost);
+      Kokkos::Profiling::popRegion();
+      // if the improvement isn't large enough, just return because we're done
+      if ((max_block_cost - new_max)/max_block_cost < lb_tolerance_) return;
+      AssignBlocks(start_trial, nb_trial, ranklist);
+      nslist = std::move(start_trial);
+      nblist = std::move(nb_trial);
+    } else {
+      // default balancing on number of meshblocks should be good to go since
+      // the mesh hasn't changed
+      return;
+    }
+  }
+
+  // if we got here, we're going to be changing or moving the mesh around
+
   // kill any cached packs
   mesh_data.PurgeNonBase();
   mesh_data.Get()->ClearCaches();
@@ -721,14 +738,10 @@ void Mesh::RedistributeAndRefineMeshBlocks(ParameterInput *pin, ApplicationInput
     }
   }
 
-  // store old nbstart and nbend before load balancing.
-  int onbs = nslist[Globals::my_rank];
-  int onbe = onbs + nblist[Globals::my_rank] - 1;
-
   Kokkos::Profiling::popRegion(); // Construct new list
 
   // Calculate new load balance
-  CalculateLoadBalance(newcost, newrank, nslist, nblist);
+  if (modified) CalculateLoadBalance(newcost, newrank, nslist, nblist);
 
   int nbs = nslist[Globals::my_rank];
   int nbe = nbs + nblist[Globals::my_rank] - 1;
@@ -937,7 +950,12 @@ void Mesh::RedistributeAndRefineMeshBlocks(ParameterInput *pin, ApplicationInput
   loclist = std::move(newloc);
   ranklist = std::move(newrank);
   costlist = std::move(newcost);
-  cost_d.Realloc(nbe - nbs + 1);
+
+#ifdef ENABLE_LB_TIMERS
+  block_cost.Realloc(nbe - nbs + 1);
+#else
+  block_cost.resize(nbe - nbs + 1);
+#endif
 
   // re-initialize the MeshBlocks
   for (auto &pmb : block_list) {
diff --git a/src/mesh/mesh.cpp b/src/mesh/mesh.cpp
index 1c856748c1a4..254c83a4423a 100644
--- a/src/mesh/mesh.cpp
+++ b/src/mesh/mesh.cpp
@@ -473,7 +473,11 @@ Mesh::Mesh(ParameterInput *pin, ApplicationInput *app_in, Packages_t &packages,
                         packages, resolved_packages, gflag);
     block_list[i - nbs]->SearchAndSetNeighbors(tree, ranklist.data(), nslist.data());
   }
-  cost_d.Realloc(block_list.size());
+#ifdef ENABLE_LB_TIMERS
+  block_cost.Realloc(block_list.size());
+#else
+  block_cost.resize(block_list.size());
+#endif
 
   ResetLoadBalanceVariables();
 
@@ -738,7 +742,11 @@ Mesh::Mesh(ParameterInput *pin, ApplicationInput *app_in, RestartReader &rr,
     block_list[i - nbs]->SearchAndSetNeighbors(tree, ranklist.data(), nslist.data());
   }
 
-  cost_d.Realloc(block_list.size());
+#ifdef ENABLE_LB_TIMERS
+  block_cost.Realloc(block_list.size());
+#else
+  block_cost.resize(block_list.size());
+#endif
   ResetLoadBalanceVariables();
 
   // Output variables in use in this run
@@ -1259,12 +1267,13 @@ void Mesh::RegisterLoadBalancing_(ParameterInput *pin) {
   const std::string balancer =
       pin->GetOrAddString("parthenon/loadbalancing", "balancer", "default",
                           std::vector<std::string>{"default", "automatic", "manual"});
+#ifndef ENABLE_LB_TIMERS
+  if (balancer == "automatic") {
+    PARTHENON_FAIL("Cannot use automatic load balancing without enabling timers. "
+                   "Rebuild with -DPARTHENON_ENABLE_LB_TIMERS=ON or change balancer");
+  }
+#endif
   if (balancer == "automatic") {
-    // JMM: I am disabling timing based load balancing, as it's not
-    // threaded through the infrastructure. I think some thought needs
-    // to go into doing this right with loops over meshdata rather
-    // than loops over data on a single meshblock.
-    PARTHENON_FAIL("Timing based load balancing is currently unavailable.");
     lb_automatic_ = true;
   } else if (balancer == "manual") {
     lb_manual_ = true;
diff --git a/src/mesh/mesh.hpp b/src/mesh/mesh.hpp
index 48147e55e67a..05242dfd0c44 100644
--- a/src/mesh/mesh.hpp
+++ b/src/mesh/mesh.hpp
@@ -107,7 +107,11 @@ class Mesh {
   BlockList_t block_list;
   Packages_t packages;
   std::shared_ptr<StateDescriptor> resolved_packages;
-  ParArray1D<double> cost_d;
+#ifdef ENABLE_LB_TIMERS
+  ParArray1D<double> block_cost;
+#else
+  std::vector<double> block_cost;
+#endif
 
   DataCollection<MeshData<Real>> mesh_data;
 
@@ -263,17 +267,16 @@ class Mesh {
   // functions
   MeshGenFunc MeshGenerator_[4];
 
-  void CalculateLoadBalance(std::vector<double> const &costlist,
-                            std::vector<int> &ranklist, std::vector<int> &nslist,
-                            std::vector<int> &nblist);
   void ResetLoadBalanceVariables();
 
+  void CalculateLoadBalance(std::vector<double> const &cost,
+                            std::vector<int> &rank, std::vector<int> &start,
+                            std::vector<int> &nb);
   // Mesh::LoadBalancingAndAdaptiveMeshRefinement() helper functions:
-  void UpdateCostList();
   void UpdateMeshBlockTree(int &nnew, int &ndel);
-  bool GatherCostListAndCheckBalance();
+  void GatherCostList();
   void RedistributeAndRefineMeshBlocks(ParameterInput *pin, ApplicationInput *app_in,
-                                       int ntot);
+                                       int ntot, bool modified);
 
   // defined in either the prob file or default_pgen.cpp in ../pgen/
   static void InitUserMeshDataDefault(Mesh *mesh, ParameterInput *pin);
diff --git a/src/utils/block_timer.hpp b/src/utils/block_timer.hpp
index b4e98ed58405..8bd680611d64 100644
--- a/src/utils/block_timer.hpp
+++ b/src/utils/block_timer.hpp
@@ -22,19 +22,30 @@
 #include <Kokkos_Core.hpp>
 #include <impl/Kokkos_ClockTic.hpp>
 
+#include "config.hpp"
+
 namespace parthenon {
 
 template <typename T>
 class BlockTimer {
+#ifdef ENABLE_LB_TIMERS
  public:
   BlockTimer() = delete;
-  // constructor for team policies
+  // constructor for team policies when there is no pack
+  KOKKOS_INLINE_FUNCTION
+  BlockTimer(team_mbr_t &member, T cost)
+    : member_(&member), pack_(cost), b_(0), start_(Kokkos::Impl::clock_tic()) {}
+  // constructor for non-team policies when there is no pack
   KOKKOS_INLINE_FUNCTION
-  BlockTimer(team_mbr_t &member, const T &pack, const int b = 0)
+  explicit BlockTimer(T cost)
+    : member_(nullptr), pack_(cost), b_(0), start_(Kokkos::Impl::clock_tic()) {}
+  // constructor for team policies and packs
+  KOKKOS_INLINE_FUNCTION
+  BlockTimer(team_mbr_t &member, const T &pack, const int b)
     : member_(&member), pack_(pack), b_(b), start_(Kokkos::Impl::clock_tic()) {}
-  // constructor for non-team policies
+  // constructor for non-team policies and packs
   KOKKOS_INLINE_FUNCTION
-  BlockTimer(const T &pack, const int b = 0)
+  BlockTimer(const T &pack, const int b)
     : member_(nullptr), pack_(pack), b_(b), start_(Kokkos::Impl::clock_tic()) {}
   // constructor for team policies without a block index
   KOKKOS_INLINE_FUNCTION
@@ -46,11 +57,21 @@ class BlockTimer {
                     + static_cast<double>(stop) :
                   static_cast<double>(stop - start_));
     if (member_ == nullptr) {
-      Kokkos::atomic_add(&(pack_.GetCost(b_)), diff);
-    } else {
-      Kokkos::single(Kokkos::PerTeam(*member_), [&] () {
+      if constexpr (std::is_same<T, double *>::value) {
+        Kokkos::atomic_add(pack_, diff);
+      } else {
         Kokkos::atomic_add(&(pack_.GetCost(b_)), diff);
-      });
+      }
+    } else {
+      if constexpr (std::is_same<T, double *>::value) {
+        Kokkos::single(Kokkos::PerTeam(*member_), [&] () {
+          Kokkos::atomic_add(pack_, diff);
+        });
+      } else {
+        Kokkos::single(Kokkos::PerTeam(*member_), [&] () {
+          Kokkos::atomic_add(&(pack_.GetCost(b_)), diff);
+        });
+      }
     }
   }
  private:
@@ -58,7 +79,12 @@ class BlockTimer {
   const T &pack_;
   const int b_;
   const uint64_t start_;
-  ;
+#else // no timers, so just stub this out
+ public:
+  template <typename... Args>
+  KOKKOS_INLINE_FUNCTION
+  BlockTimer(Args &&... args) {}
+#endif
 };
 
 } // namespace parthenon

From 39724012eb1430a346e6a72272a75071daaba442 Mon Sep 17 00:00:00 2001
From: jdolence <jcdolence@gmail.com>
Date: Thu, 31 Aug 2023 17:44:45 -0600
Subject: [PATCH 11/67] fix some things

---
 src/mesh/amr_loadbalance.cpp | 38 +++++++++++++++++++-----------------
 src/mesh/mesh.hpp            |  2 +-
 src/utils/block_timer.hpp    |  2 +-
 3 files changed, 22 insertions(+), 20 deletions(-)

diff --git a/src/mesh/amr_loadbalance.cpp b/src/mesh/amr_loadbalance.cpp
index 4bf024437e15..c6808d988d9f 100644
--- a/src/mesh/amr_loadbalance.cpp
+++ b/src/mesh/amr_loadbalance.cpp
@@ -326,10 +326,11 @@ void Mesh::LoadBalancingAndAdaptiveMeshRefinement(ParameterInput *pin,
     nbdel += ndel;
   }
 
+  modified = false;
   if (nnew != 0 || ndel != 0) { // at least one (de)refinement happened
-    RedistributeAndRefineMeshBlocks(pin, app_in, nbtotal + nnew - ndel, true);
+    modified = RedistributeAndRefineMeshBlocks(pin, app_in, nbtotal + nnew - ndel, true);
   } else if (step_since_lb >= lb_interval_) {
-    RedistributeAndRefineMeshBlocks(pin, app_in, nbtotal, false);
+    modified = RedistributeAndRefineMeshBlocks(pin, app_in, nbtotal, false);
   }
   Kokkos::Profiling::popRegion(); // LoadBalancingAndAdaptiveMeshRefinement
 }
@@ -367,8 +368,6 @@ Real DistributeTrial(std::vector<Real> const &cost, std::vector<int> &start,
 
 double CalculateNewBalance(std::vector<double> const &cost, std::vector<int> &start, std::vector<int> &nb,
                            const double avg_cost, const double max_block_cost) {
-  start.resize(Globals::nranks);
-  nb.resize(Globals::nranks);
   const int nblocks = cost.size();
   const int max_rank = std::min(nblocks, Globals::nranks);
 
@@ -431,8 +430,7 @@ std::tuple<double, double> BlockCostInfo(std::vector<double> const &cost) {
   return std::make_tuple(avg_cost, max_block_cost);
 }
 
-void SetSimpleBalance(const std::vector<double> &cost, std::vector<int> &start, std::vector<int> &nb) {
-  const int nblocks = cost.size();
+void SetSimpleBalance(const int nblocks, std::vector<int> &start, std::vector<int> &nb) {
   const int max_rank = std::min(nblocks, Globals::nranks);
   int nassign = nblocks/max_rank;
   start[0] = 0;
@@ -456,7 +454,7 @@ void Mesh::CalculateLoadBalance(std::vector<double> const &cost,
 
   } else {
     // just try to distribute blocks evenly
-    SetSimpleBalance(cost, start, nb);
+    SetSimpleBalance(cost.size(), start, nb);
   }
   AssignBlocks(start, nb, rank);
   // now assign blocks to ranks
@@ -654,33 +652,37 @@ void Mesh::GatherCostList() {
 // \!fn void Mesh::RedistributeAndRefineMeshBlocks(ParameterInput *pin, int ntot)
 // \brief redistribute MeshBlocks according to the new load balance
 
-void Mesh::RedistributeAndRefineMeshBlocks(ParameterInput *pin, ApplicationInput *app_in,
+bool Mesh::RedistributeAndRefineMeshBlocks(ParameterInput *pin, ApplicationInput *app_in,
                                            int ntot, bool modified) {
   Kokkos::Profiling::pushRegion("RedistributeAndRefineMeshBlocks");
 
   GatherCostList();
-  // store old nbstart and nbend before load balancing.
-  int onbs = nslist[Globals::my_rank];
-  int onbe = onbs + nblist[Globals::my_rank] - 1;
+  // store old things
+  const int onbs = nslist[Globals::my_rank];
+  const int onbe = onbs + nblist[Globals::my_rank] - 1;
+  const int nbtold = nbtotal;
 
+  std::vector<int> newrank;
   if (!modified) {
     // The mesh hasn't actually changed.  Let's just check the load balancing
     // and only move things around if needed
     if (lb_automatic_ || lb_manual_) {
       Kokkos::Profiling::pushRegion("Reloadbalance");
       auto [avg_cost, max_block_cost] = BlockCostInfo(costlist);
-      std::vector<int> start_trial, nb_trial;
+      std::vector<int> start_trial(Globals::nranks);
+      std::vector<int> nb_trial(Globals::nranks);
       double new_max = CalculateNewBalance(costlist, start_trial, nb_trial, avg_cost, max_block_cost);
       Kokkos::Profiling::popRegion();
       // if the improvement isn't large enough, just return because we're done
-      if ((max_block_cost - new_max)/max_block_cost < lb_tolerance_) return;
-      AssignBlocks(start_trial, nb_trial, ranklist);
+      if ((max_block_cost - new_max)/max_block_cost < lb_tolerance_) return false;
+      newrank.resize(ntot);
+      AssignBlocks(start_trial, nb_trial, newrank);
       nslist = std::move(start_trial);
       nblist = std::move(nb_trial);
     } else {
       // default balancing on number of meshblocks should be good to go since
       // the mesh hasn't changed
-      return;
+      return false;
     }
   }
 
@@ -698,12 +700,11 @@ void Mesh::RedistributeAndRefineMeshBlocks(ParameterInput *pin, ApplicationInput
   // construct new lists
   Kokkos::Profiling::pushRegion("Construct new list");
   std::vector<LogicalLocation> newloc(ntot);
-  std::vector<int> newrank(ntot);
+  if (newrank.size() == 0) newrank.resize(ntot);
   std::vector<double> newcost(ntot);
   std::vector<int> newtoold(ntot);
-  std::vector<int> oldtonew(nbtotal);
+  std::vector<int> oldtonew(nbtold);
 
-  int nbtold = nbtotal;
   tree.GetMeshBlockList(newloc.data(), newtoold.data(), nbtotal);
 
   // create a list mapping the previous gid to the current one
@@ -966,5 +967,6 @@ void Mesh::RedistributeAndRefineMeshBlocks(ParameterInput *pin, ApplicationInput
   ResetLoadBalanceVariables();
 
   Kokkos::Profiling::popRegion(); // RedistributeAndRefineMeshBlocks
+  return true;
 }
 } // namespace parthenon
diff --git a/src/mesh/mesh.hpp b/src/mesh/mesh.hpp
index 05242dfd0c44..3b720dc8c20a 100644
--- a/src/mesh/mesh.hpp
+++ b/src/mesh/mesh.hpp
@@ -275,7 +275,7 @@ class Mesh {
   // Mesh::LoadBalancingAndAdaptiveMeshRefinement() helper functions:
   void UpdateMeshBlockTree(int &nnew, int &ndel);
   void GatherCostList();
-  void RedistributeAndRefineMeshBlocks(ParameterInput *pin, ApplicationInput *app_in,
+  bool RedistributeAndRefineMeshBlocks(ParameterInput *pin, ApplicationInput *app_in,
                                        int ntot, bool modified);
 
   // defined in either the prob file or default_pgen.cpp in ../pgen/
diff --git a/src/utils/block_timer.hpp b/src/utils/block_timer.hpp
index 8bd680611d64..5326d1d85f3d 100644
--- a/src/utils/block_timer.hpp
+++ b/src/utils/block_timer.hpp
@@ -26,7 +26,7 @@
 
 namespace parthenon {
 
-template <typename T>
+template <typename T = void>
 class BlockTimer {
 #ifdef ENABLE_LB_TIMERS
  public:

From 013e53ba993b5b92e2d72358d5b2c77b259191cb Mon Sep 17 00:00:00 2001
From: jdolence <jcdolence@gmail.com>
Date: Thu, 31 Aug 2023 18:01:18 -0600
Subject: [PATCH 12/67] fix another silly bug

---
 src/mesh/amr_loadbalance.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/mesh/amr_loadbalance.cpp b/src/mesh/amr_loadbalance.cpp
index c6808d988d9f..86296c184e45 100644
--- a/src/mesh/amr_loadbalance.cpp
+++ b/src/mesh/amr_loadbalance.cpp
@@ -435,10 +435,12 @@ void SetSimpleBalance(const int nblocks, std::vector<int> &start, std::vector<in
   int nassign = nblocks/max_rank;
   start[0] = 0;
   nb[0] = nassign;
+  int nassigned = nassign;
   for (int i = 1; i < max_rank; i++) {
-    nassign = (nblocks - i*nassign)/(max_rank - i);
+    nassign = (nblocks - nassigned)/(max_rank - i);
     start[i] = start[i-1] + nb[i-1];
     nb[i] = nassign;
+    nassigned += nassign;
   }
 }
 

From cf31ae979b3c28a4a1794e24b37759e0a095a265 Mon Sep 17 00:00:00 2001
From: jdolence <jcdolence@gmail.com>
Date: Fri, 1 Sep 2023 09:13:13 -0600
Subject: [PATCH 13/67] try to allow fewer blocks than ranks on base mesh

---
 src/mesh/amr_loadbalance.cpp |  7 ++++---
 src/mesh/mesh.cpp            | 33 ++++++++-------------------------
 src/mesh/mesh.hpp            | 10 ++++++++++
 3 files changed, 22 insertions(+), 28 deletions(-)

diff --git a/src/mesh/amr_loadbalance.cpp b/src/mesh/amr_loadbalance.cpp
index 86296c184e45..18997ea327db 100644
--- a/src/mesh/amr_loadbalance.cpp
+++ b/src/mesh/amr_loadbalance.cpp
@@ -771,8 +771,9 @@ bool Mesh::RedistributeAndRefineMeshBlocks(ParameterInput *pin, ApplicationInput
     }
   }
   restriction_cache.CopyToDevice();
+  auto [cellbounds, c_cellbounds] = GetCellBounds();
   refinement::Restrict(resolved_packages.get(), restriction_cache,
-                       block_list[0]->cellbounds, block_list[0]->c_cellbounds);
+                       cellbounds, c_cellbounds);
 
   Kokkos::fence();
 
@@ -938,9 +939,9 @@ bool Mesh::RedistributeAndRefineMeshBlocks(ParameterInput *pin, ApplicationInput
   }
   prolongation_cache.CopyToDevice();
   refinement::ProlongateShared(resolved_packages.get(), prolongation_cache,
-                               block_list[0]->cellbounds, block_list[0]->c_cellbounds);
+                               cellbounds, c_cellbounds);
   refinement::ProlongateInternal(resolved_packages.get(), prolongation_cache,
-                                 block_list[0]->cellbounds, block_list[0]->c_cellbounds);
+                                 cellbounds, c_cellbounds);
 
 #ifdef MPI_PARALLEL
   if (send_reqs.size() != 0)
diff --git a/src/mesh/mesh.cpp b/src/mesh/mesh.cpp
index 254c83a4423a..4c3993517908 100644
--- a/src/mesh/mesh.cpp
+++ b/src/mesh/mesh.cpp
@@ -103,7 +103,6 @@ Mesh::Mesh(ParameterInput *pin, ApplicationInput *app_in, Packages_t &packages,
                                                     UniformMeshGenerator<X3DIR>},
       MeshBndryFnctn{nullptr, nullptr, nullptr, nullptr, nullptr, nullptr} {
   std::stringstream msg;
-  RegionSize block_size;
   BoundaryFlag block_bcs[6];
   std::int64_t nbmax;
 
@@ -408,22 +407,6 @@ Mesh::Mesh(ParameterInput *pin, ApplicationInput *app_in, Packages_t &packages,
   loclist.resize(nbtotal);
   tree.GetMeshBlockList(loclist.data(), nullptr, nbtotal);
 
-#ifdef MPI_PARALLEL
-  // check if there are sufficient blocks
-  if (nbtotal < Globals::nranks) {
-    if (mesh_test == 0) {
-      msg << "### FATAL ERROR in Mesh constructor" << std::endl
-          << "Too few mesh blocks: nbtotal (" << nbtotal << ") < nranks ("
-          << Globals::nranks << ")" << std::endl;
-      PARTHENON_FAIL(msg);
-    } else { // test
-      std::cout << "### Warning in Mesh constructor" << std::endl
-                << "Too few mesh blocks: nbtotal (" << nbtotal << ") < nranks ("
-                << Globals::nranks << ")" << std::endl;
-    }
-  }
-#endif
-
   ranklist = std::vector<int>(nbtotal);
 
   nslist = std::vector<int>(Globals::nranks);
@@ -534,7 +517,6 @@ Mesh::Mesh(ParameterInput *pin, ApplicationInput *app_in, RestartReader &rr,
                                                     UniformMeshGenerator<X3DIR>},
       MeshBndryFnctn{nullptr, nullptr, nullptr, nullptr, nullptr, nullptr} {
   std::stringstream msg;
-  RegionSize block_size;
   BoundaryFlag block_bcs[6];
 
   // mesh test
@@ -774,7 +756,6 @@ Mesh::~Mesh() {
 
 void Mesh::OutputMeshStructure(const int ndim,
                                const bool dump_mesh_structure /*= true*/) {
-  RegionSize block_size;
   BoundaryFlag block_bcs[6];
 
   // Write overall Mesh structure to stdout and file
@@ -1024,7 +1005,7 @@ void Mesh::Initialize(bool init_problem, ParameterInput *pin, ApplicationInput *
     // problem generator
     if (init_problem) {
       PARTHENON_REQUIRE_THROWS(
-          !(ProblemGenerator != nullptr && block_list[0]->ProblemGenerator != nullptr),
+          !(ProblemGenerator != nullptr && app_in->ProblemGenerator != nullptr),
           "Mesh and MeshBlock ProblemGenerators are defined. Please use only one.");
 
       // Call Mesh ProblemGenerator
@@ -1192,6 +1173,10 @@ void Mesh::Initialize(bool init_problem, ParameterInput *pin, ApplicationInput *
     }
   } while (!init_done);
 
+  PARTHENON_REQUIRE_THROWS(nbtotal >= Globals::nranks,
+    "After initialization, there are fewer meshblocks than ranks. Parthenon requires at "
+    "least as many blocks as ranks. Change your settings and try again.");
+
   // Initialize the "base" MeshData object
   mesh_data.Get()->Set(block_list);
 
@@ -1251,15 +1236,13 @@ void Mesh::SetBlockSizeAndBoundaries(LogicalLocation loc, RegionSize &block_size
 }
 
 std::int64_t Mesh::GetTotalCells() {
-  auto &pmb = block_list.front();
-  return static_cast<std::int64_t>(nbtotal) * pmb->block_size.nx(X1DIR) *
-         pmb->block_size.nx(X2DIR) * pmb->block_size.nx(X3DIR);
+  return static_cast<std::int64_t>(nbtotal) * GetNumberOfMeshBlockCells();
 }
 // TODO(JMM): Move block_size into mesh.
 int Mesh::GetNumberOfMeshBlockCells() const {
-  return block_list.front()->GetNumberOfMeshBlockCells();
+  return block_size.nx(X1DIR) * block_size.nx(X2DIR) * block_size.nx(X3DIR);
 }
-const RegionSize &Mesh::GetBlockSize() const { return block_list.front()->block_size; }
+const RegionSize &Mesh::GetBlockSize() const { return block_size; }
 
 // Functionality re-used in mesh constructor
 void Mesh::RegisterLoadBalancing_(ParameterInput *pin) {
diff --git a/src/mesh/mesh.hpp b/src/mesh/mesh.hpp
index 3b720dc8c20a..0291ced3ecc1 100644
--- a/src/mesh/mesh.hpp
+++ b/src/mesh/mesh.hpp
@@ -95,6 +95,7 @@ class Mesh {
   // data
   bool modified;
   RegionSize mesh_size;
+  RegionSize block_size;
   BoundaryFlag mesh_bcs[BOUNDARY_NFACES];
   const int ndim; // number of dimensions
   const bool adaptive, multilevel;
@@ -267,6 +268,15 @@ class Mesh {
   // functions
   MeshGenFunc MeshGenerator_[4];
 
+  std::pair<IndexShape, IndexShape> GetCellBounds() const {
+    auto cb = [&](const int rfact) {
+      return IndexShape(block_size.nx(X3DIR)/rfact,
+                        block_size.nx(X2DIR)/rfact,
+                        block_size.nx(X1DIR)/rfact, multilevel*Globals::nghost);
+    };
+    return std::make_pair(cb(1), cb(2));
+  }
+
   void ResetLoadBalanceVariables();
 
   void CalculateLoadBalance(std::vector<double> const &cost,

From dd0ebf5add2742d09a99ca5f4bf045bcf62efc8a Mon Sep 17 00:00:00 2001
From: jdolence <jcdolence@gmail.com>
Date: Fri, 1 Sep 2023 09:17:04 -0600
Subject: [PATCH 14/67] missing include

---
 src/utils/block_timer.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/utils/block_timer.hpp b/src/utils/block_timer.hpp
index 5326d1d85f3d..d30cf945ae94 100644
--- a/src/utils/block_timer.hpp
+++ b/src/utils/block_timer.hpp
@@ -19,10 +19,10 @@
 
 #include <limits>
 
-#include <Kokkos_Core.hpp>
 #include <impl/Kokkos_ClockTic.hpp>
 
 #include "config.hpp"
+#include "kokkos_abstraction.hpp"
 
 namespace parthenon {
 

From 335291ce89379bce71cae2379b6f0b49867a372d Mon Sep 17 00:00:00 2001
From: jdolence <jcdolence@gmail.com>
Date: Fri, 1 Sep 2023 10:21:55 -0600
Subject: [PATCH 15/67] fix bug in new Mesh::GetCellBounds function

---
 src/mesh/mesh.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/mesh/mesh.hpp b/src/mesh/mesh.hpp
index 0291ced3ecc1..1addc5eb495f 100644
--- a/src/mesh/mesh.hpp
+++ b/src/mesh/mesh.hpp
@@ -270,8 +270,8 @@ class Mesh {
 
   std::pair<IndexShape, IndexShape> GetCellBounds() const {
     auto cb = [&](const int rfact) {
-      return IndexShape(block_size.nx(X3DIR)/rfact,
-                        block_size.nx(X2DIR)/rfact,
+      return IndexShape((ndim > 2) * block_size.nx(X3DIR)/rfact,
+                        (ndim > 1) * block_size.nx(X2DIR)/rfact,
                         block_size.nx(X1DIR)/rfact, multilevel*Globals::nghost);
     };
     return std::make_pair(cb(1), cb(2));

From a6032683532482f0ebc2005c2bab42410195c340 Mon Sep 17 00:00:00 2001
From: jdolence <jcdolence@gmail.com>
Date: Fri, 1 Sep 2023 11:23:09 -0600
Subject: [PATCH 16/67] another guard for empty block lists

---
 src/mesh/mesh.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mesh/mesh.hpp b/src/mesh/mesh.hpp
index 1addc5eb495f..1191b57f9aff 100644
--- a/src/mesh/mesh.hpp
+++ b/src/mesh/mesh.hpp
@@ -124,7 +124,7 @@ class Mesh {
   void LoadBalancingAndAdaptiveMeshRefinement(ParameterInput *pin,
                                               ApplicationInput *app_in);
   int DefaultPackSize() {
-    return default_pack_size_ < 1 ? block_list.size() : default_pack_size_;
+    return default_pack_size_ < 1 ? std::max(block_list.size(), 1) : default_pack_size_;
   }
   int DefaultNumPartitions() {
     return partition::partition_impl::IntCeil(block_list.size(), DefaultPackSize());

From 953cfde6926fef974aeaf59613eb16d1bdb207ea Mon Sep 17 00:00:00 2001
From: jdolence <jcdolence@gmail.com>
Date: Fri, 1 Sep 2023 12:02:25 -0600
Subject: [PATCH 17/67] oh come on

---
 src/mesh/mesh.hpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/mesh/mesh.hpp b/src/mesh/mesh.hpp
index 1191b57f9aff..a823f1ccfd4f 100644
--- a/src/mesh/mesh.hpp
+++ b/src/mesh/mesh.hpp
@@ -124,7 +124,8 @@ class Mesh {
   void LoadBalancingAndAdaptiveMeshRefinement(ParameterInput *pin,
                                               ApplicationInput *app_in);
   int DefaultPackSize() {
-    return default_pack_size_ < 1 ? std::max(block_list.size(), 1) : default_pack_size_;
+    int nb = block_list.size();
+    return default_pack_size_ < 1 ? std::max(nb), 1) : default_pack_size_;
   }
   int DefaultNumPartitions() {
     return partition::partition_impl::IntCeil(block_list.size(), DefaultPackSize());

From 072dba2dce9b95a75a44f662b07af99daeb1f6ab Mon Sep 17 00:00:00 2001
From: jdolence <jcdolence@gmail.com>
Date: Fri, 1 Sep 2023 12:02:56 -0600
Subject: [PATCH 18/67] agh

---
 src/mesh/mesh.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mesh/mesh.hpp b/src/mesh/mesh.hpp
index a823f1ccfd4f..6441310a19f0 100644
--- a/src/mesh/mesh.hpp
+++ b/src/mesh/mesh.hpp
@@ -125,7 +125,7 @@ class Mesh {
                                               ApplicationInput *app_in);
   int DefaultPackSize() {
     int nb = block_list.size();
-    return default_pack_size_ < 1 ? std::max(nb), 1) : default_pack_size_;
+    return default_pack_size_ < 1 ? std::max(nb, 1) : default_pack_size_;
   }
   int DefaultNumPartitions() {
     return partition::partition_impl::IntCeil(block_list.size(), DefaultPackSize());

From df4f226eccd46cc4166657bad7df4f9be0ee56e3 Mon Sep 17 00:00:00 2001
From: jdolence <jcdolence@gmail.com>
Date: Thu, 7 Sep 2023 13:20:56 -0600
Subject: [PATCH 19/67] remove stray const

---
 src/interface/sparse_pack.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/interface/sparse_pack.hpp b/src/interface/sparse_pack.hpp
index acbf57adf8cb..76004520780f 100644
--- a/src/interface/sparse_pack.hpp
+++ b/src/interface/sparse_pack.hpp
@@ -160,7 +160,7 @@ class SparsePack : public SparsePackBase {
 #ifdef ENABLE_LB_TIMERS
   // Methods used for timing blocks
   KOKKOS_INLINE_FUNCTION
-  const int GetLid(const int b) const {
+  int GetLid(const int b) const {
     return lid_(b);
   }
 

From e0d4fb6b6a136e001f4ca79b47454cf8a54f9f1c Mon Sep 17 00:00:00 2001
From: jdolence <jcdolence@gmail.com>
Date: Thu, 7 Sep 2023 15:25:12 -0600
Subject: [PATCH 20/67] change require to warn for nbtotal >= nranks

---
 src/mesh/mesh.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/mesh/mesh.cpp b/src/mesh/mesh.cpp
index 4c3993517908..2e36f9c566a4 100644
--- a/src/mesh/mesh.cpp
+++ b/src/mesh/mesh.cpp
@@ -1173,9 +1173,8 @@ void Mesh::Initialize(bool init_problem, ParameterInput *pin, ApplicationInput *
     }
   } while (!init_done);
 
-  PARTHENON_REQUIRE_THROWS(nbtotal >= Globals::nranks,
-    "After initialization, there are fewer meshblocks than ranks. Parthenon requires at "
-    "least as many blocks as ranks. Change your settings and try again.");
+  PARTHENON_WARN(nbtotal >= Globals::nranks,
+    "Fewer meshblocks than ranks.  Some ranks will be idle.");
 
   // Initialize the "base" MeshData object
   mesh_data.Get()->Set(block_list);

From 188b5c2afa01cc25e7e84b0785e9a54505078c47 Mon Sep 17 00:00:00 2001
From: jdolence <jcdolence@gmail.com>
Date: Thu, 7 Sep 2023 15:33:16 -0600
Subject: [PATCH 21/67] oops

---
 src/mesh/mesh.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/mesh/mesh.cpp b/src/mesh/mesh.cpp
index 2e36f9c566a4..22539a538182 100644
--- a/src/mesh/mesh.cpp
+++ b/src/mesh/mesh.cpp
@@ -1173,8 +1173,9 @@ void Mesh::Initialize(bool init_problem, ParameterInput *pin, ApplicationInput *
     }
   } while (!init_done);
 
-  PARTHENON_WARN(nbtotal >= Globals::nranks,
-    "Fewer meshblocks than ranks.  Some ranks will be idle.");
+  if (nbtotal < Globals::nranks) {
+    PARTHENON_WARN("Fewer meshblocks than ranks.  Some ranks will be idle.");
+  }
 
   // Initialize the "base" MeshData object
   mesh_data.Get()->Set(block_list);

From 269a540e7981ee2dc1e2a59c73c9cbacbe329ca6 Mon Sep 17 00:00:00 2001
From: jdolence <jcdolence@gmail.com>
Date: Thu, 7 Sep 2023 15:49:20 -0600
Subject: [PATCH 22/67] trying to fix nbtotal < nranks

---
 src/interface/mesh_data.hpp | 8 ++++++--
 src/mesh/mesh.cpp           | 4 ++--
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/interface/mesh_data.hpp b/src/interface/mesh_data.hpp
index b5ef97f642ce..56e573d58198 100644
--- a/src/interface/mesh_data.hpp
+++ b/src/interface/mesh_data.hpp
@@ -227,13 +227,17 @@ class MeshData {
     }
   }
 
-  void Set(BlockList_t blocks) {
+  void Set(BlockList_t blocks, Mesh *pm) {
     const int nblocks = blocks.size();
     block_data_.resize(nblocks);
-    SetMeshPointer(blocks[0]->pmy_mesh);
+    SetMeshPointer(pmy);
     for (int i = 0; i < nblocks; i++) {
       block_data_[i] = blocks[i]->meshblock_data.Get(stage_name_);
     }
+
+  }
+  void Set(BlockList_t blocks) {
+    Set(blocks, blocks[0]->pmy_mesh);
   }
 
   void Initialize(const MeshData<T> *src, const std::vector<std::string> &names,
diff --git a/src/mesh/mesh.cpp b/src/mesh/mesh.cpp
index 22539a538182..43adc9d04632 100644
--- a/src/mesh/mesh.cpp
+++ b/src/mesh/mesh.cpp
@@ -1173,12 +1173,12 @@ void Mesh::Initialize(bool init_problem, ParameterInput *pin, ApplicationInput *
     }
   } while (!init_done);
 
-  if (nbtotal < Globals::nranks) {
+  if (Globals::my_rank == 0 && nbtotal < Globals::nranks) {
     PARTHENON_WARN("Fewer meshblocks than ranks.  Some ranks will be idle.");
   }
 
   // Initialize the "base" MeshData object
-  mesh_data.Get()->Set(block_list);
+  mesh_data.Get()->Set(block_list, this);
 
   Kokkos::Profiling::popRegion(); // Mesh::Initialize
 }

From e86eb331361386b93ba19508370e3ee786233154 Mon Sep 17 00:00:00 2001
From: jdolence <jcdolence@gmail.com>
Date: Thu, 7 Sep 2023 15:50:18 -0600
Subject: [PATCH 23/67] oops

---
 src/interface/mesh_data.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/interface/mesh_data.hpp b/src/interface/mesh_data.hpp
index 56e573d58198..c824830bbd24 100644
--- a/src/interface/mesh_data.hpp
+++ b/src/interface/mesh_data.hpp
@@ -230,7 +230,7 @@ class MeshData {
   void Set(BlockList_t blocks, Mesh *pm) {
     const int nblocks = blocks.size();
     block_data_.resize(nblocks);
-    SetMeshPointer(pmy);
+    SetMeshPointer(pm);
     for (int i = 0; i < nblocks; i++) {
       block_data_[i] = blocks[i]->meshblock_data.Get(stage_name_);
     }

From 9f1cea9f0bccee7c6e9768524dab754d7ff2016e Mon Sep 17 00:00:00 2001
From: jdolence <jcdolence@gmail.com>
Date: Thu, 7 Sep 2023 16:07:40 -0600
Subject: [PATCH 24/67] more things to try nblocks < nranks

---
 src/outputs/parthenon_hdf5.cpp | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/outputs/parthenon_hdf5.cpp b/src/outputs/parthenon_hdf5.cpp
index 95b89b93b627..66611c52216d 100644
--- a/src/outputs/parthenon_hdf5.cpp
+++ b/src/outputs/parthenon_hdf5.cpp
@@ -82,9 +82,10 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm
 
   auto const &first_block = *(pm->block_list.front());
 
-  const IndexRange out_ib = first_block.cellbounds.GetBoundsI(theDomain);
-  const IndexRange out_jb = first_block.cellbounds.GetBoundsJ(theDomain);
-  const IndexRange out_kb = first_block.cellbounds.GetBoundsK(theDomain);
+  auto [cellbounds, c_cellbounds] = pm->GetCellBounds();
+  const IndexRange out_ib = cellbounds.GetBoundsI(theDomain);
+  const IndexRange out_jb = cellbounds.GetBoundsJ(theDomain);
+  const IndexRange out_kb = cellbounds.GetBoundsK(theDomain);
 
   auto const nx1 = out_ib.e - out_ib.s + 1;
   auto const nx2 = out_jb.e - out_jb.s + 1;
@@ -152,7 +153,8 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm
                        info_group);
     // write number of ghost cells in simulation
     HDF5WriteAttribute("NGhost", Globals::nghost, info_group);
-    HDF5WriteAttribute("Coordinates", std::string(first_block.coords.Name()).c_str(),
+    auto coords = Coordinates_t();
+    HDF5WriteAttribute("Coordinates", std::string(coords.Name()).c_str(),
                        info_group);
 
     // restart info, write always

From bc04b4257f7e0831f1f4df37e1fc133d6499d92e Mon Sep 17 00:00:00 2001
From: jdolence <jcdolence@gmail.com>
Date: Thu, 7 Sep 2023 16:09:12 -0600
Subject: [PATCH 25/67] move GetCellBounds to public

---
 src/mesh/mesh.hpp | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/mesh/mesh.hpp b/src/mesh/mesh.hpp
index 6441310a19f0..45e92e1085ca 100644
--- a/src/mesh/mesh.hpp
+++ b/src/mesh/mesh.hpp
@@ -222,6 +222,15 @@ class Mesh {
     return resolved_packages->GetVariableNames(std::forward<Args>(args)...);
   }
 
+  std::pair<IndexShape, IndexShape> GetCellBounds() const {
+    auto cb = [&](const int rfact) {
+      return IndexShape((ndim > 2) * block_size.nx(X3DIR)/rfact,
+                        (ndim > 1) * block_size.nx(X2DIR)/rfact,
+                        block_size.nx(X1DIR)/rfact, multilevel*Globals::nghost);
+    };
+    return std::make_pair(cb(1), cb(2));
+  }
+
  private:
   // data
   int root_level, max_level, current_level;
@@ -269,15 +278,6 @@ class Mesh {
   // functions
   MeshGenFunc MeshGenerator_[4];
 
-  std::pair<IndexShape, IndexShape> GetCellBounds() const {
-    auto cb = [&](const int rfact) {
-      return IndexShape((ndim > 2) * block_size.nx(X3DIR)/rfact,
-                        (ndim > 1) * block_size.nx(X2DIR)/rfact,
-                        block_size.nx(X1DIR)/rfact, multilevel*Globals::nghost);
-    };
-    return std::make_pair(cb(1), cb(2));
-  }
-
   void ResetLoadBalanceVariables();
 
   void CalculateLoadBalance(std::vector<double> const &cost,

From de3d17d4b4b082dd186dd0cd7f9b3eaeef271bf6 Mon Sep 17 00:00:00 2001
From: jdolence <jcdolence@gmail.com>
Date: Thu, 7 Sep 2023 16:12:42 -0600
Subject: [PATCH 26/67] actually remove the line that makes the segfault

---
 src/outputs/parthenon_hdf5.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/outputs/parthenon_hdf5.cpp b/src/outputs/parthenon_hdf5.cpp
index 66611c52216d..a2607aa1dce0 100644
--- a/src/outputs/parthenon_hdf5.cpp
+++ b/src/outputs/parthenon_hdf5.cpp
@@ -80,8 +80,6 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm
   const IndexDomain theDomain =
       (output_params.include_ghost_zones ? IndexDomain::entire : IndexDomain::interior);
 
-  auto const &first_block = *(pm->block_list.front());
-
   auto [cellbounds, c_cellbounds] = pm->GetCellBounds();
   const IndexRange out_ib = cellbounds.GetBoundsI(theDomain);
   const IndexRange out_jb = cellbounds.GetBoundsJ(theDomain);

From f1fc1f5525358ed077f8788e9427808d56253f4d Mon Sep 17 00:00:00 2001
From: jdolence <jcdolence@gmail.com>
Date: Thu, 7 Sep 2023 16:20:19 -0600
Subject: [PATCH 27/67] keep trying

---
 src/outputs/parthenon_hdf5.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/outputs/parthenon_hdf5.cpp b/src/outputs/parthenon_hdf5.cpp
index a2607aa1dce0..d62e51211b8d 100644
--- a/src/outputs/parthenon_hdf5.cpp
+++ b/src/outputs/parthenon_hdf5.cpp
@@ -356,7 +356,9 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm
   // get list of all vars, just use the first block since the list is the same for all
   // blocks
   std::vector<VarInfo> all_vars_info;
-  const auto vars = get_vars(pm->block_list.front());
+  const auto vars = (pm->block_list.size() > 0) 
+                    ? get_vars(pm->block_list.front())
+                    : VariableVector<Real>();
   for (auto &v : vars) {
     all_vars_info.emplace_back(v);
   }

From 75ecb00e45bf67c79acc317fdcb5e7e30a8a61ec Mon Sep 17 00:00:00 2001
From: jdolence <jcdolence@gmail.com>
Date: Fri, 8 Sep 2023 08:36:19 -0600
Subject: [PATCH 28/67] fix comparison of max block cost to max rank cost

---
 src/mesh/amr_loadbalance.cpp | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/src/mesh/amr_loadbalance.cpp b/src/mesh/amr_loadbalance.cpp
index 18997ea327db..88d7d91aad6e 100644
--- a/src/mesh/amr_loadbalance.cpp
+++ b/src/mesh/amr_loadbalance.cpp
@@ -417,17 +417,25 @@ void AssignBlocks(const std::vector<int> &start, const std::vector<int> &nb, std
   }
 }
 
-std::tuple<double, double> BlockCostInfo(std::vector<double> const &cost) {
+std::tuple<double, double, double> BlockCostInfo(std::vector<double> const &cost,
+                                                 std::vector<int> const &start,
+                                                 std::vector<int> const &nb) {
   const int nblocks = cost.size();
   const int max_rank = std::min(Globals::nranks, nblocks);
   double avg_cost = 0.0;
   double max_block_cost = 0.0;
-  for (int b = 0; b < nblocks; b++) {
-    avg_cost += cost[b];
-    max_block_cost = std::max(max_block_cost, cost[b]);
+  double max_rank_cost = 0.0;
+  for (int i = 0; i < max_rank; i++) {
+    double rank_cost = 0.0;
+    for (int b = start[i]; b < start[i] + nb[i]; b++) {
+      rank_cost += cost[b];
+      avg_cost += cost[b];
+      max_block_cost = std::max(max_block_cost, cost[b]);
+    }
+    max_rank_cost = std::max(max_rank_cost, rank_cost);
   }
   avg_cost /= max_rank;
-  return std::make_tuple(avg_cost, max_block_cost);
+  return std::make_tuple(avg_cost, max_block_cost, max_rank_cost);
 }
 
 void SetSimpleBalance(const int nblocks, std::vector<int> &start, std::vector<int> &nb) {
@@ -451,7 +459,7 @@ void Mesh::CalculateLoadBalance(std::vector<double> const &cost,
                                 std::vector<int> &nb) {
   Kokkos::Profiling::pushRegion("CalculateLoadBalance");
   if ((lb_automatic_ || lb_manual_)) {
-    auto [avg_cost, max_block_cost] = BlockCostInfo(cost);
+    auto [avg_cost, max_block_cost, max_rank_cost] = BlockCostInfo(cost, start, nb);
     double new_max = CalculateNewBalance(cost, start, nb, avg_cost, max_block_cost);
 
   } else {
@@ -670,13 +678,13 @@ bool Mesh::RedistributeAndRefineMeshBlocks(ParameterInput *pin, ApplicationInput
     // and only move things around if needed
     if (lb_automatic_ || lb_manual_) {
       Kokkos::Profiling::pushRegion("Reloadbalance");
-      auto [avg_cost, max_block_cost] = BlockCostInfo(costlist);
+      auto [avg_cost, max_block_cost, max_rank_cost] = BlockCostInfo(costlist, nslist, nblist);
       std::vector<int> start_trial(Globals::nranks);
       std::vector<int> nb_trial(Globals::nranks);
       double new_max = CalculateNewBalance(costlist, start_trial, nb_trial, avg_cost, max_block_cost);
       Kokkos::Profiling::popRegion();
       // if the improvement isn't large enough, just return because we're done
-      if ((max_block_cost - new_max)/max_block_cost < lb_tolerance_) return false;
+      if ((max_rank_cost - new_max)/max_block_cost < lb_tolerance_) return false;
       newrank.resize(ntot);
       AssignBlocks(start_trial, nb_trial, newrank);
       nslist = std::move(start_trial);

From 1ba414c66a002a5e9c010e54ef089c2ba7fdc2b0 Mon Sep 17 00:00:00 2001
From: jdolence <jcdolence@gmail.com>
Date: Fri, 8 Sep 2023 08:58:42 -0600
Subject: [PATCH 29/67] set initial balance

---
 src/mesh/amr_loadbalance.cpp | 10 +++++++---
 src/mesh/mesh.cpp            |  1 +
 src/mesh/mesh.hpp            |  1 +
 3 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/src/mesh/amr_loadbalance.cpp b/src/mesh/amr_loadbalance.cpp
index 88d7d91aad6e..19dafe43be36 100644
--- a/src/mesh/amr_loadbalance.cpp
+++ b/src/mesh/amr_loadbalance.cpp
@@ -438,7 +438,9 @@ std::tuple<double, double, double> BlockCostInfo(std::vector<double> const &cost
   return std::make_tuple(avg_cost, max_block_cost, max_rank_cost);
 }
 
-void SetSimpleBalance(const int nblocks, std::vector<int> &start, std::vector<int> &nb) {
+} // namespace
+
+void Mesh::SetSimpleBalance(const int nblocks, std::vector<int> &start, std::vector<int> &nb) {
   const int max_rank = std::min(nblocks, Globals::nranks);
   int nassign = nblocks/max_rank;
   start[0] = 0;
@@ -450,10 +452,12 @@ void SetSimpleBalance(const int nblocks, std::vector<int> &start, std::vector<in
     nb[i] = nassign;
     nassigned += nassign;
   }
+  for (int i = max_rank; i < Globals::nranks; i++) {
+    start[i] = 0;
+    nb[i] = 0;
+  }
 }
 
-} // namespace
-
 void Mesh::CalculateLoadBalance(std::vector<double> const &cost,
                                 std::vector<int> &rank, std::vector<int> &start,
                                 std::vector<int> &nb) {
diff --git a/src/mesh/mesh.cpp b/src/mesh/mesh.cpp
index 43adc9d04632..3b9b58e17f44 100644
--- a/src/mesh/mesh.cpp
+++ b/src/mesh/mesh.cpp
@@ -411,6 +411,7 @@ Mesh::Mesh(ParameterInput *pin, ApplicationInput *app_in, Packages_t &packages,
 
   nslist = std::vector<int>(Globals::nranks);
   nblist = std::vector<int>(Globals::nranks);
+  SetSimpleBalance(nbtotal, nslist, nblist);
   if (adaptive) { // allocate arrays for AMR
     nref = std::vector<int>(Globals::nranks);
     nderef = std::vector<int>(Globals::nranks);
diff --git a/src/mesh/mesh.hpp b/src/mesh/mesh.hpp
index 45e92e1085ca..c5ce1114b744 100644
--- a/src/mesh/mesh.hpp
+++ b/src/mesh/mesh.hpp
@@ -280,6 +280,7 @@ class Mesh {
 
   void ResetLoadBalanceVariables();
 
+  void SetSimpleBalance(const int nblocks, std::vector<int> &start, std::vector<int> &nb);
   void CalculateLoadBalance(std::vector<double> const &cost,
                             std::vector<int> &rank, std::vector<int> &start,
                             std::vector<int> &nb);

From b01afeb9b20ae7a16c7aa49ee6c00b813012cdec Mon Sep 17 00:00:00 2001
From: jdolence <jcdolence@gmail.com>
Date: Fri, 8 Sep 2023 10:18:55 -0600
Subject: [PATCH 30/67] fix some more stuff

---
 src/mesh/amr_loadbalance.cpp | 17 +++++++++--------
 src/mesh/mesh.cpp            |  1 -
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/mesh/amr_loadbalance.cpp b/src/mesh/amr_loadbalance.cpp
index 19dafe43be36..44a96bbdaf6b 100644
--- a/src/mesh/amr_loadbalance.cpp
+++ b/src/mesh/amr_loadbalance.cpp
@@ -379,13 +379,13 @@ double CalculateNewBalance(std::vector<double> const &cost, std::vector<int> &st
   // set the bounds for the search
   double a = avg_cost;
   double b = std::max(2.0*avg_cost, 1.01*max_block_cost);
-  Real h = b - a;
-  constexpr Real invphi = 0.618033988749894848204586834366;
-  constexpr Real invphi2 = 0.38196601125010515179541316563436188228;
-  Real c = a + invphi2 * h;
-  Real d = a + invphi * h;
-  Real yc = DistributeTrial(cost, start, nb, max_rank, c);
-  Real yd = DistributeTrial(cost, start, nb, max_rank, d);
+  double h = b - a;
+  constexpr double invphi = 0.618033988749894848204586834366;
+  constexpr double invphi2 = 0.38196601125010515179541316563436188228;
+  double c = a + invphi2 * h;
+  double d = a + invphi * h;
+  double yc = DistributeTrial(cost, start, nb, max_rank, c);
+  double yd = DistributeTrial(cost, start, nb, max_rank, d);
 
   while (yc != yd) {
     if (yc < yd) {
@@ -463,6 +463,7 @@ void Mesh::CalculateLoadBalance(std::vector<double> const &cost,
                                 std::vector<int> &nb) {
   Kokkos::Profiling::pushRegion("CalculateLoadBalance");
   if ((lb_automatic_ || lb_manual_)) {
+    SetSimpleBalance(cost.size(), start, nb);
     auto [avg_cost, max_block_cost, max_rank_cost] = BlockCostInfo(cost, start, nb);
     double new_max = CalculateNewBalance(cost, start, nb, avg_cost, max_block_cost);
 
@@ -688,7 +689,7 @@ bool Mesh::RedistributeAndRefineMeshBlocks(ParameterInput *pin, ApplicationInput
       double new_max = CalculateNewBalance(costlist, start_trial, nb_trial, avg_cost, max_block_cost);
       Kokkos::Profiling::popRegion();
       // if the improvement isn't large enough, just return because we're done
-      if ((max_rank_cost - new_max)/max_block_cost < lb_tolerance_) return false;
+      if ((max_rank_cost - new_max)/max_rank_cost < lb_tolerance_) return false;
       newrank.resize(ntot);
       AssignBlocks(start_trial, nb_trial, newrank);
       nslist = std::move(start_trial);
diff --git a/src/mesh/mesh.cpp b/src/mesh/mesh.cpp
index 3b9b58e17f44..43adc9d04632 100644
--- a/src/mesh/mesh.cpp
+++ b/src/mesh/mesh.cpp
@@ -411,7 +411,6 @@ Mesh::Mesh(ParameterInput *pin, ApplicationInput *app_in, Packages_t &packages,
 
   nslist = std::vector<int>(Globals::nranks);
   nblist = std::vector<int>(Globals::nranks);
-  SetSimpleBalance(nbtotal, nslist, nblist);
   if (adaptive) { // allocate arrays for AMR
     nref = std::vector<int>(Globals::nranks);
     nderef = std::vector<int>(Globals::nranks);

From 9fe2aff1618991571c989f271dcb7918c1d17c0a Mon Sep 17 00:00:00 2001
From: jdolence <jcdolence@gmail.com>
Date: Fri, 8 Sep 2023 10:59:44 -0600
Subject: [PATCH 31/67] missed setting some block_lid fields

---
 src/bvals/comms/bnd_info.cpp | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/bvals/comms/bnd_info.cpp b/src/bvals/comms/bnd_info.cpp
index bd1ef608681e..fdb7de92dc9c 100644
--- a/src/bvals/comms/bnd_info.cpp
+++ b/src/bvals/comms/bnd_info.cpp
@@ -222,6 +222,8 @@ ProResInfo ProResInfo::GetInteriorRestrict(std::shared_ptr<MeshBlock> pmb,
                                            std::shared_ptr<Variable<Real>> v) {
   ProResInfo out;
 
+  out.block_lid = pmb->lid;
+
   out.allocated = v->IsAllocated();
   if (!out.allocated) return out;
 
@@ -256,6 +258,8 @@ ProResInfo ProResInfo::GetInteriorProlongate(std::shared_ptr<MeshBlock> pmb,
                                              std::shared_ptr<Variable<Real>> v) {
   ProResInfo out;
 
+  out.block_lid = pmb->lid;
+
   out.allocated = v->IsAllocated();
   if (!out.allocated) return out;
 
@@ -322,12 +326,12 @@ ProResInfo ProResInfo::GetSend(std::shared_ptr<MeshBlock> pmb, const NeighborBlo
 ProResInfo ProResInfo::GetSet(std::shared_ptr<MeshBlock> pmb, const NeighborBlock &nb,
                               std::shared_ptr<Variable<Real>> v) {
   ProResInfo out;
+  out.block_lid = pmb->lid;
   out.allocated = v->IsAllocated();
   int Nv = v->GetDim(4);
   int Nu = v->GetDim(5);
   int Nt = v->GetDim(6);
 
-  out.block_lid = pmb->lid;
   int mylevel = pmb->loc.level();
   out.coords = pmb->coords;
   if (pmb->pmr) out.coarse_coords = pmb->pmr->GetCoarseCoords();
@@ -384,6 +388,7 @@ BndInfo BndInfo::GetSetBndInfo(std::shared_ptr<MeshBlock> pmb, const NeighborBlo
                                std::shared_ptr<Variable<Real>> v,
                                CommBuffer<buf_pool_t<Real>::owner_t> *buf) {
   BndInfo out;
+  out.block_lid = pmb->lid;
   out.buf = buf->buffer();
   auto buf_state = buf->GetState();
   if (buf_state == BufferState::received) {
@@ -399,7 +404,6 @@ BndInfo BndInfo::GetSetBndInfo(std::shared_ptr<MeshBlock> pmb, const NeighborBlo
   int Nu = v->GetDim(5);
   int Nt = v->GetDim(6);
 
-  out.block_lid = pmb->lid;
   int mylevel = pmb->loc.level();
 
   auto elements = v->GetTopologicalElements();

From 4a0a7ddce7d240c18edd1d965d53b307dab1893b Mon Sep 17 00:00:00 2001
From: jdolence <jcdolence@gmail.com>
Date: Fri, 8 Sep 2023 14:02:02 -0600
Subject: [PATCH 32/67] playing with sparse packs...

---
 src/interface/sparse_pack_base.cpp | 29 ++++++++++++++++++-----------
 src/interface/sparse_pack_base.hpp |  3 ++-
 2 files changed, 20 insertions(+), 12 deletions(-)

diff --git a/src/interface/sparse_pack_base.cpp b/src/interface/sparse_pack_base.cpp
index 371a45c141de..4a0c7795bf8f 100644
--- a/src/interface/sparse_pack_base.cpp
+++ b/src/interface/sparse_pack_base.cpp
@@ -289,17 +289,17 @@ template SparsePackBase SparsePackBase::Build<MeshData<Real>>(MeshData<Real> *,
 template <class T>
 SparsePackBase &SparsePackCache::Get(T *pmd, const PackDescriptor &desc,
                                      const std::vector<bool> &include_block) {
-  std::string ident = GetIdentifier(desc, include_block);
+  std::string ident = GetIdentifier(pmd, desc, include_block);
   if (pack_map.count(ident) > 0) {
-    auto &pack = pack_map[ident].first;
-    auto alloc_status_in = SparsePackBase::GetAllocStatus(pmd, desc, include_block);
-    auto &alloc_status = pack_map[ident].second;
-    if (alloc_status.size() != alloc_status_in.size())
-      return BuildAndAdd(pmd, desc, ident, include_block);
-    for (int i = 0; i < alloc_status_in.size(); ++i) {
-      if (alloc_status[i] != alloc_status_in[i])
-        return BuildAndAdd(pmd, desc, ident, include_block);
-    }
+    //auto &pack = pack_map[ident].first;
+    //auto alloc_status_in = SparsePackBase::GetAllocStatus(pmd, desc, include_block);
+    //auto &alloc_status = pack_map[ident].second;
+    //if (alloc_status.size() != alloc_status_in.size())
+      //return BuildAndAdd(pmd, desc, ident, include_block);
+    //for (int i = 0; i < alloc_status_in.size(); ++i) {
+      //if (alloc_status[i] != alloc_status_in[i])
+        //return BuildAndAdd(pmd, desc, ident, include_block);
+    //}
     // Cached version is not stale, so just return a reference to it
     return pack_map[ident].first;
   }
@@ -329,7 +329,8 @@ template SparsePackBase &SparsePackCache::BuildAndAdd<MeshBlockData<Real>>(
     MeshBlockData<Real> *, const PackDescriptor &, const std::string &,
     const std::vector<bool> &);
 
-std::string SparsePackCache::GetIdentifier(const PackDescriptor &desc,
+template <class T>
+std::string SparsePackCache::GetIdentifier(T *pmd, const PackDescriptor &desc,
                                            const std::vector<bool> &include_block) const {
   std::string identifier("");
   for (const auto &vgroup : desc.var_groups) {
@@ -341,9 +342,15 @@ std::string SparsePackCache::GetIdentifier(const PackDescriptor &desc,
   identifier += std::to_string(desc.with_fluxes);
   identifier += std::to_string(desc.coarse);
   identifier += std::to_string(desc.flat);
+  identifier += "|";
   for (const auto b : include_block) {
     identifier += std::to_string(b);
   }
+  identifier += "|";
+  auto alloc_status = SparsePackBase::GetAllocStatus(pmd, desc, include_block);
+  for (const auto a : alloc_status) {
+    identifier += std::to_string(a);
+  }
   return identifier;
 }
 
diff --git a/src/interface/sparse_pack_base.hpp b/src/interface/sparse_pack_base.hpp
index e021a49db2f3..4a49135468a5 100644
--- a/src/interface/sparse_pack_base.hpp
+++ b/src/interface/sparse_pack_base.hpp
@@ -124,7 +124,8 @@ class SparsePackCache {
                               const std::string &ident,
                               const std::vector<bool> &include_block);
 
-  std::string GetIdentifier(const impl::PackDescriptor &desc,
+  template <class T>
+  std::string GetIdentifier(T *pmd, const impl::PackDescriptor &desc,
                             const std::vector<bool> &include_block) const;
   std::unordered_map<std::string, std::pair<SparsePackBase, SparsePackBase::alloc_t>>
       pack_map;

From 036eaea2290c936cb3ddc99b1590c8a591beaad1 Mon Sep 17 00:00:00 2001
From: jdolence <jcdolence@gmail.com>
Date: Fri, 8 Sep 2023 14:46:50 -0600
Subject: [PATCH 33/67] revert that change

---
 src/interface/sparse_pack_base.cpp | 28 +++++++++++-----------------
 src/interface/sparse_pack_base.hpp |  3 +--
 2 files changed, 12 insertions(+), 19 deletions(-)

diff --git a/src/interface/sparse_pack_base.cpp b/src/interface/sparse_pack_base.cpp
index 4a0c7795bf8f..a7ac4e3f5037 100644
--- a/src/interface/sparse_pack_base.cpp
+++ b/src/interface/sparse_pack_base.cpp
@@ -289,17 +289,17 @@ template SparsePackBase SparsePackBase::Build<MeshData<Real>>(MeshData<Real> *,
 template <class T>
 SparsePackBase &SparsePackCache::Get(T *pmd, const PackDescriptor &desc,
                                      const std::vector<bool> &include_block) {
-  std::string ident = GetIdentifier(pmd, desc, include_block);
+  std::string ident = GetIdentifier(desc, include_block);
   if (pack_map.count(ident) > 0) {
-    //auto &pack = pack_map[ident].first;
-    //auto alloc_status_in = SparsePackBase::GetAllocStatus(pmd, desc, include_block);
-    //auto &alloc_status = pack_map[ident].second;
-    //if (alloc_status.size() != alloc_status_in.size())
-      //return BuildAndAdd(pmd, desc, ident, include_block);
-    //for (int i = 0; i < alloc_status_in.size(); ++i) {
-      //if (alloc_status[i] != alloc_status_in[i])
-        //return BuildAndAdd(pmd, desc, ident, include_block);
-    //}
+    auto &pack = pack_map[ident].first;
+    auto alloc_status_in = SparsePackBase::GetAllocStatus(pmd, desc, include_block);
+    auto &alloc_status = pack_map[ident].second;
+    if (alloc_status.size() != alloc_status_in.size())
+      return BuildAndAdd(pmd, desc, ident, include_block);
+    for (int i = 0; i < alloc_status_in.size(); ++i) {
+      if (alloc_status[i] != alloc_status_in[i])
+        return BuildAndAdd(pmd, desc, ident, include_block);
+    }
     // Cached version is not stale, so just return a reference to it
     return pack_map[ident].first;
   }
@@ -329,8 +329,7 @@ template SparsePackBase &SparsePackCache::BuildAndAdd<MeshBlockData<Real>>(
     MeshBlockData<Real> *, const PackDescriptor &, const std::string &,
     const std::vector<bool> &);
 
-template <class T>
-std::string SparsePackCache::GetIdentifier(T *pmd, const PackDescriptor &desc,
+std::string SparsePackCache::GetIdentifier(const PackDescriptor &desc,
                                            const std::vector<bool> &include_block) const {
   std::string identifier("");
   for (const auto &vgroup : desc.var_groups) {
@@ -346,11 +345,6 @@ std::string SparsePackCache::GetIdentifier(T *pmd, const PackDescriptor &desc,
   for (const auto b : include_block) {
     identifier += std::to_string(b);
   }
-  identifier += "|";
-  auto alloc_status = SparsePackBase::GetAllocStatus(pmd, desc, include_block);
-  for (const auto a : alloc_status) {
-    identifier += std::to_string(a);
-  }
   return identifier;
 }
 
diff --git a/src/interface/sparse_pack_base.hpp b/src/interface/sparse_pack_base.hpp
index 4a49135468a5..e021a49db2f3 100644
--- a/src/interface/sparse_pack_base.hpp
+++ b/src/interface/sparse_pack_base.hpp
@@ -124,8 +124,7 @@ class SparsePackCache {
                               const std::string &ident,
                               const std::vector<bool> &include_block);
 
-  template <class T>
-  std::string GetIdentifier(T *pmd, const impl::PackDescriptor &desc,
+  std::string GetIdentifier(const impl::PackDescriptor &desc,
                             const std::vector<bool> &include_block) const;
   std::unordered_map<std::string, std::pair<SparsePackBase, SparsePackBase::alloc_t>>
       pack_map;

From bb31132788daa8e6860964e6c484c0483bab997c Mon Sep 17 00:00:00 2001
From: jdolence <jcdolence@gmail.com>
Date: Fri, 8 Sep 2023 16:00:12 -0600
Subject: [PATCH 34/67] play more with sparse pack stuff

---
 src/interface/sparse_pack_base.cpp | 29 +++++++++++++----------------
 src/interface/sparse_pack_base.hpp | 22 +++++++++++++++++++---
 2 files changed, 32 insertions(+), 19 deletions(-)

diff --git a/src/interface/sparse_pack_base.cpp b/src/interface/sparse_pack_base.cpp
index a7ac4e3f5037..5711f6662b58 100644
--- a/src/interface/sparse_pack_base.cpp
+++ b/src/interface/sparse_pack_base.cpp
@@ -289,21 +289,21 @@ template SparsePackBase SparsePackBase::Build<MeshData<Real>>(MeshData<Real> *,
 template <class T>
 SparsePackBase &SparsePackCache::Get(T *pmd, const PackDescriptor &desc,
                                      const std::vector<bool> &include_block) {
-  std::string ident = GetIdentifier(desc, include_block);
-  if (pack_map.count(ident) > 0) {
-    auto &pack = pack_map[ident].first;
+  //std::string ident = GetIdentifier(desc, include_block);
+  if (auto pack_pair = pack_map.find(desc.identifier); pack_pair != pack_map.end()) {
+    auto &pack = pack_pair->second;
     auto alloc_status_in = SparsePackBase::GetAllocStatus(pmd, desc, include_block);
-    auto &alloc_status = pack_map[ident].second;
+    auto &alloc_status = pack.second;
     if (alloc_status.size() != alloc_status_in.size())
-      return BuildAndAdd(pmd, desc, ident, include_block);
+      return BuildAndAdd(pmd, desc, include_block);
     for (int i = 0; i < alloc_status_in.size(); ++i) {
       if (alloc_status[i] != alloc_status_in[i])
-        return BuildAndAdd(pmd, desc, ident, include_block);
+        return BuildAndAdd(pmd, desc, include_block);
     }
     // Cached version is not stale, so just return a reference to it
-    return pack_map[ident].first;
+    return pack.first;
   }
-  return BuildAndAdd(pmd, desc, ident, include_block);
+  return BuildAndAdd(pmd, desc, include_block);
 }
 template SparsePackBase &SparsePackCache::Get<MeshData<Real>>(MeshData<Real> *,
                                                               const PackDescriptor &,
@@ -314,20 +314,17 @@ SparsePackCache::Get<MeshBlockData<Real>>(MeshBlockData<Real> *, const PackDescr
 
 template <class T>
 SparsePackBase &SparsePackCache::BuildAndAdd(T *pmd, const PackDescriptor &desc,
-                                             const std::string &ident,
                                              const std::vector<bool> &include_block) {
-  if (pack_map.count(ident) > 0) pack_map.erase(ident);
-  pack_map[ident] = {SparsePackBase::Build(pmd, desc, include_block),
-                     SparsePackBase::GetAllocStatus(pmd, desc, include_block)};
-  return pack_map[ident].first;
+  if (pack_map.count(desc.identifier) > 0) pack_map.erase(desc.identifier);
+  pack_map[desc.identifier] = {SparsePackBase::Build(pmd, desc, include_block),
+                               SparsePackBase::GetAllocStatus(pmd, desc, include_block)};
+  return pack_map[desc.identifier].first;
 }
 template SparsePackBase &
 SparsePackCache::BuildAndAdd<MeshData<Real>>(MeshData<Real> *, const PackDescriptor &,
-                                             const std::string &,
                                              const std::vector<bool> &);
 template SparsePackBase &SparsePackCache::BuildAndAdd<MeshBlockData<Real>>(
-    MeshBlockData<Real> *, const PackDescriptor &, const std::string &,
-    const std::vector<bool> &);
+    MeshBlockData<Real> *, const PackDescriptor &, const std::vector<bool> &);
 
 std::string SparsePackCache::GetIdentifier(const PackDescriptor &desc,
                                            const std::vector<bool> &include_block) const {
diff --git a/src/interface/sparse_pack_base.hpp b/src/interface/sparse_pack_base.hpp
index e021a49db2f3..af13b22b108c 100644
--- a/src/interface/sparse_pack_base.hpp
+++ b/src/interface/sparse_pack_base.hpp
@@ -121,7 +121,6 @@ class SparsePackCache {
 
   template <class T>
   SparsePackBase &BuildAndAdd(T *pmd, const impl::PackDescriptor &desc,
-                              const std::string &ident,
                               const std::vector<bool> &include_block);
 
   std::string GetIdentifier(const impl::PackDescriptor &desc,
@@ -143,7 +142,7 @@ struct PackDescriptor {
   // default constructor needed for certain use cases
   PackDescriptor()
       : nvar_groups(0), var_group_names({}), var_groups({}), with_fluxes(false),
-        coarse(false), flat(false) {}
+        coarse(false), flat(false), identifier("") {}
 
   template <class GROUP_t, class SELECTOR_t>
   PackDescriptor(StateDescriptor *psd, const std::vector<GROUP_t> &var_groups_in,
@@ -151,7 +150,8 @@ struct PackDescriptor {
       : nvar_groups(var_groups_in.size()), var_group_names(MakeGroupNames(var_groups_in)),
         var_groups(BuildUids(var_groups_in.size(), psd, selector)),
         with_fluxes(options.count(PDOpt::WithFluxes)),
-        coarse(options.count(PDOpt::Coarse)), flat(options.count(PDOpt::Flatten)) {
+        coarse(options.count(PDOpt::Coarse)), flat(options.count(PDOpt::Flatten)),
+        identifier(GetIdentifier()) {
     PARTHENON_REQUIRE(!(with_fluxes && coarse),
                       "Probably shouldn't be making a coarse pack with fine fluxes.");
   }
@@ -162,8 +162,24 @@ struct PackDescriptor {
   const bool with_fluxes;
   const bool coarse;
   const bool flat;
+  const std::string identifier;
+
 
  private:
+  std::string GetIdentifier() {
+    std::string ident("");
+    for (const auto &vgroup : var_groups) {
+      for (const auto &[vid, uid] : vgroup) {
+        ident += std::to_string(uid) + "_";
+      }
+      ident += "|";
+    }
+    ident += std::to_string(with_fluxes);
+    ident += std::to_string(coarse);
+    ident += std::to_string(flat);
+    return ident;
+  }
+
   template <class FUNC_t>
   std::vector<PackDescriptor::VariableGroup_t>
   BuildUids(int nvgs, const StateDescriptor *const psd, const FUNC_t &selector) {

From 50c7acfffb0131fe44f5b63c0b77b517a0915c98 Mon Sep 17 00:00:00 2001
From: jdolence <jcdolence@gmail.com>
Date: Thu, 28 Sep 2023 10:36:14 -0600
Subject: [PATCH 35/67] add BlockTimer to SparseDealloc

---
 src/interface/update.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/interface/update.cpp b/src/interface/update.cpp
index e4da9849f8d2..2127d9ec1e8e 100644
--- a/src/interface/update.cpp
+++ b/src/interface/update.cpp
@@ -167,6 +167,7 @@ TaskStatus SparseDealloc(MeshData<Real> *md) {
       Kokkos::TeamPolicy<>(parthenon::DevExecSpace(), pack.GetNBlocks(), Kokkos::AUTO),
       KOKKOS_LAMBDA(parthenon::team_mbr_t team_member) {
         const int b = team_member.league_rank();
+        BlockTimer(team_member, pack, b);
 
         const int lo = pack.GetLowerBound(b);
         const int hi = pack.GetUpperBound(b);

From f70f2f15c4e551b45a29c195ddb700092fcd950e Mon Sep 17 00:00:00 2001
From: jdolence <jcdolence@gmail.com>
Date: Tue, 24 Oct 2023 19:22:17 -0600
Subject: [PATCH 36/67] typo and missing include

---
 src/interface/update.cpp     | 1 +
 src/mesh/amr_loadbalance.cpp | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/interface/update.cpp b/src/interface/update.cpp
index 2127d9ec1e8e..108a847c11d2 100644
--- a/src/interface/update.cpp
+++ b/src/interface/update.cpp
@@ -25,6 +25,7 @@
 #include "interface/variable_pack.hpp"
 #include "mesh/mesh.hpp"
 #include "mesh/meshblock.hpp"
+#include "utils/block_timer.hpp"
 
 #include "kokkos_abstraction.hpp"
 #include "mesh/meshblock_pack.hpp"
diff --git a/src/mesh/amr_loadbalance.cpp b/src/mesh/amr_loadbalance.cpp
index 4db6ed6b2b53..4d0eb0f45cd9 100644
--- a/src/mesh/amr_loadbalance.cpp
+++ b/src/mesh/amr_loadbalance.cpp
@@ -990,7 +990,7 @@ bool Mesh::RedistributeAndRefineMeshBlocks(ParameterInput *pin, ApplicationInput
   // Internal refinement relies on the fine shared values, which are only consistent after
   // being updated with any previously fine versions
   refinement::ProlongateInternal(resolved_packages.get(), prolongation_cache,
-                                 cellbounds, fcellc_cellbounds);
+                                 cellbounds, c_cellbounds);
 
   // Rebuild just the ownership model, this time weighting the "new" fine blocks just like
   // any other blocks at their level.

From 2a4cf88174ef2bc8ec4b9d76c6e6d1d76a9c25d6 Mon Sep 17 00:00:00 2001
From: jdolence <jcdolence@gmail.com>
Date: Mon, 30 Oct 2023 12:00:29 -0600
Subject: [PATCH 37/67] fix broken merge

---
 src/mesh/amr_loadbalance.cpp | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/src/mesh/amr_loadbalance.cpp b/src/mesh/amr_loadbalance.cpp
index 4d0eb0f45cd9..e20da0702c5e 100644
--- a/src/mesh/amr_loadbalance.cpp
+++ b/src/mesh/amr_loadbalance.cpp
@@ -964,6 +964,11 @@ bool Mesh::RedistributeAndRefineMeshBlocks(ParameterInput *pin, ApplicationInput
   loclist = std::move(newloc);
   ranklist = std::move(newrank);
   costlist = std::move(newcost);
+#ifdef ENABLE_LB_TIMERS
+  block_cost.Realloc(nbe - nbs + 1);
+#else
+  block_cost.resize(nbe - nbs + 1);
+#endif
 
   // A block newly refined and prolongated may have neighbors which were
   // already refined to the new level.
@@ -1000,12 +1005,6 @@ bool Mesh::RedistributeAndRefineMeshBlocks(ParameterInput *pin, ApplicationInput
   
   Kokkos::Profiling::popRegion(); // AMR: Recv data and unpack
 
-#ifdef ENABLE_LB_TIMERS
-  block_cost.Realloc(nbe - nbs + 1);
-#else
-  block_cost.resize(nbe - nbs + 1);
-#endif
-
   ResetLoadBalanceVariables();
 
   Kokkos::Profiling::popRegion(); // RedistributeAndRefineMeshBlocks

From 5578924fdfd154f350a4dba0c76fbde781f1ac4b Mon Sep 17 00:00:00 2001
From: jdolence <jcdolence@gmail.com>
Date: Mon, 30 Oct 2023 13:46:51 -0600
Subject: [PATCH 38/67] try again

---
 src/mesh/amr_loadbalance.cpp | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/mesh/amr_loadbalance.cpp b/src/mesh/amr_loadbalance.cpp
index e20da0702c5e..88d3f47edcc3 100644
--- a/src/mesh/amr_loadbalance.cpp
+++ b/src/mesh/amr_loadbalance.cpp
@@ -767,6 +767,12 @@ bool Mesh::RedistributeAndRefineMeshBlocks(ParameterInput *pin, ApplicationInput
   int nbs = nslist[Globals::my_rank];
   int nbe = nbs + nblist[Globals::my_rank] - 1;
 
+#ifdef ENABLE_LB_TIMERS
+  block_cost.Realloc(nbe - nbs + 1);
+#else
+  block_cost.resize(nbe - nbs + 1);
+#endif
+
   // Restrict fine to coarse buffers
   ProResCache_t restriction_cache;
   int nrestrict = 0;
@@ -964,11 +970,6 @@ bool Mesh::RedistributeAndRefineMeshBlocks(ParameterInput *pin, ApplicationInput
   loclist = std::move(newloc);
   ranklist = std::move(newrank);
   costlist = std::move(newcost);
-#ifdef ENABLE_LB_TIMERS
-  block_cost.Realloc(nbe - nbs + 1);
-#else
-  block_cost.resize(nbe - nbs + 1);
-#endif
 
   // A block newly refined and prolongated may have neighbors which were
   // already refined to the new level.

From 863ec24290ba52a390709634ad9e335b7d555605 Mon Sep 17 00:00:00 2001
From: jdolence <jcdolence@gmail.com>
Date: Tue, 31 Oct 2023 10:59:25 -0600
Subject: [PATCH 39/67] wrap profiling calls in a macro/struct

---
 benchmarks/burgers/burgers_package.cpp        |   6 +-
 example/advection/advection_package.cpp       |   3 +-
 example/particles/particles.cpp               |  15 +-
 example/poisson/poisson_package.cpp           |   6 +-
 .../sparse_advection_package.cpp              |   3 +-
 .../stochastic_subgrid_package.cpp            |   8 +-
 src/amr_criteria/refinement_package.cpp       |   6 +-
 src/bvals/boundary_conditions.cpp             |   3 +-
 src/bvals/bvals_base.cpp                      |   5 +-
 src/bvals/comms/boundary_communication.cpp    |  17 +-
 src/bvals/comms/build_boundary_buffers.cpp    |   3 +-
 src/bvals/comms/flux_correction.cpp           |  15 +-
 src/driver/driver.cpp                         |  79 +-
 src/driver/multistage.hpp                     |   6 +-
 src/interface/meshblock_data.cpp              |   3 +-
 src/interface/swarm_container.cpp             |  26 +-
 src/interface/update.cpp                      |   4 +-
 src/interface/update.hpp                      |  60 +-
 src/kokkos_abstraction.hpp                    |  13 +-
 src/mesh/amr_loadbalance.cpp                  | 420 +++++-----
 src/mesh/mesh.cpp                             |   4 +-
 src/outputs/outputs.cpp                       |   3 +-
 src/outputs/parthenon_hdf5.cpp                | 740 +++++++++---------
 src/parthenon/prelude.hpp                     |   1 +
 24 files changed, 692 insertions(+), 757 deletions(-)

diff --git a/benchmarks/burgers/burgers_package.cpp b/benchmarks/burgers/burgers_package.cpp
index 64201440b80b..b66bc69c2430 100644
--- a/benchmarks/burgers/burgers_package.cpp
+++ b/benchmarks/burgers/burgers_package.cpp
@@ -127,7 +127,7 @@ void CalculateDerived(MeshData<Real> *md) {
 
 // provide the routine that estimates a stable timestep for this package
 Real EstimateTimestepMesh(MeshData<Real> *md) {
-  Kokkos::Profiling::pushRegion("Task_burgers_EstimateTimestepMesh");
+  PARTHENON_INSTRUMENT
   auto pm = md->GetParentPointer();
   IndexRange ib = md->GetBoundsI(IndexDomain::interior);
   IndexRange jb = md->GetBoundsJ(IndexDomain::interior);
@@ -155,14 +155,13 @@ Real EstimateTimestepMesh(MeshData<Real> *md) {
       },
       Kokkos::Min<Real>(min_dt));
 
-  Kokkos::Profiling::popRegion(); // Task_burgers_EstimateTimestepMesh
   return cfl * min_dt;
 }
 
 TaskStatus CalculateFluxes(MeshData<Real> *md) {
   using parthenon::ScratchPad1D;
   using parthenon::team_mbr_t;
-  Kokkos::Profiling::pushRegion("Task_burgers_CalculateFluxes");
+  PARTHENON_INSTRUMENT
 
   auto pm = md->GetParentPointer();
   const int ndim = pm->ndim;
@@ -360,7 +359,6 @@ TaskStatus CalculateFluxes(MeshData<Real> *md) {
         }
       });
 
-  Kokkos::Profiling::popRegion(); // Task_burgers_CalculateFluxes
   return TaskStatus::complete;
 }
 
diff --git a/example/advection/advection_package.cpp b/example/advection/advection_package.cpp
index f618de1758c3..52f8d2c69bda 100644
--- a/example/advection/advection_package.cpp
+++ b/example/advection/advection_package.cpp
@@ -438,7 +438,7 @@ Real EstimateTimestepBlock(MeshBlockData<Real> *rc) {
 TaskStatus CalculateFluxes(std::shared_ptr<MeshBlockData<Real>> &rc) {
   using parthenon::MetadataFlag;
 
-  Kokkos::Profiling::pushRegion("Task_Advection_CalculateFluxes");
+  PARTHENON_INSTRUMENT
   auto pmb = rc->GetBlockPointer();
 
   IndexRange ib = pmb->cellbounds.GetBoundsI(IndexDomain::interior);
@@ -581,7 +581,6 @@ TaskStatus CalculateFluxes(std::shared_ptr<MeshBlockData<Real>> &rc) {
         });
   }
 
-  Kokkos::Profiling::popRegion(); // Task_Advection_CalculateFluxes
   return TaskStatus::complete;
 }
 
diff --git a/example/particles/particles.cpp b/example/particles/particles.cpp
index 5e5648518d3e..5a112977cc23 100644
--- a/example/particles/particles.cpp
+++ b/example/particles/particles.cpp
@@ -130,7 +130,7 @@ Real EstimateTimestepBlock(MeshBlockData<Real> *rc) {
 // first some helper tasks
 
 TaskStatus DestroySomeParticles(MeshBlock *pmb) {
-  Kokkos::Profiling::pushRegion("Task_Particles_DestroySomeParticles");
+  PARTHENON_INSTRUMENT
 
   auto pkg = pmb->packages.Get("particles_package");
   auto swarm = pmb->swarm_data.Get()->Get("my_particles");
@@ -156,7 +156,6 @@ TaskStatus DestroySomeParticles(MeshBlock *pmb) {
   // Remove marked particles
   swarm->RemoveMarkedParticles();
 
-  Kokkos::Profiling::popRegion(); // Task_Particles_DestroySomeParticles
   return TaskStatus::complete;
 }
 
@@ -172,7 +171,7 @@ TaskStatus SortParticlesIfUsingPerCellDeposition(MeshBlock *pmb) {
 }
 
 TaskStatus DepositParticles(MeshBlock *pmb) {
-  Kokkos::Profiling::pushRegion("Task_Particles_DepositParticles");
+  PARTHENON_INSTRUMENT
 
   auto swarm = pmb->swarm_data.Get()->Get("my_particles");
 
@@ -238,12 +237,11 @@ TaskStatus DepositParticles(MeshBlock *pmb) {
         });
   }
 
-  Kokkos::Profiling::popRegion(); // Task_Particles_DepositParticles
   return TaskStatus::complete;
 }
 
 TaskStatus CreateSomeParticles(MeshBlock *pmb, const double t0) {
-  Kokkos::Profiling::pushRegion("Task_Particles_CreateSomeParticles");
+  PARTHENON_INSTRUMENT
 
   auto pkg = pmb->packages.Get("particles_package");
   auto swarm = pmb->swarm_data.Get()->Get("my_particles");
@@ -351,13 +349,12 @@ TaskStatus CreateSomeParticles(MeshBlock *pmb, const double t0) {
         });
   }
 
-  Kokkos::Profiling::popRegion(); // Task_Particles_CreateSomeParticles
   return TaskStatus::complete;
 }
 
 TaskStatus TransportParticles(MeshBlock *pmb, const StagedIntegrator *integrator,
                               const double t0) {
-  Kokkos::Profiling::pushRegion("Task_Particles_TransportParticles");
+  PARTHENON_INSTRUMENT
 
   auto swarm = pmb->swarm_data.Get()->Get("my_particles");
   auto pkg = pmb->packages.Get("particles_package");
@@ -478,7 +475,6 @@ TaskStatus TransportParticles(MeshBlock *pmb, const StagedIntegrator *integrator
         });
   }
 
-  Kokkos::Profiling::popRegion(); // Task_Particles_TransportParticles
   return TaskStatus::complete;
 }
 
@@ -520,7 +516,7 @@ TaskListStatus ParticleDriver::Step() {
 // TODO(BRR) This should really be in parthenon/src... but it can't just live in Swarm
 // because of the loop over blocks
 TaskStatus StopCommunicationMesh(const BlockList_t &blocks) {
-  Kokkos::Profiling::pushRegion("Task_Particles_StopCommunicationMesh");
+  PARTHENON_INSTRUMENT
 
   int num_sent_local = 0;
   for (auto &block : blocks) {
@@ -574,7 +570,6 @@ TaskStatus StopCommunicationMesh(const BlockList_t &blocks) {
     }
   }
 
-  Kokkos::Profiling::popRegion(); // Task_Particles_StopCommunicationMesh
   return TaskStatus::complete;
 }
 
diff --git a/example/poisson/poisson_package.cpp b/example/poisson/poisson_package.cpp
index 24d60613981f..378bc48a3176 100644
--- a/example/poisson/poisson_package.cpp
+++ b/example/poisson/poisson_package.cpp
@@ -207,7 +207,7 @@ TaskStatus SumDeltaPhi(T *du, Real *reduce_sum) {
 template <typename T>
 TaskStatus UpdatePhi(T *u, T *du) {
   using Stencil_t = parthenon::solvers::Stencil<Real>;
-  Kokkos::Profiling::pushRegion("Task_Poisson_UpdatePhi");
+  PARTHENON_INSTRUMENT
   auto pm = u->GetParentPointer();
 
   IndexRange ib = u->GetBoundsI(IndexDomain::interior);
@@ -274,13 +274,12 @@ TaskStatus UpdatePhi(T *u, T *du) {
         v(b, iphi, k, j, i) += dv(b, idphi, k, j, i);
       });
 
-  Kokkos::Profiling::popRegion(); // Task_Poisson_UpdatePhi
   return TaskStatus::complete;
 }
 
 template <typename T>
 TaskStatus CheckConvergence(T *u, T *du) {
-  Kokkos::Profiling::pushRegion("Task_Poisson_UpdatePhi");
+  PARTHENON_INSTRUMENT
   auto pm = u->GetParentPointer();
 
   IndexRange ib = u->GetBoundsI(IndexDomain::interior);
@@ -311,7 +310,6 @@ TaskStatus CheckConvergence(T *u, T *du) {
 
   auto status = (max_err < err_tol ? TaskStatus::complete : TaskStatus::iterate);
 
-  Kokkos::Profiling::popRegion(); // Task_Poisson_CheckConvergence
   return status;
 }
 
diff --git a/example/sparse_advection/sparse_advection_package.cpp b/example/sparse_advection/sparse_advection_package.cpp
index 01ec9c62475b..b3f8130cbe20 100644
--- a/example/sparse_advection/sparse_advection_package.cpp
+++ b/example/sparse_advection/sparse_advection_package.cpp
@@ -182,7 +182,7 @@ Real EstimateTimestepBlock(MeshBlockData<Real> *rc) {
 TaskStatus CalculateFluxes(std::shared_ptr<MeshBlockData<Real>> &rc) {
   using parthenon::MetadataFlag;
 
-  Kokkos::Profiling::pushRegion("Task_Advection_CalculateFluxes");
+  PARTHENON_INSTRUMENT
   auto pmb = rc->GetBlockPointer();
 
   IndexRange ib = pmb->cellbounds.GetBoundsI(IndexDomain::interior);
@@ -257,7 +257,6 @@ TaskStatus CalculateFluxes(std::shared_ptr<MeshBlockData<Real>> &rc) {
   PARTHENON_REQUIRE_THROWS(pmb->pmy_mesh->ndim == 2,
                            "Sparse Advection example must be 2D");
 
-  Kokkos::Profiling::popRegion(); // Task_Advection_CalculateFluxes
   return TaskStatus::complete;
 }
 
diff --git a/example/stochastic_subgrid/stochastic_subgrid_package.cpp b/example/stochastic_subgrid/stochastic_subgrid_package.cpp
index 6ab1aad2501c..79be429a3232 100644
--- a/example/stochastic_subgrid/stochastic_subgrid_package.cpp
+++ b/example/stochastic_subgrid/stochastic_subgrid_package.cpp
@@ -245,11 +245,9 @@ AmrTag CheckRefinement(MeshBlockData<Real> *rc) {
 // randomly sample an interation number for each cell from the discrete power-law
 // distribution
 TaskStatus ComputeNumIter(std::shared_ptr<MeshData<Real>> &md, Packages_t &packages) {
-  Kokkos::Profiling::pushRegion("Task_ComputeNumIter");
+  PARTHENON_INSTRUMENT
 
-  Kokkos::Profiling::pushRegion("Task_ComputeNumIter_pack");
   auto pack = md->PackVariables(std::vector<std::string>({"num_iter"}));
-  Kokkos::Profiling::popRegion();
 
   auto pkg = packages.Get("stochastic_subgrid_package");
   const auto &pool =
@@ -276,7 +274,6 @@ TaskStatus ComputeNumIter(std::shared_ptr<MeshData<Real>> &md, Packages_t &packa
         pack(b, v, k, j, i) = num_iter;
       });
 
-  Kokkos::Profiling::popRegion(); // Task_ComputeNumIter
   return TaskStatus::complete;
 }
 
@@ -364,7 +361,7 @@ Real EstimateTimestepBlock(MeshBlockData<Real> *rc) {
 // some field "advected" that we are pushing around.
 // This routine implements all the "physics" in this example
 TaskStatus CalculateFluxes(std::shared_ptr<MeshBlockData<Real>> &rc) {
-  Kokkos::Profiling::pushRegion("Task_Advection_CalculateFluxes");
+  PARTHENON_INSTRUMENT
   auto pmb = rc->GetBlockPointer();
   const IndexRange ib = pmb->cellbounds.GetBoundsI(IndexDomain::interior);
   const IndexRange jb = pmb->cellbounds.GetBoundsJ(IndexDomain::interior);
@@ -467,7 +464,6 @@ TaskStatus CalculateFluxes(std::shared_ptr<MeshBlockData<Real>> &rc) {
         });
   }
 
-  Kokkos::Profiling::popRegion(); // Task_Advection_CalculateFluxes
   return TaskStatus::complete;
 }
 
diff --git a/src/amr_criteria/refinement_package.cpp b/src/amr_criteria/refinement_package.cpp
index 285d01c71c21..bc6744cab109 100644
--- a/src/amr_criteria/refinement_package.cpp
+++ b/src/amr_criteria/refinement_package.cpp
@@ -153,19 +153,17 @@ void SetRefinement_(MeshBlockData<Real> *rc) {
 
 template <>
 TaskStatus Tag(MeshBlockData<Real> *rc) {
-  Kokkos::Profiling::pushRegion("Task_Tag_Block");
+  PARTHENON_INSTRUMENT
   SetRefinement_(rc);
-  Kokkos::Profiling::popRegion(); // Task_Tag_Block
   return TaskStatus::complete;
 }
 
 template <>
 TaskStatus Tag(MeshData<Real> *rc) {
-  Kokkos::Profiling::pushRegion("Task_Tag_Mesh");
+  PARTHENON_INSTRUMENT
   for (int i = 0; i < rc->NumBlocks(); i++) {
     SetRefinement_(rc->GetBlockData(i).get());
   }
-  Kokkos::Profiling::popRegion(); // Task_Tag_Mesh
   return TaskStatus::complete;
 }
 
diff --git a/src/bvals/boundary_conditions.cpp b/src/bvals/boundary_conditions.cpp
index bac842d8614b..f453f256bedd 100644
--- a/src/bvals/boundary_conditions.cpp
+++ b/src/bvals/boundary_conditions.cpp
@@ -33,7 +33,7 @@ bool DoPhysicalBoundary_(const BoundaryFlag flag, const BoundaryFace face,
 
 TaskStatus ApplyBoundaryConditionsOnCoarseOrFine(std::shared_ptr<MeshBlockData<Real>> &rc,
                                                  bool coarse) {
-  Kokkos::Profiling::pushRegion("Task_ApplyBoundaryConditionsOnCoarseOrFine");
+  PARTHENON_INSTRUMENT
   using namespace boundary_cond_impl;
   std::shared_ptr<MeshBlock> pmb = rc->GetBlockPointer();
   Mesh *pmesh = pmb->pmy_mesh;
@@ -47,7 +47,6 @@ TaskStatus ApplyBoundaryConditionsOnCoarseOrFine(std::shared_ptr<MeshBlockData<R
     }
   }
 
-  Kokkos::Profiling::popRegion(); // Task_ApplyBoundaryConditionsOnCoarseOrFine
   return TaskStatus::complete;
 }
 
diff --git a/src/bvals/bvals_base.cpp b/src/bvals/bvals_base.cpp
index 057493f186e8..45147b68011e 100644
--- a/src/bvals/bvals_base.cpp
+++ b/src/bvals/bvals_base.cpp
@@ -304,7 +304,7 @@ int BoundaryBase::CreateBvalsMPITag(int lid, int bufid) {
 void BoundaryBase::SearchAndSetNeighbors(
     MeshBlockTree &tree, int *ranklist, int *nslist,
     const std::unordered_set<LogicalLocation> &newly_refined) {
-  Kokkos::Profiling::pushRegion("SearchAndSetNeighbors");
+  PARTHENON_INSTRUMENT
   MeshBlockTree *neibt;
   int myox1, myox2 = 0, myox3 = 0, myfx1, myfx2, myfx3;
   myfx1 = ((loc.lx1() & 1LL) == 1LL);
@@ -371,7 +371,6 @@ void BoundaryBase::SearchAndSetNeighbors(
   }
   if (block_size_.nx(X2DIR) == 1) {
     SetNeighborOwnership(newly_refined);
-    Kokkos::Profiling::popRegion(); // SearchAndSetNeighbors
     return;
   }
 
@@ -506,7 +505,6 @@ void BoundaryBase::SearchAndSetNeighbors(
 
   if (block_size_.nx(X3DIR) == 1) {
     SetNeighborOwnership(newly_refined);
-    Kokkos::Profiling::popRegion(); // SearchAndSetNeighbors
     return;
   }
 
@@ -629,7 +627,6 @@ void BoundaryBase::SearchAndSetNeighbors(
   }
 
   SetNeighborOwnership(newly_refined);
-  Kokkos::Profiling::popRegion(); // SearchAndSetNeighbors
 }
 
 void BoundaryBase::SetNeighborOwnership(
diff --git a/src/bvals/comms/boundary_communication.cpp b/src/bvals/comms/boundary_communication.cpp
index fc5231c2da79..4f26a9f52809 100644
--- a/src/bvals/comms/boundary_communication.cpp
+++ b/src/bvals/comms/boundary_communication.cpp
@@ -46,7 +46,7 @@ using namespace loops::shorthands;
 
 template <BoundaryType bound_type>
 TaskStatus SendBoundBufs(std::shared_ptr<MeshData<Real>> &md) {
-  Kokkos::Profiling::pushRegion("Task_LoadAndSendBoundBufs");
+  PARTHENON_INSTRUMENT
 
   Mesh *pmesh = md->GetMeshPointer();
   auto &cache = md->GetBvarsCache().GetSubCache(bound_type, true);
@@ -59,11 +59,9 @@ TaskStatus SendBoundBufs(std::shared_ptr<MeshData<Real>> &md) {
       CheckSendBufferCacheForRebuild<bound_type, true>(md);
 
   if (nbound == 0) {
-    Kokkos::Profiling::popRegion(); // Task_LoadAndSendBoundBufs
     return TaskStatus::complete;
   }
   if (other_communication_unfinished) {
-    Kokkos::Profiling::popRegion(); // Task_LoadAndSendBoundBufs
     return TaskStatus::incomplete;
   }
 
@@ -144,7 +142,6 @@ TaskStatus SendBoundBufs(std::shared_ptr<MeshData<Real>> &md) {
       buf.SendNull();
   }
 
-  Kokkos::Profiling::popRegion(); // Task_LoadAndSendBoundBufs
   return TaskStatus::complete;
 }
 
@@ -155,7 +152,7 @@ SendBoundBufs<BoundaryType::nonlocal>(std::shared_ptr<MeshData<Real>> &);
 
 template <BoundaryType bound_type>
 TaskStatus StartReceiveBoundBufs(std::shared_ptr<MeshData<Real>> &md) {
-  Kokkos::Profiling::pushRegion("Task_StartReceiveBoundBufs");
+  PARTHENON_INSTRUMENT
   Mesh *pmesh = md->GetMeshPointer();
   auto &cache = md->GetBvarsCache().GetSubCache(bound_type, false);
   if (cache.buf_vec.size() == 0)
@@ -165,7 +162,6 @@ TaskStatus StartReceiveBoundBufs(std::shared_ptr<MeshData<Real>> &md) {
   std::for_each(std::begin(cache.buf_vec), std::end(cache.buf_vec),
                 [](auto pbuf) { pbuf->TryStartReceive(); });
 
-  Kokkos::Profiling::popRegion(); // Task_StartReceiveBoundBufs
   return TaskStatus::complete;
 }
 
@@ -178,7 +174,7 @@ StartReceiveBoundBufs<BoundaryType::nonlocal>(std::shared_ptr<MeshData<Real>> &)
 
 template <BoundaryType bound_type>
 TaskStatus ReceiveBoundBufs(std::shared_ptr<MeshData<Real>> &md) {
-  Kokkos::Profiling::pushRegion("Task_ReceiveBoundBufs");
+  PARTHENON_INSTRUMENT
 
   Mesh *pmesh = md->GetMeshPointer();
   auto &cache = md->GetBvarsCache().GetSubCache(bound_type, false);
@@ -209,7 +205,6 @@ TaskStatus ReceiveBoundBufs(std::shared_ptr<MeshData<Real>> &md) {
           ++ibound;
         });
   }
-  Kokkos::Profiling::popRegion(); // Task_ReceiveBoundBufs
   if (all_received) return TaskStatus::complete;
   return TaskStatus::incomplete;
 }
@@ -223,7 +218,7 @@ ReceiveBoundBufs<BoundaryType::nonlocal>(std::shared_ptr<MeshData<Real>> &);
 
 template <BoundaryType bound_type>
 TaskStatus SetBounds(std::shared_ptr<MeshData<Real>> &md) {
-  Kokkos::Profiling::pushRegion("Task_SetInternalBoundaries");
+  PARTHENON_INSTRUMENT
 
   Mesh *pmesh = md->GetMeshPointer();
   auto &cache = md->GetBvarsCache().GetSubCache(bound_type, false);
@@ -294,7 +289,6 @@ TaskStatus SetBounds(std::shared_ptr<MeshData<Real>> &md) {
     refinement::Restrict(resolved_packages, cache.prores_cache, pmb->cellbounds,
                          pmb->c_cellbounds);
   }
-  Kokkos::Profiling::popRegion(); // Task_SetInternalBoundaries
   return TaskStatus::complete;
 }
 
@@ -304,7 +298,7 @@ template TaskStatus SetBounds<BoundaryType::nonlocal>(std::shared_ptr<MeshData<R
 
 template <BoundaryType bound_type>
 TaskStatus ProlongateBounds(std::shared_ptr<MeshData<Real>> &md) {
-  Kokkos::Profiling::pushRegion("Task_ProlongateBoundaries");
+  PARTHENON_INSTRUMENT
 
   Mesh *pmesh = md->GetMeshPointer();
   auto &cache = md->GetBvarsCache().GetSubCache(bound_type, false);
@@ -323,7 +317,6 @@ TaskStatus ProlongateBounds(std::shared_ptr<MeshData<Real>> &md) {
     refinement::ProlongateInternal(resolved_packages, cache.prores_cache, pmb->cellbounds,
                                    pmb->c_cellbounds);
   }
-  Kokkos::Profiling::popRegion(); // Task_ProlongateBoundaries
   return TaskStatus::complete;
 }
 
diff --git a/src/bvals/comms/build_boundary_buffers.cpp b/src/bvals/comms/build_boundary_buffers.cpp
index 01868f22ec9b..3eabe3c4d284 100644
--- a/src/bvals/comms/build_boundary_buffers.cpp
+++ b/src/bvals/comms/build_boundary_buffers.cpp
@@ -113,7 +113,7 @@ void BuildBoundaryBufferSubset(std::shared_ptr<MeshData<Real>> &md,
 // pmesh->boundary_comm_map.clear() after every remesh
 // in InitializeBlockTimeStepsAndBoundaries()
 TaskStatus BuildBoundaryBuffers(std::shared_ptr<MeshData<Real>> &md) {
-  Kokkos::Profiling::pushRegion("Task_BuildSendBoundBufs");
+  PARTHENON_INSTRUMENT
   Mesh *pmesh = md->GetMeshPointer();
   auto &all_caches = md->GetBvarsCache();
 
@@ -127,7 +127,6 @@ TaskStatus BuildBoundaryBuffers(std::shared_ptr<MeshData<Real>> &md) {
   BuildBoundaryBufferSubset<BoundaryType::flxcor_recv>(md,
                                                        pmesh->boundary_comm_flxcor_map);
 
-  Kokkos::Profiling::popRegion(); // "Task_BuildSendBoundBufs"
   return TaskStatus::complete;
 }
 } // namespace parthenon
diff --git a/src/bvals/comms/flux_correction.cpp b/src/bvals/comms/flux_correction.cpp
index 5a82a15429e6..50166ec9ca89 100644
--- a/src/bvals/comms/flux_correction.cpp
+++ b/src/bvals/comms/flux_correction.cpp
@@ -39,7 +39,7 @@ namespace parthenon {
 using namespace impl;
 
 TaskStatus LoadAndSendFluxCorrections(std::shared_ptr<MeshData<Real>> &md) {
-  Kokkos::Profiling::pushRegion("Task_LoadAndSendFluxCorrections");
+  PARTHENON_INSTRUMENT
 
   Mesh *pmesh = md->GetMeshPointer();
   auto &cache = md->GetBvarsCache().GetSubCache(BoundaryType::flxcor_send, true);
@@ -53,12 +53,10 @@ TaskStatus LoadAndSendFluxCorrections(std::shared_ptr<MeshData<Real>> &md) {
       CheckSendBufferCacheForRebuild<BoundaryType::flxcor_send, true>(md);
 
   if (nbound == 0) {
-    Kokkos::Profiling::popRegion(); // Task_LoadAndSendFluxCorrections
     return TaskStatus::complete;
   }
 
   if (other_communication_unfinished) {
-    Kokkos::Profiling::popRegion(); // Task_LoadAndSendFluxCorrections
     return TaskStatus::incomplete;
   }
 
@@ -116,12 +114,11 @@ TaskStatus LoadAndSendFluxCorrections(std::shared_ptr<MeshData<Real>> &md) {
   // Calling Send will send null if the underlying buffer is unallocated
   for (auto &buf : cache.buf_vec)
     buf->Send();
-  Kokkos::Profiling::popRegion(); // Task_LoadAndSendFluxCorrections
   return TaskStatus::complete;
 }
 
 TaskStatus StartReceiveFluxCorrections(std::shared_ptr<MeshData<Real>> &md) {
-  Kokkos::Profiling::pushRegion("Task_StartReceiveFluxCorrections");
+  PARTHENON_INSTRUMENT
   Mesh *pmesh = md->GetMeshPointer();
   auto &cache = md->GetBvarsCache().GetSubCache(BoundaryType::flxcor_recv, false);
   if (cache.buf_vec.size() == 0)
@@ -131,12 +128,11 @@ TaskStatus StartReceiveFluxCorrections(std::shared_ptr<MeshData<Real>> &md) {
   std::for_each(std::begin(cache.buf_vec), std::end(cache.buf_vec),
                 [](auto pbuf) { pbuf->TryStartReceive(); });
 
-  Kokkos::Profiling::popRegion(); // Task_StartReceiveFluxCorrections
   return TaskStatus::complete;
 }
 
 TaskStatus ReceiveFluxCorrections(std::shared_ptr<MeshData<Real>> &md) {
-  Kokkos::Profiling::pushRegion("Task_ReceiveFluxCorrections");
+  PARTHENON_INSTRUMENT
 
   Mesh *pmesh = md->GetMeshPointer();
   auto &cache = md->GetBvarsCache().GetSubCache(BoundaryType::flxcor_recv, false);
@@ -149,14 +145,12 @@ TaskStatus ReceiveFluxCorrections(std::shared_ptr<MeshData<Real>> &md) {
       std::begin(cache.buf_vec), std::end(cache.buf_vec),
       [&all_received](auto pbuf) { all_received = pbuf->TryReceive() && all_received; });
 
-  Kokkos::Profiling::popRegion(); // Task_ReceiveFluxCorrections
-
   if (all_received) return TaskStatus::complete;
   return TaskStatus::incomplete;
 }
 
 TaskStatus SetFluxCorrections(std::shared_ptr<MeshData<Real>> &md) {
-  Kokkos::Profiling::pushRegion("Task_SetFluxCorrections");
+  PARTHENON_INSTRUMENT
 
   Mesh *pmesh = md->GetMeshPointer();
   auto &cache = md->GetBvarsCache().GetSubCache(BoundaryType::flxcor_recv, false);
@@ -188,7 +182,6 @@ TaskStatus SetFluxCorrections(std::shared_ptr<MeshData<Real>> &md) {
   std::for_each(std::begin(cache.buf_vec), std::end(cache.buf_vec),
                 [](auto pbuf) { pbuf->Stale(); });
 
-  Kokkos::Profiling::popRegion(); // Task_SetFluxCorrections
   return TaskStatus::complete;
 }
 
diff --git a/src/driver/driver.cpp b/src/driver/driver.cpp
index 84371eeaeb36..8359e6fc9a82 100644
--- a/src/driver/driver.cpp
+++ b/src/driver/driver.cpp
@@ -75,51 +75,54 @@ DriverStatus EvolutionDriver::Execute() {
   // Defaults must be set across all ranks
   DumpInputParameters();
 
-  Kokkos::Profiling::pushRegion("Driver_Main");
-  while (tm.KeepGoing()) {
-    if (Globals::my_rank == 0) OutputCycleDiagnostics();
-
-    pmesh->PreStepUserWorkInLoop(pmesh, pinput, tm);
-    pmesh->PreStepUserDiagnosticsInLoop(pmesh, pinput, tm);
-
-    TaskListStatus status = Step();
-    if (status != TaskListStatus::complete) {
-      std::cerr << "Step failed to complete all tasks." << std::endl;
-      return DriverStatus::failed;
-    }
+  { // Main t < tmax loop region
+    PARTHENON_INSTRUMENT_REGION("evolution_loop")
+    while (tm.KeepGoing()) {
+      if (Globals::my_rank == 0) OutputCycleDiagnostics();
+
+      pmesh->PreStepUserWorkInLoop(pmesh, pinput, tm);
+      pmesh->PreStepUserDiagnosticsInLoop(pmesh, pinput, tm);
+
+      TaskListStatus status = Step();
+      if (status != TaskListStatus::complete) {
+        std::cerr << "Step failed to complete all tasks." << std::endl;
+        return DriverStatus::failed;
+      }
 
-    pmesh->PostStepUserWorkInLoop(pmesh, pinput, tm);
-    pmesh->PostStepUserDiagnosticsInLoop(pmesh, pinput, tm);
+      pmesh->PostStepUserWorkInLoop(pmesh, pinput, tm);
+      pmesh->PostStepUserDiagnosticsInLoop(pmesh, pinput, tm);
 
-    tm.ncycle++;
-    tm.time += tm.dt;
-    pmesh->mbcnt += pmesh->nbtotal;
-    pmesh->step_since_lb++;
+      tm.ncycle++;
+      tm.time += tm.dt;
+      pmesh->mbcnt += pmesh->nbtotal;
+      pmesh->step_since_lb++;
 
-    timer_LBandAMR.reset();
-    pmesh->LoadBalancingAndAdaptiveMeshRefinement(pinput, app_input);
-    if (pmesh->modified) InitializeBlockTimeStepsAndBoundaries();
-    time_LBandAMR += timer_LBandAMR.seconds();
-    SetGlobalTimeStep();
+      timer_LBandAMR.reset();
+      pmesh->LoadBalancingAndAdaptiveMeshRefinement(pinput, app_input);
+      if (pmesh->modified) InitializeBlockTimeStepsAndBoundaries();
+      time_LBandAMR += timer_LBandAMR.seconds();
+      SetGlobalTimeStep();
 
-    // check for signals
-    signal = SignalHandler::CheckSignalFlags();
+      // check for signals
+      signal = SignalHandler::CheckSignalFlags();
 
-    if (signal == OutputSignal::final) {
-      break;
-    }
+      if (signal == OutputSignal::final) {
+        break;
+      }
 
-    // skip the final (last) output at the end of the simulation time as it happens later
-    if (tm.KeepGoing()) {
-      pouts->MakeOutputs(pmesh, pinput, &tm, signal);
-    }
+      // skip the final (last) output at the end of the simulation time as it happens
+      // later
+      if (tm.KeepGoing()) {
+        pouts->MakeOutputs(pmesh, pinput, &tm, signal);
+      }
 
-    if (tm.ncycle == perf_cycle_offset) {
-      pmesh->mbcnt = 0;
-      timer_main.reset();
-    }
-  } // END OF MAIN INTEGRATION LOOP ======================================================
-  Kokkos::Profiling::popRegion(); // Driver_Main
+      if (tm.ncycle == perf_cycle_offset) {
+        pmesh->mbcnt = 0;
+        timer_main.reset();
+      }
+    } // END OF MAIN INTEGRATION LOOP
+      // ======================================================
+  }   // Main t < tmax loop region
 
   pmesh->UserWorkAfterLoop(pmesh, pinput, tm);
 
diff --git a/src/driver/multistage.hpp b/src/driver/multistage.hpp
index 07748ae3564c..9cc1fcf5b2c5 100644
--- a/src/driver/multistage.hpp
+++ b/src/driver/multistage.hpp
@@ -37,7 +37,7 @@ class MultiStageDriverGeneric : public EvolutionDriver {
   // the dependencies that must be executed.
   virtual TaskCollection MakeTaskCollection(BlockList_t &blocks, int stage) = 0;
   virtual TaskListStatus Step() {
-    Kokkos::Profiling::pushRegion("MultiStage_Step");
+    PARTHENON_INSTRUMENT
     using DriverUtils::ConstructAndExecuteTaskLists;
     TaskListStatus status;
     integrator->dt = tm.dt;
@@ -49,7 +49,6 @@ class MultiStageDriverGeneric : public EvolutionDriver {
       status = ConstructAndExecuteTaskLists<>(this, stage);
       if (status != TaskListStatus::complete) break;
     }
-    Kokkos::Profiling::popRegion(); // MultiStage_Step
     return status;
   }
 
@@ -66,7 +65,7 @@ class MultiStageBlockTaskDriverGeneric : public MultiStageDriverGeneric<Integrat
       : MultiStageDriverGeneric<Integrator>(pin, app_in, pm) {}
   virtual TaskList MakeTaskList(MeshBlock *pmb, int stage) = 0;
   virtual TaskListStatus Step() {
-    Kokkos::Profiling::pushRegion("MultiStageBlockTask_Step");
+    PARTHENON_INSTRUMENT
     using DriverUtils::ConstructAndExecuteBlockTasks;
     TaskListStatus status;
     Integrator *integrator = (this->integrator).get();
@@ -76,7 +75,6 @@ class MultiStageBlockTaskDriverGeneric : public MultiStageDriverGeneric<Integrat
       status = ConstructAndExecuteBlockTasks<>(this, stage);
       if (status != TaskListStatus::complete) break;
     }
-    Kokkos::Profiling::popRegion(); // MultiStageBlockTask_Step
     return status;
   }
 
diff --git a/src/interface/meshblock_data.cpp b/src/interface/meshblock_data.cpp
index 1b194597911b..5f685c0e119d 100644
--- a/src/interface/meshblock_data.cpp
+++ b/src/interface/meshblock_data.cpp
@@ -327,7 +327,7 @@ template <typename T>
 typename MeshBlockData<T>::VarList
 MeshBlockData<T>::GetVariablesByFlag(const Metadata::FlagCollection &flags,
                                      const std::vector<int> &sparse_ids) {
-  Kokkos::Profiling::pushRegion("GetVariablesByFlag");
+  PARTHENON_INSTRUMENT
 
   typename MeshBlockData<T>::VarList var_list;
   std::unordered_set<int> sparse_ids_set(sparse_ids.begin(), sparse_ids.end());
@@ -338,7 +338,6 @@ MeshBlockData<T>::GetVariablesByFlag(const Metadata::FlagCollection &flags,
     var_list.Add(v, sparse_ids_set);
   }
 
-  Kokkos::Profiling::popRegion(); // GetVariablesByFlag
   return var_list;
 }
 
diff --git a/src/interface/swarm_container.cpp b/src/interface/swarm_container.cpp
index 6abb9de69721..056d742bcea7 100644
--- a/src/interface/swarm_container.cpp
+++ b/src/interface/swarm_container.cpp
@@ -83,16 +83,15 @@ void SwarmContainer::Remove(const std::string &label) {
 
 // Return swarms meeting some conditions
 SwarmSet SwarmContainer::GetSwarmsByFlag(const Metadata::FlagCollection &flags) {
-  Kokkos::Profiling::pushRegion("GetSwarmsByFlag");
+  PARTHENON_INSTRUMENT
 
   auto swarms = MetadataUtils::GetByFlag<SwarmSet>(flags, swarmMap_, swarmMetadataMap_);
 
-  Kokkos::Profiling::popRegion(); // GetSwarmsByFlag
   return swarms;
 }
 
 TaskStatus SwarmContainer::Defrag(double min_occupancy) {
-  Kokkos::Profiling::pushRegion("Task_SwarmContainer_Defrag");
+  PARTHENON_INSTRUMENT
   PARTHENON_REQUIRE_THROWS(min_occupancy >= 0. && min_occupancy <= 1.,
                            "Max fractional occupancy of swarm must be >= 0 and <= 1");
 
@@ -103,29 +102,24 @@ TaskStatus SwarmContainer::Defrag(double min_occupancy) {
     }
   }
 
-  Kokkos::Profiling::popRegion();
-
   return TaskStatus::complete;
 }
 
 TaskStatus SwarmContainer::DefragAll() {
-  Kokkos::Profiling::pushRegion("Task_SwarmContainer_Defrag");
+  PARTHENON_INSTRUMENT
   for (auto &s : swarmVector_) {
     s->Defrag();
   }
-  Kokkos::Profiling::popRegion();
   return TaskStatus::complete;
 }
 
 TaskStatus SwarmContainer::SortParticlesByCell() {
-  Kokkos::Profiling::pushRegion("Task_SwarmContainer_SortParticlesByCell");
+  PARTHENON_INSTRUMENT
 
   for (auto &s : swarmVector_) {
     s->SortParticlesByCell();
   }
 
-  Kokkos::Profiling::popRegion();
-
   return TaskStatus::complete;
 }
 
@@ -150,18 +144,17 @@ void SwarmContainer::AllocateBoundaries() {
 }
 
 TaskStatus SwarmContainer::Send(BoundaryCommSubset phase) {
-  Kokkos::Profiling::pushRegion("Task_SwarmContainer_Send");
+  PARTHENON_INSTRUMENT
 
   for (auto &s : swarmVector_) {
     s->Send(phase);
   }
 
-  Kokkos::Profiling::popRegion(); // Task_SwarmContainer_Send
   return TaskStatus::complete;
 }
 
 TaskStatus SwarmContainer::Receive(BoundaryCommSubset phase) {
-  Kokkos::Profiling::pushRegion("Task_SwarmContainer_Receive");
+  PARTHENON_INSTRUMENT
 
   int success = 0, total = 0;
   for (auto &s : swarmVector_) {
@@ -171,24 +164,22 @@ TaskStatus SwarmContainer::Receive(BoundaryCommSubset phase) {
     total++;
   }
 
-  Kokkos::Profiling::popRegion(); // Task_SwarmContainer_Receive
   if (success == total) return TaskStatus::complete;
   return TaskStatus::incomplete;
 }
 
 TaskStatus SwarmContainer::ResetCommunication() {
-  Kokkos::Profiling::pushRegion("Task_SwarmContainer_ResetCommunication");
+  PARTHENON_INSTRUMENT
 
   for (auto &s : swarmVector_) {
     s->ResetCommunication();
   }
 
-  Kokkos::Profiling::popRegion(); // Task_SwarmContainer_ResetCommunication
   return TaskStatus::complete;
 }
 
 TaskStatus SwarmContainer::FinalizeCommunicationIterative() {
-  Kokkos::Profiling::pushRegion("Task_SwarmContainer_FinalizeCommunicationIterative");
+  PARTHENON_INSTRUMENT
 
   PARTHENON_THROW("FinalizeCommunicationIterative not yet fully implemented!")
 
@@ -200,7 +191,6 @@ TaskStatus SwarmContainer::FinalizeCommunicationIterative() {
     total++;
   }
 
-  Kokkos::Profiling::popRegion(); // Task_SwarmContainer_FinalizeCommunicationIterative
   if (success == total) return TaskStatus::complete;
   return TaskStatus::incomplete;
 }
diff --git a/src/interface/update.cpp b/src/interface/update.cpp
index e4da9849f8d2..02c3b632926e 100644
--- a/src/interface/update.cpp
+++ b/src/interface/update.cpp
@@ -140,12 +140,11 @@ TaskStatus UpdateWithFluxDivergence(MeshData<Real> *u0_data, MeshData<Real> *u1_
 }
 
 TaskStatus SparseDealloc(MeshData<Real> *md) {
+  PARTHENON_INSTRUMENT
   if (!Globals::sparse_config.enabled || (md->NumBlocks() == 0)) {
     return TaskStatus::complete;
   }
 
-  Kokkos::Profiling::pushRegion("Task_SparseDealloc");
-
   const IndexRange ib = md->GetBoundsI(IndexDomain::entire);
   const IndexRange jb = md->GetBoundsJ(IndexDomain::entire);
   const IndexRange kb = md->GetBoundsK(IndexDomain::entire);
@@ -219,7 +218,6 @@ TaskStatus SparseDealloc(MeshData<Real> *md) {
     }
   }
 
-  Kokkos::Profiling::popRegion(); // Task_SparseDealloc
   return TaskStatus::complete;
 }
 
diff --git a/src/interface/update.hpp b/src/interface/update.hpp
index eb0dbc57d3e6..d2cc37343802 100644
--- a/src/interface/update.hpp
+++ b/src/interface/update.hpp
@@ -70,7 +70,7 @@ TaskStatus UpdateWithFluxDivergence(T *data_u0, T *data_u1, const Real gam0,
 template <typename F, typename T>
 TaskStatus WeightedSumData(const F &flags, T *in1, T *in2, const Real w1, const Real w2,
                            T *out) {
-  Kokkos::Profiling::pushRegion("Task_WeightedSumData");
+  PARTHENON_INSTRUMENT
   const auto &x = in1->PackVariables(flags);
   const auto &y = in2->PackVariables(flags);
   const auto &z = out->PackVariables(flags);
@@ -85,7 +85,6 @@ TaskStatus WeightedSumData(const F &flags, T *in1, T *in2, const Real w1, const
           z(b, l, k, j, i) = w1 * x(b, l, k, j, i) + w2 * y(b, l, k, j, i);
         }
       });
-  Kokkos::Profiling::popRegion(); // Task_WeightedSumData
   return TaskStatus::complete;
 }
 
@@ -96,7 +95,7 @@ TaskStatus CopyData(const F &flags, T *in, T *out) {
 
 template <typename F, typename T>
 TaskStatus SetDataToConstant(const F &flags, T *data, const Real val) {
-  Kokkos::Profiling::pushRegion("Task_SetDataToConstant");
+  PARTHENON_INSTRUMENT
   const auto &x = data->PackVariables(flags);
   parthenon::par_for(
       DEFAULT_LOOP_PATTERN, "SetDataToConstant", DevExecSpace(), 0, x.GetDim(5) - 1, 0,
@@ -106,7 +105,6 @@ TaskStatus SetDataToConstant(const F &flags, T *data, const Real val) {
           x(b, l, k, j, i) = val;
         }
       });
-  Kokkos::Profiling::popRegion(); // Task_SetDataToConstant
   return TaskStatus::complete;
 }
 
@@ -148,7 +146,7 @@ template <typename F, typename T>
 TaskStatus Update2S(const F &flags, T *s0_data, T *s1_data, T *rhs_data,
                     const LowStorageIntegrator *pint, Real dt, int stage,
                     bool update_s1) {
-  Kokkos::Profiling::pushRegion("Task_2S_Update");
+  PARTHENON_INSTRUMENT
   const auto &s0 = s0_data->PackVariables(flags);
   const auto &s1 = s1_data->PackVariables(flags);
   const auto &rhs = rhs_data->PackVariables(flags);
@@ -174,7 +172,6 @@ TaskStatus Update2S(const F &flags, T *s0_data, T *s1_data, T *rhs_data,
                               beta * dt * rhs(b, l, k, j, i);
         }
       });
-  Kokkos::Profiling::popRegion(); // Task_2S_Update
   return TaskStatus::complete;
 }
 template <typename T>
@@ -194,7 +191,7 @@ TaskStatus SumButcher(const F &flags, std::shared_ptr<T> base_data,
                       std::vector<std::shared_ptr<T>> stage_data,
                       std::shared_ptr<T> out_data, const ButcherIntegrator *pint, Real dt,
                       int stage) {
-  Kokkos::Profiling::pushRegion("Task_Butcher_Sum");
+  PARTHENON_INSTRUMENT
   const auto &out = out_data->PackVariables(flags);
   const auto &in = base_data->PackVariables(flags);
   const IndexDomain interior = IndexDomain::interior;
@@ -221,7 +218,6 @@ TaskStatus SumButcher(const F &flags, std::shared_ptr<T> base_data,
           }
         });
   }
-  Kokkos::Profiling::popRegion(); // Task_Butcher_Sum
   return TaskStatus::complete;
 }
 template <typename T>
@@ -238,7 +234,7 @@ template <typename F, typename T>
 TaskStatus UpdateButcher(const F &flags, std::vector<std::shared_ptr<T>> stage_data,
                          std::shared_ptr<T> out_data, const ButcherIntegrator *pint,
                          Real dt) {
-  Kokkos::Profiling::pushRegion("Task_Butcher_Update");
+  PARTHENON_INSTRUMENT
 
   const auto &out = out_data->PackVariables(flags);
   const IndexDomain interior = IndexDomain::interior;
@@ -259,7 +255,6 @@ TaskStatus UpdateButcher(const F &flags, std::vector<std::shared_ptr<T>> stage_d
           }
         });
   }
-  Kokkos::Profiling::popRegion(); // Task_Butcher_Update
   return TaskStatus::complete;
 }
 template <typename F, typename T>
@@ -272,53 +267,54 @@ TaskStatus UpdateButcherIndependent(std::vector<std::shared_ptr<T>> stage_data,
 
 template <typename T>
 TaskStatus EstimateTimestep(T *rc) {
-  Kokkos::Profiling::pushRegion("Task_EstimateTimestep");
+  PARTHENON_INSTRUMENT
   Real dt_min = std::numeric_limits<Real>::max();
   for (const auto &pkg : rc->GetParentPointer()->packages.AllPackages()) {
     Real dt = pkg.second->EstimateTimestep(rc);
     dt_min = std::min(dt_min, dt);
   }
   rc->SetAllowedDt(dt_min);
-  Kokkos::Profiling::popRegion(); // Task_EstimateTimestep
   return TaskStatus::complete;
 }
 
 template <typename T>
 TaskStatus PreCommFillDerived(T *rc) {
-  Kokkos::Profiling::pushRegion("Task_PreCommFillDerived");
+  PARTHENON_INSTRUMENT
   auto pm = rc->GetParentPointer();
   for (const auto &pkg : pm->packages.AllPackages()) {
     pkg.second->PreCommFillDerived(rc);
   }
-  Kokkos::Profiling::popRegion();
   return TaskStatus::complete;
 }
 
 template <typename T>
 TaskStatus FillDerived(T *rc) {
-  Kokkos::Profiling::pushRegion("Task_FillDerived");
+  PARTHENON_INSTRUMENT
   auto pm = rc->GetParentPointer();
-  Kokkos::Profiling::pushRegion("PreFillDerived");
-  for (const auto &pkg : pm->packages.AllPackages()) {
-    pkg.second->PreFillDerived(rc);
-  }
-  Kokkos::Profiling::popRegion(); // PreFillDerived
-  Kokkos::Profiling::pushRegion("FillDerived");
-  for (const auto &pkg : pm->packages.AllPackages()) {
-    pkg.second->FillDerived(rc);
-  }
-  Kokkos::Profiling::popRegion(); // FillDerived
-  Kokkos::Profiling::pushRegion("PostFillDerived");
-  for (const auto &pkg : pm->packages.AllPackages()) {
-    pkg.second->PostFillDerived(rc);
-  }
-  Kokkos::Profiling::popRegion(); // PostFillDerived
-  Kokkos::Profiling::popRegion(); // Task_FillDerived
+  { // PreFillDerived region
+    PARTHENON_INSTRUMENT_REGION("PreFillDerived")
+    for (const auto &pkg : pm->packages.AllPackages()) {
+      pkg.second->PreFillDerived(rc);
+    }
+  } // PreFillDerived region
+  { // FillDerived region
+    PARTHENON_INSTRUMENT_REGION("FillDerived");
+    for (const auto &pkg : pm->packages.AllPackages()) {
+      pkg.second->FillDerived(rc);
+    }
+  } // FillDerived region
+  { // PostFillDerived region
+    PARTHENON_INSTRUMENT_REGION("PostFillDerived");
+    for (const auto &pkg : pm->packages.AllPackages()) {
+      pkg.second->PostFillDerived(rc);
+    }
+  } // PostFillDerived region
   return TaskStatus::complete;
 }
 
 template <typename T>
 TaskStatus InitNewlyAllocatedVars(T *rc) {
+  PARTHENON_INSTRUMENT
   if (!rc->AllVariablesInitialized()) {
     const IndexDomain interior = IndexDomain::interior;
     const IndexRange ib = rc->GetBoundsI(interior);
@@ -369,12 +365,10 @@ TaskStatus InitNewlyAllocatedVars(T *rc) {
   // This has to be done even in the case where no blocks have been allocated
   // since the boundaries of allocated blocks could have received default data
   // in any case
-  Kokkos::Profiling::pushRegion("Task_InitNewlyAllocatedVars");
   auto pm = rc->GetParentPointer();
   for (const auto &pkg : pm->packages.AllPackages()) {
     pkg.second->InitNewlyAllocatedVars(rc);
   }
-  Kokkos::Profiling::popRegion();
 
   // Don't worry about flagging variables as initialized
   // since they will be flagged at the beginning of the
diff --git a/src/kokkos_abstraction.hpp b/src/kokkos_abstraction.hpp
index bfaf6a247a66..7a2220fd421f 100644
--- a/src/kokkos_abstraction.hpp
+++ b/src/kokkos_abstraction.hpp
@@ -29,6 +29,7 @@
 
 #include "parthenon_array_generic.hpp"
 #include "utils/error_checking.hpp"
+#include "utils/instrument.hpp"
 #include "utils/object_pool.hpp"
 
 namespace parthenon {
@@ -334,13 +335,12 @@ inline void par_dispatch(LoopPatternSimdFor, const std::string &name,
                          DevExecSpace exec_space, const int &kl, const int &ku,
                          const int &jl, const int &ju, const int &il, const int &iu,
                          const Function &function) {
-  Kokkos::Profiling::pushRegion(name);
+  PARTHENON_INSTRUMENT_REGION(name)
   for (auto k = kl; k <= ku; k++)
     for (auto j = jl; j <= ju; j++)
 #pragma omp simd
       for (auto i = il; i <= iu; i++)
         function(k, j, i);
-  Kokkos::Profiling::popRegion();
 }
 
 // 4D loop using Kokkos 1D Range
@@ -468,14 +468,13 @@ inline void par_dispatch(LoopPatternSimdFor, const std::string &name,
                          DevExecSpace exec_space, const int nl, const int nu,
                          const int kl, const int ku, const int jl, const int ju,
                          const int il, const int iu, const Function &function) {
-  Kokkos::Profiling::pushRegion(name);
+  PARTHENON_INSTRUMENT_REGION(name)
   for (auto n = nl; n <= nu; n++)
     for (auto k = kl; k <= ku; k++)
       for (auto j = jl; j <= ju; j++)
 #pragma omp simd
         for (auto i = il; i <= iu; i++)
           function(n, k, j, i);
-  Kokkos::Profiling::popRegion();
 }
 
 // 5D loop using MDRange loops
@@ -536,7 +535,7 @@ inline void par_dispatch(LoopPatternSimdFor, const std::string &name,
                          const int nl, const int nu, const int kl, const int ku,
                          const int jl, const int ju, const int il, const int iu,
                          const Function &function) {
-  Kokkos::Profiling::pushRegion(name);
+  PARTHENON_INSTRUMENT_REGION(name)
   for (auto b = bl; b <= bu; b++)
     for (auto n = nl; n <= nu; n++)
       for (auto k = kl; k <= ku; k++)
@@ -544,7 +543,6 @@ inline void par_dispatch(LoopPatternSimdFor, const std::string &name,
 #pragma omp simd
           for (auto i = il; i <= iu; i++)
             function(b, n, k, j, i);
-  Kokkos::Profiling::popRegion();
 }
 
 // 6D loop using MDRange loops
@@ -609,7 +607,7 @@ inline void par_dispatch(LoopPatternSimdFor, const std::string &name,
                          const int ml, const int mu, const int nl, const int nu,
                          const int kl, const int ku, const int jl, const int ju,
                          const int il, const int iu, const Function &function) {
-  Kokkos::Profiling::pushRegion(name);
+  PARTHENON_INSTRUMENT_REGION(name)
   for (auto l = ll; l <= lu; l++)
     for (auto m = ml; m <= mu; m++)
       for (auto n = nl; n <= nu; n++)
@@ -618,7 +616,6 @@ inline void par_dispatch(LoopPatternSimdFor, const std::string &name,
 #pragma omp simd
             for (auto i = il; i <= iu; i++)
               function(l, m, n, k, j, i);
-  Kokkos::Profiling::popRegion();
 }
 
 template <class... Args>
diff --git a/src/mesh/amr_loadbalance.cpp b/src/mesh/amr_loadbalance.cpp
index 32ee6d398d37..4018d8220057 100644
--- a/src/mesh/amr_loadbalance.cpp
+++ b/src/mesh/amr_loadbalance.cpp
@@ -317,7 +317,7 @@ bool TryRecvSameToSame(int lid_recv, int send_rank, Variable<Real> *var, MeshBlo
 
 void Mesh::LoadBalancingAndAdaptiveMeshRefinement(ParameterInput *pin,
                                                   ApplicationInput *app_in) {
-  Kokkos::Profiling::pushRegion("LoadBalancingAndAdaptiveMeshRefinement");
+  PARTHENON_INSTRUMENT
   int nnew = 0, ndel = 0;
 
   if (adaptive) {
@@ -342,7 +342,6 @@ void Mesh::LoadBalancingAndAdaptiveMeshRefinement(ParameterInput *pin,
     }
     lb_flag_ = false;
   }
-  Kokkos::Profiling::popRegion(); // LoadBalancingAndAdaptiveMeshRefinement
 }
 
 // Private routines
@@ -405,7 +404,7 @@ void UpdateBlockList(std::vector<int> const &ranklist, std::vector<int> &nslist,
 void Mesh::CalculateLoadBalance(std::vector<double> const &costlist,
                                 std::vector<int> &ranklist, std::vector<int> &nslist,
                                 std::vector<int> &nblist) {
-  Kokkos::Profiling::pushRegion("CalculateLoadBalance");
+  PARTHENON_INSTRUMENT
   auto const total_blocks = costlist.size();
 
   using it = std::vector<double>::const_iterator;
@@ -452,7 +451,6 @@ void Mesh::CalculateLoadBalance(std::vector<double> const &costlist,
                 << std::endl;
     }
   }
-  Kokkos::Profiling::popRegion(); // CalculateLoadBalance
 }
 
 //----------------------------------------------------------------------------------------
@@ -492,7 +490,7 @@ void Mesh::UpdateCostList() {
 // \brief collect refinement flags and manipulate the MeshBlockTree
 
 void Mesh::UpdateMeshBlockTree(int &nnew, int &ndel) {
-  Kokkos::Profiling::pushRegion("UpdateMeshBlockTree");
+  PARTHENON_INSTRUMENT
   // compute nleaf= number of leaf MeshBlocks per refined block
   int nleaf = 2;
   if (!mesh_size.symmetry(X2DIR)) nleaf = 4;
@@ -520,7 +518,6 @@ void Mesh::UpdateMeshBlockTree(int &nnew, int &ndel) {
     tnderef += nderef[n];
   }
   if (tnref == 0 && tnderef < nleaf) { // nothing to do
-    Kokkos::Profiling::popRegion();    // UpdateMeshBlockTree
     return;
   }
 
@@ -622,8 +619,6 @@ void Mesh::UpdateMeshBlockTree(int &nnew, int &ndel) {
     bt->Derefine(ndel);
   }
   if (tnderef >= nleaf) delete[] clderef;
-
-  Kokkos::Profiling::popRegion(); // UpdateMeshBlockTree
 }
 
 //----------------------------------------------------------------------------------------
@@ -664,7 +659,7 @@ bool Mesh::GatherCostListAndCheckBalance() {
 
 void Mesh::RedistributeAndRefineMeshBlocks(ParameterInput *pin, ApplicationInput *app_in,
                                            int ntot) {
-  Kokkos::Profiling::pushRegion("RedistributeAndRefineMeshBlocks");
+  PARTHENON_INSTRUMENT
   // kill any cached packs
   mesh_data.PurgeNonBase();
   mesh_data.Get()->ClearCaches();
@@ -675,58 +670,57 @@ void Mesh::RedistributeAndRefineMeshBlocks(ParameterInput *pin, ApplicationInput
   if (!mesh_size.symmetry(X3DIR)) nleaf = 8;
 
   // construct new lists
-  Kokkos::Profiling::pushRegion("Construct new list");
   std::vector<LogicalLocation> newloc(ntot);
   std::vector<int> newrank(ntot);
   std::vector<double> newcost(ntot);
   std::vector<int> newtoold(ntot);
   std::vector<int> oldtonew(nbtotal);
-
   int nbtold = nbtotal;
-  tree.GetMeshBlockList(newloc.data(), newtoold.data(), nbtotal);
-
-  // create a list mapping the previous gid to the current one
-  oldtonew[0] = 0;
-  int mb_idx = 1;
-  for (int n = 1; n < ntot; n++) {
-    if (newtoold[n] == newtoold[n - 1] + 1) { // normal
-      oldtonew[mb_idx++] = n;
-    } else if (newtoold[n] == newtoold[n - 1] + nleaf) { // derefined
-      for (int j = 0; j < nleaf - 1; j++)
-        oldtonew[mb_idx++] = n - 1;
-      oldtonew[mb_idx++] = n;
-    }
-  }
-  // fill the last block
-  for (; mb_idx < nbtold; mb_idx++)
-    oldtonew[mb_idx] = ntot - 1;
-
-  current_level = 0;
   std::unordered_set<LogicalLocation> newly_refined;
-  for (int n = 0; n < ntot; n++) {
-    // "on" = "old n" = "old gid" = "old global MeshBlock ID"
-    int on = newtoold[n];
-    if (newloc[n].level() > current_level) // set the current max level
-      current_level = newloc[n].level();
-    if (newloc[n].level() >= loclist[on].level()) { // same or refined
-      newcost[n] = costlist[on];
-      // Keep a list of all blocks refined for below
-      if (newloc[n].level() > loclist[on].level()) {
-        newly_refined.insert(newloc[n]);
-      }
-    } else {
-      double acost = 0.0;
-      for (int l = 0; l < nleaf; l++)
-        acost += costlist[on + l];
-      newcost[n] = acost / nleaf;
-    }
-  }
-
   // store old nbstart and nbend before load balancing.
   int onbs = nslist[Globals::my_rank];
   int onbe = onbs + nblist[Globals::my_rank] - 1;
 
-  Kokkos::Profiling::popRegion(); // Construct new list
+  { // Construct new list region
+    PARTHENON_INSTRUMENT_REGION("Construct_new_list")
+    tree.GetMeshBlockList(newloc.data(), newtoold.data(), nbtotal);
+
+    // create a list mapping the previous gid to the current one
+    oldtonew[0] = 0;
+    int mb_idx = 1;
+    for (int n = 1; n < ntot; n++) {
+      if (newtoold[n] == newtoold[n - 1] + 1) { // normal
+        oldtonew[mb_idx++] = n;
+      } else if (newtoold[n] == newtoold[n - 1] + nleaf) { // derefined
+        for (int j = 0; j < nleaf - 1; j++)
+          oldtonew[mb_idx++] = n - 1;
+        oldtonew[mb_idx++] = n;
+      }
+    }
+    // fill the last block
+    for (; mb_idx < nbtold; mb_idx++)
+      oldtonew[mb_idx] = ntot - 1;
+
+    current_level = 0;
+    for (int n = 0; n < ntot; n++) {
+      // "on" = "old n" = "old gid" = "old global MeshBlock ID"
+      int on = newtoold[n];
+      if (newloc[n].level() > current_level) // set the current max level
+        current_level = newloc[n].level();
+      if (newloc[n].level() >= loclist[on].level()) { // same or refined
+        newcost[n] = costlist[on];
+        // Keep a list of all blocks refined for below
+        if (newloc[n].level() > loclist[on].level()) {
+          newly_refined.insert(newloc[n]);
+        }
+      } else {
+        double acost = 0.0;
+        for (int l = 0; l < nleaf; l++)
+          acost += costlist[on + l];
+        newcost[n] = acost / nleaf;
+      }
+    }
+  } // Construct new list region
 
   // Calculate new load balance
   CalculateLoadBalance(newcost, newrank, nslist, nblist);
@@ -763,64 +757,67 @@ void Mesh::RedistributeAndRefineMeshBlocks(ParameterInput *pin, ApplicationInput
 
 #ifdef MPI_PARALLEL
   // Send data from old to new blocks
-  Kokkos::Profiling::pushRegion("AMR: Send");
   std::vector<MPI_Request> send_reqs;
-  for (int n = onbs; n <= onbe; n++) {
-    int nn = oldtonew[n];
-    LogicalLocation &oloc = loclist[n];
-    LogicalLocation &nloc = newloc[nn];
-    auto pb = FindMeshBlock(n);
-    if (nloc.level() == oloc.level() &&
-        newrank[nn] != Globals::my_rank) { // same level, different rank
-      for (auto &var : pb->vars_cc_)
-        send_reqs.emplace_back(SendSameToSame(nn - nslist[newrank[nn]], newrank[nn],
-                                              var.get(), pb.get(), this));
-    } else if (nloc.level() > oloc.level()) { // c2f
-      // c2f must communicate to multiple leaf blocks (unlike f2c, same2same)
-      for (int l = 0; l < nleaf; l++) {
-        const int nl = nn + l; // Leaf block index in new global block list
-        LogicalLocation &nloc = newloc[nl];
+  { // AMR Send region
+    PARTHENON_INSTRUMENT_REGION("AMR_Send")
+    for (int n = onbs; n <= onbe; n++) {
+      int nn = oldtonew[n];
+      LogicalLocation &oloc = loclist[n];
+      LogicalLocation &nloc = newloc[nn];
+      auto pb = FindMeshBlock(n);
+      if (nloc.level() == oloc.level() &&
+          newrank[nn] != Globals::my_rank) { // same level, different rank
         for (auto &var : pb->vars_cc_)
-          send_reqs.emplace_back(SendCoarseToFine(nl - nslist[newrank[nl]], newrank[nl],
-                                                  nloc, var.get(), this));
-      } // end loop over nleaf (unique to c2f branch in this step 6)
-    } else if (nloc.level() < oloc.level()) { // f2c: restrict + pack + send
-      for (auto &var : pb->vars_cc_)
-        send_reqs.emplace_back(SendFineToCoarse(nn - nslist[newrank[nn]], newrank[nn],
-                                                oloc, var.get(), this));
+          send_reqs.emplace_back(SendSameToSame(nn - nslist[newrank[nn]], newrank[nn],
+                                                var.get(), pb.get(), this));
+      } else if (nloc.level() > oloc.level()) { // c2f
+        // c2f must communicate to multiple leaf blocks (unlike f2c, same2same)
+        for (int l = 0; l < nleaf; l++) {
+          const int nl = nn + l; // Leaf block index in new global block list
+          LogicalLocation &nloc = newloc[nl];
+          for (auto &var : pb->vars_cc_)
+            send_reqs.emplace_back(SendCoarseToFine(nl - nslist[newrank[nl]], newrank[nl],
+                                                    nloc, var.get(), this));
+        } // end loop over nleaf (unique to c2f branch in this step 6)
+      } else if (nloc.level() < oloc.level()) { // f2c: restrict + pack + send
+        for (auto &var : pb->vars_cc_)
+          send_reqs.emplace_back(SendFineToCoarse(nn - nslist[newrank[nn]], newrank[nn],
+                                                  oloc, var.get(), this));
+      }
     }
-  }
-  Kokkos::Profiling::popRegion(); // AMR: Send
-#endif                            // MPI_PARALLEL
+  }    // AMR Send region
+#endif // MPI_PARALLEL
 
   // Construct a new MeshBlock list (moving the data within the MPI rank)
-  Kokkos::Profiling::pushRegion("AMR: Construct new MeshBlockList");
-  RegionSize block_size = GetBlockSize();
-
   BlockList_t new_block_list(nbe - nbs + 1);
-  for (int n = nbs; n <= nbe; n++) {
-    int on = newtoold[n];
-    if ((ranklist[on] == Globals::my_rank) &&
-        (loclist[on].level() == newloc[n].level())) {
-      // on the same MPI rank and same level -> just move it
-      new_block_list[n - nbs] = FindMeshBlock(on);
-      if (!new_block_list[n - nbs]) {
+  { // AMR Construct new MeshBlockList region
+    PARTHENON_INSTRUMENT_REGION("AMR_Construct_new_list")
+    RegionSize block_size = GetBlockSize();
+
+    for (int n = nbs; n <= nbe; n++) {
+      int on = newtoold[n];
+      if ((ranklist[on] == Globals::my_rank) &&
+          (loclist[on].level() == newloc[n].level())) {
+        // on the same MPI rank and same level -> just move it
+        new_block_list[n - nbs] = FindMeshBlock(on);
+        if (!new_block_list[n - nbs]) {
+          BoundaryFlag block_bcs[6];
+          SetBlockSizeAndBoundaries(newloc[n], block_size, block_bcs);
+          new_block_list[n - nbs] =
+              MeshBlock::Make(n, n - nbs, newloc[n], block_size, block_bcs, this, pin,
+                              app_in, packages, resolved_packages, gflag);
+        }
+      } else {
+        // on a different refinement level or MPI rank - create a new block
         BoundaryFlag block_bcs[6];
         SetBlockSizeAndBoundaries(newloc[n], block_size, block_bcs);
+        // append new block to list of MeshBlocks
         new_block_list[n - nbs] =
             MeshBlock::Make(n, n - nbs, newloc[n], block_size, block_bcs, this, pin,
                             app_in, packages, resolved_packages, gflag);
       }
-    } else {
-      // on a different refinement level or MPI rank - create a new block
-      BoundaryFlag block_bcs[6];
-      SetBlockSizeAndBoundaries(newloc[n], block_size, block_bcs);
-      // append new block to list of MeshBlocks
-      new_block_list[n - nbs] =
-          MeshBlock::Make(n, n - nbs, newloc[n], block_size, block_bcs, this, pin, app_in,
-                          packages, resolved_packages, gflag);
     }
-  }
+  } // AMR Construct new MeshBlockList region
 
   // Replace the MeshBlock list
   auto old_block_list = std::move(block_list);
@@ -832,142 +829,141 @@ void Mesh::RedistributeAndRefineMeshBlocks(ParameterInput *pin, ApplicationInput
     block_list[n - nbs]->lid = n - nbs;
   }
 
-  Kokkos::Profiling::popRegion(); // AMR: Construct new MeshBlockList
-
   // Receive the data and load into MeshBlocks
-  Kokkos::Profiling::pushRegion("AMR: Recv data and unpack");
-  bool all_received;
-  int niter = 0;
-  if (block_list.size() > 0) {
-    // Create a vector for holding the status of all communications, it is sized to fit
-    // the maximal number of calculations that this rank could receive: the number of
-    // blocks on the rank x the number of variables x times the number of fine blocks
-    // that would communicate if every block had been coarsened (8 in 3D)
-    std::vector<bool> finished(
-        std::max((nbe - nbs + 1), 1) * FindMeshBlock(nbs)->vars_cc_.size() * 8, false);
-    do {
-      all_received = true;
-      niter++;
-      int idx = 0;
-      for (int n = nbs; n <= nbe; n++) {
-        int on = newtoold[n];
-        LogicalLocation &oloc = loclist[on];
-        LogicalLocation &nloc = newloc[n];
-        auto pb = FindMeshBlock(n);
-        if (oloc.level() == nloc.level() &&
-            ranklist[on] != Globals::my_rank) { // same level, different rank
+  { // AMR Recv and unpack data
+    PARTHENON_INSTRUMENT_REGION("AMR_Recv_unpack")
+    bool all_received;
+    int niter = 0;
+    if (block_list.size() > 0) {
+      // Create a vector for holding the status of all communications, it is sized to fit
+      // the maximal number of calculations that this rank could receive: the number of
+      // blocks on the rank x the number of variables x times the number of fine blocks
+      // that would communicate if every block had been coarsened (8 in 3D)
+      std::vector<bool> finished(
+          std::max((nbe - nbs + 1), 1) * FindMeshBlock(nbs)->vars_cc_.size() * 8, false);
+      do {
+        all_received = true;
+        niter++;
+        int idx = 0;
+        for (int n = nbs; n <= nbe; n++) {
+          int on = newtoold[n];
+          LogicalLocation &oloc = loclist[on];
+          LogicalLocation &nloc = newloc[n];
+          auto pb = FindMeshBlock(n);
+          if (oloc.level() == nloc.level() &&
+              ranklist[on] != Globals::my_rank) { // same level, different rank
 #ifdef MPI_PARALLEL
-          for (auto &var : pb->vars_cc_) {
-            if (!finished[idx])
-              finished[idx] =
-                  TryRecvSameToSame(n - nbs, ranklist[on], var.get(), pb.get(), this);
-            all_received = finished[idx++] && all_received;
-          }
+            for (auto &var : pb->vars_cc_) {
+              if (!finished[idx])
+                finished[idx] =
+                    TryRecvSameToSame(n - nbs, ranklist[on], var.get(), pb.get(), this);
+              all_received = finished[idx++] && all_received;
+            }
 #endif
-        } else if (oloc.level() > nloc.level()) { // f2c
-          for (int l = 0; l < nleaf; l++) {
-            auto pob = pb;
-            if (ranklist[on + l] == Globals::my_rank) pob = old_block_list[on + l - onbs];
-            LogicalLocation &oloc = loclist[on + l];
+          } else if (oloc.level() > nloc.level()) { // f2c
+            for (int l = 0; l < nleaf; l++) {
+              auto pob = pb;
+              if (ranklist[on + l] == Globals::my_rank)
+                pob = old_block_list[on + l - onbs];
+              LogicalLocation &oloc = loclist[on + l];
+              for (auto &var : pb->vars_cc_) {
+                if (!finished[idx]) {
+                  auto var_in = pob->meshblock_data.Get()->GetVarPtr(var->label());
+                  finished[idx] =
+                      TryRecvFineToCoarse(n - nbs, ranklist[on + l], oloc, var_in.get(),
+                                          var.get(), pb.get(), this);
+                }
+                all_received = finished[idx++] && all_received;
+              }
+            }
+          } else if (oloc.level() < nloc.level()) { // c2f
             for (auto &var : pb->vars_cc_) {
               if (!finished[idx]) {
+                auto pob = pb;
+                if (ranklist[on] == Globals::my_rank) pob = old_block_list[on - onbs];
                 auto var_in = pob->meshblock_data.Get()->GetVarPtr(var->label());
-                finished[idx] =
-                    TryRecvFineToCoarse(n - nbs, ranklist[on + l], oloc, var_in.get(),
-                                        var.get(), pb.get(), this);
+                finished[idx] = TryRecvCoarseToFine(
+                    n - nbs, ranklist[on], nloc, var_in.get(), var.get(), pb.get(), this);
               }
               all_received = finished[idx++] && all_received;
             }
           }
-        } else if (oloc.level() < nloc.level()) { // c2f
-          for (auto &var : pb->vars_cc_) {
-            if (!finished[idx]) {
-              auto pob = pb;
-              if (ranklist[on] == Globals::my_rank) pob = old_block_list[on - onbs];
-              auto var_in = pob->meshblock_data.Get()->GetVarPtr(var->label());
-              finished[idx] = TryRecvCoarseToFine(
-                  n - nbs, ranklist[on], nloc, var_in.get(), var.get(), pb.get(), this);
-            }
-            all_received = finished[idx++] && all_received;
-          }
         }
-      }
-      // rb_idx is a running index, so we repeat the loop until all vals are true
-    } while (!all_received && niter < 1e7);
-    if (!all_received) PARTHENON_FAIL("AMR Receive failed");
-  }
-  // Fence here to be careful that all communication is finished before moving
-  // on to prolongation
-  Kokkos::fence();
-
-  // Prolongate blocks that had a coarse buffer filled (i.e. c2f blocks)
-  ProResCache_t prolongation_cache;
-  int nprolong = 0;
-  for (int nn = nbs; nn <= nbe; nn++) {
-    int on = newtoold[nn];
-    auto pmb = FindMeshBlock(nn);
-    if (newloc[nn].level() > loclist[on].level()) nprolong += pmb->vars_cc_.size();
-  }
-  prolongation_cache.Initialize(nprolong, resolved_packages.get());
-  int iprolong = 0;
-  for (int nn = nbs; nn <= nbe; nn++) {
-    int on = newtoold[nn];
-    if (newloc[nn].level() > loclist[on].level()) {
+        // rb_idx is a running index, so we repeat the loop until all vals are true
+      } while (!all_received && niter < 1e7);
+      if (!all_received) PARTHENON_FAIL("AMR Receive failed");
+    }
+    // Fence here to be careful that all communication is finished before moving
+    // on to prolongation
+    Kokkos::fence();
+
+    // Prolongate blocks that had a coarse buffer filled (i.e. c2f blocks)
+    ProResCache_t prolongation_cache;
+    int nprolong = 0;
+    for (int nn = nbs; nn <= nbe; nn++) {
+      int on = newtoold[nn];
       auto pmb = FindMeshBlock(nn);
-      for (auto &var : pmb->vars_cc_) {
-        prolongation_cache.RegisterRegionHost(iprolong++,
-                                              ProResInfo::GetInteriorProlongate(pmb, var),
-                                              var.get(), resolved_packages.get());
+      if (newloc[nn].level() > loclist[on].level()) nprolong += pmb->vars_cc_.size();
+    }
+    prolongation_cache.Initialize(nprolong, resolved_packages.get());
+    int iprolong = 0;
+    for (int nn = nbs; nn <= nbe; nn++) {
+      int on = newtoold[nn];
+      if (newloc[nn].level() > loclist[on].level()) {
+        auto pmb = FindMeshBlock(nn);
+        for (auto &var : pmb->vars_cc_) {
+          prolongation_cache.RegisterRegionHost(
+              iprolong++, ProResInfo::GetInteriorProlongate(pmb, var), var.get(),
+              resolved_packages.get());
+        }
       }
     }
-  }
-  prolongation_cache.CopyToDevice();
-
-  refinement::ProlongateShared(resolved_packages.get(), prolongation_cache,
-                               block_list[0]->cellbounds, block_list[0]->c_cellbounds);
-
-  // update the lists
-  loclist = std::move(newloc);
-  ranklist = std::move(newrank);
-  costlist = std::move(newcost);
-
-  // A block newly refined and prolongated may have neighbors which were
-  // already refined to the new level.
-  // If so, the prolongated versions of shared elements will not reflect
-  // the true, finer versions present in the neighbor block.
-  // We must create any new fine buffers and fill them from these neighbors
-  // in order to maintain a consistent global state.
-  // Thus we rebuild and synchronize the mesh now, but using a unique
-  // neighbor precedence favoring the "old" fine blocks over "new" ones
-  for (auto &pmb : block_list) {
-    pmb->pbval->SearchAndSetNeighbors(tree, ranklist.data(), nslist.data(),
-                                      newly_refined);
-  }
-  // Make sure all old sends/receives are done before we reconfigure the mesh
-#ifdef MPI_PARALLEL
-  if (send_reqs.size() != 0)
-    PARTHENON_MPI_CHECK(
-        MPI_Waitall(send_reqs.size(), send_reqs.data(), MPI_STATUSES_IGNORE));
-#endif
-  // Re-initialize the mesh with our temporary ownership/neighbor configurations.
-  // No buffers are different when we switch to the final precedence order.
-  Initialize(false, pin, app_in);
+    prolongation_cache.CopyToDevice();
 
-  // Internal refinement relies on the fine shared values, which are only consistent after
-  // being updated with any previously fine versions
-  refinement::ProlongateInternal(resolved_packages.get(), prolongation_cache,
+    refinement::ProlongateShared(resolved_packages.get(), prolongation_cache,
                                  block_list[0]->cellbounds, block_list[0]->c_cellbounds);
 
-  // Rebuild just the ownership model, this time weighting the "new" fine blocks just like
-  // any other blocks at their level.
-  for (auto &pmb : block_list) {
-    pmb->pbval->SearchAndSetNeighbors(tree, ranklist.data(), nslist.data());
-  }
+    // update the lists
+    loclist = std::move(newloc);
+    ranklist = std::move(newrank);
+    costlist = std::move(newcost);
+
+    // A block newly refined and prolongated may have neighbors which were
+    // already refined to the new level.
+    // If so, the prolongated versions of shared elements will not reflect
+    // the true, finer versions present in the neighbor block.
+    // We must create any new fine buffers and fill them from these neighbors
+    // in order to maintain a consistent global state.
+    // Thus we rebuild and synchronize the mesh now, but using a unique
+    // neighbor precedence favoring the "old" fine blocks over "new" ones
+    for (auto &pmb : block_list) {
+      pmb->pbval->SearchAndSetNeighbors(tree, ranklist.data(), nslist.data(),
+                                        newly_refined);
+    }
+    // Make sure all old sends/receives are done before we reconfigure the mesh
+#ifdef MPI_PARALLEL
+    if (send_reqs.size() != 0)
+      PARTHENON_MPI_CHECK(
+          MPI_Waitall(send_reqs.size(), send_reqs.data(), MPI_STATUSES_IGNORE));
+#endif
+    // Re-initialize the mesh with our temporary ownership/neighbor configurations.
+    // No buffers are different when we switch to the final precedence order.
+    Initialize(false, pin, app_in);
+
+    // Internal refinement relies on the fine shared values, which are only consistent
+    // after being updated with any previously fine versions
+    refinement::ProlongateInternal(resolved_packages.get(), prolongation_cache,
+                                   block_list[0]->cellbounds,
+                                   block_list[0]->c_cellbounds);
+
+    // Rebuild just the ownership model, this time weighting the "new" fine blocks just
+    // like any other blocks at their level.
+    for (auto &pmb : block_list) {
+      pmb->pbval->SearchAndSetNeighbors(tree, ranklist.data(), nslist.data());
+    }
 
-  Kokkos::Profiling::popRegion(); // AMR: Recv data and unpack
+  } // AMR Recv and unpack data
 
   ResetLoadBalanceVariables();
-
-  Kokkos::Profiling::popRegion(); // RedistributeAndRefineMeshBlocks
 }
 } // namespace parthenon
diff --git a/src/mesh/mesh.cpp b/src/mesh/mesh.cpp
index ed81e3e4e219..a6651154ae3a 100644
--- a/src/mesh/mesh.cpp
+++ b/src/mesh/mesh.cpp
@@ -987,7 +987,7 @@ void Mesh::ApplyUserWorkBeforeOutput(ParameterInput *pin) {
 // \brief  initialization before the main loop as well as during remeshing
 
 void Mesh::Initialize(bool init_problem, ParameterInput *pin, ApplicationInput *app_in) {
-  Kokkos::Profiling::pushRegion("Mesh::Initialize");
+  PARTHENON_INSTRUMENT
   bool init_done = true;
   const int nb_initial = nbtotal;
   do {
@@ -1174,8 +1174,6 @@ void Mesh::Initialize(bool init_problem, ParameterInput *pin, ApplicationInput *
 
   // Initialize the "base" MeshData object
   mesh_data.Get()->Set(block_list);
-
-  Kokkos::Profiling::popRegion(); // Mesh::Initialize
 }
 
 /// Finds location of a block with ID `tgid`.
diff --git a/src/outputs/outputs.cpp b/src/outputs/outputs.cpp
index de54d7827c59..14eac0dd20fa 100644
--- a/src/outputs/outputs.cpp
+++ b/src/outputs/outputs.cpp
@@ -402,7 +402,7 @@ void OutputType::ClearOutputData() {
 
 void Outputs::MakeOutputs(Mesh *pm, ParameterInput *pin, SimTime *tm,
                           const SignalHandler::OutputSignal signal) {
-  Kokkos::Profiling::pushRegion("MakeOutputs");
+  PARTHENON_INSTRUMENT
   bool first = true;
   OutputType *ptype = pfirst_type_;
   while (ptype != nullptr) {
@@ -418,7 +418,6 @@ void Outputs::MakeOutputs(Mesh *pm, ParameterInput *pin, SimTime *tm,
     }
     ptype = ptype->pnext_type; // move to next OutputType node in singly linked list
   }
-  Kokkos::Profiling::popRegion(); // MakeOutputs
 }
 
 } // namespace parthenon
diff --git a/src/outputs/parthenon_hdf5.cpp b/src/outputs/parthenon_hdf5.cpp
index 95b89b93b627..866d4096015d 100644
--- a/src/outputs/parthenon_hdf5.cpp
+++ b/src/outputs/parthenon_hdf5.cpp
@@ -63,12 +63,7 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm
                                       const SignalHandler::OutputSignal signal) {
   using namespace HDF5;
   using namespace OutputUtils;
-
-  if constexpr (WRITE_SINGLE_PRECISION) {
-    Kokkos::Profiling::pushRegion("PHDF5::WriteOutputFileSinglePrec");
-  } else {
-    Kokkos::Profiling::pushRegion("PHDF5::WriteOutputFileRealPrec");
-  }
+  PARTHENON_INSTRUMENT
 
   // writes all graphics variables to hdf file
   // HDF5 structures
@@ -117,95 +112,94 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm
   // -------------------------------------------------------------------------------- //
   //   WRITING ATTRIBUTES                                                             //
   // -------------------------------------------------------------------------------- //
-  Kokkos::Profiling::pushRegion("write Attributes");
-  {
-    Kokkos::Profiling::pushRegion("write input");
-    // write input key-value pairs
-    std::ostringstream oss;
-    pin->ParameterDump(oss);
-
-    // Mesh information
-    const H5G input_group = MakeGroup(file, "/Input");
-
-    HDF5WriteAttribute("File", oss.str().c_str(), input_group);
-    Kokkos::Profiling::popRegion(); // write input
-  }                                 // Input section
-
-  // we'll need this again at the end
-  const H5G info_group = MakeGroup(file, "/Info");
-  {
-    Kokkos::Profiling::pushRegion("write Info");
-    HDF5WriteAttribute("OutputFormatVersion", OUTPUT_VERSION_FORMAT, info_group);
+  H5G info_group;
+  { // write attributes
+    PARTHENON_INSTRUMENT_REGION("write_attributes") {
+      PARTHENON_INSTRUMENT_REGION("write_input")
+      // write input key-value pairs
+      std::ostringstream oss;
+      pin->ParameterDump(oss);
+
+      // Mesh information
+      const H5G input_group = MakeGroup(file, "/Input");
+
+      HDF5WriteAttribute("File", oss.str().c_str(), input_group);
+    } // Input section
+
+    // we'll need this again at the end
+    info_group = MakeGroup(file, "/Info");
+    {
+      PARTHENON_INSTRUMENT_REGION("write_info")
+      HDF5WriteAttribute("OutputFormatVersion", OUTPUT_VERSION_FORMAT, info_group);
 
-    if (tm != nullptr) {
-      HDF5WriteAttribute("NCycle", tm->ncycle, info_group);
-      HDF5WriteAttribute("Time", tm->time, info_group);
-      HDF5WriteAttribute("dt", tm->dt, info_group);
-    }
+      if (tm != nullptr) {
+        HDF5WriteAttribute("NCycle", tm->ncycle, info_group);
+        HDF5WriteAttribute("Time", tm->time, info_group);
+        HDF5WriteAttribute("dt", tm->dt, info_group);
+      }
 
-    HDF5WriteAttribute("WallTime", Driver::elapsed_main(), info_group);
-    HDF5WriteAttribute("NumDims", pm->ndim, info_group);
-    HDF5WriteAttribute("NumMeshBlocks", pm->nbtotal, info_group);
-    HDF5WriteAttribute("MaxLevel", max_level, info_group);
-    // write whether we include ghost cells or not
-    HDF5WriteAttribute("IncludesGhost", output_params.include_ghost_zones ? 1 : 0,
-                       info_group);
-    // write number of ghost cells in simulation
-    HDF5WriteAttribute("NGhost", Globals::nghost, info_group);
-    HDF5WriteAttribute("Coordinates", std::string(first_block.coords.Name()).c_str(),
-                       info_group);
-
-    // restart info, write always
-    HDF5WriteAttribute("NBNew", pm->nbnew, info_group);
-    HDF5WriteAttribute("NBDel", pm->nbdel, info_group);
-    HDF5WriteAttribute("RootLevel", rootLevel, info_group);
-    HDF5WriteAttribute("Refine", pm->adaptive ? 1 : 0, info_group);
-    HDF5WriteAttribute("Multilevel", pm->multilevel ? 1 : 0, info_group);
-
-    HDF5WriteAttribute("BlocksPerPE", nblist, info_group);
-
-    // Mesh block size
-    HDF5WriteAttribute("MeshBlockSize", std::vector<int>{nx1, nx2, nx3}, info_group);
-
-    // RootGridDomain - float[9] array with xyz mins, maxs, rats (dx(i)/dx(i-1))
-    HDF5WriteAttribute(
-        "RootGridDomain",
-        std::vector<Real>{pm->mesh_size.xmin(X1DIR), pm->mesh_size.xmax(X1DIR),
-                          pm->mesh_size.xrat(X1DIR), pm->mesh_size.xmin(X2DIR),
-                          pm->mesh_size.xmax(X2DIR), pm->mesh_size.xrat(X2DIR),
-                          pm->mesh_size.xmin(X3DIR), pm->mesh_size.xmax(X3DIR),
-                          pm->mesh_size.xrat(X3DIR)},
-        info_group);
-
-    // Root grid size (number of cells at root level)
-    HDF5WriteAttribute("RootGridSize",
-                       std::vector<int>{pm->mesh_size.nx(X1DIR), pm->mesh_size.nx(X2DIR),
-                                        pm->mesh_size.nx(X3DIR)},
-                       info_group);
-
-    // Boundary conditions
-    std::vector<std::string> boundary_condition_str(BOUNDARY_NFACES);
-    for (size_t i = 0; i < boundary_condition_str.size(); i++) {
-      boundary_condition_str[i] = GetBoundaryString(pm->mesh_bcs[i]);
-    }
+      HDF5WriteAttribute("WallTime", Driver::elapsed_main(), info_group);
+      HDF5WriteAttribute("NumDims", pm->ndim, info_group);
+      HDF5WriteAttribute("NumMeshBlocks", pm->nbtotal, info_group);
+      HDF5WriteAttribute("MaxLevel", max_level, info_group);
+      // write whether we include ghost cells or not
+      HDF5WriteAttribute("IncludesGhost", output_params.include_ghost_zones ? 1 : 0,
+                         info_group);
+      // write number of ghost cells in simulation
+      HDF5WriteAttribute("NGhost", Globals::nghost, info_group);
+      HDF5WriteAttribute("Coordinates", std::string(first_block.coords.Name()).c_str(),
+                         info_group);
+
+      // restart info, write always
+      HDF5WriteAttribute("NBNew", pm->nbnew, info_group);
+      HDF5WriteAttribute("NBDel", pm->nbdel, info_group);
+      HDF5WriteAttribute("RootLevel", rootLevel, info_group);
+      HDF5WriteAttribute("Refine", pm->adaptive ? 1 : 0, info_group);
+      HDF5WriteAttribute("Multilevel", pm->multilevel ? 1 : 0, info_group);
+
+      HDF5WriteAttribute("BlocksPerPE", nblist, info_group);
+
+      // Mesh block size
+      HDF5WriteAttribute("MeshBlockSize", std::vector<int>{nx1, nx2, nx3}, info_group);
+
+      // RootGridDomain - float[9] array with xyz mins, maxs, rats (dx(i)/dx(i-1))
+      HDF5WriteAttribute(
+          "RootGridDomain",
+          std::vector<Real>{pm->mesh_size.xmin(X1DIR), pm->mesh_size.xmax(X1DIR),
+                            pm->mesh_size.xrat(X1DIR), pm->mesh_size.xmin(X2DIR),
+                            pm->mesh_size.xmax(X2DIR), pm->mesh_size.xrat(X2DIR),
+                            pm->mesh_size.xmin(X3DIR), pm->mesh_size.xmax(X3DIR),
+                            pm->mesh_size.xrat(X3DIR)},
+          info_group);
+
+      // Root grid size (number of cells at root level)
+      HDF5WriteAttribute("RootGridSize",
+                         std::vector<int>{pm->mesh_size.nx(X1DIR),
+                                          pm->mesh_size.nx(X2DIR),
+                                          pm->mesh_size.nx(X3DIR)},
+                         info_group);
+
+      // Boundary conditions
+      std::vector<std::string> boundary_condition_str(BOUNDARY_NFACES);
+      for (size_t i = 0; i < boundary_condition_str.size(); i++) {
+        boundary_condition_str[i] = GetBoundaryString(pm->mesh_bcs[i]);
+      }
 
-    HDF5WriteAttribute("BoundaryConditions", boundary_condition_str, info_group);
-    Kokkos::Profiling::popRegion(); // write Info
-  }                                 // Info section
+      HDF5WriteAttribute("BoundaryConditions", boundary_condition_str, info_group);
+    } // Info section
 
-  // write Params
-  {
-    Kokkos::Profiling::pushRegion("behold: write Params");
-    const H5G params_group = MakeGroup(file, "/Params");
+    // write Params
+    {
+      PARTHENON_INSTRUMENT_REGION("write_params")
+      const H5G params_group = MakeGroup(file, "/Params");
 
-    for (const auto &package : pm->packages.AllPackages()) {
-      const auto state = package.second;
-      // Write all params that can be written as HDF5 attributes
-      state->AllParams().WriteAllToHDF5(state->label(), params_group);
-    }
-    Kokkos::Profiling::popRegion(); // behold: write Params
-  }                                 // Params section
-  Kokkos::Profiling::popRegion();   // write Attributes
+      for (const auto &package : pm->packages.AllPackages()) {
+        const auto state = package.second;
+        // Write all params that can be written as HDF5 attributes
+        state->AllParams().WriteAllToHDF5(state->label(), params_group);
+      }
+    } // Params section
+  }   // write attributes
 
   // -------------------------------------------------------------------------------- //
   //   WRITING MESHBLOCK METADATA                                                     //
@@ -242,7 +236,7 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm
 
   // write Blocks metadata
   {
-    Kokkos::Profiling::pushRegion("write block metadata");
+    PARTHENON_INSTRUMENT_REGION("write_block_metadata")
     const H5G gBlocks = MakeGroup(file, "/Blocks");
 
     // write Xmin[ndim] for blocks
@@ -272,39 +266,39 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm
       HDF5Write2D(gBlocks, "loc.level-gid-lid-cnghost-gflag", tmpID.data(), p_loc_offset,
                   p_loc_cnt, p_glob_cnt, pl_xfer);
     }
-    Kokkos::Profiling::popRegion(); // write block metadata
-  }                                 // Block section
+  } // Block section
 
   // Write mesh coordinates to file
-  Kokkos::Profiling::pushRegion("write mesh coords");
-  for (const bool face : {true, false}) {
-    const H5G gLocations = MakeGroup(file, face ? "/Locations" : "/VolumeLocations");
+  { // write mesh coords
+    PARTHENON_INSTRUMENT_REGION("write_mesh_coords");
+    for (const bool face : {true, false}) {
+      const H5G gLocations = MakeGroup(file, face ? "/Locations" : "/VolumeLocations");
 
-    // write X coordinates
-    std::vector<Real> loc_x((nx1 + face) * num_blocks_local);
-    std::vector<Real> loc_y((nx2 + face) * num_blocks_local);
-    std::vector<Real> loc_z((nx3 + face) * num_blocks_local);
+      // write X coordinates
+      std::vector<Real> loc_x((nx1 + face) * num_blocks_local);
+      std::vector<Real> loc_y((nx2 + face) * num_blocks_local);
+      std::vector<Real> loc_z((nx3 + face) * num_blocks_local);
 
-    ComputeCoords_(pm, face, out_ib, out_jb, out_kb, loc_x, loc_y, loc_z);
+      ComputeCoords_(pm, face, out_ib, out_jb, out_kb, loc_x, loc_y, loc_z);
 
-    local_count[1] = global_count[1] = nx1 + face;
-    HDF5Write2D(gLocations, "x", loc_x.data(), p_loc_offset, p_loc_cnt, p_glob_cnt,
-                pl_xfer);
+      local_count[1] = global_count[1] = nx1 + face;
+      HDF5Write2D(gLocations, "x", loc_x.data(), p_loc_offset, p_loc_cnt, p_glob_cnt,
+                  pl_xfer);
 
-    local_count[1] = global_count[1] = nx2 + face;
-    HDF5Write2D(gLocations, "y", loc_y.data(), p_loc_offset, p_loc_cnt, p_glob_cnt,
-                pl_xfer);
+      local_count[1] = global_count[1] = nx2 + face;
+      HDF5Write2D(gLocations, "y", loc_y.data(), p_loc_offset, p_loc_cnt, p_glob_cnt,
+                  pl_xfer);
 
-    local_count[1] = global_count[1] = nx3 + face;
-    HDF5Write2D(gLocations, "z", loc_z.data(), p_loc_offset, p_loc_cnt, p_glob_cnt,
-                pl_xfer);
-  }
-  Kokkos::Profiling::popRegion(); // write mesh coords
+      local_count[1] = global_count[1] = nx3 + face;
+      HDF5Write2D(gLocations, "z", loc_z.data(), p_loc_offset, p_loc_cnt, p_glob_cnt,
+                  pl_xfer);
+    }
+  } // write mesh coords
 
   // Write Levels and Logical Locations with the level for each Meshblock loclist contains
   // levels and logical locations for all meshblocks on all ranks
   {
-    Kokkos::Profiling::pushRegion("write levels and locations");
+    PARTHENON_INSTRUMENT_REGION("write_levels_and_locations")
     const auto &loclist = pm->GetLocList();
 
     std::vector<std::int64_t> levels;
@@ -331,236 +325,241 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm
 
     // reset for collective output
     local_count[0] = num_blocks_local;
-    Kokkos::Profiling::popRegion(); // write levels and locations
   }
 
   // -------------------------------------------------------------------------------- //
   //   WRITING VARIABLES DATA                                                         //
   // -------------------------------------------------------------------------------- //
-  Kokkos::Profiling::pushRegion("write all variable data");
-
-  // All blocks have the same list of variable metadata that exist in the entire
-  // simulation, but not all variables may be allocated on all blocks
-
-  auto get_vars = [=](const std::shared_ptr<MeshBlock> pmb) {
-    auto &var_vec = pmb->meshblock_data.Get()->GetVariableVector();
-    if (restart_) {
-      // get all vars with flag Independent OR restart
-      return GetAnyVariables(
-          var_vec, {parthenon::Metadata::Independent, parthenon::Metadata::Restart});
-    } else {
-      return GetAnyVariables(var_vec, output_params.variables);
-    }
-  };
-
-  // get list of all vars, just use the first block since the list is the same for all
-  // blocks
   std::vector<VarInfo> all_vars_info;
-  const auto vars = get_vars(pm->block_list.front());
-  for (auto &v : vars) {
-    all_vars_info.emplace_back(v);
-  }
-
-  // sort alphabetically
-  std::sort(all_vars_info.begin(), all_vars_info.end(),
-            [](const VarInfo &a, const VarInfo &b) { return a.label < b.label; });
-
-  // We need to add information about the sparse variables to the HDF5 file, namely:
-  // 1) Which variables are sparse
-  // 2) Is a sparse id of a particular sparse variable allocated on a given block
-  //
-  // This information is stored in the dataset called "SparseInfo". The data set
-  // contains an attribute "SparseFields" that is a vector of strings with the names
-  // of the sparse fields (field name with sparse id, i.e. "bar_28", "bar_7", foo_1",
-  // "foo_145"). The field names are in alphabetical order, which is the same order
-  // they show up in all_unique_vars (because it's a sorted set).
-  //
-  // The dataset SparseInfo itself is a 2D array of bools. The first index is the
-  // global block index and the second index is the sparse field (same order as the
-  // SparseFields attribute). SparseInfo[b][v] is true if the sparse field with index
-  // v is allocated on the block with index b, otherwise the value is false
-
   std::vector<std::string> sparse_names;
-  std::unordered_map<std::string, size_t> sparse_field_idx;
-  for (auto &vinfo : all_vars_info) {
-    if (vinfo.is_sparse) {
-      sparse_field_idx.insert({vinfo.label, sparse_names.size()});
-      sparse_names.push_back(vinfo.label);
-    }
-  }
-
-  hsize_t num_sparse = sparse_names.size();
-  // can't use std::vector here because std::vector<hbool_t> is the same as
-  // std::vector<bool> and it doesn't have .data() member
-  std::unique_ptr<hbool_t[]> sparse_allocated(new hbool_t[num_blocks_local * num_sparse]);
-
-  // allocate space for largest size variable
-  int varSize_max = 0;
-  for (auto &vinfo : all_vars_info) {
-    const int varSize =
-        vinfo.nx6 * vinfo.nx5 * vinfo.nx4 * vinfo.nx3 * vinfo.nx2 * vinfo.nx1;
-    varSize_max = std::max(varSize_max, varSize);
-  }
-
-  using OutT = typename std::conditional<WRITE_SINGLE_PRECISION, float, Real>::type;
-  std::vector<OutT> tmpData(varSize_max * num_blocks_local);
-
-  // create persistent spaces
-  local_count[0] = num_blocks_local;
-  global_count[0] = max_blocks_global;
-  local_count[4] = global_count[4] = nx3;
-  local_count[5] = global_count[5] = nx2;
-  local_count[6] = global_count[6] = nx1;
+  hsize_t num_sparse;
+  std::unique_ptr<hbool_t[]> sparse_allocated;
+  { // write all variable data
+    PARTHENON_INSTRUMENT_REGION("write_all_variable_data")
+
+    // All blocks have the same list of variable metadata that exist in the entire
+    // simulation, but not all variables may be allocated on all blocks
+
+    auto get_vars = [=](const std::shared_ptr<MeshBlock> pmb) {
+      auto &var_vec = pmb->meshblock_data.Get()->GetVariableVector();
+      if (restart_) {
+        // get all vars with flag Independent OR restart
+        return GetAnyVariables(
+            var_vec, {parthenon::Metadata::Independent, parthenon::Metadata::Restart});
+      } else {
+        return GetAnyVariables(var_vec, output_params.variables);
+      }
+    };
 
-  // for each variable we write
-  for (auto &vinfo : all_vars_info) {
-    Kokkos::Profiling::pushRegion("write variable loop");
-    // not really necessary, but doesn't hurt
-    memset(tmpData.data(), 0, tmpData.size() * sizeof(OutT));
+    // get list of all vars, just use the first block since the list is the same for all
+    // blocks
+    const auto vars = get_vars(pm->block_list.front());
+    for (auto &v : vars) {
+      all_vars_info.emplace_back(v);
+    }
 
-    const std::string var_name = vinfo.label;
-    const hsize_t nx6 = vinfo.nx6;
-    const hsize_t nx5 = vinfo.nx5;
-    const hsize_t nx4 = vinfo.nx4;
+    // sort alphabetically
+    std::sort(all_vars_info.begin(), all_vars_info.end(),
+              [](const VarInfo &a, const VarInfo &b) { return a.label < b.label; });
+
+    // We need to add information about the sparse variables to the HDF5 file, namely:
+    // 1) Which variables are sparse
+    // 2) Is a sparse id of a particular sparse variable allocated on a given block
+    //
+    // This information is stored in the dataset called "SparseInfo". The data set
+    // contains an attribute "SparseFields" that is a vector of strings with the names
+    // of the sparse fields (field name with sparse id, i.e. "bar_28", "bar_7", foo_1",
+    // "foo_145"). The field names are in alphabetical order, which is the same order
+    // they show up in all_unique_vars (because it's a sorted set).
+    //
+    // The dataset SparseInfo itself is a 2D array of bools. The first index is the
+    // global block index and the second index is the sparse field (same order as the
+    // SparseFields attribute). SparseInfo[b][v] is true if the sparse field with index
+    // v is allocated on the block with index b, otherwise the value is false
+
+    std::unordered_map<std::string, size_t> sparse_field_idx;
+    for (auto &vinfo : all_vars_info) {
+      if (vinfo.is_sparse) {
+        sparse_field_idx.insert({vinfo.label, sparse_names.size()});
+        sparse_names.push_back(vinfo.label);
+      }
+    }
 
-    local_count[1] = global_count[1] = nx6;
-    local_count[2] = global_count[2] = nx5;
-    local_count[3] = global_count[3] = nx4;
+    num_sparse = sparse_names.size();
+    // can't use std::vector here because std::vector<hbool_t> is the same as
+    // std::vector<bool> and it doesn't have .data() member
+    sparse_allocated = std::make_unique<hbool_t[]>(num_blocks_local * num_sparse);
+
+    // allocate space for largest size variable
+    int varSize_max = 0;
+    for (auto &vinfo : all_vars_info) {
+      const int varSize =
+          vinfo.nx6 * vinfo.nx5 * vinfo.nx4 * vinfo.nx3 * vinfo.nx2 * vinfo.nx1;
+      varSize_max = std::max(varSize_max, varSize);
+    }
 
-    std::vector<hsize_t> alldims({nx6, nx5, nx4, static_cast<hsize_t>(vinfo.nx3),
-                                  static_cast<hsize_t>(vinfo.nx2),
-                                  static_cast<hsize_t>(vinfo.nx1)});
+    using OutT = typename std::conditional<WRITE_SINGLE_PRECISION, float, Real>::type;
+    std::vector<OutT> tmpData(varSize_max * num_blocks_local);
 
-    int ndim = -1;
+    // create persistent spaces
+    local_count[0] = num_blocks_local;
+    global_count[0] = max_blocks_global;
+    local_count[4] = global_count[4] = nx3;
+    local_count[5] = global_count[5] = nx2;
+    local_count[6] = global_count[6] = nx1;
+
+    // for each variable we write
+    for (auto &vinfo : all_vars_info) {
+      PARTHENON_INSTRUMENT_REGION("write_variable_loop")
+      // not really necessary, but doesn't hurt
+      memset(tmpData.data(), 0, tmpData.size() * sizeof(OutT));
+
+      const std::string var_name = vinfo.label;
+      const hsize_t nx6 = vinfo.nx6;
+      const hsize_t nx5 = vinfo.nx5;
+      const hsize_t nx4 = vinfo.nx4;
+
+      local_count[1] = global_count[1] = nx6;
+      local_count[2] = global_count[2] = nx5;
+      local_count[3] = global_count[3] = nx4;
+
+      std::vector<hsize_t> alldims({nx6, nx5, nx4, static_cast<hsize_t>(vinfo.nx3),
+                                    static_cast<hsize_t>(vinfo.nx2),
+                                    static_cast<hsize_t>(vinfo.nx1)});
+
+      int ndim = -1;
 #ifndef PARTHENON_DISABLE_HDF5_COMPRESSION
-    // we need chunks to enable compression
-    std::array<hsize_t, H5_NDIM> chunk_size({1, 1, 1, 1, 1, 1, 1});
+      // we need chunks to enable compression
+      std::array<hsize_t, H5_NDIM> chunk_size({1, 1, 1, 1, 1, 1, 1});
 #endif
-    if (vinfo.where == MetadataFlag(Metadata::Cell)) {
-      ndim = 3 + vinfo.tensor_rank + 1;
-      for (int i = 0; i < vinfo.tensor_rank; i++) {
-        local_count[1 + i] = global_count[1 + i] = alldims[3 - vinfo.tensor_rank + i];
-      }
-      local_count[vinfo.tensor_rank + 1] = global_count[vinfo.tensor_rank + 1] = nx3;
-      local_count[vinfo.tensor_rank + 2] = global_count[vinfo.tensor_rank + 2] = nx2;
-      local_count[vinfo.tensor_rank + 3] = global_count[vinfo.tensor_rank + 3] = nx1;
+      if (vinfo.where == MetadataFlag(Metadata::Cell)) {
+        ndim = 3 + vinfo.tensor_rank + 1;
+        for (int i = 0; i < vinfo.tensor_rank; i++) {
+          local_count[1 + i] = global_count[1 + i] = alldims[3 - vinfo.tensor_rank + i];
+        }
+        local_count[vinfo.tensor_rank + 1] = global_count[vinfo.tensor_rank + 1] = nx3;
+        local_count[vinfo.tensor_rank + 2] = global_count[vinfo.tensor_rank + 2] = nx2;
+        local_count[vinfo.tensor_rank + 3] = global_count[vinfo.tensor_rank + 3] = nx1;
 
 #ifndef PARTHENON_DISABLE_HDF5_COMPRESSION
-      if (output_params.hdf5_compression_level > 0) {
-        for (int i = ndim - 3; i < ndim; i++) {
-          chunk_size[i] = local_count[i];
+        if (output_params.hdf5_compression_level > 0) {
+          for (int i = ndim - 3; i < ndim; i++) {
+            chunk_size[i] = local_count[i];
+          }
         }
-      }
 #endif
-    } else if (vinfo.where == MetadataFlag(Metadata::None)) {
-      ndim = vinfo.tensor_rank + 1;
-      for (int i = 0; i < vinfo.tensor_rank; i++) {
-        local_count[1 + i] = global_count[1 + i] = alldims[6 - vinfo.tensor_rank + i];
-      }
+      } else if (vinfo.where == MetadataFlag(Metadata::None)) {
+        ndim = vinfo.tensor_rank + 1;
+        for (int i = 0; i < vinfo.tensor_rank; i++) {
+          local_count[1 + i] = global_count[1 + i] = alldims[6 - vinfo.tensor_rank + i];
+        }
 
 #ifndef PARTHENON_DISABLE_HDF5_COMPRESSION
-      if (output_params.hdf5_compression_level > 0) {
-        int nchunk_indices = std::min<int>(vinfo.tensor_rank, 3);
-        for (int i = ndim - nchunk_indices; i < ndim; i++) {
-          chunk_size[i] = alldims[6 - nchunk_indices + i];
+        if (output_params.hdf5_compression_level > 0) {
+          int nchunk_indices = std::min<int>(vinfo.tensor_rank, 3);
+          for (int i = ndim - nchunk_indices; i < ndim; i++) {
+            chunk_size[i] = alldims[6 - nchunk_indices + i];
+          }
         }
-      }
 #endif
-    } else {
-      PARTHENON_THROW("Only Cell and None locations supported!");
-    }
+      } else {
+        PARTHENON_THROW("Only Cell and None locations supported!");
+      }
 
 #ifndef PARTHENON_DISABLE_HDF5_COMPRESSION
-    PARTHENON_HDF5_CHECK(H5Pset_chunk(pl_dcreate, ndim, chunk_size.data()));
-    // Do not run the pipeline if compression is soft disabled.
-    // By default data would still be passed, which may result in slower output.
-    if (output_params.hdf5_compression_level > 0) {
-      PARTHENON_HDF5_CHECK(
-          H5Pset_deflate(pl_dcreate, std::min(9, output_params.hdf5_compression_level)));
-    }
+      PARTHENON_HDF5_CHECK(H5Pset_chunk(pl_dcreate, ndim, chunk_size.data()));
+      // Do not run the pipeline if compression is soft disabled.
+      // By default data would still be passed, which may result in slower output.
+      if (output_params.hdf5_compression_level > 0) {
+        PARTHENON_HDF5_CHECK(H5Pset_deflate(
+            pl_dcreate, std::min(9, output_params.hdf5_compression_level)));
+      }
 #endif
 
-    // load up data
-    hsize_t index = 0;
-
-    Kokkos::Profiling::pushRegion("fill host output buffer");
-    // for each local mesh block
-    for (size_t b_idx = 0; b_idx < num_blocks_local; ++b_idx) {
-      const auto &pmb = pm->block_list[b_idx];
-      bool is_allocated = false;
-
-      // for each variable that this local meshblock actually has
-      const auto vars = get_vars(pmb);
-      for (auto &v : vars) {
-        // For reference, if we update the logic here, there's also
-        // a similar block in parthenon_manager.cpp
-        if (v->IsAllocated() && (var_name == v->label())) {
-          auto v_h = v->data.GetHostMirrorAndCopy();
-          for (int t = 0; t < nx6; ++t) {
-            for (int u = 0; u < nx5; ++u) {
-              for (int v = 0; v < nx4; ++v) {
-                if (vinfo.where == MetadataFlag(Metadata::Cell)) {
-                  for (int k = out_kb.s; k <= out_kb.e; ++k) {
-                    for (int j = out_jb.s; j <= out_jb.e; ++j) {
-                      for (int i = out_ib.s; i <= out_ib.e; ++i) {
-                        tmpData[index++] = static_cast<OutT>(v_h(t, u, v, k, j, i));
+      // load up data
+      hsize_t index = 0;
+
+      { // fill host output buffer
+        PARTHENON_INSTRUMENT_REGION("fill_host_output_buffer")
+        // for each local mesh block
+        for (size_t b_idx = 0; b_idx < num_blocks_local; ++b_idx) {
+          const auto &pmb = pm->block_list[b_idx];
+          bool is_allocated = false;
+
+          // for each variable that this local meshblock actually has
+          const auto vars = get_vars(pmb);
+          for (auto &v : vars) {
+            // For reference, if we update the logic here, there's also
+            // a similar block in parthenon_manager.cpp
+            if (v->IsAllocated() && (var_name == v->label())) {
+              auto v_h = v->data.GetHostMirrorAndCopy();
+              for (int t = 0; t < nx6; ++t) {
+                for (int u = 0; u < nx5; ++u) {
+                  for (int v = 0; v < nx4; ++v) {
+                    if (vinfo.where == MetadataFlag(Metadata::Cell)) {
+                      for (int k = out_kb.s; k <= out_kb.e; ++k) {
+                        for (int j = out_jb.s; j <= out_jb.e; ++j) {
+                          for (int i = out_ib.s; i <= out_ib.e; ++i) {
+                            tmpData[index++] = static_cast<OutT>(v_h(t, u, v, k, j, i));
+                          }
+                        }
                       }
-                    }
-                  }
-                } else {
-                  for (int k = 0; k < vinfo.nx3; ++k) {
-                    for (int j = 0; j < vinfo.nx2; ++j) {
-                      for (int i = 0; i < vinfo.nx1; ++i) {
-                        tmpData[index++] = static_cast<OutT>(v_h(t, u, v, k, j, i));
+                    } else {
+                      for (int k = 0; k < vinfo.nx3; ++k) {
+                        for (int j = 0; j < vinfo.nx2; ++j) {
+                          for (int i = 0; i < vinfo.nx1; ++i) {
+                            tmpData[index++] = static_cast<OutT>(v_h(t, u, v, k, j, i));
+                          }
+                        }
                       }
                     }
                   }
                 }
               }
+
+              is_allocated = true;
+              break;
             }
           }
 
-          is_allocated = true;
-          break;
-        }
-      }
-
-      if (vinfo.is_sparse) {
-        size_t sparse_idx = sparse_field_idx.at(vinfo.label);
-        sparse_allocated[b_idx * num_sparse + sparse_idx] = is_allocated;
-      }
+          if (vinfo.is_sparse) {
+            size_t sparse_idx = sparse_field_idx.at(vinfo.label);
+            sparse_allocated[b_idx * num_sparse + sparse_idx] = is_allocated;
+          }
 
-      if (!is_allocated) {
-        if (vinfo.is_sparse) {
-          hsize_t varSize{};
-          if (vinfo.where == MetadataFlag(Metadata::Cell)) {
-            varSize = vinfo.nx6 * vinfo.nx5 * vinfo.nx4 * (out_kb.e - out_kb.s + 1) *
-                      (out_jb.e - out_jb.s + 1) * (out_ib.e - out_ib.s + 1);
-          } else {
-            varSize =
-                vinfo.nx6 * vinfo.nx5 * vinfo.nx4 * vinfo.nx3 * vinfo.nx2 * vinfo.nx1;
+          if (!is_allocated) {
+            if (vinfo.is_sparse) {
+              hsize_t varSize{};
+              if (vinfo.where == MetadataFlag(Metadata::Cell)) {
+                varSize = vinfo.nx6 * vinfo.nx5 * vinfo.nx4 * (out_kb.e - out_kb.s + 1) *
+                          (out_jb.e - out_jb.s + 1) * (out_ib.e - out_ib.s + 1);
+              } else {
+                varSize =
+                    vinfo.nx6 * vinfo.nx5 * vinfo.nx4 * vinfo.nx3 * vinfo.nx2 * vinfo.nx1;
+              }
+              auto fill_val = output_params.sparse_seed_nans
+                                  ? std::numeric_limits<OutT>::quiet_NaN()
+                                  : 0;
+              std::fill(tmpData.data() + index, tmpData.data() + index + varSize,
+                        fill_val);
+              index += varSize;
+            } else {
+              std::stringstream msg;
+              msg << "### ERROR: Unable to find dense variable " << var_name << std::endl;
+              PARTHENON_FAIL(msg);
+            }
           }
-          auto fill_val =
-              output_params.sparse_seed_nans ? std::numeric_limits<OutT>::quiet_NaN() : 0;
-          std::fill(tmpData.data() + index, tmpData.data() + index + varSize, fill_val);
-          index += varSize;
-        } else {
-          std::stringstream msg;
-          msg << "### ERROR: Unable to find dense variable " << var_name << std::endl;
-          PARTHENON_FAIL(msg);
         }
-      }
+      } // fill host output buffer
+
+      { // write variable data
+        PARTHENON_INSTRUMENT_REGION("write_variable_data")
+        // write data to file
+        HDF5WriteND(file, var_name, tmpData.data(), ndim, p_loc_offset, p_loc_cnt,
+                    p_glob_cnt, pl_xfer, pl_dcreate);
+      } // write variable data
     }
-    Kokkos::Profiling::popRegion(); // fill host output buffer
-
-    Kokkos::Profiling::pushRegion("write variable data");
-    // write data to file
-    HDF5WriteND(file, var_name, tmpData.data(), ndim, p_loc_offset, p_loc_cnt, p_glob_cnt,
-                pl_xfer, pl_dcreate);
-    Kokkos::Profiling::popRegion(); // write variable data
-    Kokkos::Profiling::popRegion(); // write variable loop
-  }
-  Kokkos::Profiling::popRegion(); // write all variable data
+  } // write all variable data
 
   // names of variables
   std::vector<std::string> var_names;
@@ -593,7 +592,7 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm
   // write SparseInfo and SparseFields (we can't write a zero-size dataset, so only write
   // this if we have sparse fields)
   if (num_sparse > 0) {
-    Kokkos::Profiling::pushRegion("write sparse info");
+    PARTHENON_INSTRUMENT_REGION("write_sparse_info")
     local_count[1] = global_count[1] = num_sparse;
 
     HDF5Write2D(file, "SparseInfo", sparse_allocated.get(), p_loc_offset, p_loc_cnt,
@@ -606,82 +605,81 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm
 
     const H5D dset = H5D::FromHIDCheck(H5Dopen2(file, "SparseInfo", H5P_DEFAULT));
     HDF5WriteAttribute("SparseFields", names, dset);
-    Kokkos::Profiling::popRegion(); // write sparse info
-  }                                 // SparseInfo and SparseFields sections
+  } // SparseInfo and SparseFields sections
 
   // -------------------------------------------------------------------------------- //
   //   WRITING PARTICLE DATA                                                          //
   // -------------------------------------------------------------------------------- //
 
-  Kokkos::Profiling::pushRegion("write particle data");
   AllSwarmInfo swarm_info(pm->block_list, output_params.swarms, restart_);
-  for (auto &[swname, swinfo] : swarm_info.all_info) {
-    const H5G g_swm = MakeGroup(file, swname);
-    // offsets/counts are NOT the same here vs the grid data
-    hsize_t local_offset[6] = {static_cast<hsize_t>(my_offset), 0, 0, 0, 0, 0};
-    hsize_t local_count[6] = {static_cast<hsize_t>(num_blocks_local), 0, 0, 0, 0, 0};
-    hsize_t global_count[6] = {static_cast<hsize_t>(max_blocks_global), 0, 0, 0, 0, 0};
-    // These indicate particles/meshblock and location in global index
-    // space where each meshblock starts
-    HDF5Write1D(g_swm, "counts", swinfo.counts.data(), local_offset, local_count,
-                global_count, pl_xfer);
-    HDF5Write1D(g_swm, "offsets", swinfo.offsets.data(), local_offset, local_count,
-                global_count, pl_xfer);
-
-    const H5G g_var = MakeGroup(g_swm, "SwarmVars");
-    if (swinfo.global_count == 0) {
-      continue;
-    }
-    auto SetCounts = [&](const SwarmInfo &swinfo, const SwarmVarInfo &vinfo) {
-      const int rank = vinfo.tensor_rank;
-      for (int i = 0; i < 6; ++i) {
-        local_offset[i] = 0; // reset everything
-        local_count[i] = 0;
-        global_count[i] = 0;
+  { // write particle data
+    PARTHENON_INSTRUMENT_REGION("write_particle_data")
+    for (auto &[swname, swinfo] : swarm_info.all_info) {
+      const H5G g_swm = MakeGroup(file, swname);
+      // offsets/counts are NOT the same here vs the grid data
+      hsize_t local_offset[6] = {static_cast<hsize_t>(my_offset), 0, 0, 0, 0, 0};
+      hsize_t local_count[6] = {static_cast<hsize_t>(num_blocks_local), 0, 0, 0, 0, 0};
+      hsize_t global_count[6] = {static_cast<hsize_t>(max_blocks_global), 0, 0, 0, 0, 0};
+      // These indicate particles/meshblock and location in global index
+      // space where each meshblock starts
+      HDF5Write1D(g_swm, "counts", swinfo.counts.data(), local_offset, local_count,
+                  global_count, pl_xfer);
+      HDF5Write1D(g_swm, "offsets", swinfo.offsets.data(), local_offset, local_count,
+                  global_count, pl_xfer);
+
+      const H5G g_var = MakeGroup(g_swm, "SwarmVars");
+      if (swinfo.global_count == 0) {
+        continue;
       }
-      for (int i = 0; i < rank; ++i) {
-        local_count[i] = global_count[i] = vinfo.GetN(rank + 1 - i);
+      auto SetCounts = [&](const SwarmInfo &swinfo, const SwarmVarInfo &vinfo) {
+        const int rank = vinfo.tensor_rank;
+        for (int i = 0; i < 6; ++i) {
+          local_offset[i] = 0; // reset everything
+          local_count[i] = 0;
+          global_count[i] = 0;
+        }
+        for (int i = 0; i < rank; ++i) {
+          local_count[i] = global_count[i] = vinfo.GetN(rank + 1 - i);
+        }
+        local_offset[rank] = swinfo.global_offset;
+        local_count[rank] = swinfo.count_on_rank;
+        global_count[rank] = swinfo.global_count;
+      };
+      auto &int_vars = std::get<SwarmInfo::MapToVarVec<int>>(swinfo.vars);
+      for (auto &[vname, swmvarvec] : int_vars) {
+        const auto &vinfo = swinfo.var_info.at(vname);
+        auto host_data = swinfo.FillHostBuffer(vname, swmvarvec);
+        SetCounts(swinfo, vinfo);
+        HDF5WriteND(g_var, vname, host_data.data(), vinfo.tensor_rank + 1, local_offset,
+                    local_count, global_count, pl_xfer, H5P_DEFAULT);
+      }
+      auto &rvars = std::get<SwarmInfo::MapToVarVec<Real>>(swinfo.vars);
+      for (auto &[vname, swmvarvec] : rvars) {
+        const auto &vinfo = swinfo.var_info.at(vname);
+        auto host_data = swinfo.FillHostBuffer(vname, swmvarvec);
+        SetCounts(swinfo, vinfo);
+        HDF5WriteND(g_var, vname, host_data.data(), vinfo.tensor_rank + 1, local_offset,
+                    local_count, global_count, pl_xfer, H5P_DEFAULT);
+      }
+      // If swarm does not contain an "id" object, generate a sequential
+      // one for vis.
+      if (swinfo.var_info.count("id") == 0) {
+        std::vector<int> ids(swinfo.global_count);
+        std::iota(std::begin(ids), std::end(ids), swinfo.global_offset);
+        local_offset[0] = swinfo.global_offset;
+        local_count[0] = swinfo.count_on_rank;
+        global_count[0] = swinfo.global_count;
+        HDF5Write1D(g_var, "id", ids.data(), local_offset, local_count, global_count,
+                    pl_xfer);
       }
-      local_offset[rank] = swinfo.global_offset;
-      local_count[rank] = swinfo.count_on_rank;
-      global_count[rank] = swinfo.global_count;
-    };
-    auto &int_vars = std::get<SwarmInfo::MapToVarVec<int>>(swinfo.vars);
-    for (auto &[vname, swmvarvec] : int_vars) {
-      const auto &vinfo = swinfo.var_info.at(vname);
-      auto host_data = swinfo.FillHostBuffer(vname, swmvarvec);
-      SetCounts(swinfo, vinfo);
-      HDF5WriteND(g_var, vname, host_data.data(), vinfo.tensor_rank + 1, local_offset,
-                  local_count, global_count, pl_xfer, H5P_DEFAULT);
-    }
-    auto &rvars = std::get<SwarmInfo::MapToVarVec<Real>>(swinfo.vars);
-    for (auto &[vname, swmvarvec] : rvars) {
-      const auto &vinfo = swinfo.var_info.at(vname);
-      auto host_data = swinfo.FillHostBuffer(vname, swmvarvec);
-      SetCounts(swinfo, vinfo);
-      HDF5WriteND(g_var, vname, host_data.data(), vinfo.tensor_rank + 1, local_offset,
-                  local_count, global_count, pl_xfer, H5P_DEFAULT);
-    }
-    // If swarm does not contain an "id" object, generate a sequential
-    // one for vis.
-    if (swinfo.var_info.count("id") == 0) {
-      std::vector<int> ids(swinfo.global_count);
-      std::iota(std::begin(ids), std::end(ids), swinfo.global_offset);
-      local_offset[0] = swinfo.global_offset;
-      local_count[0] = swinfo.count_on_rank;
-      global_count[0] = swinfo.global_count;
-      HDF5Write1D(g_var, "id", ids.data(), local_offset, local_count, global_count,
-                  pl_xfer);
     }
-  }
-  Kokkos::Profiling::popRegion(); // write particle data
-
-  Kokkos::Profiling::pushRegion("genXDMF");
-  // generate XDMF companion file
-  XDMF::genXDMF(filename, pm, tm, nx1, nx2, nx3, all_vars_info, swarm_info);
-  Kokkos::Profiling::popRegion(); // genXDMF
+  } // write particle data
 
-  Kokkos::Profiling::popRegion(); // WriteOutputFile???Prec
+  { // generate xdmf
+    PARTHENON_INSTRUMENT_REGION("genXDMF")
+    // generate XDMF companion file
+    XDMF::genXDMF(filename, pm, tm, nx1, nx2, nx3, all_vars_info, swarm_info);
+  } // generate xdmf
 }
 
 std::string PHDF5Output::GenerateFilename_(ParameterInput *pin, SimTime *tm,
diff --git a/src/parthenon/prelude.hpp b/src/parthenon/prelude.hpp
index 8fafb2c94204..6f28cdcb9ee0 100644
--- a/src/parthenon/prelude.hpp
+++ b/src/parthenon/prelude.hpp
@@ -33,6 +33,7 @@ namespace prelude {
 using ::parthenon::BoundaryCommSubset;
 using ::parthenon::IndexDomain;
 using ::parthenon::IndexRange;
+using ::parthenon::KokkosTimer;
 using ::parthenon::MeshBlock;
 using ::parthenon::MeshBlockData;
 using ::parthenon::MeshData;

From 140ca4d6b24ec107831fc9fd695baac2b7b4b609 Mon Sep 17 00:00:00 2001
From: jdolence <jcdolence@gmail.com>
Date: Tue, 31 Oct 2023 11:02:50 -0600
Subject: [PATCH 40/67] actually add the new file

---
 src/utils/instrument.hpp | 43 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)
 create mode 100644 src/utils/instrument.hpp

diff --git a/src/utils/instrument.hpp b/src/utils/instrument.hpp
new file mode 100644
index 000000000000..3fa3fcb4f7c4
--- /dev/null
+++ b/src/utils/instrument.hpp
@@ -0,0 +1,43 @@
+//========================================================================================
+// (C) (or copyright) 2023. Triad National Security, LLC. All rights reserved.
+//
+// This program was produced under U.S. Government contract 89233218CNA000001 for Los
+// Alamos National Laboratory (LANL), which is operated by Triad National Security, LLC
+// for the U.S. Department of Energy/National Nuclear Security Administration. All rights
+// in the program are reserved by Triad National Security, LLC, and the U.S. Department
+// of Energy/National Nuclear Security Administration. The Government is granted for
+// itself and others acting on its behalf a nonexclusive, paid-up, irrevocable worldwide
+// license in this material to reproduce, prepare derivative works, distribute copies to
+// the public, perform publicly and display publicly, and to permit others to do so.
+//========================================================================================
+#ifndef UTILS_INSTRUMENT_HPP_
+#define UTILS_INSTRUMENT_HPP_
+
+#include <string>
+
+#include <Kokkos_Core.hpp>
+
+#define PARTHENON_INSTRUMENT KokkosTimer internal_inst_(__FILE__, __LINE__, __func__);
+#define PARTHENON_INSTRUMENT_REGION(name)                                                \
+  KokkosTimer internal_inst_reg____LINE__(__FILE__, __LINE__, name);
+
+namespace parthenon {
+
+struct KokkosTimer {
+  KokkosTimer(const std::string &file, const int line, const std::string &name) {
+    std::string label;
+    auto npos = file.rfind('/');
+    if (npos == std::string::npos) {
+      label = file;
+    } else {
+      label = file.substr(npos + 1);
+    }
+    label += "::" + std::to_string(line) + "::" + name;
+    Kokkos::Profiling::pushRegion(label);
+  }
+  ~KokkosTimer() { Kokkos::Profiling::popRegion(); }
+};
+
+} // namespace parthenon
+
+#endif

From 4856d8b189238ff3993b8de728975d51cedc915b Mon Sep 17 00:00:00 2001
From: jdolence <jcdolence@gmail.com>
Date: Tue, 31 Oct 2023 11:03:31 -0600
Subject: [PATCH 41/67] and add it to the cmake

---
 src/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 54a226c55383..062702836532 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -231,6 +231,7 @@ add_library(parthenon
   utils/error_checking.cpp
   utils/hash.hpp
   utils/indexer.hpp
+  utils/instrument.hpp
   utils/loop_utils.hpp
   utils/morton_number.hpp
   utils/mpi_types.hpp

From 27dffb46126af032150c43fbffddc27b04b66cf6 Mon Sep 17 00:00:00 2001
From: jdolence <jcdolence@gmail.com>
Date: Tue, 31 Oct 2023 16:40:28 -0600
Subject: [PATCH 42/67] add some more macros for auto profiling

---
 benchmarks/burgers/burgers_package.cpp        |  6 ++--
 benchmarks/burgers/parthenon_app_inputs.cpp   |  2 +-
 example/advection/advection_package.cpp       | 31 ++++++++--------
 example/advection/parthenon_app_inputs.cpp    |  5 ++-
 example/calculate_pi/calculate_pi.cpp         |  2 +-
 example/kokkos_pi/kokkos_pi.cpp               |  2 +-
 .../particle_leapfrog/particle_leapfrog.cpp   |  4 +--
 example/particle_tracers/particle_tracers.cpp | 16 ++++-----
 example/particles/particles.cpp               | 17 +++++----
 example/poisson/parthenon_app_inputs.cpp      |  4 +--
 example/poisson/poisson_package.cpp           | 26 +++++++-------
 .../sparse_advection/parthenon_app_inputs.cpp |  4 +--
 .../sparse_advection_package.cpp              |  8 ++---
 .../stochastic_subgrid_package.cpp            | 21 +++++------
 src/bvals/comms/boundary_communication.cpp    |  4 +--
 src/bvals/comms/flux_correction.cpp           |  4 +--
 src/driver/driver.cpp                         |  2 +-
 src/interface/swarm.cpp                       | 15 ++++----
 src/interface/update.cpp                      | 12 +++----
 src/interface/update.hpp                      | 24 ++++++-------
 src/mesh/amr_loadbalance.cpp                  | 16 ++++-----
 src/outputs/ascent.cpp                        |  2 +-
 src/outputs/parthenon_hdf5.cpp                | 30 ++++++++--------
 src/prolong_restrict/pr_loops.hpp             |  8 ++---
 src/utils/buffer_utils.cpp                    | 10 +++---
 src/utils/instrument.hpp                      | 35 +++++++++++++------
 26 files changed, 162 insertions(+), 148 deletions(-)

diff --git a/benchmarks/burgers/burgers_package.cpp b/benchmarks/burgers/burgers_package.cpp
index b66bc69c2430..4c2c46a71d40 100644
--- a/benchmarks/burgers/burgers_package.cpp
+++ b/benchmarks/burgers/burgers_package.cpp
@@ -110,7 +110,7 @@ void CalculateDerived(MeshData<Real> *md) {
   size_t scratch_size = 0;
   constexpr int scratch_level = 0;
   parthenon::par_for_outer(
-      DEFAULT_OUTER_LOOP_PATTERN, "CalculateDerived", DevExecSpace(), scratch_size,
+      DEFAULT_OUTER_LOOP_PATTERN, PARTHENON_AUTO_LABEL, DevExecSpace(), scratch_size,
       scratch_level, 0, nblocks - 1, kb.s, kb.e, jb.s, jb.e,
       KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int b, const int k, const int j) {
         Real *out = &v(b, 0, k, j, 0);
@@ -193,7 +193,7 @@ TaskStatus CalculateFluxes(MeshData<Real> *md) {
   size_t scratch_size = 0;
   constexpr int scratch_level = 0;
   parthenon::par_for_outer(
-      DEFAULT_OUTER_LOOP_PATTERN, "burgers::reconstruction", DevExecSpace(), scratch_size,
+      DEFAULT_OUTER_LOOP_PATTERN, PARTHENON_AUTO_LABEL, DevExecSpace(), scratch_size,
       scratch_level, 0, nblocks - 1, kb.s - dk, kb.e + dk, jb.s - dj, jb.e + dj,
       KOKKOS_LAMBDA(team_mbr_t member, const int b, const int k, const int j) {
         bool xrec = (k >= kb.s && k <= kb.e) && (j >= jb.s && j <= jb.e);
@@ -264,7 +264,7 @@ TaskStatus CalculateFluxes(MeshData<Real> *md) {
   // now we'll solve the Riemann problems to get fluxes
   scratch_size = 2 * ScratchPad1D<Real>::shmem_size(ib.e + 1);
   parthenon::par_for_outer(
-      DEFAULT_OUTER_LOOP_PATTERN, "burgers::reconstruction", DevExecSpace(), scratch_size,
+      DEFAULT_OUTER_LOOP_PATTERN, PARTHENON_AUTO_LABEL, DevExecSpace(), scratch_size,
       scratch_level, 0, nblocks - 1, kb.s, kb.e + dk, jb.s, jb.e + dj,
       KOKKOS_LAMBDA(team_mbr_t member, const int b, const int k, const int j) {
         bool xflux = (k <= kb.e && j <= jb.e);
diff --git a/benchmarks/burgers/parthenon_app_inputs.cpp b/benchmarks/burgers/parthenon_app_inputs.cpp
index 94d24b8e2068..c831d4ef7802 100644
--- a/benchmarks/burgers/parthenon_app_inputs.cpp
+++ b/benchmarks/burgers/parthenon_app_inputs.cpp
@@ -53,7 +53,7 @@ void ProblemGenerator(MeshBlock *pmb, ParameterInput *pin) {
   const auto num_vars = q.GetDim(4);
 
   pmb->par_for(
-      "Burgers::ProblemGenerator", kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+      PARTHENON_AUTO_LABEL, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
       KOKKOS_LAMBDA(const int k, const int j, const int i) {
         const Real x = coords.Xc<1>(i);
         const Real y = coords.Xc<2>(j);
diff --git a/example/advection/advection_package.cpp b/example/advection/advection_package.cpp
index 52f8d2c69bda..210b2d474872 100644
--- a/example/advection/advection_package.cpp
+++ b/example/advection/advection_package.cpp
@@ -237,8 +237,7 @@ AmrTag CheckRefinement(MeshBlockData<Real> *rc) {
 
   typename Kokkos::MinMax<Real>::value_type minmax;
   pmb->par_reduce(
-      "advection check refinement", 0, v.GetDim(4) - 1, kb.s, kb.e, jb.s, jb.e, ib.s,
-      ib.e,
+      PARTHENON_AUTO_LABEL, 0, v.GetDim(4) - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
       KOKKOS_LAMBDA(const int n, const int k, const int j, const int i,
                     typename Kokkos::MinMax<Real>::value_type &lminmax) {
         lminmax.min_val =
@@ -276,7 +275,7 @@ void PreFill(MeshBlockData<Real> *rc) {
     const int out = imap.get("one_minus_advected").first;
     const auto num_vars = rc->Get("advected").data.GetDim(4);
     pmb->par_for(
-        "advection_package::PreFill", 0, num_vars - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+        PARTHENON_AUTO_LABEL, 0, num_vars - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
         KOKKOS_LAMBDA(const int n, const int k, const int j, const int i) {
           v(out + n, k, j, i) = 1.0 - v(in + n, k, j, i);
         });
@@ -300,7 +299,7 @@ void SquareIt(MeshBlockData<Real> *rc) {
   const int out = imap.get("one_minus_advected_sq").first;
   const auto num_vars = rc->Get("advected").data.GetDim(4);
   pmb->par_for(
-      "advection_package::SquareIt", 0, num_vars - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+      PARTHENON_AUTO_LABEL, 0, num_vars - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
       KOKKOS_LAMBDA(const int n, const int k, const int j, const int i) {
         v(out + n, k, j, i) = v(in + n, k, j, i) * v(in + n, k, j, i);
       });
@@ -317,8 +316,8 @@ void SquareIt(MeshBlockData<Real> *rc) {
   if (profile == "smooth_gaussian") {
     const auto &advected = rc->Get("advected").data;
     pmb->par_for(
-        "advection_package::SquareIt bval check", 0, num_vars - 1, kb.s, kb.e, jb.s, jb.e,
-        ib.s, ib.e, KOKKOS_LAMBDA(const int n, const int k, const int j, const int i) {
+        PARTHENON_AUTO_LABEL, 0, num_vars - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+        KOKKOS_LAMBDA(const int n, const int k, const int j, const int i) {
           PARTHENON_REQUIRE(advected(n, k, j, i) != 0.0,
                             "Advected not properly initialized.");
         });
@@ -353,8 +352,8 @@ void PostFill(MeshBlockData<Real> *rc) {
     const int out37 = imap.get("one_minus_sqrt_one_minus_advected_sq_37").first;
     const auto num_vars = rc->Get("advected").data.GetDim(4);
     pmb->par_for(
-        "advection_package::PostFill", 0, num_vars - 1, kb.s, kb.e, jb.s, jb.e, ib.s,
-        ib.e, KOKKOS_LAMBDA(const int n, const int k, const int j, const int i) {
+        PARTHENON_AUTO_LABEL, 0, num_vars - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+        KOKKOS_LAMBDA(const int n, const int k, const int j, const int i) {
           v(out12 + n, k, j, i) = 1.0 - sqrt(v(in + n, k, j, i));
           v(out37 + n, k, j, i) = 1.0 - v(out12 + n, k, j, i);
         });
@@ -387,7 +386,8 @@ Real AdvectionHst(MeshData<Real> *md) {
   const bool volume_weighting = std::is_same<T, Kokkos::Sum<Real, HostExecSpace>>::value;
 
   pmb->par_reduce(
-      "AdvectionHst", 0, advected_pack.GetDim(5) - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+      PARTHENON_AUTO_LABEL, 0, advected_pack.GetDim(5) - 1, kb.s, kb.e, jb.s, jb.e, ib.s,
+      ib.e,
       KOKKOS_LAMBDA(const int b, const int k, const int j, const int i, Real &lresult) {
         const auto &coords = advected_pack.GetCoords(b);
         // `join` is a function of the Kokkos::ReducerConecpt that allows to use the same
@@ -418,7 +418,7 @@ Real EstimateTimestepBlock(MeshBlockData<Real> *rc) {
   // this is obviously overkill for this constant velocity problem
   Real min_dt;
   pmb->par_reduce(
-      "advection_package::EstimateTimestep", kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+      PARTHENON_AUTO_LABEL, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
       KOKKOS_LAMBDA(const int k, const int j, const int i, Real &lmin_dt) {
         if (vx != 0.0)
           lmin_dt = std::min(lmin_dt, coords.Dxc<X1DIR>(k, j, i) / std::abs(vx));
@@ -465,8 +465,8 @@ TaskStatus CalculateFluxes(std::shared_ptr<MeshBlockData<Real>> &rc) {
   size_t scratch_size_in_bytes = parthenon::ScratchPad2D<Real>::shmem_size(nvar, nx1);
   // get x-fluxes
   pmb->par_for_outer(
-      "x1 flux", 2 * scratch_size_in_bytes, scratch_level, kb.s, kb.e, jb.s, jb.e,
-      KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int k, const int j) {
+      PARTHENON_AUTO_LABEL, 2 * scratch_size_in_bytes, scratch_level, kb.s, kb.e, jb.s,
+      jb.e, KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int k, const int j) {
         parthenon::ScratchPad2D<Real> ql(member.team_scratch(scratch_level), nvar, nx1);
         parthenon::ScratchPad2D<Real> qr(member.team_scratch(scratch_level), nvar, nx1);
         // get reconstructed state on faces
@@ -498,8 +498,8 @@ TaskStatus CalculateFluxes(std::shared_ptr<MeshBlockData<Real>> &rc) {
   // get y-fluxes
   if (pmb->pmy_mesh->ndim >= 2) {
     pmb->par_for_outer(
-        "x2 flux", 3 * scratch_size_in_bytes, scratch_level, kb.s, kb.e, jb.s, jb.e + 1,
-        KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int k, const int j) {
+        PARTHENON_AUTO_LABEL, 3 * scratch_size_in_bytes, scratch_level, kb.s, kb.e, jb.s,
+        jb.e + 1, KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int k, const int j) {
           // the overall algorithm/use of scratch pad here is clear inefficient and kept
           // just for demonstrating purposes. The key point is that we cannot reuse
           // reconstructed arrays for different `j` with `j` being part of the outer
@@ -541,7 +541,8 @@ TaskStatus CalculateFluxes(std::shared_ptr<MeshBlockData<Real>> &rc) {
   // get z-fluxes
   if (pmb->pmy_mesh->ndim == 3) {
     pmb->par_for_outer(
-        "x3 flux", 3 * scratch_size_in_bytes, scratch_level, kb.s, kb.e + 1, jb.s, jb.e,
+        PARTHENON_AUTO_LABEL, 3 * scratch_size_in_bytes, scratch_level, kb.s, kb.e + 1,
+        jb.s, jb.e,
         KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int k, const int j) {
           // the overall algorithm/use of scratch pad here is clear inefficient and kept
           // just for demonstrating purposes. The key point is that we cannot reuse
diff --git a/example/advection/parthenon_app_inputs.cpp b/example/advection/parthenon_app_inputs.cpp
index d55e0243b9ff..43bb74292f06 100644
--- a/example/advection/parthenon_app_inputs.cpp
+++ b/example/advection/parthenon_app_inputs.cpp
@@ -73,7 +73,7 @@ void ProblemGenerator(MeshBlock *pmb, ParameterInput *pin) {
   if (profile == "block") profile_type = 3;
 
   pmb->par_for(
-      "Advection::ProblemGenerator", 0, num_vars - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+      PARTHENON_AUTO_LABEL, 0, num_vars - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
       KOKKOS_LAMBDA(const int n, const int k, const int j, const int i) {
         if (profile_type == 0) {
           Real x = cos_a2 * (coords.Xc<1>(i) * cos_a3 + coords.Xc<2>(j) * sin_a3) +
@@ -99,8 +99,7 @@ void ProblemGenerator(MeshBlock *pmb, ParameterInput *pin) {
   // initialize some arbitrary cells in the first block that move in all 6 directions
   if (profile_type == 3 && block_id == 0) {
     pmb->par_for(
-        "Advection::ProblemGenerator bvals test", 0, 1,
-        KOKKOS_LAMBDA(const int /*unused*/) {
+        PARTHENON_AUTO_LABEL, 0, 1, KOKKOS_LAMBDA(const int /*unused*/) {
           q(idx_adv, 4, 4, 4) = 10.0;
           q(idx_v, 4, 4, 4) = vx;
           q(idx_adv, 4, 6, 4) = 10.0;
diff --git a/example/calculate_pi/calculate_pi.cpp b/example/calculate_pi/calculate_pi.cpp
index 03ef92fd6872..06d6ba3c1984 100644
--- a/example/calculate_pi/calculate_pi.cpp
+++ b/example/calculate_pi/calculate_pi.cpp
@@ -88,7 +88,7 @@ void SetInOrOut(MeshBlockData<Real> *rc) {
   // Loop bounds are set to catch the case where the edge is between the
   // cell centers of the first/last real cell and the first ghost cell
   pmb->par_for(
-      "SetInOrOut", kb.s, kb.e, jb.s - 1, jb.e + 1, ib.s - 1, ib.e + 1,
+      PARTHENON_AUTO_LABEL, kb.s, kb.e, jb.s - 1, jb.e + 1, ib.s - 1, ib.e + 1,
       KOKKOS_LAMBDA(const int k, const int j, const int i) {
         Real rsq = std::pow(coords.Xc<1>(i), 2) + std::pow(coords.Xc<2>(j), 2);
         if (rsq < radius * radius) {
diff --git a/example/kokkos_pi/kokkos_pi.cpp b/example/kokkos_pi/kokkos_pi.cpp
index 4bcdbd8d3819..6d5f69c980bc 100644
--- a/example/kokkos_pi/kokkos_pi.cpp
+++ b/example/kokkos_pi/kokkos_pi.cpp
@@ -272,7 +272,7 @@ result_t naiveParFor(int n_block, int n_mesh, int n_iter, double radius) {
       auto inOrOut = base->PackVariables({Metadata::Independent});
       // iops = 0  fops = 11
       par_for(
-          DEFAULT_LOOP_PATTERN, "par_for in or out", DevExecSpace(), 0,
+          DEFAULT_LOOP_PATTERN, PARTHENON_AUTO_LABEL, DevExecSpace(), 0,
           inOrOut.GetDim(4) - 1, nghost, inOrOut.GetDim(3) - nghost - 1, nghost,
           inOrOut.GetDim(2) - nghost - 1, nghost, inOrOut.GetDim(1) - nghost - 1,
           KOKKOS_LAMBDA(const int l, const int k_grid, const int j_grid,
diff --git a/example/particle_leapfrog/particle_leapfrog.cpp b/example/particle_leapfrog/particle_leapfrog.cpp
index b54bd6ddebd3..673e795fdead 100644
--- a/example/particle_leapfrog/particle_leapfrog.cpp
+++ b/example/particle_leapfrog/particle_leapfrog.cpp
@@ -189,7 +189,7 @@ void ProblemGenerator(MeshBlock *pmb, ParameterInput *pin) {
   // This hardcoded implementation should only used in PGEN and not during runtime
   // addition of particles as indices need to be taken into account.
   pmb->par_for(
-      "CreateParticles", 0, num_particles_this_block - 1, KOKKOS_LAMBDA(const int n) {
+      PARTHENON_AUTO_LABEL, 0, num_particles_this_block - 1, KOKKOS_LAMBDA(const int n) {
         const auto &m = ids_this_block(n);
 
         id(n) = m; // global unique id
@@ -227,7 +227,7 @@ TaskStatus TransportParticles(MeshBlock *pmb, const StagedIntegrator *integrator
   const Real ay = 0.0;
   const Real az = 0.0;
   pmb->par_for(
-      "Leapfrog", 0, max_active_index, KOKKOS_LAMBDA(const int n) {
+      PARTHENON_AUTO_LABEL, 0, max_active_index, KOKKOS_LAMBDA(const int n) {
         if (swarm_d.IsActive(n)) {
           // drift
           x(n) += v(0, n) * 0.5 * dt;
diff --git a/example/particle_tracers/particle_tracers.cpp b/example/particle_tracers/particle_tracers.cpp
index 1f8394a6c6c4..9cc8a0cb2570 100644
--- a/example/particle_tracers/particle_tracers.cpp
+++ b/example/particle_tracers/particle_tracers.cpp
@@ -182,7 +182,7 @@ TaskStatus AdvectTracers(MeshBlock *pmb, const StagedIntegrator *integrator) {
 
   auto swarm_d = swarm->GetDeviceContext();
   pmb->par_for(
-      "Tracer advection", 0, max_active_index, KOKKOS_LAMBDA(const int n) {
+      PARTHENON_AUTO_LABEL, 0, max_active_index, KOKKOS_LAMBDA(const int n) {
         if (swarm_d.IsActive(n)) {
           x(n) += vx * dt;
           y(n) += vy * dt;
@@ -219,13 +219,13 @@ TaskStatus DepositTracers(MeshBlock *pmb) {
   auto &tracer_dep = pmb->meshblock_data.Get()->Get("tracer_deposition").data;
   // Reset particle count
   pmb->par_for(
-      "ZeroParticleDep", kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+      PARTHENON_AUTO_LABEL, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
       KOKKOS_LAMBDA(const int k, const int j, const int i) { tracer_dep(k, j, i) = 0.; });
 
   const int ndim = pmb->pmy_mesh->ndim;
 
   pmb->par_for(
-      "DepositTracers", 0, swarm->GetMaxActiveIndex(), KOKKOS_LAMBDA(const int n) {
+      PARTHENON_AUTO_LABEL, 0, swarm->GetMaxActiveIndex(), KOKKOS_LAMBDA(const int n) {
         if (swarm_d.IsActive(n)) {
           int i = static_cast<int>(std::floor((x(n) - minx_i) / dx_i) + ib.s);
           int j = 0;
@@ -269,7 +269,7 @@ TaskStatus CalculateFluxes(MeshBlockData<Real> *mbd) {
 
   // Spatially first order upwind method
   pmb->par_for(
-      "CalculateFluxesX1", kb.s, kb.e, jb.s, jb.e, ib.s, ib.e + 1,
+      PARTHENON_AUTO_LABEL, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e + 1,
       KOKKOS_LAMBDA(const int k, const int j, const int i) {
         // X1
         if (vx > 0.) {
@@ -282,7 +282,7 @@ TaskStatus CalculateFluxes(MeshBlockData<Real> *mbd) {
   if (ndim > 1) {
     auto x2flux = mbd->Get("advected").flux[X2DIR].Get<4>();
     pmb->par_for(
-        "CalculateFluxesX2", kb.s, kb.e, jb.s, jb.e + 1, ib.s, ib.e,
+        PARTHENON_AUTO_LABEL, kb.s, kb.e, jb.s, jb.e + 1, ib.s, ib.e,
         KOKKOS_LAMBDA(const int k, const int j, const int i) {
           // X2
           if (vy > 0.) {
@@ -296,7 +296,7 @@ TaskStatus CalculateFluxes(MeshBlockData<Real> *mbd) {
   if (ndim > 2) {
     auto x3flux = mbd->Get("advected").flux[X3DIR].Get<4>();
     pmb->par_for(
-        "CalculateFluxesX3", kb.s, kb.e + 1, jb.s, jb.e, ib.s, ib.e,
+        PARTHENON_AUTO_LABEL, kb.s, kb.e + 1, jb.s, jb.e, ib.s, ib.e,
         KOKKOS_LAMBDA(const int k, const int j, const int i) {
           // X3
           if (vz > 0.) {
@@ -355,7 +355,7 @@ void ProblemGenerator(MeshBlock *pmb, ParameterInput *pin) {
   const Real kwave = 2. * M_PI / (x_max_mesh - x_min_mesh);
 
   pmb->par_for(
-      "Init advected profile", kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+      PARTHENON_AUTO_LABEL, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
       KOKKOS_LAMBDA(const int k, const int j, const int i) {
         advected(k, j, i) = advected_mean + advected_amp * sin(kwave * coords.Xc<1>(i));
       });
@@ -387,7 +387,7 @@ void ProblemGenerator(MeshBlock *pmb, ParameterInput *pin) {
   // This hardcoded implementation should only used in PGEN and not during runtime
   // addition of particles as indices need to be taken into account.
   pmb->par_for(
-      "CreateParticles", 0, num_tracers_meshblock - 1, KOKKOS_LAMBDA(const int n) {
+      PARTHENON_AUTO_LABEL, 0, num_tracers_meshblock - 1, KOKKOS_LAMBDA(const int n) {
         auto rng_gen = rng_pool.get_state();
 
         // Rejection sample the x position
diff --git a/example/particles/particles.cpp b/example/particles/particles.cpp
index 5a112977cc23..384a42ddc432 100644
--- a/example/particles/particles.cpp
+++ b/example/particles/particles.cpp
@@ -143,7 +143,7 @@ TaskStatus DestroySomeParticles(MeshBlock *pmb) {
 
   // Randomly mark some fraction of particles each timestep for removal
   pmb->par_for(
-      "DestroySomeParticles", 0, swarm->GetMaxActiveIndex(), KOKKOS_LAMBDA(const int n) {
+      PARTHENON_AUTO_LABEL, 0, swarm->GetMaxActiveIndex(), KOKKOS_LAMBDA(const int n) {
         if (swarm_d.IsActive(n)) {
           auto rng_gen = rng_pool.get_state();
           if (rng_gen.drand() > 1.0 - destroy_particles_frac) {
@@ -201,13 +201,13 @@ TaskStatus DepositParticles(MeshBlock *pmb) {
   if (deposition_method == DepositionMethod::per_particle) {
     // Reset particle count
     pmb->par_for(
-        "ZeroParticleDep", kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+        PARTHENON_AUTO_LABEL, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
         KOKKOS_LAMBDA(const int k, const int j, const int i) {
           particle_dep(k, j, i) = 0.;
         });
 
     pmb->par_for(
-        "DepositParticles", 0, swarm->GetMaxActiveIndex(), KOKKOS_LAMBDA(const int n) {
+        PARTHENON_AUTO_LABEL, 0, swarm->GetMaxActiveIndex(), KOKKOS_LAMBDA(const int n) {
           if (swarm_d.IsActive(n)) {
             int i = static_cast<int>(std::floor((x(n) - minx_i) / dx_i) + ib.s);
             int j = 0;
@@ -227,7 +227,7 @@ TaskStatus DepositParticles(MeshBlock *pmb) {
         });
   } else if (deposition_method == DepositionMethod::per_cell) {
     pmb->par_for(
-        "DepositParticlesByCell", kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+        PARTHENON_AUTO_LABEL, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
         KOKKOS_LAMBDA(const int k, const int j, const int i) {
           particle_dep(k, j, i) = 0.;
           for (int n = 0; n < swarm_d.GetParticleCountPerCell(k, j, i); n++) {
@@ -278,8 +278,7 @@ TaskStatus CreateSomeParticles(MeshBlock *pmb, const double t0) {
 
   if (orbiting_particles) {
     pmb->par_for(
-        "CreateSomeOrbitingParticles", 0, swarm->GetMaxActiveIndex(),
-        KOKKOS_LAMBDA(const int n) {
+        PARTHENON_AUTO_LABEL, 0, swarm->GetMaxActiveIndex(), KOKKOS_LAMBDA(const int n) {
           if (new_particles_mask(n)) {
             auto rng_gen = rng_pool.get_state();
 
@@ -323,7 +322,7 @@ TaskStatus CreateSomeParticles(MeshBlock *pmb, const double t0) {
         });
   } else {
     pmb->par_for(
-        "CreateSomeParticles", 0, swarm->GetMaxActiveIndex(), KOKKOS_LAMBDA(const int n) {
+        PARTHENON_AUTO_LABEL, 0, swarm->GetMaxActiveIndex(), KOKKOS_LAMBDA(const int n) {
           if (new_particles_mask(n)) {
             auto rng_gen = rng_pool.get_state();
 
@@ -393,7 +392,7 @@ TaskStatus TransportParticles(MeshBlock *pmb, const StagedIntegrator *integrator
   if (orbiting_particles) {
     // Particles orbit the origin
     pmb->par_for(
-        "TransportOrbitingParticles", 0, max_active_index, KOKKOS_LAMBDA(const int n) {
+        PARTHENON_AUTO_LABEL, 0, max_active_index, KOKKOS_LAMBDA(const int n) {
           if (swarm_d.IsActive(n)) {
             Real vel = sqrt(v(0, n) * v(0, n) + v(1, n) * v(1, n) + v(2, n) * v(2, n));
             PARTHENON_DEBUG_REQUIRE(vel > 0., "Speed must be > 0!");
@@ -446,7 +445,7 @@ TaskStatus TransportParticles(MeshBlock *pmb, const StagedIntegrator *integrator
   } else {
     // Particles move in straight lines
     pmb->par_for(
-        "TransportParticles", 0, max_active_index, KOKKOS_LAMBDA(const int n) {
+        PARTHENON_AUTO_LABEL, 0, max_active_index, KOKKOS_LAMBDA(const int n) {
           if (swarm_d.IsActive(n)) {
             Real vel = sqrt(v(0, n) * v(0, n) + v(1, n) * v(1, n) + v(2, n) * v(2, n));
             PARTHENON_DEBUG_REQUIRE(vel > 0., "vel must be > 0 for division!");
diff --git a/example/poisson/parthenon_app_inputs.cpp b/example/poisson/parthenon_app_inputs.cpp
index 4c2bd177c42f..2dd29551f350 100644
--- a/example/poisson/parthenon_app_inputs.cpp
+++ b/example/poisson/parthenon_app_inputs.cpp
@@ -50,8 +50,8 @@ void ProblemGenerator(Mesh *pm, ParameterInput *pin, MeshData<Real> *md) {
   const int iphi = imap["potential"].first;
 
   pmb->par_for(
-      "Poisson::ProblemGenerator", 0, q_bpack.GetDim(5) - 1, kb.s, kb.e, jb.s, jb.e, ib.s,
-      ib.e, KOKKOS_LAMBDA(const int b, const int k, const int j, const int i) {
+      PARTHENON_AUTO_LABEL, 0, q_bpack.GetDim(5) - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+      KOKKOS_LAMBDA(const int b, const int k, const int j, const int i) {
         const auto &coords = q_bpack.GetCoords(b);
         auto &q = q_bpack(b);
         Real dist2 = std::pow(coords.Xc<1>(i) - x0, 2) +
diff --git a/example/poisson/poisson_package.cpp b/example/poisson/poisson_package.cpp
index 378bc48a3176..032d0decffd5 100644
--- a/example/poisson/poisson_package.cpp
+++ b/example/poisson/poisson_package.cpp
@@ -129,8 +129,8 @@ TaskStatus SetMatrixElements(T *u) {
   const int ndim = v.GetNdim();
   const Real w0 = -2.0 * ndim;
   parthenon::par_for(
-      DEFAULT_LOOP_PATTERN, "SetMatElem", DevExecSpace(), 0, v.GetDim(5) - 1, kb.s, kb.e,
-      jb.s, jb.e, ib.s, ib.e,
+      DEFAULT_LOOP_PATTERN, PARTHENON_AUTO_LABEL, DevExecSpace(), 0, v.GetDim(5) - 1,
+      kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
       KOKKOS_LAMBDA(const int b, const int k, const int j, const int i) {
         for (int n = isp_lo; n <= isp_hi; n++) {
           v(b, n, k, j, i) = 1;
@@ -168,8 +168,8 @@ TaskStatus SumMass(T *u, Real *reduce_sum) {
 
   Real total;
   parthenon::par_reduce(
-      parthenon::loop_pattern_mdrange_tag, "SumMass", DevExecSpace(), 0, v.GetDim(5) - 1,
-      kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+      parthenon::loop_pattern_mdrange_tag, PARTHENON_AUTO_LABEL, DevExecSpace(), 0,
+      v.GetDim(5) - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
       KOKKOS_LAMBDA(const int b, const int k, const int j, const int i, Real &sum) {
         sum += v(b, irho, k, j, i) * std::pow(dx, ndim);
       },
@@ -194,8 +194,8 @@ TaskStatus SumDeltaPhi(T *du, Real *reduce_sum) {
 
   Real total;
   parthenon::par_reduce(
-      parthenon::loop_pattern_mdrange_tag, "SumMass", DevExecSpace(), 0, dv.GetDim(5) - 1,
-      kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+      parthenon::loop_pattern_mdrange_tag, PARTHENON_AUTO_LABEL, DevExecSpace(), 0,
+      dv.GetDim(5) - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
       KOKKOS_LAMBDA(const int b, const int k, const int j, const int i, Real &sum) {
         sum += std::pow(dv(b, iphi, k, j, i), 2);
       },
@@ -245,8 +245,8 @@ TaskStatus UpdatePhi(T *u, T *du) {
   if (isp_hi < 0) { // there is no sparse matrix, so we must be using the stencil
     const auto &stencil = pkg->Param<Stencil_t>("stencil");
     parthenon::par_for(
-        DEFAULT_LOOP_PATTERN, "StencilJacobi", DevExecSpace(), 0, v.GetDim(5) - 1, kb.s,
-        kb.e, jb.s, jb.e, ib.s, ib.e,
+        DEFAULT_LOOP_PATTERN, PARTHENON_AUTO_LABEL, DevExecSpace(), 0, v.GetDim(5) - 1,
+        kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
         KOKKOS_LAMBDA(const int b, const int k, const int j, const int i) {
           const Real rhs = dV * v(b, irho, k, j, i);
           const Real phi_new = stencil.Jacobi(v, iphi, b, k, j, i, rhs);
@@ -257,8 +257,8 @@ TaskStatus UpdatePhi(T *u, T *du) {
     const auto &sp_accessor =
         pkg->Param<parthenon::solvers::SparseMatrixAccessor>("sparse_accessor");
     parthenon::par_for(
-        DEFAULT_LOOP_PATTERN, "SparseUpdate", DevExecSpace(), 0, v.GetDim(5) - 1, kb.s,
-        kb.e, jb.s, jb.e, ib.s, ib.e,
+        DEFAULT_LOOP_PATTERN, PARTHENON_AUTO_LABEL, DevExecSpace(), 0, v.GetDim(5) - 1,
+        kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
         KOKKOS_LAMBDA(const int b, const int k, const int j, const int i) {
           const Real rhs = dV * v(b, irho, k, j, i);
           const Real phi_new =
@@ -268,8 +268,8 @@ TaskStatus UpdatePhi(T *u, T *du) {
   }
 
   parthenon::par_for(
-      DEFAULT_LOOP_PATTERN, "UpdatePhi", DevExecSpace(), 0, dv.GetDim(5) - 1, kb.s, kb.e,
-      jb.s, jb.e, ib.s, ib.e,
+      DEFAULT_LOOP_PATTERN, PARTHENON_AUTO_LABEL, DevExecSpace(), 0, dv.GetDim(5) - 1,
+      kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
       KOKKOS_LAMBDA(const int b, const int k, const int j, const int i) {
         v(b, iphi, k, j, i) += dv(b, idphi, k, j, i);
       });
@@ -296,7 +296,7 @@ TaskStatus CheckConvergence(T *u, T *du) {
 
   Real max_err;
   parthenon::par_reduce(
-      parthenon::loop_pattern_mdrange_tag, "CheckConvergence", DevExecSpace(), 0,
+      parthenon::loop_pattern_mdrange_tag, PARTHENON_AUTO_LABEL, DevExecSpace(), 0,
       v.GetDim(5) - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
       KOKKOS_LAMBDA(const int b, const int k, const int j, const int i, Real &eps) {
         Real reps = std::abs(dv(b, idphi, k, j, i) / v(b, iphi, k, j, i));
diff --git a/example/sparse_advection/parthenon_app_inputs.cpp b/example/sparse_advection/parthenon_app_inputs.cpp
index 1cd806accad6..0f9730d7f718 100644
--- a/example/sparse_advection/parthenon_app_inputs.cpp
+++ b/example/sparse_advection/parthenon_app_inputs.cpp
@@ -97,8 +97,8 @@ void ProblemGenerator(MeshBlock *pmb, ParameterInput *pin) {
       }
 
       pmb->par_for(
-          "SparseAdvection::ProblemGenerator", 0, v.GetDim(4) - 1, kb.s, kb.e, jb.s, jb.e,
-          ib.s, ib.e, KOKKOS_LAMBDA(const int n, const int k, const int j, const int i) {
+          PARTHENON_AUTO_LABEL, 0, v.GetDim(4) - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+          KOKKOS_LAMBDA(const int n, const int k, const int j, const int i) {
             auto x = coords.Xc<1>(i) - x0;
             auto y = coords.Xc<2>(j) - y0;
             auto z = coords.Xc<3>(k);
diff --git a/example/sparse_advection/sparse_advection_package.cpp b/example/sparse_advection/sparse_advection_package.cpp
index b3f8130cbe20..0ffeea6ca96d 100644
--- a/example/sparse_advection/sparse_advection_package.cpp
+++ b/example/sparse_advection/sparse_advection_package.cpp
@@ -203,8 +203,8 @@ TaskStatus CalculateFluxes(std::shared_ptr<MeshBlockData<Real>> &rc) {
   size_t scratch_size_in_bytes = parthenon::ScratchPad2D<Real>::shmem_size(nvar, nx1);
   // get x-fluxes
   pmb->par_for_outer(
-      "x1 flux", 2 * scratch_size_in_bytes, scratch_level, kb.s, kb.e, jb.s, jb.e,
-      KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int k, const int j) {
+      PARTHENON_AUTO_LABEL, 2 * scratch_size_in_bytes, scratch_level, kb.s, kb.e, jb.s,
+      jb.e, KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int k, const int j) {
         parthenon::ScratchPad2D<Real> ql(member.team_scratch(scratch_level), nvar, nx1);
         parthenon::ScratchPad2D<Real> qr(member.team_scratch(scratch_level), nvar, nx1);
 
@@ -226,8 +226,8 @@ TaskStatus CalculateFluxes(std::shared_ptr<MeshBlockData<Real>> &rc) {
   // get y-fluxes
   if (pmb->pmy_mesh->ndim >= 2) {
     pmb->par_for_outer(
-        "x2 flux", 3 * scratch_size_in_bytes, scratch_level, kb.s, kb.e, jb.s, jb.e + 1,
-        KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int k, const int j) {
+        PARTHENON_AUTO_LABEL, 3 * scratch_size_in_bytes, scratch_level, kb.s, kb.e, jb.s,
+        jb.e + 1, KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int k, const int j) {
           // the overall algorithm/use of scratch pad here is clearly inefficient and kept
           // just for demonstrating purposes. The key point is that we cannot reuse
           // reconstructed arrays for different `j` with `j` being part of the outer
diff --git a/example/stochastic_subgrid/stochastic_subgrid_package.cpp b/example/stochastic_subgrid/stochastic_subgrid_package.cpp
index 79be429a3232..f91bf0ad7b75 100644
--- a/example/stochastic_subgrid/stochastic_subgrid_package.cpp
+++ b/example/stochastic_subgrid/stochastic_subgrid_package.cpp
@@ -262,9 +262,9 @@ TaskStatus ComputeNumIter(std::shared_ptr<MeshData<Real>> &md, Packages_t &packa
   int N_min = pkg->Param<int>("N_min");
 
   par_for(
-      parthenon::loop_pattern_mdrange_tag, "ComputeNumIter", parthenon::DevExecSpace(), 0,
-      pack.GetDim(5) - 1, 0, pack.GetDim(4) - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
-      KOKKOS_LAMBDA(int b, int v, int k, int j, int i) {
+      parthenon::loop_pattern_mdrange_tag, PARTHENON_AUTO_LABEL,
+      parthenon::DevExecSpace(), 0, pack.GetDim(5) - 1, 0, pack.GetDim(4) - 1, kb.s, kb.e,
+      jb.s, jb.e, ib.s, ib.e, KOKKOS_LAMBDA(int b, int v, int k, int j, int i) {
         auto rng = pool.get_state();
         double rand1 = rng.drand();
         double rand2 = rng.drand();
@@ -302,7 +302,7 @@ void DoLotsOfWork(MeshBlockData<Real> *rc) {
   const Real ilog10 = 1.0 / log(10.0);
 
   pmb->par_for(
-      "stochastic_subgrid_package::DoLotsOfWork", kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+      PARTHENON_AUTO_LABEL, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
       KOKKOS_LAMBDA(const int k, const int j, const int i) {
         int num_iter = v(niter, k, j, i);
 
@@ -343,7 +343,7 @@ Real EstimateTimestepBlock(MeshBlockData<Real> *rc) {
   // this is obviously overkill for this constant velocity problem
   Real min_dt;
   pmb->par_reduce(
-      "stochastic_subgrid_package::EstimateTimestep", kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+      PARTHENON_AUTO_LABEL, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
       KOKKOS_LAMBDA(const int k, const int j, const int i, Real &lmin_dt) {
         if (vx != 0.0)
           lmin_dt = std::min(lmin_dt, coords.Dxc<X1DIR>(k, j, i) / std::abs(vx));
@@ -380,8 +380,8 @@ TaskStatus CalculateFluxes(std::shared_ptr<MeshBlockData<Real>> &rc) {
   parthenon::ParArray4D<Real> x1flux = rc->Get("advected").flux[X1DIR].Get<4>();
   // get x-fluxes
   pmb->par_for_outer(
-      "x1 flux", 2 * scratch_size_in_bytes, scratch_level, kb.s, kb.e, jb.s, jb.e,
-      KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int k, const int j) {
+      PARTHENON_AUTO_LABEL, 2 * scratch_size_in_bytes, scratch_level, kb.s, kb.e, jb.s,
+      jb.e, KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int k, const int j) {
         parthenon::ScratchPad2D<Real> ql(member.team_scratch(scratch_level), nvar, nx1);
         parthenon::ScratchPad2D<Real> qr(member.team_scratch(scratch_level), nvar, nx1);
         // get reconstructed state on faces
@@ -404,8 +404,8 @@ TaskStatus CalculateFluxes(std::shared_ptr<MeshBlockData<Real>> &rc) {
   if (pmb->pmy_mesh->ndim >= 2) {
     parthenon::ParArray4D<Real> x2flux = rc->Get("advected").flux[X2DIR].Get<4>();
     pmb->par_for_outer(
-        "x2 flux", 3 * scratch_size_in_bytes, scratch_level, kb.s, kb.e, jb.s, jb.e + 1,
-        KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int k, const int j) {
+        PARTHENON_AUTO_LABEL, 3 * scratch_size_in_bytes, scratch_level, kb.s, kb.e, jb.s,
+        jb.e + 1, KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int k, const int j) {
           // the overall algorithm/use of scratch pad here is clear inefficient and kept
           // just for demonstrating purposes. The key point is that we cannot reuse
           // reconstructed arrays for different `j` with `j` being part of the outer
@@ -436,7 +436,8 @@ TaskStatus CalculateFluxes(std::shared_ptr<MeshBlockData<Real>> &rc) {
   if (pmb->pmy_mesh->ndim == 3) {
     parthenon::ParArray4D<Real> x3flux = rc->Get("advected").flux[X3DIR].Get<4>();
     pmb->par_for_outer(
-        "x3 flux", 3 * scratch_size_in_bytes, scratch_level, kb.s, kb.e + 1, jb.s, jb.e,
+        PARTHENON_AUTO_LABEL, 3 * scratch_size_in_bytes, scratch_level, kb.s, kb.e + 1,
+        jb.s, jb.e,
         KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int k, const int j) {
           // the overall algorithm/use of scratch pad here is clear inefficient and kept
           // just for demonstrating purposes. The key point is that we cannot reuse
diff --git a/src/bvals/comms/boundary_communication.cpp b/src/bvals/comms/boundary_communication.cpp
index 4f26a9f52809..7c61881308b6 100644
--- a/src/bvals/comms/boundary_communication.cpp
+++ b/src/bvals/comms/boundary_communication.cpp
@@ -82,7 +82,7 @@ TaskStatus SendBoundBufs(std::shared_ptr<MeshData<Real>> &md) {
   auto &sending_nonzero_flags_h = cache.sending_non_zero_flags_h;
 
   Kokkos::parallel_for(
-      "SendBoundBufs",
+      PARTHENON_AUTO_LABEL,
       Kokkos::TeamPolicy<>(parthenon::DevExecSpace(), nbound, Kokkos::AUTO),
       KOKKOS_LAMBDA(parthenon::team_mbr_t team_member) {
         const int b = team_member.league_rank();
@@ -231,7 +231,7 @@ TaskStatus SetBounds(std::shared_ptr<MeshData<Real>> &md) {
   // const Real threshold = Globals::sparse_config.allocation_threshold;
   auto &bnd_info = cache.bnd_info;
   Kokkos::parallel_for(
-      "SetBoundaryBuffers",
+      PARTHENON_AUTO_LABEL,
       Kokkos::TeamPolicy<>(parthenon::DevExecSpace(), nbound, Kokkos::AUTO),
       KOKKOS_LAMBDA(parthenon::team_mbr_t team_member) {
         const int b = team_member.league_rank();
diff --git a/src/bvals/comms/flux_correction.cpp b/src/bvals/comms/flux_correction.cpp
index 50166ec9ca89..ff76bcba0014 100644
--- a/src/bvals/comms/flux_correction.cpp
+++ b/src/bvals/comms/flux_correction.cpp
@@ -67,7 +67,7 @@ TaskStatus LoadAndSendFluxCorrections(std::shared_ptr<MeshData<Real>> &md) {
   auto &bnd_info = cache.bnd_info;
   PARTHENON_REQUIRE(bnd_info.size() == nbound, "Need same size for boundary info");
   Kokkos::parallel_for(
-      "SendFluxCorrectionBufs",
+      PARTHENON_AUTO_LABEL,
       Kokkos::TeamPolicy<>(parthenon::DevExecSpace(), nbound, Kokkos::AUTO),
       KOKKOS_LAMBDA(parthenon::team_mbr_t team_member) {
         auto &binfo = bnd_info(team_member.league_rank());
@@ -163,7 +163,7 @@ TaskStatus SetFluxCorrections(std::shared_ptr<MeshData<Real>> &md) {
 
   auto &bnd_info = cache.bnd_info;
   Kokkos::parallel_for(
-      "SetFluxCorBuffers",
+      PARTHENON_AUTO_LABEL,
       Kokkos::TeamPolicy<>(parthenon::DevExecSpace(), nbound, Kokkos::AUTO),
       KOKKOS_LAMBDA(parthenon::team_mbr_t team_member) {
         const int b = team_member.league_rank();
diff --git a/src/driver/driver.cpp b/src/driver/driver.cpp
index 8359e6fc9a82..166586003bc4 100644
--- a/src/driver/driver.cpp
+++ b/src/driver/driver.cpp
@@ -76,7 +76,7 @@ DriverStatus EvolutionDriver::Execute() {
   DumpInputParameters();
 
   { // Main t < tmax loop region
-    PARTHENON_INSTRUMENT_REGION("evolution_loop")
+    PARTHENON_INSTRUMENT_REGION(PARTHENON_AUTO_LABEL)
     while (tm.KeepGoing()) {
       if (Globals::my_rank == 0) OutputCycleDiagnostics();
 
diff --git a/src/interface/swarm.cpp b/src/interface/swarm.cpp
index c9db0ced5478..981ded597589 100644
--- a/src/interface/swarm.cpp
+++ b/src/interface/swarm.cpp
@@ -406,7 +406,7 @@ void Swarm::Defrag() {
 
   auto &mask = mask_;
   pmb->par_for(
-      "Swarm::DefragMask", 0, max_active_index_, KOKKOS_LAMBDA(const int n) {
+      PARTHENON_AUTO_LABEL, 0, max_active_index_, KOKKOS_LAMBDA(const int n) {
         if (from_to_indices(n) >= 0) {
           mask(from_to_indices(n)) = mask(n);
           mask(n) = false;
@@ -427,7 +427,7 @@ void Swarm::Defrag() {
   const int intPackDim = vint.GetDim(2);
 
   pmb->par_for(
-      "Swarm::DefragVariables", 0, max_active_index_, KOKKOS_LAMBDA(const int n) {
+      PARTHENON_AUTO_LABEL, 0, max_active_index_, KOKKOS_LAMBDA(const int n) {
         if (from_to_indices(n) >= 0) {
           for (int vidx = 0; vidx < realPackDim; vidx++) {
             vreal(vidx, from_to_indices(n)) = vreal(vidx, n);
@@ -477,7 +477,7 @@ void Swarm::SortParticlesByCell() {
 
   // Write an unsorted list
   pmb->par_for(
-      "Write unsorted list", 0, max_active_index_, KOKKOS_LAMBDA(const int n) {
+      PARTHENON_AUTO_LABEL, 0, max_active_index_, KOKKOS_LAMBDA(const int n) {
         int i, j, k;
         swarm_d.Xtoijk(x(n), y(n), z(n), i, j, k);
         const int64_t cell_idx_1d = i + nx1 * (j + nx2 * k);
@@ -491,7 +491,7 @@ void Swarm::SortParticlesByCell() {
   const IndexRange &jb = pmb->cellbounds.GetBoundsJ(IndexDomain::entire);
   const IndexRange &kb = pmb->cellbounds.GetBoundsK(IndexDomain::entire);
   pmb->par_for(
-      "Update per-cell arrays", kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+      PARTHENON_AUTO_LABEL, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
       KOKKOS_LAMBDA(const int k, const int j, const int i) {
         int cell_idx_1d = i + nx1 * (j + nx2 * k);
         // Find starting index, first by guessing
@@ -970,7 +970,7 @@ void Swarm::LoadBuffers_(const int max_indices_size) {
   auto particle_indices_to_send = particle_indices_to_send_;
   auto neighbor_buffer_index = neighbor_buffer_index_;
   pmb->par_for(
-      "Pack Buffers", 0, max_indices_size - 1,
+      PARTHENON_AUTO_LABEL, 0, max_indices_size - 1,
       KOKKOS_LAMBDA(const int n) {            // Max index
         for (int m = 0; m < nneighbor; m++) { // Number of neighbors
           const int bufid = neighbor_buffer_index(m);
@@ -1109,7 +1109,8 @@ void Swarm::UnloadBuffers_() {
     auto swarm_d = GetDeviceContext();
 
     pmb->par_for(
-        "Unload buffers", 0, total_received_particles_ - 1, KOKKOS_LAMBDA(const int n) {
+        PARTHENON_AUTO_LABEL, 0, total_received_particles_ - 1,
+        KOKKOS_LAMBDA(const int n) {
           const int sid = new_indices(n);
           const int nid = neighbor_index(n);
           int bid = buffer_index(n) * particle_size;
@@ -1137,7 +1138,7 @@ void Swarm::ApplyBoundaries_(const int nparticles, ParArrayND<int> indices) {
   auto bcs = this->bounds_d;
 
   pmb->par_for(
-      "Swarm::ApplyBoundaries", 0, nparticles - 1, KOKKOS_LAMBDA(const int n) {
+      PARTHENON_AUTO_LABEL, 0, nparticles - 1, KOKKOS_LAMBDA(const int n) {
         const int sid = indices(n);
         for (int l = 0; l < 6; l++) {
           bcs.bounds[l]->Apply(sid, x(sid), y(sid), z(sid), swarm_d);
diff --git a/src/interface/update.cpp b/src/interface/update.cpp
index 02c3b632926e..b9a7e6c8f778 100644
--- a/src/interface/update.cpp
+++ b/src/interface/update.cpp
@@ -48,7 +48,7 @@ TaskStatus FluxDivergence(MeshBlockData<Real> *in, MeshBlockData<Real> *dudt_con
   const auto &coords = pmb->coords;
   const int ndim = pmb->pmy_mesh->ndim;
   pmb->par_for(
-      "FluxDivergenceBlock", 0, vin.GetDim(4) - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+      PARTHENON_AUTO_LABEL, 0, vin.GetDim(4) - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
       KOKKOS_LAMBDA(const int l, const int k, const int j, const int i) {
         if (dudt.IsAllocated(l) && vin.IsAllocated(l)) {
           dudt(l, k, j, i) = FluxDivHelper(l, k, j, i, ndim, coords, vin);
@@ -71,7 +71,7 @@ TaskStatus FluxDivergence(MeshData<Real> *in_obj, MeshData<Real> *dudt_obj) {
 
   const int ndim = vin.GetNdim();
   parthenon::par_for(
-      DEFAULT_LOOP_PATTERN, "FluxDivergenceMesh", DevExecSpace(), 0, vin.GetDim(5) - 1, 0,
+      DEFAULT_LOOP_PATTERN, PARTHENON_AUTO_LABEL, DevExecSpace(), 0, vin.GetDim(5) - 1, 0,
       vin.GetDim(4) - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
       KOKKOS_LAMBDA(const int m, const int l, const int k, const int j, const int i) {
         if (dudt.IsAllocated(m, l) && vin.IsAllocated(m, l)) {
@@ -100,8 +100,8 @@ TaskStatus UpdateWithFluxDivergence(MeshBlockData<Real> *u0_data,
   const auto &coords = pmb->coords;
   const int ndim = pmb->pmy_mesh->ndim;
   pmb->par_for(
-      "UpdateWithFluxDivergenceBlock", 0, u0.GetDim(4) - 1, kb.s, kb.e, jb.s, jb.e, ib.s,
-      ib.e, KOKKOS_LAMBDA(const int l, const int k, const int j, const int i) {
+      PARTHENON_AUTO_LABEL, 0, u0.GetDim(4) - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+      KOKKOS_LAMBDA(const int l, const int k, const int j, const int i) {
         if (u0.IsAllocated(l) && u1.IsAllocated(l)) {
           u0(l, k, j, i) = gam0 * u0(l, k, j, i) + gam1 * u1(l, k, j, i) +
                            beta_dt * FluxDivHelper(l, k, j, i, ndim, coords, u0);
@@ -126,7 +126,7 @@ TaskStatus UpdateWithFluxDivergence(MeshData<Real> *u0_data, MeshData<Real> *u1_
 
   const int ndim = u0_pack.GetNdim();
   parthenon::par_for(
-      DEFAULT_LOOP_PATTERN, "UpdateWithFluxDivergenceMesh", DevExecSpace(), 0,
+      DEFAULT_LOOP_PATTERN, PARTHENON_AUTO_LABEL, DevExecSpace(), 0,
       u0_pack.GetDim(5) - 1, 0, u0_pack.GetDim(4) - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
       KOKKOS_LAMBDA(const int m, const int l, const int k, const int j, const int i) {
         if (u0_pack.IsAllocated(m, l) && u1_pack.IsAllocated(m, l)) {
@@ -162,7 +162,7 @@ TaskStatus SparseDealloc(MeshData<Real> *md) {
   const int NjNi = Nj * Ni;
   const int NkNjNi = Nk * NjNi;
   Kokkos::parallel_for(
-      "SparseDealloc",
+      PARTHENON_AUTO_LABEL,
       Kokkos::TeamPolicy<>(parthenon::DevExecSpace(), pack.GetNBlocks(), Kokkos::AUTO),
       KOKKOS_LAMBDA(parthenon::team_mbr_t team_member) {
         const int b = team_member.league_rank();
diff --git a/src/interface/update.hpp b/src/interface/update.hpp
index d2cc37343802..dd4b8158d0b8 100644
--- a/src/interface/update.hpp
+++ b/src/interface/update.hpp
@@ -75,7 +75,7 @@ TaskStatus WeightedSumData(const F &flags, T *in1, T *in2, const Real w1, const
   const auto &y = in2->PackVariables(flags);
   const auto &z = out->PackVariables(flags);
   parthenon::par_for(
-      DEFAULT_LOOP_PATTERN, "WeightedSumData", DevExecSpace(), 0, x.GetDim(5) - 1, 0,
+      DEFAULT_LOOP_PATTERN, PARTHENON_AUTO_LABEL, DevExecSpace(), 0, x.GetDim(5) - 1, 0,
       x.GetDim(4) - 1, 0, x.GetDim(3) - 1, 0, x.GetDim(2) - 1, 0, x.GetDim(1) - 1,
       KOKKOS_LAMBDA(const int b, const int l, const int k, const int j, const int i) {
         // TOOD(someone) This is potentially dangerous and/or not intended behavior
@@ -98,7 +98,7 @@ TaskStatus SetDataToConstant(const F &flags, T *data, const Real val) {
   PARTHENON_INSTRUMENT
   const auto &x = data->PackVariables(flags);
   parthenon::par_for(
-      DEFAULT_LOOP_PATTERN, "SetDataToConstant", DevExecSpace(), 0, x.GetDim(5) - 1, 0,
+      DEFAULT_LOOP_PATTERN, PARTHENON_AUTO_LABEL, DevExecSpace(), 0, x.GetDim(5) - 1, 0,
       x.GetDim(4) - 1, 0, x.GetDim(3) - 1, 0, x.GetDim(2) - 1, 0, x.GetDim(1) - 1,
       KOKKOS_LAMBDA(const int b, const int l, const int k, const int j, const int i) {
         if (x.IsAllocated(b, l)) {
@@ -161,7 +161,7 @@ TaskStatus Update2S(const F &flags, T *s0_data, T *s1_data, T *rhs_data,
   Real gam0 = pint->gam0[stage - 1];
   Real gam1 = pint->gam1[stage - 1];
   parthenon::par_for(
-      DEFAULT_LOOP_PATTERN, "2S_Update", DevExecSpace(), 0, s0.GetDim(5) - 1, 0,
+      DEFAULT_LOOP_PATTERN, PARTHENON_AUTO_LABEL, DevExecSpace(), 0, s0.GetDim(5) - 1, 0,
       s0.GetDim(4) - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
       KOKKOS_LAMBDA(const int b, const int l, const int k, const int j, const int i) {
         if (s0.IsAllocated(b, l) && s1.IsAllocated(b, l) && rhs.IsAllocated(b, l)) {
@@ -199,7 +199,7 @@ TaskStatus SumButcher(const F &flags, std::shared_ptr<T> base_data,
   const IndexRange jb = out_data->GetBoundsJ(interior);
   const IndexRange kb = out_data->GetBoundsK(interior);
   parthenon::par_for(
-      DEFAULT_LOOP_PATTERN, "ButcherSumInit", DevExecSpace(), 0, out.GetDim(5) - 1, 0,
+      DEFAULT_LOOP_PATTERN, PARTHENON_AUTO_LABEL, DevExecSpace(), 0, out.GetDim(5) - 1, 0,
       out.GetDim(4) - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
       KOKKOS_LAMBDA(const int b, const int l, const int k, const int j, const int i) {
         if (out.IsAllocated(b, l) && in.IsAllocated(b, l)) {
@@ -210,8 +210,8 @@ TaskStatus SumButcher(const F &flags, std::shared_ptr<T> base_data,
     Real a = pint->a[stage - 1][prev];
     const auto &in = stage_data[stage]->PackVariables(flags);
     parthenon::par_for(
-        DEFAULT_LOOP_PATTERN, "ButcherSum", DevExecSpace(), 0, out.GetDim(5) - 1, 0,
-        out.GetDim(4) - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+        DEFAULT_LOOP_PATTERN, PARTHENON_AUTO_LABEL, DevExecSpace(), 0, out.GetDim(5) - 1,
+        0, out.GetDim(4) - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
         KOKKOS_LAMBDA(const int b, const int l, const int k, const int j, const int i) {
           if (out.IsAllocated(b, l) && in.IsAllocated(b, l)) {
             out(b, l, k, j, i) += dt * a * in(b, l, k, j, i);
@@ -247,8 +247,8 @@ TaskStatus UpdateButcher(const F &flags, std::vector<std::shared_ptr<T>> stage_d
     const Real butcher_b = pint->b[stage];
     const auto &in = stage_data[stage]->PackVariables(flags);
     parthenon::par_for(
-        DEFAULT_LOOP_PATTERN, "ButcherUpdate", DevExecSpace(), 0, out.GetDim(5) - 1, 0,
-        out.GetDim(4) - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+        DEFAULT_LOOP_PATTERN, PARTHENON_AUTO_LABEL, DevExecSpace(), 0, out.GetDim(5) - 1,
+        0, out.GetDim(4) - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
         KOKKOS_LAMBDA(const int b, const int l, const int k, const int j, const int i) {
           if (out.IsAllocated(b, l) && in.IsAllocated(b, l)) {
             out(b, l, k, j, i) += dt * b * in(b, l, k, j, i);
@@ -292,19 +292,19 @@ TaskStatus FillDerived(T *rc) {
   PARTHENON_INSTRUMENT
   auto pm = rc->GetParentPointer();
   { // PreFillDerived region
-    PARTHENON_INSTRUMENT_REGION("PreFillDerived")
+    PARTHENON_INSTRUMENT_REGION(PARTHENON_AUTO_LABEL)
     for (const auto &pkg : pm->packages.AllPackages()) {
       pkg.second->PreFillDerived(rc);
     }
   } // PreFillDerived region
   { // FillDerived region
-    PARTHENON_INSTRUMENT_REGION("FillDerived");
+    PARTHENON_INSTRUMENT_REGION(PARTHENON_AUTO_LABEL)
     for (const auto &pkg : pm->packages.AllPackages()) {
       pkg.second->FillDerived(rc);
     }
   } // FillDerived region
   { // PostFillDerived region
-    PARTHENON_INSTRUMENT_REGION("PostFillDerived");
+    PARTHENON_INSTRUMENT_REGION(PARTHENON_AUTO_LABEL)
     for (const auto &pkg : pm->packages.AllPackages()) {
       pkg.second->PostFillDerived(rc);
     }
@@ -334,7 +334,7 @@ TaskStatus InitNewlyAllocatedVars(T *rc) {
     auto v = desc.GetPack(rc);
 
     Kokkos::parallel_for(
-        "Set newly allocated interior to default",
+        PARTHENON_AUTO_LABEL,
         Kokkos::TeamPolicy<>(parthenon::DevExecSpace(), v.GetNBlocks(), Kokkos::AUTO),
         KOKKOS_LAMBDA(parthenon::team_mbr_t team_member) {
           const int b = team_member.league_rank();
diff --git a/src/mesh/amr_loadbalance.cpp b/src/mesh/amr_loadbalance.cpp
index 4018d8220057..f1eff2959c40 100644
--- a/src/mesh/amr_loadbalance.cpp
+++ b/src/mesh/amr_loadbalance.cpp
@@ -126,8 +126,8 @@ bool TryRecvCoarseToFine(int lid_recv, int send_rank, const LogicalLocation &fin
         const int is = (ox1 == 0) ? 0 : (ib_int.e - ib_int.s + 1) / 2;
         const int idx_te = static_cast<int>(te) % 3;
         parthenon::par_for(
-            DEFAULT_LOOP_PATTERN, "ReceiveCoarseToFineAMR", DevExecSpace(), 0, nt, 0, nu,
-            0, nv, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+            DEFAULT_LOOP_PATTERN, PARTHENON_AUTO_LABEL, DevExecSpace(), 0, nt, 0, nu, 0,
+            nv, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
             KOKKOS_LAMBDA(const int t, const int u, const int v, const int k, const int j,
                           const int i) {
               cb(idx_te, t, u, v, k, j, i) = fb(idx_te, t, u, v, k + ks, j + js, i + is);
@@ -219,8 +219,8 @@ bool TryRecvFineToCoarse(int lid_recv, int send_rank, const LogicalLocation &fin
         const int is = (ox1 == 0) ? 0 : (ib.e - ib.s + 1 - TopologicalOffsetI(te));
         const int idx_te = static_cast<int>(te) % 3;
         parthenon::par_for(
-            DEFAULT_LOOP_PATTERN, "ReceiveFineToCoarseAMR", DevExecSpace(), 0, nt, 0, nu,
-            0, nv, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+            DEFAULT_LOOP_PATTERN, PARTHENON_AUTO_LABEL, DevExecSpace(), 0, nt, 0, nu, 0,
+            nv, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
             KOKKOS_LAMBDA(const int t, const int u, const int v, const int k, const int j,
                           const int i) {
               fb(idx_te, t, u, v, k + ks, j + js, i + is) = cb(idx_te, t, u, v, k, j, i);
@@ -682,7 +682,7 @@ void Mesh::RedistributeAndRefineMeshBlocks(ParameterInput *pin, ApplicationInput
   int onbe = onbs + nblist[Globals::my_rank] - 1;
 
   { // Construct new list region
-    PARTHENON_INSTRUMENT_REGION("Construct_new_list")
+    PARTHENON_INSTRUMENT_REGION(PARTHENON_AUTO_LABEL)
     tree.GetMeshBlockList(newloc.data(), newtoold.data(), nbtotal);
 
     // create a list mapping the previous gid to the current one
@@ -759,7 +759,7 @@ void Mesh::RedistributeAndRefineMeshBlocks(ParameterInput *pin, ApplicationInput
   // Send data from old to new blocks
   std::vector<MPI_Request> send_reqs;
   { // AMR Send region
-    PARTHENON_INSTRUMENT_REGION("AMR_Send")
+    PARTHENON_INSTRUMENT_REGION(PARTHENON_AUTO_LABEL)
     for (int n = onbs; n <= onbe; n++) {
       int nn = oldtonew[n];
       LogicalLocation &oloc = loclist[n];
@@ -791,7 +791,7 @@ void Mesh::RedistributeAndRefineMeshBlocks(ParameterInput *pin, ApplicationInput
   // Construct a new MeshBlock list (moving the data within the MPI rank)
   BlockList_t new_block_list(nbe - nbs + 1);
   { // AMR Construct new MeshBlockList region
-    PARTHENON_INSTRUMENT_REGION("AMR_Construct_new_list")
+    PARTHENON_INSTRUMENT_REGION(PARTHENON_AUTO_LABEL)
     RegionSize block_size = GetBlockSize();
 
     for (int n = nbs; n <= nbe; n++) {
@@ -831,7 +831,7 @@ void Mesh::RedistributeAndRefineMeshBlocks(ParameterInput *pin, ApplicationInput
 
   // Receive the data and load into MeshBlocks
   { // AMR Recv and unpack data
-    PARTHENON_INSTRUMENT_REGION("AMR_Recv_unpack")
+    PARTHENON_INSTRUMENT_REGION(PARTHENON_AUTO_LABEL)
     bool all_received;
     int niter = 0;
     if (block_list.size() > 0) {
diff --git a/src/outputs/ascent.cpp b/src/outputs/ascent.cpp
index 57db543f0900..13af3b50f613 100644
--- a/src/outputs/ascent.cpp
+++ b/src/outputs/ascent.cpp
@@ -150,7 +150,7 @@ void AscentOutput::WriteOutputFile(Mesh *pm, ParameterInput *pin, SimTime *tm,
       const int njni = nj * ni;
       auto &ghost_mask = ghost_mask_; // redef to lambda capture class member
       pmb->par_for(
-          "Set ascent ghost mask", 0, ncells - 1, KOKKOS_LAMBDA(const int &idx) {
+          PARTHENON_AUTO_LABEL, 0, ncells - 1, KOKKOS_LAMBDA(const int &idx) {
             const int k = idx / (njni);
             const int j = (idx - k * njni) / ni;
             const int i = idx - k * njni - j * nj;
diff --git a/src/outputs/parthenon_hdf5.cpp b/src/outputs/parthenon_hdf5.cpp
index 866d4096015d..1635627e1056 100644
--- a/src/outputs/parthenon_hdf5.cpp
+++ b/src/outputs/parthenon_hdf5.cpp
@@ -113,9 +113,9 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm
   //   WRITING ATTRIBUTES                                                             //
   // -------------------------------------------------------------------------------- //
   H5G info_group;
-  { // write attributes
-    PARTHENON_INSTRUMENT_REGION("write_attributes") {
-      PARTHENON_INSTRUMENT_REGION("write_input")
+  {                                                     // write attributes
+    PARTHENON_INSTRUMENT_REGION(PARTHENON_AUTO_LABEL) { // Input section
+      PARTHENON_INSTRUMENT_REGION(PARTHENON_AUTO_LABEL)
       // write input key-value pairs
       std::ostringstream oss;
       pin->ParameterDump(oss);
@@ -129,7 +129,7 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm
     // we'll need this again at the end
     info_group = MakeGroup(file, "/Info");
     {
-      PARTHENON_INSTRUMENT_REGION("write_info")
+      PARTHENON_INSTRUMENT_REGION(PARTHENON_AUTO_LABEL)
       HDF5WriteAttribute("OutputFormatVersion", OUTPUT_VERSION_FORMAT, info_group);
 
       if (tm != nullptr) {
@@ -190,7 +190,7 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm
 
     // write Params
     {
-      PARTHENON_INSTRUMENT_REGION("write_params")
+      PARTHENON_INSTRUMENT_REGION(PARTHENON_AUTO_LABEL)
       const H5G params_group = MakeGroup(file, "/Params");
 
       for (const auto &package : pm->packages.AllPackages()) {
@@ -236,7 +236,7 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm
 
   // write Blocks metadata
   {
-    PARTHENON_INSTRUMENT_REGION("write_block_metadata")
+    PARTHENON_INSTRUMENT_REGION(PARTHENON_AUTO_LABEL)
     const H5G gBlocks = MakeGroup(file, "/Blocks");
 
     // write Xmin[ndim] for blocks
@@ -270,7 +270,7 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm
 
   // Write mesh coordinates to file
   { // write mesh coords
-    PARTHENON_INSTRUMENT_REGION("write_mesh_coords");
+    PARTHENON_INSTRUMENT_REGION(PARTHENON_AUTO_LABEL)
     for (const bool face : {true, false}) {
       const H5G gLocations = MakeGroup(file, face ? "/Locations" : "/VolumeLocations");
 
@@ -298,7 +298,7 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm
   // Write Levels and Logical Locations with the level for each Meshblock loclist contains
   // levels and logical locations for all meshblocks on all ranks
   {
-    PARTHENON_INSTRUMENT_REGION("write_levels_and_locations")
+    PARTHENON_INSTRUMENT_REGION(PARTHENON_AUTO_LABEL)
     const auto &loclist = pm->GetLocList();
 
     std::vector<std::int64_t> levels;
@@ -335,7 +335,7 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm
   hsize_t num_sparse;
   std::unique_ptr<hbool_t[]> sparse_allocated;
   { // write all variable data
-    PARTHENON_INSTRUMENT_REGION("write_all_variable_data")
+    PARTHENON_INSTRUMENT_REGION(PARTHENON_AUTO_LABEL)
 
     // All blocks have the same list of variable metadata that exist in the entire
     // simulation, but not all variables may be allocated on all blocks
@@ -410,7 +410,7 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm
 
     // for each variable we write
     for (auto &vinfo : all_vars_info) {
-      PARTHENON_INSTRUMENT_REGION("write_variable_loop")
+      PARTHENON_INSTRUMENT_REGION(PARTHENON_AUTO_LABEL)
       // not really necessary, but doesn't hurt
       memset(tmpData.data(), 0, tmpData.size() * sizeof(OutT));
 
@@ -480,7 +480,7 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm
       hsize_t index = 0;
 
       { // fill host output buffer
-        PARTHENON_INSTRUMENT_REGION("fill_host_output_buffer")
+        PARTHENON_INSTRUMENT_REGION(PARTHENON_AUTO_LABEL)
         // for each local mesh block
         for (size_t b_idx = 0; b_idx < num_blocks_local; ++b_idx) {
           const auto &pmb = pm->block_list[b_idx];
@@ -553,7 +553,7 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm
       } // fill host output buffer
 
       { // write variable data
-        PARTHENON_INSTRUMENT_REGION("write_variable_data")
+        PARTHENON_INSTRUMENT_REGION(PARTHENON_AUTO_LABEL)
         // write data to file
         HDF5WriteND(file, var_name, tmpData.data(), ndim, p_loc_offset, p_loc_cnt,
                     p_glob_cnt, pl_xfer, pl_dcreate);
@@ -592,7 +592,7 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm
   // write SparseInfo and SparseFields (we can't write a zero-size dataset, so only write
   // this if we have sparse fields)
   if (num_sparse > 0) {
-    PARTHENON_INSTRUMENT_REGION("write_sparse_info")
+    PARTHENON_INSTRUMENT_REGION(PARTHENON_AUTO_LABEL)
     local_count[1] = global_count[1] = num_sparse;
 
     HDF5Write2D(file, "SparseInfo", sparse_allocated.get(), p_loc_offset, p_loc_cnt,
@@ -613,7 +613,7 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm
 
   AllSwarmInfo swarm_info(pm->block_list, output_params.swarms, restart_);
   { // write particle data
-    PARTHENON_INSTRUMENT_REGION("write_particle_data")
+    PARTHENON_INSTRUMENT_REGION(PARTHENON_AUTO_LABEL)
     for (auto &[swname, swinfo] : swarm_info.all_info) {
       const H5G g_swm = MakeGroup(file, swname);
       // offsets/counts are NOT the same here vs the grid data
@@ -676,7 +676,7 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm
   } // write particle data
 
   { // generate xdmf
-    PARTHENON_INSTRUMENT_REGION("genXDMF")
+    PARTHENON_INSTRUMENT_REGION(PARTHENON_AUTO_LABEL)
     // generate XDMF companion file
     XDMF::genXDMF(filename, pm, tm, nx1, nx2, nx3, all_vars_info, swarm_info);
   } // generate xdmf
diff --git a/src/prolong_restrict/pr_loops.hpp b/src/prolong_restrict/pr_loops.hpp
index 6f9cf823bc37..9dc02e9e92fe 100644
--- a/src/prolong_restrict/pr_loops.hpp
+++ b/src/prolong_restrict/pr_loops.hpp
@@ -123,8 +123,8 @@ ProlongationRestrictionLoop(const ProResInfoArr_t &info, const Idx_t &buffer_idx
   const int scratch_level = 1; // 0 is actual scratch (tiny); 1 is HBM
   size_t scratch_size_in_bytes = 1;
   par_for_outer(
-      DEFAULT_OUTER_LOOP_PATTERN, "ProlongateOrRestrictCellCenteredValues",
-      DevExecSpace(), scratch_size_in_bytes, scratch_level, 0, nbuffers - 1,
+      DEFAULT_OUTER_LOOP_PATTERN, PARTHENON_AUTO_LABEL, DevExecSpace(),
+      scratch_size_in_bytes, scratch_level, 0, nbuffers - 1,
       KOKKOS_LAMBDA(team_mbr_t team_member, const int sub_idx) {
         const std::size_t buf = buffer_idxs(sub_idx);
         if (DoRefinementOp(info(buf), op)) {
@@ -157,8 +157,8 @@ InnerHostProlongationRestrictionLoop(std::size_t buf, const ProResInfoArrHost_t
   auto coarse = info(buf).coarse;
   auto fine = info(buf).fine;
   par_for(
-      DEFAULT_LOOP_PATTERN, "ProlongateOrRestrictCellCenteredValues", DevExecSpace(), 0,
-      0, 0, 0, 0, idxer.size() - 1, KOKKOS_LAMBDA(const int, const int, const int ii) {
+      DEFAULT_LOOP_PATTERN, PARTHENON_AUTO_LABEL, DevExecSpace(), 0, 0, 0, 0, 0,
+      idxer.size() - 1, KOKKOS_LAMBDA(const int, const int, const int ii) {
         const auto [t, u, v, k, j, i] = idxer(ii);
         if (idxer.IsActive(k, j, i)) {
           Stencil::template Do<DIM, FEL, CEL>(t, u, v, k, j, i, ckb, cjb, cib, kb, jb, ib,
diff --git a/src/utils/buffer_utils.cpp b/src/utils/buffer_utils.cpp
index 65213d9421c7..d2a2e418c83f 100644
--- a/src/utils/buffer_utils.cpp
+++ b/src/utils/buffer_utils.cpp
@@ -44,7 +44,7 @@ void PackData(ParArray4D<T> &src, BufArray1D<T> &buf, int sn, int en, int si, in
   const int nn = en + 1 - sn;
 
   pmb->par_for(
-      "PackData 4D", sn, en, sk, ek, sj, ej, si, ei,
+      PARTHENON_AUTO_LABEL, sn, en, sk, ek, sj, ej, si, ei,
       KOKKOS_LAMBDA(const int n, const int k, const int j, const int i) {
         buf(offset + i - si + ni * (j - sj + nj * (k - sk + nk * (n - sn)))) =
             src(n, k, j, i);
@@ -62,7 +62,7 @@ void PackZero(BufArray1D<T> &buf, int sn, int en, int si, int ei, int sj, int ej
   const int nn = en + 1 - sn;
 
   pmb->par_for(
-      "PackZero 4D", sn, en, sk, ek, sj, ej, si, ei,
+      PARTHENON_AUTO_LABEL, sn, en, sk, ek, sj, ej, si, ei,
       KOKKOS_LAMBDA(const int n, const int k, const int j, const int i) {
         buf(offset + i - si + ni * (j - sj + nj * (k - sk + nk * (n - sn)))) = 0.0;
       });
@@ -84,7 +84,7 @@ void PackData(ParArray3D<T> &src, BufArray1D<T> &buf, int si, int ei, int sj, in
   const int nk = ek + 1 - sk;
 
   pmb->par_for(
-      "PackData 3D", sk, ek, sj, ej, si, ei, KOKKOS_LAMBDA(int k, int j, int i) {
+      PARTHENON_AUTO_LABEL, sk, ek, sj, ej, si, ei, KOKKOS_LAMBDA(int k, int j, int i) {
         buf(offset + i - si + ni * (j - sj + nj * (k - sk))) = src(k, j, i);
       });
 
@@ -107,7 +107,7 @@ void UnpackData(BufArray1D<T> &buf, ParArray4D<T> &dst, int sn, int en, int si,
   const int nn = en + 1 - sn;
 
   pmb->par_for(
-      "UnpackData 4D", sn, en, sk, ek, sj, ej, si, ei,
+      PARTHENON_AUTO_LABEL, sn, en, sk, ek, sj, ej, si, ei,
       KOKKOS_LAMBDA(int n, int k, int j, int i) {
         dst(n, k, j, i) =
             buf(offset + i - si + ni * (j - sj + nj * (k - sk + nk * (n - sn))));
@@ -131,7 +131,7 @@ void UnpackData(BufArray1D<T> &buf, ParArray3D<T> &dst, int si, int ei, int sj,
   const int nk = ek + 1 - sk;
 
   pmb->par_for(
-      "UnpackData 3D", sk, ek, sj, ej, si, ei, KOKKOS_LAMBDA(int k, int j, int i) {
+      PARTHENON_AUTO_LABEL, sk, ek, sj, ej, si, ei, KOKKOS_LAMBDA(int k, int j, int i) {
         dst(k, j, i) = buf(offset + i - si + ni * (j - sj + nj * (k - sk)));
       });
 
diff --git a/src/utils/instrument.hpp b/src/utils/instrument.hpp
index 3fa3fcb4f7c4..6e1030056141 100644
--- a/src/utils/instrument.hpp
+++ b/src/utils/instrument.hpp
@@ -18,24 +18,37 @@
 #include <Kokkos_Core.hpp>
 
 #define PARTHENON_INSTRUMENT KokkosTimer internal_inst_(__FILE__, __LINE__, __func__);
-#define PARTHENON_INSTRUMENT_REGION(name)                                                \
-  KokkosTimer internal_inst_reg____LINE__(__FILE__, __LINE__, name);
+#define PARTHENON_INSTRUMENT_REGION(name) KokkosTimer internal_inst_reg_(name);
+#define PARTHENON_AUTO_LABEL parthenon::build_auto_label(__FILE__, __LINE__, __func__)
 
 namespace parthenon {
+namespace detail {
+inline std::string strip_full_path(const std::string &full_path) {
+  std::string label;
+  auto npos = full_path.rfind('/');
+  if (npos == std::string::npos) {
+    label = full_path;
+  } else {
+    label = full_path.substr(npos + 1);
+  }
+  return label;
+}
+} // namespace detail
+
+inline std::string build_auto_label(const std::string &file, const int line,
+                                    const std::string &name) {
+  return detail::strip_full_path(file) + "::" + std::to_string(line) + "::" + name;
+}
 
 struct KokkosTimer {
   KokkosTimer(const std::string &file, const int line, const std::string &name) {
-    std::string label;
-    auto npos = file.rfind('/');
-    if (npos == std::string::npos) {
-      label = file;
-    } else {
-      label = file.substr(npos + 1);
-    }
-    label += "::" + std::to_string(line) + "::" + name;
-    Kokkos::Profiling::pushRegion(label);
+    Push(build_auto_label(file, line, name));
   }
+  KokkosTimer(const std::string &name) { Push(name); }
   ~KokkosTimer() { Kokkos::Profiling::popRegion(); }
+
+ private:
+  void Push(const std::string &name) { Kokkos::Profiling::pushRegion(name); }
 };
 
 } // namespace parthenon

From 410ceb5117bf504d300c743c041e2bc10d94b135 Mon Sep 17 00:00:00 2001
From: jdolence <jcdolence@gmail.com>
Date: Tue, 31 Oct 2023 17:07:21 -0600
Subject: [PATCH 43/67] missed one

---
 src/bvals/boundary_conditions_generic.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/bvals/boundary_conditions_generic.hpp b/src/bvals/boundary_conditions_generic.hpp
index 9ee8cf5eba8e..76a52b899c88 100644
--- a/src/bvals/boundary_conditions_generic.hpp
+++ b/src/bvals/boundary_conditions_generic.hpp
@@ -88,7 +88,7 @@ void GenericBC(std::shared_ptr<MeshBlockData<Real>> &rc, bool coarse,
   const int offsetin = INNER;
   const int offsetout = !INNER;
   pmb->par_for_bndry(
-      label, nb, domain, el, coarse,
+      PARTHENON_AUTO_LABEL, nb, domain, el, coarse,
       KOKKOS_LAMBDA(const int &l, const int &k, const int &j, const int &i) {
         if (TYPE == BCType::Reflect) {
           const bool reflect = (q(b, el, l).vector_component == DIR);

From 14a10606f2e2bfcd314912c209fbd765bf17f11a Mon Sep 17 00:00:00 2001
From: jdolence <jcdolence@gmail.com>
Date: Wed, 1 Nov 2023 09:08:33 -0600
Subject: [PATCH 44/67] fix style

---
 src/mesh/amr_loadbalance.cpp   |  1 -
 src/outputs/parthenon_hdf5.cpp | 22 ++++++++++------------
 src/utils/instrument.hpp       |  4 ++--
 3 files changed, 12 insertions(+), 15 deletions(-)

diff --git a/src/mesh/amr_loadbalance.cpp b/src/mesh/amr_loadbalance.cpp
index f1eff2959c40..0814c5ccb487 100644
--- a/src/mesh/amr_loadbalance.cpp
+++ b/src/mesh/amr_loadbalance.cpp
@@ -961,7 +961,6 @@ void Mesh::RedistributeAndRefineMeshBlocks(ParameterInput *pin, ApplicationInput
     for (auto &pmb : block_list) {
       pmb->pbval->SearchAndSetNeighbors(tree, ranklist.data(), nslist.data());
     }
-
   } // AMR Recv and unpack data
 
   ResetLoadBalanceVariables();
diff --git a/src/outputs/parthenon_hdf5.cpp b/src/outputs/parthenon_hdf5.cpp
index 1635627e1056..3e9e5d55c2e7 100644
--- a/src/outputs/parthenon_hdf5.cpp
+++ b/src/outputs/parthenon_hdf5.cpp
@@ -128,7 +128,7 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm
 
     // we'll need this again at the end
     info_group = MakeGroup(file, "/Info");
-    {
+    { // write info
       PARTHENON_INSTRUMENT_REGION(PARTHENON_AUTO_LABEL)
       HDF5WriteAttribute("OutputFormatVersion", OUTPUT_VERSION_FORMAT, info_group);
 
@@ -186,10 +186,9 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm
       }
 
       HDF5WriteAttribute("BoundaryConditions", boundary_condition_str, info_group);
-    } // Info section
+    } // write info
 
-    // write Params
-    {
+    { // write params
       PARTHENON_INSTRUMENT_REGION(PARTHENON_AUTO_LABEL)
       const H5G params_group = MakeGroup(file, "/Params");
 
@@ -198,7 +197,7 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm
         // Write all params that can be written as HDF5 attributes
         state->AllParams().WriteAllToHDF5(state->label(), params_group);
       }
-    } // Params section
+    } // write params
   }   // write attributes
 
   // -------------------------------------------------------------------------------- //
@@ -234,8 +233,7 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm
   PARTHENON_HDF5_CHECK(H5Pset_dxpl_mpio(pl_xfer, H5FD_MPIO_COLLECTIVE));
 #endif
 
-  // write Blocks metadata
-  {
+  { // write Blocks metadata
     PARTHENON_INSTRUMENT_REGION(PARTHENON_AUTO_LABEL)
     const H5G gBlocks = MakeGroup(file, "/Blocks");
 
@@ -266,7 +264,7 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm
       HDF5Write2D(gBlocks, "loc.level-gid-lid-cnghost-gflag", tmpID.data(), p_loc_offset,
                   p_loc_cnt, p_glob_cnt, pl_xfer);
     }
-  } // Block section
+  } // write Blocks metadata
 
   // Write mesh coordinates to file
   { // write mesh coords
@@ -297,7 +295,7 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm
 
   // Write Levels and Logical Locations with the level for each Meshblock loclist contains
   // levels and logical locations for all meshblocks on all ranks
-  {
+  { // write levels + log locs
     PARTHENON_INSTRUMENT_REGION(PARTHENON_AUTO_LABEL)
     const auto &loclist = pm->GetLocList();
 
@@ -325,7 +323,7 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm
 
     // reset for collective output
     local_count[0] = num_blocks_local;
-  }
+  } // write levels + log locs
 
   // -------------------------------------------------------------------------------- //
   //   WRITING VARIABLES DATA                                                         //
@@ -558,8 +556,8 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm
         HDF5WriteND(file, var_name, tmpData.data(), ndim, p_loc_offset, p_loc_cnt,
                     p_glob_cnt, pl_xfer, pl_dcreate);
       } // write variable data
-    }
-  } // write all variable data
+    }   // for each variable
+  }     // write all variable data
 
   // names of variables
   std::vector<std::string> var_names;
diff --git a/src/utils/instrument.hpp b/src/utils/instrument.hpp
index 6e1030056141..1f3b3972d0fd 100644
--- a/src/utils/instrument.hpp
+++ b/src/utils/instrument.hpp
@@ -44,7 +44,7 @@ struct KokkosTimer {
   KokkosTimer(const std::string &file, const int line, const std::string &name) {
     Push(build_auto_label(file, line, name));
   }
-  KokkosTimer(const std::string &name) { Push(name); }
+  explicit KokkosTimer(const std::string &name) { Push(name); }
   ~KokkosTimer() { Kokkos::Profiling::popRegion(); }
 
  private:
@@ -53,4 +53,4 @@ struct KokkosTimer {
 
 } // namespace parthenon
 
-#endif
+#endif // UTILS_INSTRUMENT_HPP_

From e9f755e5ca5ca014a0a9dc72f348fa19401139b8 Mon Sep 17 00:00:00 2001
From: jdolence <jcdolence@gmail.com>
Date: Wed, 1 Nov 2023 09:33:29 -0600
Subject: [PATCH 45/67] make auto named profiling variables unique

---
 src/utils/instrument.hpp | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/utils/instrument.hpp b/src/utils/instrument.hpp
index 1f3b3972d0fd..df0a072e13d6 100644
--- a/src/utils/instrument.hpp
+++ b/src/utils/instrument.hpp
@@ -17,9 +17,15 @@
 
 #include <Kokkos_Core.hpp>
 
-#define PARTHENON_INSTRUMENT KokkosTimer internal_inst_(__FILE__, __LINE__, __func__);
-#define PARTHENON_INSTRUMENT_REGION(name) KokkosTimer internal_inst_reg_(name);
+#define __UNIQUE_INST_VAR2(x, y) x##y
+#define __UNIQUE_INST_VAR(x, y) __UNIQUE_INST_VAR2(x, y)
+#define PARTHENON_INSTRUMENT                                                             \
+  KokkosTimer __UNIQUE_INST_VAR(internal_inst, __LINE__)(__FILE__, __LINE__, __func__);
+#define PARTHENON_INSTRUMENT_REGION(name)                                                \
+  KokkosTimer __UNIQUE_INST_VAR(internal_inst_reg, __LINE__)(name);
 #define PARTHENON_AUTO_LABEL parthenon::build_auto_label(__FILE__, __LINE__, __func__)
+//#undef UNIQUE_VAR
+//#undef UNIQUE_VAR2
 
 namespace parthenon {
 namespace detail {

From 3b70bbb08b4b9f6d1a2c38c15fc647ed5a741843 Mon Sep 17 00:00:00 2001
From: jdolence <jcdolence@gmail.com>
Date: Wed, 1 Nov 2023 12:17:58 -0600
Subject: [PATCH 46/67] missed a few

---
 src/amr_criteria/refinement_package.cpp | 11 +++++++----
 src/interface/swarm.cpp                 |  2 +-
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/amr_criteria/refinement_package.cpp b/src/amr_criteria/refinement_package.cpp
index bc6744cab109..57542e9e5775 100644
--- a/src/amr_criteria/refinement_package.cpp
+++ b/src/amr_criteria/refinement_package.cpp
@@ -59,6 +59,7 @@ AmrTag CheckAllRefinement(MeshBlockData<Real> *rc) {
   //    2) the code must maintain proper nesting, which sometimes means a block that is
   //       tagged as "derefine" must be left alone (or possibly refined?) because of
   //       neighboring blocks.  Similarly for "do nothing"
+  PARTHENON_INSTRUMENT
   std::shared_ptr<MeshBlock> pmb = rc->GetBlockPointer();
   // delta_level holds the max over all criteria.  default to derefining.
   AmrTag delta_level = AmrTag::derefine;
@@ -90,11 +91,12 @@ AmrTag CheckAllRefinement(MeshBlockData<Real> *rc) {
 
 AmrTag FirstDerivative(const AMRBounds &bnds, const ParArray3D<Real> &q,
                        const Real refine_criteria, const Real derefine_criteria) {
+  PARTHENON_INSTRUMENT
   const int ndim = 1 + (bnds.je > bnds.js) + (bnds.ke > bnds.ks);
   Real maxd = 0.0;
   par_reduce(
-      loop_pattern_mdrange_tag, "refinement first derivative", DevExecSpace(), bnds.ks,
-      bnds.ke, bnds.js, bnds.je, bnds.is, bnds.ie,
+      loop_pattern_mdrange_tag, PARTHENON_AUTO_LABEL, DevExecSpace(), bnds.ks, bnds.ke,
+      bnds.js, bnds.je, bnds.is, bnds.ie,
       KOKKOS_LAMBDA(int k, int j, int i, Real &maxd) {
         Real scale = std::abs(q(k, j, i));
         Real d =
@@ -118,11 +120,12 @@ AmrTag FirstDerivative(const AMRBounds &bnds, const ParArray3D<Real> &q,
 
 AmrTag SecondDerivative(const AMRBounds &bnds, const ParArray3D<Real> &q,
                         const Real refine_criteria, const Real derefine_criteria) {
+  PARTHENON_INSTRUMENT
   const int ndim = 1 + (bnds.je > bnds.js) + (bnds.ke > bnds.ks);
   Real maxd = 0.0;
   par_reduce(
-      loop_pattern_mdrange_tag, "refinement second derivative", DevExecSpace(), bnds.ks,
-      bnds.ke, bnds.js, bnds.je, bnds.is, bnds.ie,
+      loop_pattern_mdrange_tag, PARTHENON_AUTO_LABEL, DevExecSpace(), bnds.ks, bnds.ke,
+      bnds.js, bnds.je, bnds.is, bnds.ie,
       KOKKOS_LAMBDA(int k, int j, int i, Real &maxd) {
         Real aqt = std::abs(q(k, j, i)) + TINY_NUMBER;
         Real qavg = 0.5 * (q(k, j, i + 1) + q(k, j, i - 1));
diff --git a/src/interface/swarm.cpp b/src/interface/swarm.cpp
index 981ded597589..c6106d387c02 100644
--- a/src/interface/swarm.cpp
+++ b/src/interface/swarm.cpp
@@ -1005,7 +1005,7 @@ void Swarm::Send(BoundaryCommSubset phase) {
 
     int total_sent_particles = 0;
     pmb->par_reduce(
-        "total sent particles", 0, max_active_index_,
+        PARTHENON_AUTO_LABEL, 0, max_active_index_,
         KOKKOS_LAMBDA(int n, int &total_sent_particles) {
           if (swarm_d.IsActive(n)) {
             if (!swarm_d.IsOnCurrentMeshBlock(n)) {

From 220a0532a0d167a87e9dd53dbce08754cc7540b6 Mon Sep 17 00:00:00 2001
From: jdolence <jcdolence@gmail.com>
Date: Wed, 1 Nov 2023 12:56:44 -0600
Subject: [PATCH 47/67] add missing function profiling calls.  add new badly
 written script to post process output from kokkos simple kernel timer.

---
 .../process_timer.py                          | 140 ++++++++++++++++++
 src/prolong_restrict/pr_loops.hpp             |   3 +
 2 files changed, 143 insertions(+)
 create mode 100644 scripts/python/packages/parthenon_process_kernel_timer/process_timer.py

diff --git a/scripts/python/packages/parthenon_process_kernel_timer/process_timer.py b/scripts/python/packages/parthenon_process_kernel_timer/process_timer.py
new file mode 100644
index 000000000000..afa31bd03339
--- /dev/null
+++ b/scripts/python/packages/parthenon_process_kernel_timer/process_timer.py
@@ -0,0 +1,140 @@
+# =========================================================================================
+# (C) (or copyright) 2023. Triad National Security, LLC. All rights reserved.
+#
+# This program was produced under U.S. Government contract 89233218CNA000001 for Los
+# Alamos National Laboratory (LANL), which is operated by Triad National Security, LLC
+# for the U.S. Department of Energy/National Nuclear Security Administration. All rights
+# in the program are reserved by Triad National Security, LLC, and the U.S. Department
+# of Energy/National Nuclear Security Administration. The Government is granted for
+# itself and others acting on its behalf a nonexclusive, paid-up, irrevocable worldwide
+# license in this material to reproduce, prepare derivative works, distribute copies to
+# the public, perform publicly and display publicly, and to permit others to do so.
+# =========================================================================================
+
+"""
+Process the text profiling data spit out by the kokkos simple kernel timer, arranging things
+in a convenient way, assuming all the region/kernel names are auto-generated by parthenon
+"""
+
+import sys
+
+class region:
+  def __init__(self, fl, line, data):
+    self.file = fl
+    self.line = line
+    self.data = data
+
+class func:
+  def __init__(self, name):
+    self.name = name
+    self.file = ""
+    self.reg = {}
+    self.reg_type = {}
+    self.total_time = 0.0
+    self.total_pct = 0.0
+    self.total_kernel_time = 0.0
+  def add_region(self, fl, line, data):
+    if self.file == "":
+      self.file = fl
+    else:
+      if fl != self.file:
+        print("Duplicate function name found")
+        sys.exit(1)
+    dsplit = data.split()
+    self.reg_type[line] = dsplit[0][1:-1]
+    self.reg[line] = [float(dsplit[1]), float(dsplit[5].rstrip())]
+    if self.reg_type[line] != "REGION":
+      self.total_kernel_time += self.reg[line][0]
+  def compute_total_cost(self):
+    if len(self.reg) == 1:
+      for key in self.reg.keys():
+        if self.reg_type[key] != "REGION":
+          self.name += "   ***WARNING: no function-level profiling***"
+        self.total_time = self.reg[key][0]
+        self.total_pct = self.reg[key][1]
+      return
+    # sort by line number
+    keys = list(self.reg.keys())
+    key_int = [int(k) for k in keys]
+    key_int.sort()
+    keys = [str(k) for k in key_int]
+    if self.reg_type[keys[0]] == "REGION":
+      # assume this is the time for the function
+      self.total_time = self.reg[keys[0]][0]
+      self.total_pct = self.reg[keys[0]][1]
+    else :
+      # assume there are just kernel timers
+      self.name += "   ***WARNING: no function-level profiling***"
+      wall = 0.0
+      for key,val in self.reg.items():
+        if self.reg_type[key] != "REGION":
+          self.total_time += self.reg[key][0]
+          wall = self.reg[key][0]/self.reg[key][1]
+      self.total_pct = self.total_time / wall
+  def print_region(self):
+    print(self.name, "time: " + str(self.total_time), "%wall: " + str(self.total_pct))
+    for key,val in self.reg.items():
+      if val[0] != self.total_time:
+        print("  ", "type: " + self.reg_type[key], "line: " + key, "selftime: " + str(val[0]), "%func: " + str(100*val[0]/self.total_time))
+    if self.total_time > 0:
+      print("  ", "Kernel summary: Total kernel time: " + str(self.total_kernel_time), "  % Time in kernels: ", str(100*self.total_kernel_time/self.total_time))
+    else:
+      print("  ", "Apparently this function took zero time to execute???")
+
+def parse_name(s):
+  if s[0:6] == "Kokkos":
+    f = s.rstrip()
+    line = "na"
+    fl = "na"
+    label = f
+  else:
+    words = s.split("::")
+    if words[0] == "kokkos_abstraction.hpp":
+      return "", "", "", "", True
+    # make sure it follows the filename::line_number::name convection
+    if ".cpp" in s or ".hpp" in s:
+      # now strip away the file and line
+      fl = s[:s.find(":")]
+      f = s[s.find(":")+2:]
+      line = f[:f.find(":")]
+      f = f[f.find(":")+2:]
+      label = (fl + "::" + f).rstrip()
+    else:
+      print("nonconforming entry", s)
+      sys.exit(1)
+  return f, line, fl, label, False
+
+def main(prof):
+  funcs = {}
+  with open(prof) as fp:
+    raw = fp.readlines()
+  if raw[0].rstrip() != "Regions:":
+    print(prof + " does not appear to be a profile from the Kokkos simple kernel timer")
+    print(raw[0])
+    sys.exit(1)
+  cline = 2
+  in_regions = True
+  # process regions/kernels
+  while raw[cline].rstrip() != "":
+    sraw = raw[cline][2::]
+    f, line, fl, label, skip = parse_name(sraw)
+    if skip:
+      cline += 2
+      continue
+    if label not in funcs.keys():
+      funcs[label] = func(label)
+    funcs[label].add_region(fl, line, raw[cline+1].rstrip())
+    cline += 2
+    if raw[cline].rstrip() == "" and in_regions:
+      cline += 4
+      in_regions = False
+
+  for key in funcs.keys():
+    funcs[key].compute_total_cost()
+  
+  for key in (sorted(funcs, key=lambda name: funcs[name].total_time, reverse=True)):
+    funcs[key].print_region()
+    print()
+
+if __name__ == "__main__":
+  main(sys.argv[1])
\ No newline at end of file
diff --git a/src/prolong_restrict/pr_loops.hpp b/src/prolong_restrict/pr_loops.hpp
index 9dc02e9e92fe..2cb07ac4c547 100644
--- a/src/prolong_restrict/pr_loops.hpp
+++ b/src/prolong_restrict/pr_loops.hpp
@@ -64,6 +64,7 @@ KOKKOS_INLINE_FUNCTION void InnerProlongationRestrictionLoop(
     team_mbr_t &team_member, std::size_t buf, const ProResInfoArr_t &info,
     const IndexRange &ckb, const IndexRange &cjb, const IndexRange &cib,
     const IndexRange &kb, const IndexRange &jb, const IndexRange &ib) {
+  PARTHENON_INSTRUMENT
   const auto &idxer = info(buf).idxer[static_cast<int>(CEL)];
   par_for_inner(
       inner_loop_pattern_tvr_tag, team_member, 0, idxer.size() - 1, [&](const int ii) {
@@ -113,6 +114,7 @@ inline void
 ProlongationRestrictionLoop(const ProResInfoArr_t &info, const Idx_t &buffer_idxs,
                             const IndexShape &cellbounds, const IndexShape &c_cellbounds,
                             const RefinementOp_t op, const std::size_t nbuffers) {
+  PARTHENON_INSTRUMENT
   const IndexDomain interior = IndexDomain::interior;
   auto ckb = c_cellbounds.GetBoundsK(interior);
   auto cjb = c_cellbounds.GetBoundsJ(interior);
@@ -151,6 +153,7 @@ InnerHostProlongationRestrictionLoop(std::size_t buf, const ProResInfoArrHost_t
                                      const IndexRange &ckb, const IndexRange &cjb,
                                      const IndexRange &cib, const IndexRange &kb,
                                      const IndexRange &jb, const IndexRange &ib) {
+  PARTHENON_INSTRUMENT
   const auto &idxer = info(buf).idxer[static_cast<int>(CEL)];
   auto coords = info(buf).coords;
   auto coarse_coords = info(buf).coarse_coords;

From 883323c0be50ac77a7cdf091294a3e4c33322021 Mon Sep 17 00:00:00 2001
From: jdolence <jcdolence@gmail.com>
Date: Wed, 1 Nov 2023 13:43:47 -0600
Subject: [PATCH 48/67] formatting

---
 .../process_timer.py                          | 244 ++++++++++--------
 1 file changed, 134 insertions(+), 110 deletions(-)

diff --git a/scripts/python/packages/parthenon_process_kernel_timer/process_timer.py b/scripts/python/packages/parthenon_process_kernel_timer/process_timer.py
index afa31bd03339..e23a40bbf72b 100644
--- a/scripts/python/packages/parthenon_process_kernel_timer/process_timer.py
+++ b/scripts/python/packages/parthenon_process_kernel_timer/process_timer.py
@@ -18,123 +18,147 @@
 
 import sys
 
+
 class region:
-  def __init__(self, fl, line, data):
-    self.file = fl
-    self.line = line
-    self.data = data
+    def __init__(self, fl, line, data):
+        self.file = fl
+        self.line = line
+        self.data = data
+
 
 class func:
-  def __init__(self, name):
-    self.name = name
-    self.file = ""
-    self.reg = {}
-    self.reg_type = {}
-    self.total_time = 0.0
-    self.total_pct = 0.0
-    self.total_kernel_time = 0.0
-  def add_region(self, fl, line, data):
-    if self.file == "":
-      self.file = fl
-    else:
-      if fl != self.file:
-        print("Duplicate function name found")
-        sys.exit(1)
-    dsplit = data.split()
-    self.reg_type[line] = dsplit[0][1:-1]
-    self.reg[line] = [float(dsplit[1]), float(dsplit[5].rstrip())]
-    if self.reg_type[line] != "REGION":
-      self.total_kernel_time += self.reg[line][0]
-  def compute_total_cost(self):
-    if len(self.reg) == 1:
-      for key in self.reg.keys():
-        if self.reg_type[key] != "REGION":
-          self.name += "   ***WARNING: no function-level profiling***"
-        self.total_time = self.reg[key][0]
-        self.total_pct = self.reg[key][1]
-      return
-    # sort by line number
-    keys = list(self.reg.keys())
-    key_int = [int(k) for k in keys]
-    key_int.sort()
-    keys = [str(k) for k in key_int]
-    if self.reg_type[keys[0]] == "REGION":
-      # assume this is the time for the function
-      self.total_time = self.reg[keys[0]][0]
-      self.total_pct = self.reg[keys[0]][1]
-    else :
-      # assume there are just kernel timers
-      self.name += "   ***WARNING: no function-level profiling***"
-      wall = 0.0
-      for key,val in self.reg.items():
-        if self.reg_type[key] != "REGION":
-          self.total_time += self.reg[key][0]
-          wall = self.reg[key][0]/self.reg[key][1]
-      self.total_pct = self.total_time / wall
-  def print_region(self):
-    print(self.name, "time: " + str(self.total_time), "%wall: " + str(self.total_pct))
-    for key,val in self.reg.items():
-      if val[0] != self.total_time:
-        print("  ", "type: " + self.reg_type[key], "line: " + key, "selftime: " + str(val[0]), "%func: " + str(100*val[0]/self.total_time))
-    if self.total_time > 0:
-      print("  ", "Kernel summary: Total kernel time: " + str(self.total_kernel_time), "  % Time in kernels: ", str(100*self.total_kernel_time/self.total_time))
-    else:
-      print("  ", "Apparently this function took zero time to execute???")
+    def __init__(self, name):
+        self.name = name
+        self.file = ""
+        self.reg = {}
+        self.reg_type = {}
+        self.total_time = 0.0
+        self.total_pct = 0.0
+        self.total_kernel_time = 0.0
+
+    def add_region(self, fl, line, data):
+        if self.file == "":
+            self.file = fl
+        else:
+            if fl != self.file:
+                print("Duplicate function name found")
+                sys.exit(1)
+        dsplit = data.split()
+        self.reg_type[line] = dsplit[0][1:-1]
+        self.reg[line] = [float(dsplit[1]), float(dsplit[5].rstrip())]
+        if self.reg_type[line] != "REGION":
+            self.total_kernel_time += self.reg[line][0]
+
+    def compute_total_cost(self):
+        if len(self.reg) == 1:
+            for key in self.reg.keys():
+                if self.reg_type[key] != "REGION":
+                    self.name += "   ***WARNING: no function-level profiling***"
+                self.total_time = self.reg[key][0]
+                self.total_pct = self.reg[key][1]
+            return
+        # sort by line number
+        keys = list(self.reg.keys())
+        key_int = [int(k) for k in keys]
+        key_int.sort()
+        keys = [str(k) for k in key_int]
+        if self.reg_type[keys[0]] == "REGION":
+            # assume this is the time for the function
+            self.total_time = self.reg[keys[0]][0]
+            self.total_pct = self.reg[keys[0]][1]
+        else:
+            # assume there are just kernel timers
+            self.name += "   ***WARNING: no function-level profiling***"
+            wall = 0.0
+            for key, val in self.reg.items():
+                if self.reg_type[key] != "REGION":
+                    self.total_time += self.reg[key][0]
+                    wall = self.reg[key][0] / self.reg[key][1]
+            self.total_pct = self.total_time / wall
+
+    def print_region(self):
+        print(
+            self.name, "time: " + str(self.total_time), "%wall: " + str(self.total_pct)
+        )
+        for key, val in self.reg.items():
+            if val[0] != self.total_time:
+                print(
+                    "  ",
+                    "type: " + self.reg_type[key],
+                    "line: " + key,
+                    "selftime: " + str(val[0]),
+                    "%func: " + str(100 * val[0] / self.total_time),
+                )
+        if self.total_time > 0:
+            print(
+                "  ",
+                "Kernel summary: Total kernel time: " + str(self.total_kernel_time),
+                "  % Time in kernels: ",
+                str(100 * self.total_kernel_time / self.total_time),
+            )
+        else:
+            print("  ", "Apparently this function took zero time to execute???")
+
 
 def parse_name(s):
-  if s[0:6] == "Kokkos":
-    f = s.rstrip()
-    line = "na"
-    fl = "na"
-    label = f
-  else:
-    words = s.split("::")
-    if words[0] == "kokkos_abstraction.hpp":
-      return "", "", "", "", True
-    # make sure it follows the filename::line_number::name convection
-    if ".cpp" in s or ".hpp" in s:
-      # now strip away the file and line
-      fl = s[:s.find(":")]
-      f = s[s.find(":")+2:]
-      line = f[:f.find(":")]
-      f = f[f.find(":")+2:]
-      label = (fl + "::" + f).rstrip()
+    if s[0:6] == "Kokkos":
+        f = s.rstrip()
+        line = "na"
+        fl = "na"
+        label = f
     else:
-      print("nonconforming entry", s)
-      sys.exit(1)
-  return f, line, fl, label, False
+        words = s.split("::")
+        if words[0] == "kokkos_abstraction.hpp":
+            return "", "", "", "", True
+        # make sure it follows the filename::line_number::name convection
+        if ".cpp" in s or ".hpp" in s:
+            # now strip away the file and line
+            fl = s[: s.find(":")]
+            f = s[s.find(":") + 2 :]
+            line = f[: f.find(":")]
+            f = f[f.find(":") + 2 :]
+            label = (fl + "::" + f).rstrip()
+        else:
+            print("nonconforming entry", s)
+            sys.exit(1)
+    return f, line, fl, label, False
+
 
 def main(prof):
-  funcs = {}
-  with open(prof) as fp:
-    raw = fp.readlines()
-  if raw[0].rstrip() != "Regions:":
-    print(prof + " does not appear to be a profile from the Kokkos simple kernel timer")
-    print(raw[0])
-    sys.exit(1)
-  cline = 2
-  in_regions = True
-  # process regions/kernels
-  while raw[cline].rstrip() != "":
-    sraw = raw[cline][2::]
-    f, line, fl, label, skip = parse_name(sraw)
-    if skip:
-      cline += 2
-      continue
-    if label not in funcs.keys():
-      funcs[label] = func(label)
-    funcs[label].add_region(fl, line, raw[cline+1].rstrip())
-    cline += 2
-    if raw[cline].rstrip() == "" and in_regions:
-      cline += 4
-      in_regions = False
-
-  for key in funcs.keys():
-    funcs[key].compute_total_cost()
-  
-  for key in (sorted(funcs, key=lambda name: funcs[name].total_time, reverse=True)):
-    funcs[key].print_region()
-    print()
+    funcs = {}
+    with open(prof) as fp:
+        raw = fp.readlines()
+    if raw[0].rstrip() != "Regions:":
+        print(
+            prof
+            + " does not appear to be a profile from the Kokkos simple kernel timer"
+        )
+        print(raw[0])
+        sys.exit(1)
+    cline = 2
+    in_regions = True
+    # process regions/kernels
+    while raw[cline].rstrip() != "":
+        sraw = raw[cline][2::]
+        f, line, fl, label, skip = parse_name(sraw)
+        if skip:
+            cline += 2
+            continue
+        if label not in funcs.keys():
+            funcs[label] = func(label)
+        funcs[label].add_region(fl, line, raw[cline + 1].rstrip())
+        cline += 2
+        if raw[cline].rstrip() == "" and in_regions:
+            cline += 4
+            in_regions = False
+
+    for key in funcs.keys():
+        funcs[key].compute_total_cost()
+
+    for key in sorted(funcs, key=lambda name: funcs[name].total_time, reverse=True):
+        funcs[key].print_region()
+        print()
+
 
 if __name__ == "__main__":
-  main(sys.argv[1])
\ No newline at end of file
+    main(sys.argv[1])

From 243dd3228309806a33063713f2f2c43203d89826 Mon Sep 17 00:00:00 2001
From: jdolence <jcdolence@gmail.com>
Date: Wed, 1 Nov 2023 14:20:35 -0600
Subject: [PATCH 49/67] just use PARTHENON_INSTRUMENT where possible

---
 src/driver/driver.cpp          |  2 +-
 src/interface/update.hpp       |  6 +++---
 src/mesh/amr_loadbalance.cpp   |  8 ++++----
 src/outputs/parthenon_hdf5.cpp | 30 +++++++++++++++---------------
 src/utils/instrument.hpp       |  2 --
 5 files changed, 23 insertions(+), 25 deletions(-)

diff --git a/src/driver/driver.cpp b/src/driver/driver.cpp
index 166586003bc4..b12a8dde3f9b 100644
--- a/src/driver/driver.cpp
+++ b/src/driver/driver.cpp
@@ -76,7 +76,7 @@ DriverStatus EvolutionDriver::Execute() {
   DumpInputParameters();
 
   { // Main t < tmax loop region
-    PARTHENON_INSTRUMENT_REGION(PARTHENON_AUTO_LABEL)
+    PARTHENON_INSTRUMENT
     while (tm.KeepGoing()) {
       if (Globals::my_rank == 0) OutputCycleDiagnostics();
 
diff --git a/src/interface/update.hpp b/src/interface/update.hpp
index dd4b8158d0b8..21f035caefa3 100644
--- a/src/interface/update.hpp
+++ b/src/interface/update.hpp
@@ -292,19 +292,19 @@ TaskStatus FillDerived(T *rc) {
   PARTHENON_INSTRUMENT
   auto pm = rc->GetParentPointer();
   { // PreFillDerived region
-    PARTHENON_INSTRUMENT_REGION(PARTHENON_AUTO_LABEL)
+    PARTHENON_INSTRUMENT
     for (const auto &pkg : pm->packages.AllPackages()) {
       pkg.second->PreFillDerived(rc);
     }
   } // PreFillDerived region
   { // FillDerived region
-    PARTHENON_INSTRUMENT_REGION(PARTHENON_AUTO_LABEL)
+    PARTHENON_INSTRUMENT
     for (const auto &pkg : pm->packages.AllPackages()) {
       pkg.second->FillDerived(rc);
     }
   } // FillDerived region
   { // PostFillDerived region
-    PARTHENON_INSTRUMENT_REGION(PARTHENON_AUTO_LABEL)
+    PARTHENON_INSTRUMENT
     for (const auto &pkg : pm->packages.AllPackages()) {
       pkg.second->PostFillDerived(rc);
     }
diff --git a/src/mesh/amr_loadbalance.cpp b/src/mesh/amr_loadbalance.cpp
index 0814c5ccb487..1e76d17521b3 100644
--- a/src/mesh/amr_loadbalance.cpp
+++ b/src/mesh/amr_loadbalance.cpp
@@ -682,7 +682,7 @@ void Mesh::RedistributeAndRefineMeshBlocks(ParameterInput *pin, ApplicationInput
   int onbe = onbs + nblist[Globals::my_rank] - 1;
 
   { // Construct new list region
-    PARTHENON_INSTRUMENT_REGION(PARTHENON_AUTO_LABEL)
+    PARTHENON_INSTRUMENT
     tree.GetMeshBlockList(newloc.data(), newtoold.data(), nbtotal);
 
     // create a list mapping the previous gid to the current one
@@ -759,7 +759,7 @@ void Mesh::RedistributeAndRefineMeshBlocks(ParameterInput *pin, ApplicationInput
   // Send data from old to new blocks
   std::vector<MPI_Request> send_reqs;
   { // AMR Send region
-    PARTHENON_INSTRUMENT_REGION(PARTHENON_AUTO_LABEL)
+    PARTHENON_INSTRUMENT
     for (int n = onbs; n <= onbe; n++) {
       int nn = oldtonew[n];
       LogicalLocation &oloc = loclist[n];
@@ -791,7 +791,7 @@ void Mesh::RedistributeAndRefineMeshBlocks(ParameterInput *pin, ApplicationInput
   // Construct a new MeshBlock list (moving the data within the MPI rank)
   BlockList_t new_block_list(nbe - nbs + 1);
   { // AMR Construct new MeshBlockList region
-    PARTHENON_INSTRUMENT_REGION(PARTHENON_AUTO_LABEL)
+    PARTHENON_INSTRUMENT
     RegionSize block_size = GetBlockSize();
 
     for (int n = nbs; n <= nbe; n++) {
@@ -831,7 +831,7 @@ void Mesh::RedistributeAndRefineMeshBlocks(ParameterInput *pin, ApplicationInput
 
   // Receive the data and load into MeshBlocks
   { // AMR Recv and unpack data
-    PARTHENON_INSTRUMENT_REGION(PARTHENON_AUTO_LABEL)
+    PARTHENON_INSTRUMENT
     bool all_received;
     int niter = 0;
     if (block_list.size() > 0) {
diff --git a/src/outputs/parthenon_hdf5.cpp b/src/outputs/parthenon_hdf5.cpp
index 3e9e5d55c2e7..e15a038fae86 100644
--- a/src/outputs/parthenon_hdf5.cpp
+++ b/src/outputs/parthenon_hdf5.cpp
@@ -113,9 +113,9 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm
   //   WRITING ATTRIBUTES                                                             //
   // -------------------------------------------------------------------------------- //
   H5G info_group;
-  {                                                     // write attributes
-    PARTHENON_INSTRUMENT_REGION(PARTHENON_AUTO_LABEL) { // Input section
-      PARTHENON_INSTRUMENT_REGION(PARTHENON_AUTO_LABEL)
+  {                        // write attributes
+    PARTHENON_INSTRUMENT { // Input section
+      PARTHENON_INSTRUMENT
       // write input key-value pairs
       std::ostringstream oss;
       pin->ParameterDump(oss);
@@ -129,7 +129,7 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm
     // we'll need this again at the end
     info_group = MakeGroup(file, "/Info");
     { // write info
-      PARTHENON_INSTRUMENT_REGION(PARTHENON_AUTO_LABEL)
+      PARTHENON_INSTRUMENT
       HDF5WriteAttribute("OutputFormatVersion", OUTPUT_VERSION_FORMAT, info_group);
 
       if (tm != nullptr) {
@@ -189,7 +189,7 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm
     } // write info
 
     { // write params
-      PARTHENON_INSTRUMENT_REGION(PARTHENON_AUTO_LABEL)
+      PARTHENON_INSTRUMENT
       const H5G params_group = MakeGroup(file, "/Params");
 
       for (const auto &package : pm->packages.AllPackages()) {
@@ -234,7 +234,7 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm
 #endif
 
   { // write Blocks metadata
-    PARTHENON_INSTRUMENT_REGION(PARTHENON_AUTO_LABEL)
+    PARTHENON_INSTRUMENT
     const H5G gBlocks = MakeGroup(file, "/Blocks");
 
     // write Xmin[ndim] for blocks
@@ -268,7 +268,7 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm
 
   // Write mesh coordinates to file
   { // write mesh coords
-    PARTHENON_INSTRUMENT_REGION(PARTHENON_AUTO_LABEL)
+    PARTHENON_INSTRUMENT
     for (const bool face : {true, false}) {
       const H5G gLocations = MakeGroup(file, face ? "/Locations" : "/VolumeLocations");
 
@@ -296,7 +296,7 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm
   // Write Levels and Logical Locations with the level for each Meshblock loclist contains
   // levels and logical locations for all meshblocks on all ranks
   { // write levels + log locs
-    PARTHENON_INSTRUMENT_REGION(PARTHENON_AUTO_LABEL)
+    PARTHENON_INSTRUMENT
     const auto &loclist = pm->GetLocList();
 
     std::vector<std::int64_t> levels;
@@ -333,7 +333,7 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm
   hsize_t num_sparse;
   std::unique_ptr<hbool_t[]> sparse_allocated;
   { // write all variable data
-    PARTHENON_INSTRUMENT_REGION(PARTHENON_AUTO_LABEL)
+    PARTHENON_INSTRUMENT
 
     // All blocks have the same list of variable metadata that exist in the entire
     // simulation, but not all variables may be allocated on all blocks
@@ -408,7 +408,7 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm
 
     // for each variable we write
     for (auto &vinfo : all_vars_info) {
-      PARTHENON_INSTRUMENT_REGION(PARTHENON_AUTO_LABEL)
+      PARTHENON_INSTRUMENT
       // not really necessary, but doesn't hurt
       memset(tmpData.data(), 0, tmpData.size() * sizeof(OutT));
 
@@ -478,7 +478,7 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm
       hsize_t index = 0;
 
       { // fill host output buffer
-        PARTHENON_INSTRUMENT_REGION(PARTHENON_AUTO_LABEL)
+        PARTHENON_INSTRUMENT
         // for each local mesh block
         for (size_t b_idx = 0; b_idx < num_blocks_local; ++b_idx) {
           const auto &pmb = pm->block_list[b_idx];
@@ -551,7 +551,7 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm
       } // fill host output buffer
 
       { // write variable data
-        PARTHENON_INSTRUMENT_REGION(PARTHENON_AUTO_LABEL)
+        PARTHENON_INSTRUMENT
         // write data to file
         HDF5WriteND(file, var_name, tmpData.data(), ndim, p_loc_offset, p_loc_cnt,
                     p_glob_cnt, pl_xfer, pl_dcreate);
@@ -590,7 +590,7 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm
   // write SparseInfo and SparseFields (we can't write a zero-size dataset, so only write
   // this if we have sparse fields)
   if (num_sparse > 0) {
-    PARTHENON_INSTRUMENT_REGION(PARTHENON_AUTO_LABEL)
+    PARTHENON_INSTRUMENT
     local_count[1] = global_count[1] = num_sparse;
 
     HDF5Write2D(file, "SparseInfo", sparse_allocated.get(), p_loc_offset, p_loc_cnt,
@@ -611,7 +611,7 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm
 
   AllSwarmInfo swarm_info(pm->block_list, output_params.swarms, restart_);
   { // write particle data
-    PARTHENON_INSTRUMENT_REGION(PARTHENON_AUTO_LABEL)
+    PARTHENON_INSTRUMENT
     for (auto &[swname, swinfo] : swarm_info.all_info) {
       const H5G g_swm = MakeGroup(file, swname);
       // offsets/counts are NOT the same here vs the grid data
@@ -674,7 +674,7 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm
   } // write particle data
 
   { // generate xdmf
-    PARTHENON_INSTRUMENT_REGION(PARTHENON_AUTO_LABEL)
+    PARTHENON_INSTRUMENT
     // generate XDMF companion file
     XDMF::genXDMF(filename, pm, tm, nx1, nx2, nx3, all_vars_info, swarm_info);
   } // generate xdmf
diff --git a/src/utils/instrument.hpp b/src/utils/instrument.hpp
index df0a072e13d6..9f5e9aaecc77 100644
--- a/src/utils/instrument.hpp
+++ b/src/utils/instrument.hpp
@@ -24,8 +24,6 @@
 #define PARTHENON_INSTRUMENT_REGION(name)                                                \
   KokkosTimer __UNIQUE_INST_VAR(internal_inst_reg, __LINE__)(name);
 #define PARTHENON_AUTO_LABEL parthenon::build_auto_label(__FILE__, __LINE__, __func__)
-//#undef UNIQUE_VAR
-//#undef UNIQUE_VAR2
 
 namespace parthenon {
 namespace detail {

From 57ffd1db945c5d61c00af149c41f240c381d9a99 Mon Sep 17 00:00:00 2001
From: jdolence <jcdolence@gmail.com>
Date: Wed, 1 Nov 2023 14:25:58 -0600
Subject: [PATCH 50/67] changelog

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 181a58b2698c..64e6fa9bf5c8 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,6 +3,7 @@
 ## Current develop
 
 ### Added (new features/APIs/variables/...)
+- [[PR 969]](https://github.com/parthenon-hpc-lab/parthenon/pull/969) New macro-based auto-naming of profiling regions and kernels
 - [[PR 907]](https://github.com/parthenon-hpc-lab/parthenon/pull/907) PEP1: Allow subclassing StateDescriptor
 - [[PR 932]](https://github.com/parthenon-hpc-lab/parthenon/pull/932) Add GetOrAddFlag to metadata
 - [[PR 931]](https://github.com/parthenon-hpc-lab/parthenon/pull/931) Allow SparsePacks with subsets of blocks

From e1c960bd2718b50dfd1f5e96fdc987fc54c97ad0 Mon Sep 17 00:00:00 2001
From: jdolence <jcdolence@gmail.com>
Date: Wed, 1 Nov 2023 14:47:14 -0600
Subject: [PATCH 51/67] oops, tried to instrument a host-device function

---
 src/prolong_restrict/pr_loops.hpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/prolong_restrict/pr_loops.hpp b/src/prolong_restrict/pr_loops.hpp
index 2cb07ac4c547..288b18ed60a6 100644
--- a/src/prolong_restrict/pr_loops.hpp
+++ b/src/prolong_restrict/pr_loops.hpp
@@ -64,7 +64,6 @@ KOKKOS_INLINE_FUNCTION void InnerProlongationRestrictionLoop(
     team_mbr_t &team_member, std::size_t buf, const ProResInfoArr_t &info,
     const IndexRange &ckb, const IndexRange &cjb, const IndexRange &cib,
     const IndexRange &kb, const IndexRange &jb, const IndexRange &ib) {
-  PARTHENON_INSTRUMENT
   const auto &idxer = info(buf).idxer[static_cast<int>(CEL)];
   par_for_inner(
       inner_loop_pattern_tvr_tag, team_member, 0, idxer.size() - 1, [&](const int ii) {

From 0cc38e6439f171b34713fa81693dfe6da51396ea Mon Sep 17 00:00:00 2001
From: jdolence <jcdolence@gmail.com>
Date: Wed, 1 Nov 2023 16:37:44 -0600
Subject: [PATCH 52/67] fix broken merge

---
 src/mesh/amr_loadbalance.cpp   | 4 ----
 src/outputs/parthenon_hdf5.cpp | 1 +
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/mesh/amr_loadbalance.cpp b/src/mesh/amr_loadbalance.cpp
index a91e31bb7d67..665b8869428b 100644
--- a/src/mesh/amr_loadbalance.cpp
+++ b/src/mesh/amr_loadbalance.cpp
@@ -712,11 +712,7 @@ bool Mesh::RedistributeAndRefineMeshBlocks(ParameterInput *pin, ApplicationInput
   std::vector<double> newcost(ntot);
   std::vector<int> newtoold(ntot);
   std::vector<int> oldtonew(nbtotal);
-  int nbtold = nbtotal;
   std::unordered_set<LogicalLocation> newly_refined;
-  // store old nbstart and nbend before load balancing.
-  int onbs = nslist[Globals::my_rank];
-  int onbe = onbs + nblist[Globals::my_rank] - 1;
 
   { // Construct new list region
     PARTHENON_INSTRUMENT
diff --git a/src/outputs/parthenon_hdf5.cpp b/src/outputs/parthenon_hdf5.cpp
index d083636175ed..812945c90ab4 100644
--- a/src/outputs/parthenon_hdf5.cpp
+++ b/src/outputs/parthenon_hdf5.cpp
@@ -146,6 +146,7 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm
                          info_group);
       // write number of ghost cells in simulation
       HDF5WriteAttribute("NGhost", Globals::nghost, info_group);
+      auto coords = Coordinates_t();
       HDF5WriteAttribute("Coordinates", std::string(coords.Name()).c_str(),
                          info_group);
 

From 192b1cb93c72b263fe2c30fda6ba68491066010c Mon Sep 17 00:00:00 2001
From: jdolence <jcdolence@gmail.com>
Date: Wed, 1 Nov 2023 17:21:25 -0600
Subject: [PATCH 53/67] drop const

---
 src/utils/block_timer.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/utils/block_timer.hpp b/src/utils/block_timer.hpp
index d30cf945ae94..f298949b84c3 100644
--- a/src/utils/block_timer.hpp
+++ b/src/utils/block_timer.hpp
@@ -76,7 +76,7 @@ class BlockTimer {
   }
  private:
   const team_mbr_t *member_;
-  const T &pack_;
+  T &pack_;
   const int b_;
   const uint64_t start_;
 #else // no timers, so just stub this out

From c21900befbe966d9fb5548131e66f3ed22c05b28 Mon Sep 17 00:00:00 2001
From: jdolence <jcdolence@gmail.com>
Date: Thu, 2 Nov 2023 09:11:53 -0600
Subject: [PATCH 54/67] refactor block timer

---
 src/utils/block_timer.hpp | 36 +++++++++++++-----------------------
 1 file changed, 13 insertions(+), 23 deletions(-)

diff --git a/src/utils/block_timer.hpp b/src/utils/block_timer.hpp
index f298949b84c3..84476544babf 100644
--- a/src/utils/block_timer.hpp
+++ b/src/utils/block_timer.hpp
@@ -26,27 +26,28 @@
 
 namespace parthenon {
 
-template <typename T = void>
 class BlockTimer {
 #ifdef ENABLE_LB_TIMERS
  public:
   BlockTimer() = delete;
   // constructor for team policies when there is no pack
   KOKKOS_INLINE_FUNCTION
-  BlockTimer(team_mbr_t &member, T cost)
-    : member_(&member), pack_(cost), b_(0), start_(Kokkos::Impl::clock_tic()) {}
+  BlockTimer(team_mbr_t &member, double *cost)
+    : member_(&member), cost_(cost), start_(Kokkos::Impl::clock_tic()) {}
   // constructor for non-team policies when there is no pack
   KOKKOS_INLINE_FUNCTION
-  explicit BlockTimer(T cost)
-    : member_(nullptr), pack_(cost), b_(0), start_(Kokkos::Impl::clock_tic()) {}
+  explicit BlockTimer(double *cost)
+    : member_(nullptr), cost_(cost), start_(Kokkos::Impl::clock_tic()) {}
   // constructor for team policies and packs
+  template <typename T>
   KOKKOS_INLINE_FUNCTION
   BlockTimer(team_mbr_t &member, const T &pack, const int b)
-    : member_(&member), pack_(pack), b_(b), start_(Kokkos::Impl::clock_tic()) {}
+    : member_(&member), cost_(&pack.GetCost(b)), start_(Kokkos::Impl::clock_tic()) {}
   // constructor for non-team policies and packs
+  template <typename T>
   KOKKOS_INLINE_FUNCTION
   BlockTimer(const T &pack, const int b)
-    : member_(nullptr), pack_(pack), b_(b), start_(Kokkos::Impl::clock_tic()) {}
+    : member_(nullptr), cost_(&pack.GetCost(b)), start_(Kokkos::Impl::clock_tic()) {}
   // constructor for team policies without a block index
   KOKKOS_INLINE_FUNCTION
   ~BlockTimer() {
@@ -57,27 +58,16 @@ class BlockTimer {
                     + static_cast<double>(stop) :
                   static_cast<double>(stop - start_));
     if (member_ == nullptr) {
-      if constexpr (std::is_same<T, double *>::value) {
-        Kokkos::atomic_add(pack_, diff);
-      } else {
-        Kokkos::atomic_add(&(pack_.GetCost(b_)), diff);
-      }
+      Kokkos::atomic_add(cost_, diff);
     } else {
-      if constexpr (std::is_same<T, double *>::value) {
-        Kokkos::single(Kokkos::PerTeam(*member_), [&] () {
-          Kokkos::atomic_add(pack_, diff);
-        });
-      } else {
-        Kokkos::single(Kokkos::PerTeam(*member_), [&] () {
-          Kokkos::atomic_add(&(pack_.GetCost(b_)), diff);
-        });
-      }
+      Kokkos::single(Kokkos::PerTeam(*member_), [&] () {
+        Kokkos::atomic_add(cost_, diff);
+      });
     }
   }
  private:
   const team_mbr_t *member_;
-  T &pack_;
-  const int b_;
+  double *cost_;
   const uint64_t start_;
 #else // no timers, so just stub this out
  public:

From ffd084bd738efe1a778256c6dbff0d376cc039a6 Mon Sep 17 00:00:00 2001
From: jdolence <jcdolence@gmail.com>
Date: Thu, 2 Nov 2023 12:51:55 -0600
Subject: [PATCH 55/67] fix cellbounds bug

---
 src/mesh/mesh.hpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/mesh/mesh.hpp b/src/mesh/mesh.hpp
index c5ce1114b744..9904f5529911 100644
--- a/src/mesh/mesh.hpp
+++ b/src/mesh/mesh.hpp
@@ -224,9 +224,10 @@ class Mesh {
 
   std::pair<IndexShape, IndexShape> GetCellBounds() const {
     auto cb = [&](const int rfact) {
+      int include_ghosts = (rfact == 1 || multilevel);
       return IndexShape((ndim > 2) * block_size.nx(X3DIR)/rfact,
                         (ndim > 1) * block_size.nx(X2DIR)/rfact,
-                        block_size.nx(X1DIR)/rfact, multilevel*Globals::nghost);
+                        block_size.nx(X1DIR)/rfact, include_ghosts*Globals::nghost);
     };
     return std::make_pair(cb(1), cb(2));
   }

From 02dab9010e0c237d56604e32bdddf34557eccdcb Mon Sep 17 00:00:00 2001
From: jdolence <jcdolence@gmail.com>
Date: Thu, 2 Nov 2023 12:54:35 -0600
Subject: [PATCH 56/67] formatting

---
 src/interface/mesh_data.hpp        |  5 +--
 src/interface/sparse_pack.hpp      |  8 +---
 src/interface/sparse_pack_base.cpp |  4 +-
 src/interface/variable_state.cpp   |  1 -
 src/mesh/amr_loadbalance.cpp       | 63 ++++++++++++++++--------------
 src/mesh/mesh.hpp                  | 11 +++---
 src/outputs/parthenon_hdf5.cpp     |  8 ++--
 src/utils/block_timer.hpp          | 32 +++++++--------
 8 files changed, 61 insertions(+), 71 deletions(-)

diff --git a/src/interface/mesh_data.hpp b/src/interface/mesh_data.hpp
index c824830bbd24..bdbf14a0e5c9 100644
--- a/src/interface/mesh_data.hpp
+++ b/src/interface/mesh_data.hpp
@@ -234,11 +234,8 @@ class MeshData {
     for (int i = 0; i < nblocks; i++) {
       block_data_[i] = blocks[i]->meshblock_data.Get(stage_name_);
     }
-
-  }
-  void Set(BlockList_t blocks) {
-    Set(blocks, blocks[0]->pmy_mesh);
   }
+  void Set(BlockList_t blocks) { Set(blocks, blocks[0]->pmy_mesh); }
 
   void Initialize(const MeshData<T> *src, const std::vector<std::string> &names,
                   const bool shallow);
diff --git a/src/interface/sparse_pack.hpp b/src/interface/sparse_pack.hpp
index 76004520780f..0f525392c96e 100644
--- a/src/interface/sparse_pack.hpp
+++ b/src/interface/sparse_pack.hpp
@@ -160,14 +160,10 @@ class SparsePack : public SparsePackBase {
 #ifdef ENABLE_LB_TIMERS
   // Methods used for timing blocks
   KOKKOS_INLINE_FUNCTION
-  int GetLid(const int b) const {
-    return lid_(b);
-  }
+  int GetLid(const int b) const { return lid_(b); }
 
   KOKKOS_INLINE_FUNCTION
-  double &GetCost(const int b) const {
-    return cost_(GetLid(b));
-  }
+  double &GetCost(const int b) const { return cost_(GetLid(b)); }
 #endif
 
   // Bound overloads
diff --git a/src/interface/sparse_pack_base.cpp b/src/interface/sparse_pack_base.cpp
index abbe3a854300..06a2a4690872 100644
--- a/src/interface/sparse_pack_base.cpp
+++ b/src/interface/sparse_pack_base.cpp
@@ -197,9 +197,9 @@ SparsePackBase SparsePackBase::Build(T *pmd, const PackDescriptor &desc,
       // packs.
       coords_h(b) = pmbd->GetBlockPointer()->coords_device;
     }
-  #ifdef ENABLE_LB_TIMERS
+#ifdef ENABLE_LB_TIMERS
     lid_h(blidx) = pmbd->GetBlockPointer()->lid;
-  #endif
+#endif
 
     for (int i = 0; i < nvar; ++i) {
       pack.bounds_h_(0, blidx, i) = idx;
diff --git a/src/interface/variable_state.cpp b/src/interface/variable_state.cpp
index ca77cf7bb4e8..b3d363f8613e 100644
--- a/src/interface/variable_state.cpp
+++ b/src/interface/variable_state.cpp
@@ -28,7 +28,6 @@ VariableState::VariableState(const Metadata &md, int sparse_id,
   tensor_shape[2] = dims[5];
   tensor_components = tensor_shape[0] * tensor_shape[1] * tensor_shape[2];
   topological_type = GetTopologicalType(md);
-
 }
 
 } // namespace parthenon
diff --git a/src/mesh/amr_loadbalance.cpp b/src/mesh/amr_loadbalance.cpp
index 665b8869428b..7f147edb57b3 100644
--- a/src/mesh/amr_loadbalance.cpp
+++ b/src/mesh/amr_loadbalance.cpp
@@ -347,13 +347,13 @@ Real DistributeTrial(std::vector<Real> const &cost, std::vector<int> &start,
     start[rank] = last;
     Real rank_cost = cost[last];
     nleft--;
-    while (nleft > ranks_left && rank_cost + cost[last+1] < target_max) {
+    while (nleft > ranks_left && rank_cost + cost[last + 1] < target_max) {
       last += 1;
       rank_cost += cost[last];
       nleft--;
     }
     last++;
-    if (rank == nranks-1) {
+    if (rank == nranks - 1) {
       for (int n = last; n < nblocks; n++) {
         rank_cost += cost[n];
       }
@@ -365,8 +365,9 @@ Real DistributeTrial(std::vector<Real> const &cost, std::vector<int> &start,
   return trial_max;
 }
 
-double CalculateNewBalance(std::vector<double> const &cost, std::vector<int> &start, std::vector<int> &nb,
-                           const double avg_cost, const double max_block_cost) {
+double CalculateNewBalance(std::vector<double> const &cost, std::vector<int> &start,
+                           std::vector<int> &nb, const double avg_cost,
+                           const double max_block_cost) {
   const int nblocks = cost.size();
   const int max_rank = std::min(nblocks, Globals::nranks);
 
@@ -377,7 +378,7 @@ double CalculateNewBalance(std::vector<double> const &cost, std::vector<int> &st
 
   // set the bounds for the search
   double a = avg_cost;
-  double b = std::max(2.0*avg_cost, 1.01*max_block_cost);
+  double b = std::max(2.0 * avg_cost, 1.01 * max_block_cost);
   double h = b - a;
   constexpr double invphi = 0.618033988749894848204586834366;
   constexpr double invphi2 = 0.38196601125010515179541316563436188228;
@@ -407,7 +408,8 @@ double CalculateNewBalance(std::vector<double> const &cost, std::vector<int> &st
   return yc;
 }
 
-void AssignBlocks(const std::vector<int> &start, const std::vector<int> &nb, std::vector<int> &rank) {
+void AssignBlocks(const std::vector<int> &start, const std::vector<int> &nb,
+                  std::vector<int> &rank) {
   const int nblocks = rank.size();
   for (int i = 0; i < std::min(nblocks, Globals::nranks); i++) {
     for (int n = start[i]; n < start[i] + nb[i]; n++) {
@@ -439,15 +441,16 @@ std::tuple<double, double, double> BlockCostInfo(std::vector<double> const &cost
 
 } // namespace
 
-void Mesh::SetSimpleBalance(const int nblocks, std::vector<int> &start, std::vector<int> &nb) {
+void Mesh::SetSimpleBalance(const int nblocks, std::vector<int> &start,
+                            std::vector<int> &nb) {
   const int max_rank = std::min(nblocks, Globals::nranks);
-  int nassign = nblocks/max_rank;
+  int nassign = nblocks / max_rank;
   start[0] = 0;
   nb[0] = nassign;
   int nassigned = nassign;
   for (int i = 1; i < max_rank; i++) {
-    nassign = (nblocks - nassigned)/(max_rank - i);
-    start[i] = start[i-1] + nb[i-1];
+    nassign = (nblocks - nassigned) / (max_rank - i);
+    start[i] = start[i - 1] + nb[i - 1];
     nb[i] = nassign;
     nassigned += nassign;
   }
@@ -457,9 +460,8 @@ void Mesh::SetSimpleBalance(const int nblocks, std::vector<int> &start, std::vec
   }
 }
 
-void Mesh::CalculateLoadBalance(std::vector<double> const &cost,
-                                std::vector<int> &rank, std::vector<int> &start,
-                                std::vector<int> &nb) {
+void Mesh::CalculateLoadBalance(std::vector<double> const &cost, std::vector<int> &rank,
+                                std::vector<int> &start, std::vector<int> &nb) {
   PARTHENON_INSTRUMENT
   if ((lb_automatic_ || lb_manual_)) {
     SetSimpleBalance(cost.size(), start, nb);
@@ -480,10 +482,10 @@ void Mesh::CalculateLoadBalance(std::vector<double> const &cost,
 void Mesh::ResetLoadBalanceVariables() {
   if (lb_automatic_) {
 #ifdef ENABLE_LB_TIMERS
-    parthenon::par_for(loop_pattern_flatrange_tag, "reset cost_d", DevExecSpace(), 0, block_list.size()-1,
-        KOKKOS_LAMBDA(const int b) {
-          block_cost(b) = TINY_NUMBER;
-        });
+    parthenon::par_for(
+        loop_pattern_flatrange_tag, "reset cost_d", DevExecSpace(), 0,
+        block_list.size() - 1,
+        KOKKOS_LAMBDA(const int b) { block_cost(b) = TINY_NUMBER; });
 #endif
   } else if (lb_manual_) {
     for (int b = 0; b < block_list.size(); b++) {
@@ -639,24 +641,23 @@ void Mesh::GatherCostList() {
 #ifdef ENABLE_LB_TIMERS
     auto cost_h = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), block_cost);
 #ifdef MPI_PARALLEL
-    PARTHENON_MPI_CHECK(MPI_Allgatherv(cost_h.data(), nblist[Globals::my_rank], MPI_DOUBLE,
-                                       costlist.data(), nblist.data(), nslist.data(),
-                                       MPI_DOUBLE, MPI_COMM_WORLD));
+    PARTHENON_MPI_CHECK(MPI_Allgatherv(cost_h.data(), nblist[Globals::my_rank],
+                                       MPI_DOUBLE, costlist.data(), nblist.data(),
+                                       nslist.data(), MPI_DOUBLE, MPI_COMM_WORLD));
 #endif
 #endif
   }
   if (lb_manual_) {
 #ifdef MPI_PARALLEL
-    PARTHENON_MPI_CHECK(MPI_Allgatherv(block_cost.data(), nblist[Globals::my_rank], MPI_DOUBLE,
-                                       costlist.data(), nblist.data(), nslist.data(),
-                                       MPI_DOUBLE, MPI_COMM_WORLD));
+    PARTHENON_MPI_CHECK(MPI_Allgatherv(block_cost.data(), nblist[Globals::my_rank],
+                                       MPI_DOUBLE, costlist.data(), nblist.data(),
+                                       nslist.data(), MPI_DOUBLE, MPI_COMM_WORLD));
 #endif
   }
   Kokkos::Profiling::popRegion();
   return;
 }
 
-
 //----------------------------------------------------------------------------------------
 // \!fn void Mesh::RedistributeAndRefineMeshBlocks(ParameterInput *pin, int ntot)
 // \brief redistribute MeshBlocks according to the new load balance
@@ -664,7 +665,7 @@ void Mesh::GatherCostList() {
 bool Mesh::RedistributeAndRefineMeshBlocks(ParameterInput *pin, ApplicationInput *app_in,
                                            int ntot, bool modified) {
   PARTHENON_INSTRUMENT
-  
+
   GatherCostList();
   // store old things
   const int onbs = nslist[Globals::my_rank];
@@ -677,13 +678,15 @@ bool Mesh::RedistributeAndRefineMeshBlocks(ParameterInput *pin, ApplicationInput
     // and only move things around if needed
     if (lb_automatic_ || lb_manual_) {
       Kokkos::Profiling::pushRegion("Reloadbalance");
-      auto [avg_cost, max_block_cost, max_rank_cost] = BlockCostInfo(costlist, nslist, nblist);
+      auto [avg_cost, max_block_cost, max_rank_cost] =
+          BlockCostInfo(costlist, nslist, nblist);
       std::vector<int> start_trial(Globals::nranks);
       std::vector<int> nb_trial(Globals::nranks);
-      double new_max = CalculateNewBalance(costlist, start_trial, nb_trial, avg_cost, max_block_cost);
+      double new_max =
+          CalculateNewBalance(costlist, start_trial, nb_trial, avg_cost, max_block_cost);
       Kokkos::Profiling::popRegion();
       // if the improvement isn't large enough, just return because we're done
-      if ((max_rank_cost - new_max)/max_rank_cost < lb_tolerance_) return false;
+      if ((max_rank_cost - new_max) / max_rank_cost < lb_tolerance_) return false;
       newrank.resize(ntot);
       AssignBlocks(start_trial, nb_trial, newrank);
       nslist = std::move(start_trial);
@@ -790,8 +793,8 @@ bool Mesh::RedistributeAndRefineMeshBlocks(ParameterInput *pin, ApplicationInput
   }
   restriction_cache.CopyToDevice();
   auto [cellbounds, c_cellbounds] = GetCellBounds();
-  refinement::Restrict(resolved_packages.get(), restriction_cache,
-                       cellbounds, c_cellbounds);
+  refinement::Restrict(resolved_packages.get(), restriction_cache, cellbounds,
+                       c_cellbounds);
 
   Kokkos::fence();
 
diff --git a/src/mesh/mesh.hpp b/src/mesh/mesh.hpp
index 9904f5529911..dad41bf6715e 100644
--- a/src/mesh/mesh.hpp
+++ b/src/mesh/mesh.hpp
@@ -225,9 +225,9 @@ class Mesh {
   std::pair<IndexShape, IndexShape> GetCellBounds() const {
     auto cb = [&](const int rfact) {
       int include_ghosts = (rfact == 1 || multilevel);
-      return IndexShape((ndim > 2) * block_size.nx(X3DIR)/rfact,
-                        (ndim > 1) * block_size.nx(X2DIR)/rfact,
-                        block_size.nx(X1DIR)/rfact, include_ghosts*Globals::nghost);
+      return IndexShape((ndim > 2) * block_size.nx(X3DIR) / rfact,
+                        (ndim > 1) * block_size.nx(X2DIR) / rfact,
+                        block_size.nx(X1DIR) / rfact, include_ghosts * Globals::nghost);
     };
     return std::make_pair(cb(1), cb(2));
   }
@@ -282,9 +282,8 @@ class Mesh {
   void ResetLoadBalanceVariables();
 
   void SetSimpleBalance(const int nblocks, std::vector<int> &start, std::vector<int> &nb);
-  void CalculateLoadBalance(std::vector<double> const &cost,
-                            std::vector<int> &rank, std::vector<int> &start,
-                            std::vector<int> &nb);
+  void CalculateLoadBalance(std::vector<double> const &cost, std::vector<int> &rank,
+                            std::vector<int> &start, std::vector<int> &nb);
   // Mesh::LoadBalancingAndAdaptiveMeshRefinement() helper functions:
   void UpdateMeshBlockTree(int &nnew, int &ndel);
   void GatherCostList();
diff --git a/src/outputs/parthenon_hdf5.cpp b/src/outputs/parthenon_hdf5.cpp
index 812945c90ab4..3c02974db8c9 100644
--- a/src/outputs/parthenon_hdf5.cpp
+++ b/src/outputs/parthenon_hdf5.cpp
@@ -147,8 +147,7 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm
       // write number of ghost cells in simulation
       HDF5WriteAttribute("NGhost", Globals::nghost, info_group);
       auto coords = Coordinates_t();
-      HDF5WriteAttribute("Coordinates", std::string(coords.Name()).c_str(),
-                         info_group);
+      HDF5WriteAttribute("Coordinates", std::string(coords.Name()).c_str(), info_group);
 
       // restart info, write always
       HDF5WriteAttribute("NBNew", pm->nbnew, info_group);
@@ -351,9 +350,8 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm
 
     // get list of all vars, just use the first block since the list is the same for all
     // blocks
-    const auto vars = (pm->block_list.size() > 0) 
-                      ? get_vars(pm->block_list.front())
-                      : VariableVector<Real>();
+    const auto vars = (pm->block_list.size() > 0) ? get_vars(pm->block_list.front())
+                                                  : VariableVector<Real>();
     for (auto &v : vars) {
       all_vars_info.emplace_back(v);
     }
diff --git a/src/utils/block_timer.hpp b/src/utils/block_timer.hpp
index 84476544babf..31bf3c967883 100644
--- a/src/utils/block_timer.hpp
+++ b/src/utils/block_timer.hpp
@@ -33,38 +33,37 @@ class BlockTimer {
   // constructor for team policies when there is no pack
   KOKKOS_INLINE_FUNCTION
   BlockTimer(team_mbr_t &member, double *cost)
-    : member_(&member), cost_(cost), start_(Kokkos::Impl::clock_tic()) {}
+      : member_(&member), cost_(cost), start_(Kokkos::Impl::clock_tic()) {}
   // constructor for non-team policies when there is no pack
   KOKKOS_INLINE_FUNCTION
   explicit BlockTimer(double *cost)
-    : member_(nullptr), cost_(cost), start_(Kokkos::Impl::clock_tic()) {}
+      : member_(nullptr), cost_(cost), start_(Kokkos::Impl::clock_tic()) {}
   // constructor for team policies and packs
   template <typename T>
-  KOKKOS_INLINE_FUNCTION
-  BlockTimer(team_mbr_t &member, const T &pack, const int b)
-    : member_(&member), cost_(&pack.GetCost(b)), start_(Kokkos::Impl::clock_tic()) {}
+  KOKKOS_INLINE_FUNCTION BlockTimer(team_mbr_t &member, const T &pack, const int b)
+      : member_(&member), cost_(&pack.GetCost(b)), start_(Kokkos::Impl::clock_tic()) {}
   // constructor for non-team policies and packs
   template <typename T>
-  KOKKOS_INLINE_FUNCTION
-  BlockTimer(const T &pack, const int b)
-    : member_(nullptr), cost_(&pack.GetCost(b)), start_(Kokkos::Impl::clock_tic()) {}
+  KOKKOS_INLINE_FUNCTION BlockTimer(const T &pack, const int b)
+      : member_(nullptr), cost_(&pack.GetCost(b)), start_(Kokkos::Impl::clock_tic()) {}
   // constructor for team policies without a block index
   KOKKOS_INLINE_FUNCTION
   ~BlockTimer() {
     auto stop = Kokkos::Impl::clock_tic();
     // deal with overflow of clock
-    auto diff = (stop < start_ ?
-                  static_cast<double>(std::numeric_limits<uint64_t>::max() - start_)
-                    + static_cast<double>(stop) :
-                  static_cast<double>(stop - start_));
+    auto diff =
+        (stop < start_
+             ? static_cast<double>(std::numeric_limits<uint64_t>::max() - start_) +
+                   static_cast<double>(stop)
+             : static_cast<double>(stop - start_));
     if (member_ == nullptr) {
       Kokkos::atomic_add(cost_, diff);
     } else {
-      Kokkos::single(Kokkos::PerTeam(*member_), [&] () {
-        Kokkos::atomic_add(cost_, diff);
-      });
+      Kokkos::single(Kokkos::PerTeam(*member_),
+                     [&]() { Kokkos::atomic_add(cost_, diff); });
     }
   }
+
  private:
   const team_mbr_t *member_;
   double *cost_;
@@ -72,8 +71,7 @@ class BlockTimer {
 #else // no timers, so just stub this out
  public:
   template <typename... Args>
-  KOKKOS_INLINE_FUNCTION
-  BlockTimer(Args &&... args) {}
+  KOKKOS_INLINE_FUNCTION BlockTimer(Args &&...args) {}
 #endif
 };
 

From 8ed8c7c524f5b97ab19240bbb3ec73e06e831863 Mon Sep 17 00:00:00 2001
From: jdolence <jcdolence@gmail.com>
Date: Thu, 2 Nov 2023 13:03:59 -0600
Subject: [PATCH 57/67] remove access of block 0

---
 src/mesh/amr_loadbalance.cpp | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/mesh/amr_loadbalance.cpp b/src/mesh/amr_loadbalance.cpp
index 7f147edb57b3..191ee0b56519 100644
--- a/src/mesh/amr_loadbalance.cpp
+++ b/src/mesh/amr_loadbalance.cpp
@@ -963,8 +963,8 @@ bool Mesh::RedistributeAndRefineMeshBlocks(ParameterInput *pin, ApplicationInput
     }
     prolongation_cache.CopyToDevice();
 
-    refinement::ProlongateShared(resolved_packages.get(), prolongation_cache,
-                                 block_list[0]->cellbounds, block_list[0]->c_cellbounds);
+    refinement::ProlongateShared(resolved_packages.get(), prolongation_cache, cellbounds,
+                                 c_cellbounds);
 
     // update the lists
     loclist = std::move(newloc);
@@ -996,8 +996,7 @@ bool Mesh::RedistributeAndRefineMeshBlocks(ParameterInput *pin, ApplicationInput
     // Internal refinement relies on the fine shared values, which are only consistent
     // after being updated with any previously fine versions
     refinement::ProlongateInternal(resolved_packages.get(), prolongation_cache,
-                                   block_list[0]->cellbounds,
-                                   block_list[0]->c_cellbounds);
+                                   cellbounds, c_cellbounds);
 
     // Rebuild just the ownership model, this time weighting the "new" fine blocks just
     // like any other blocks at their level.

From a66108a016500b62cb4633a7967e5f0e66f4255e Mon Sep 17 00:00:00 2001
From: jdolence <jcdolence@gmail.com>
Date: Thu, 2 Nov 2023 13:52:09 -0600
Subject: [PATCH 58/67] try to fix cuda build

---
 src/mesh/amr_loadbalance.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/mesh/amr_loadbalance.cpp b/src/mesh/amr_loadbalance.cpp
index 191ee0b56519..f36197ed6c6d 100644
--- a/src/mesh/amr_loadbalance.cpp
+++ b/src/mesh/amr_loadbalance.cpp
@@ -482,10 +482,12 @@ void Mesh::CalculateLoadBalance(std::vector<double> const &cost, std::vector<int
 void Mesh::ResetLoadBalanceVariables() {
   if (lb_automatic_) {
 #ifdef ENABLE_LB_TIMERS
+    // make a local copy to make CUDA happy
+    auto bcost = block_cost;
     parthenon::par_for(
         loop_pattern_flatrange_tag, "reset cost_d", DevExecSpace(), 0,
         block_list.size() - 1,
-        KOKKOS_LAMBDA(const int b) { block_cost(b) = TINY_NUMBER; });
+        KOKKOS_LAMBDA(const int b) { bcost(b) = TINY_NUMBER; });
 #endif
   } else if (lb_manual_) {
     for (int b = 0; b < block_list.size(); b++) {

From 88cab107cead57debe33a441b5f11353bc1d1102 Mon Sep 17 00:00:00 2001
From: jdolence <jcdolence@gmail.com>
Date: Thu, 2 Nov 2023 14:04:15 -0600
Subject: [PATCH 59/67] fix instrumentation

---
 src/mesh/amr_loadbalance.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/mesh/amr_loadbalance.cpp b/src/mesh/amr_loadbalance.cpp
index f36197ed6c6d..d2b601b11f5d 100644
--- a/src/mesh/amr_loadbalance.cpp
+++ b/src/mesh/amr_loadbalance.cpp
@@ -638,7 +638,7 @@ void Mesh::UpdateMeshBlockTree(int &nnew, int &ndel) {
 // \brief collect the cost from MeshBlocks and check the load balance
 
 void Mesh::GatherCostList() {
-  Kokkos::Profiling::pushRegion("GatherCostList");
+  PARTHENON_INSTRUMENT
   if (lb_automatic_) {
 #ifdef ENABLE_LB_TIMERS
     auto cost_h = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), block_cost);
@@ -656,7 +656,6 @@ void Mesh::GatherCostList() {
                                        nslist.data(), MPI_DOUBLE, MPI_COMM_WORLD));
 #endif
   }
-  Kokkos::Profiling::popRegion();
   return;
 }
 
@@ -679,7 +678,7 @@ bool Mesh::RedistributeAndRefineMeshBlocks(ParameterInput *pin, ApplicationInput
     // The mesh hasn't actually changed.  Let's just check the load balancing
     // and only move things around if needed
     if (lb_automatic_ || lb_manual_) {
-      Kokkos::Profiling::pushRegion("Reloadbalance");
+      PARTHENON_INSTRUMENT
       auto [avg_cost, max_block_cost, max_rank_cost] =
           BlockCostInfo(costlist, nslist, nblist);
       std::vector<int> start_trial(Globals::nranks);

From b0762209a821eb2b17c4d843486faec903b8c163 Mon Sep 17 00:00:00 2001
From: jdolence <jcdolence@gmail.com>
Date: Thu, 2 Nov 2023 14:58:16 -0600
Subject: [PATCH 60/67] make ResetLoadBalanceVariables public

---
 src/mesh/mesh.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/mesh/mesh.hpp b/src/mesh/mesh.hpp
index dad41bf6715e..093625aea11b 100644
--- a/src/mesh/mesh.hpp
+++ b/src/mesh/mesh.hpp
@@ -232,6 +232,8 @@ class Mesh {
     return std::make_pair(cb(1), cb(2));
   }
 
+  void ResetLoadBalanceVariables();
+
  private:
   // data
   int root_level, max_level, current_level;
@@ -279,8 +281,6 @@ class Mesh {
   // functions
   MeshGenFunc MeshGenerator_[4];
 
-  void ResetLoadBalanceVariables();
-
   void SetSimpleBalance(const int nblocks, std::vector<int> &start, std::vector<int> &nb);
   void CalculateLoadBalance(std::vector<double> const &cost, std::vector<int> &rank,
                             std::vector<int> &start, std::vector<int> &nb);

From 908d6662d15cfd65c2a4ad4eac62d315f42b6387 Mon Sep 17 00:00:00 2001
From: jdolence <jcdolence@gmail.com>
Date: Thu, 2 Nov 2023 16:40:36 -0600
Subject: [PATCH 61/67] add the ability to time things on host outside of
 kernels

---
 src/bvals/boundary_conditions_generic.hpp  |  8 +++++-
 src/bvals/comms/boundary_communication.cpp |  4 +--
 src/mesh/amr_loadbalance.cpp               | 14 +++++-----
 src/mesh/mesh.cpp                          |  7 +++--
 src/mesh/mesh.hpp                          |  4 ++-
 src/utils/block_timer.hpp                  | 30 ++++++++++++++++++++++
 6 files changed, 53 insertions(+), 14 deletions(-)

diff --git a/src/bvals/boundary_conditions_generic.hpp b/src/bvals/boundary_conditions_generic.hpp
index 76a52b899c88..f228ea2d0073 100644
--- a/src/bvals/boundary_conditions_generic.hpp
+++ b/src/bvals/boundary_conditions_generic.hpp
@@ -27,6 +27,7 @@
 #include "mesh/domain.hpp"
 #include "mesh/mesh.hpp"
 #include "mesh/meshblock.hpp"
+#include "utils/block_timer.hpp"
 
 namespace parthenon {
 namespace BoundaryFunction {
@@ -40,6 +41,10 @@ void GenericBC(std::shared_ptr<MeshBlockData<Real>> &rc, bool coarse,
   // make sure DIR is X[123]DIR so we don't have to check again
   static_assert(DIR == X1DIR || DIR == X2DIR || DIR == X3DIR, "DIR must be X[123]DIR");
 
+  std::shared_ptr<MeshBlock> pmb = rc->GetBlockPointer();
+  auto &block_cost_host = pmb->pmy_mesh->block_cost_host;
+  BlockTimerHost host_timer(block_cost_host, pmb->lid, pmb->lid);
+
   // convenient shorthands
   constexpr bool X1 = (DIR == X1DIR);
   constexpr bool X2 = (DIR == X2DIR);
@@ -63,7 +68,6 @@ void GenericBC(std::shared_ptr<MeshBlockData<Real>> &rc, bool coarse,
   if (lend < lstart) return;
   auto nb = IndexRange{lstart, lend};
 
-  std::shared_ptr<MeshBlock> pmb = rc->GetBlockPointer();
   const auto &bounds = coarse ? pmb->c_cellbounds : pmb->cellbounds;
 
   const auto &range = X1 ? bounds.GetBoundsI(IndexDomain::interior, el)
@@ -87,6 +91,8 @@ void GenericBC(std::shared_ptr<MeshBlockData<Real>> &rc, bool coarse,
   // used for derivatives
   const int offsetin = INNER;
   const int offsetout = !INNER;
+  // stop timing on the host
+  host_timer.Stop();
   pmb->par_for_bndry(
       PARTHENON_AUTO_LABEL, nb, domain, el, coarse,
       KOKKOS_LAMBDA(const int &l, const int &k, const int &j, const int &i) {
diff --git a/src/bvals/comms/boundary_communication.cpp b/src/bvals/comms/boundary_communication.cpp
index 54b483e754c5..fc9ab8dfdf6e 100644
--- a/src/bvals/comms/boundary_communication.cpp
+++ b/src/bvals/comms/boundary_communication.cpp
@@ -50,7 +50,7 @@ TaskStatus SendBoundBufs(std::shared_ptr<MeshData<Real>> &md) {
   PARTHENON_INSTRUMENT
 
   Mesh *pmesh = md->GetMeshPointer();
-  auto &block_cost = pmesh->block_cost;
+  auto &block_cost = pmesh->GetBlockCost();
   auto &cache = md->GetBvarsCache().GetSubCache(bound_type, true);
 
   if (cache.buf_vec.size() == 0)
@@ -226,7 +226,7 @@ TaskStatus SetBounds(std::shared_ptr<MeshData<Real>> &md) {
   PARTHENON_INSTRUMENT
 
   Mesh *pmesh = md->GetMeshPointer();
-  auto &block_cost = pmesh->block_cost;
+  auto &block_cost = pmesh->GetBlockCost();
   auto &cache = md->GetBvarsCache().GetSubCache(bound_type, false);
 
   auto [rebuild, nbound] = CheckReceiveBufferCacheForRebuild<bound_type, false>(md);
diff --git a/src/mesh/amr_loadbalance.cpp b/src/mesh/amr_loadbalance.cpp
index d2b601b11f5d..559801bcffb7 100644
--- a/src/mesh/amr_loadbalance.cpp
+++ b/src/mesh/amr_loadbalance.cpp
@@ -486,12 +486,13 @@ void Mesh::ResetLoadBalanceVariables() {
     auto bcost = block_cost;
     parthenon::par_for(
         loop_pattern_flatrange_tag, "reset cost_d", DevExecSpace(), 0,
-        block_list.size() - 1,
-        KOKKOS_LAMBDA(const int b) { bcost(b) = TINY_NUMBER; });
+        block_list.size() - 1, KOKKOS_LAMBDA(const int b) { bcost(b) = TINY_NUMBER; });
+    for (int b = 0; b < block_list.size(); b++)
+      block_cost_host[b] = TINY_NUMBER;
 #endif
   } else if (lb_manual_) {
     for (int b = 0; b < block_list.size(); b++) {
-      block_cost[b] = TINY_NUMBER;
+      block_cost_host[b] = TINY_NUMBER;
     }
   }
   step_since_lb = 0;
@@ -642,6 +643,8 @@ void Mesh::GatherCostList() {
   if (lb_automatic_) {
 #ifdef ENABLE_LB_TIMERS
     auto cost_h = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), block_cost);
+    for (int b = 0; b < block_cost_host.size(); b++)
+      cost_h(b) += block_cost_host[b];
 #ifdef MPI_PARALLEL
     PARTHENON_MPI_CHECK(MPI_Allgatherv(cost_h.data(), nblist[Globals::my_rank],
                                        MPI_DOUBLE, costlist.data(), nblist.data(),
@@ -651,7 +654,7 @@ void Mesh::GatherCostList() {
   }
   if (lb_manual_) {
 #ifdef MPI_PARALLEL
-    PARTHENON_MPI_CHECK(MPI_Allgatherv(block_cost.data(), nblist[Globals::my_rank],
+    PARTHENON_MPI_CHECK(MPI_Allgatherv(block_cost_host.data(), nblist[Globals::my_rank],
                                        MPI_DOUBLE, costlist.data(), nblist.data(),
                                        nslist.data(), MPI_DOUBLE, MPI_COMM_WORLD));
 #endif
@@ -767,9 +770,8 @@ bool Mesh::RedistributeAndRefineMeshBlocks(ParameterInput *pin, ApplicationInput
 
 #ifdef ENABLE_LB_TIMERS
   block_cost.Realloc(nbe - nbs + 1);
-#else
-  block_cost.resize(nbe - nbs + 1);
 #endif
+  block_cost_host.resize(nbe - nbs + 1);
 
   // Restrict fine to coarse buffers
   ProResCache_t restriction_cache;
diff --git a/src/mesh/mesh.cpp b/src/mesh/mesh.cpp
index ba141db27ab4..8eb073ea11f1 100644
--- a/src/mesh/mesh.cpp
+++ b/src/mesh/mesh.cpp
@@ -458,9 +458,8 @@ Mesh::Mesh(ParameterInput *pin, ApplicationInput *app_in, Packages_t &packages,
   }
 #ifdef ENABLE_LB_TIMERS
   block_cost.Realloc(block_list.size());
-#else
-  block_cost.resize(block_list.size());
 #endif
+  block_cost_host.resize(block_list.size());
 
   ResetLoadBalanceVariables();
 }
@@ -721,9 +720,9 @@ Mesh::Mesh(ParameterInput *pin, ApplicationInput *app_in, RestartReader &rr,
 
 #ifdef ENABLE_LB_TIMERS
   block_cost.Realloc(block_list.size());
-#else
-  block_cost.resize(block_list.size());
 #endif
+  block_cost_host.resize(block_list.size());
+
   ResetLoadBalanceVariables();
 }
 
diff --git a/src/mesh/mesh.hpp b/src/mesh/mesh.hpp
index 093625aea11b..4e092f4251e6 100644
--- a/src/mesh/mesh.hpp
+++ b/src/mesh/mesh.hpp
@@ -108,10 +108,12 @@ class Mesh {
   BlockList_t block_list;
   Packages_t packages;
   std::shared_ptr<StateDescriptor> resolved_packages;
+  std::vector<double> block_cost_host;
 #ifdef ENABLE_LB_TIMERS
   ParArray1D<double> block_cost;
+  auto &GetBlockCost() const { return block_cost; }
 #else
-  std::vector<double> block_cost;
+  auto &GetBlockCost() const { return block_cost_host; }
 #endif
 
   DataCollection<MeshData<Real>> mesh_data;
diff --git a/src/utils/block_timer.hpp b/src/utils/block_timer.hpp
index 31bf3c967883..7d741edbc691 100644
--- a/src/utils/block_timer.hpp
+++ b/src/utils/block_timer.hpp
@@ -75,6 +75,36 @@ class BlockTimer {
 #endif
 };
 
+class BlockTimerHost {
+#ifdef ENABLE_LB_TIMERS
+ public:
+  BlockTimerHost(const std::vector<double> &cost, const int bs, const int be)
+      : cost_(cost), bs_(bs), be_(be), start_(Kokkos::Impl::clock_tic()) {}
+  void Stop() const {
+    auto stop = Kokkos::Impl::clock_tick();
+    // deal with overflow of clock
+    auto diff =
+        (stop < start_
+             ? static_cast<double>(std::numeric_limits<uint64_t>::max() - start_) +
+                   static_cast<double>(stop)
+             : static_cast<double>(stop - start_));
+    auto cost_per_block = diff / (be - bs + 1);
+    for (int b = bs; b <= be; b++)
+      cost_[b] += cost_per_block;
+  }
+
+ private:
+  const std::vector<double> &cost_;
+  const int bs_, be_;
+  const uint64_t start_;
+#else // stub out
+ public:
+  template <typename... Args>
+  BlockTimerHost(Args &&...args) {}
+  void Stop() const {}
+#endif
+};
+
 } // namespace parthenon
 
 #endif

From bbef7080680e958268f4b73c000d2bb7bc395f59 Mon Sep 17 00:00:00 2001
From: jdolence <jcdolence@gmail.com>
Date: Thu, 2 Nov 2023 16:51:31 -0600
Subject: [PATCH 62/67] oops

---
 src/utils/block_timer.hpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/utils/block_timer.hpp b/src/utils/block_timer.hpp
index 7d741edbc691..f9a61f16177d 100644
--- a/src/utils/block_timer.hpp
+++ b/src/utils/block_timer.hpp
@@ -81,15 +81,15 @@ class BlockTimerHost {
   BlockTimerHost(const std::vector<double> &cost, const int bs, const int be)
       : cost_(cost), bs_(bs), be_(be), start_(Kokkos::Impl::clock_tic()) {}
   void Stop() const {
-    auto stop = Kokkos::Impl::clock_tick();
+    auto stop = Kokkos::Impl::clock_tic();
     // deal with overflow of clock
     auto diff =
         (stop < start_
              ? static_cast<double>(std::numeric_limits<uint64_t>::max() - start_) +
                    static_cast<double>(stop)
              : static_cast<double>(stop - start_));
-    auto cost_per_block = diff / (be - bs + 1);
-    for (int b = bs; b <= be; b++)
+    auto cost_per_block = diff / (be_ - bs_ + 1);
+    for (int b = bs_; b <= be_; b++)
       cost_[b] += cost_per_block;
   }
 

From 362ada90e7a92e5cca4b206a24ffcbb1f773d0f0 Mon Sep 17 00:00:00 2001
From: jdolence <jcdolence@gmail.com>
Date: Thu, 2 Nov 2023 16:55:33 -0600
Subject: [PATCH 63/67] huh

---
 src/utils/block_timer.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/utils/block_timer.hpp b/src/utils/block_timer.hpp
index f9a61f16177d..ae7da11cf4c7 100644
--- a/src/utils/block_timer.hpp
+++ b/src/utils/block_timer.hpp
@@ -78,7 +78,7 @@ class BlockTimer {
 class BlockTimerHost {
 #ifdef ENABLE_LB_TIMERS
  public:
-  BlockTimerHost(const std::vector<double> &cost, const int bs, const int be)
+  BlockTimerHost(std::vector<double> &cost, const int bs, const int be)
       : cost_(cost), bs_(bs), be_(be), start_(Kokkos::Impl::clock_tic()) {}
   void Stop() const {
     auto stop = Kokkos::Impl::clock_tic();
@@ -94,7 +94,7 @@ class BlockTimerHost {
   }
 
  private:
-  const std::vector<double> &cost_;
+  std::vector<double> &cost_;
   const int bs_, be_;
   const uint64_t start_;
 #else // stub out

From b1c9faac4d3467ee6fbb205a13add1a4cfa53131 Mon Sep 17 00:00:00 2001
From: jdolence <jcdolence@gmail.com>
Date: Thu, 2 Nov 2023 17:02:00 -0600
Subject: [PATCH 64/67] errant popRegion leftover from merge

---
 src/mesh/amr_loadbalance.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/mesh/amr_loadbalance.cpp b/src/mesh/amr_loadbalance.cpp
index 559801bcffb7..1db3480e1624 100644
--- a/src/mesh/amr_loadbalance.cpp
+++ b/src/mesh/amr_loadbalance.cpp
@@ -688,7 +688,6 @@ bool Mesh::RedistributeAndRefineMeshBlocks(ParameterInput *pin, ApplicationInput
       std::vector<int> nb_trial(Globals::nranks);
       double new_max =
           CalculateNewBalance(costlist, start_trial, nb_trial, avg_cost, max_block_cost);
-      Kokkos::Profiling::popRegion();
       // if the improvement isn't large enough, just return because we're done
       if ((max_rank_cost - new_max) / max_rank_cost < lb_tolerance_) return false;
       newrank.resize(ntot);

From 64a023ec203925c626d844b20739d73b8c2d09da Mon Sep 17 00:00:00 2001
From: jdolence <jcdolence@gmail.com>
Date: Thu, 2 Nov 2023 17:24:20 -0600
Subject: [PATCH 65/67] add more things

---
 src/bvals/comms/boundary_communication.cpp | 12 ++++++++++++
 src/mesh/mesh.cpp                          |  8 ++++++++
 2 files changed, 20 insertions(+)

diff --git a/src/bvals/comms/boundary_communication.cpp b/src/bvals/comms/boundary_communication.cpp
index fc9ab8dfdf6e..15577b353055 100644
--- a/src/bvals/comms/boundary_communication.cpp
+++ b/src/bvals/comms/boundary_communication.cpp
@@ -51,6 +51,14 @@ TaskStatus SendBoundBufs(std::shared_ptr<MeshData<Real>> &md) {
 
   Mesh *pmesh = md->GetMeshPointer();
   auto &block_cost = pmesh->GetBlockCost();
+  int min_lid = 9999999;
+  int max_lid = 0;
+  for (int b = 0; b < md->NumBlocks(); b++) {
+    int lid = md->GetBlockData(b)->GetBlockPointer()->lid;
+    min_lid = lid < min_lid ? lid : min_lid;
+    max_lid = lid > max_lid ? lid : max_lid;
+  }
+  BlockTimerHost host_timer(pmesh->block_cost_host, min_lid, max_lid);
   auto &cache = md->GetBvarsCache().GetSubCache(bound_type, true);
 
   if (cache.buf_vec.size() == 0)
@@ -83,6 +91,8 @@ TaskStatus SendBoundBufs(std::shared_ptr<MeshData<Real>> &md) {
   auto &sending_nonzero_flags = cache.sending_non_zero_flags;
   auto &sending_nonzero_flags_h = cache.sending_non_zero_flags_h;
 
+  host_timer.Stop();
+
   Kokkos::parallel_for(
       PARTHENON_AUTO_LABEL,
       Kokkos::TeamPolicy<>(parthenon::DevExecSpace(), nbound, Kokkos::AUTO),
@@ -131,6 +141,7 @@ TaskStatus SendBoundBufs(std::shared_ptr<MeshData<Real>> &md) {
         });
       });
 
+  BlockTimerHost host_timer2(pmesh->block_cost_host, min_lid, max_lid);
   // Send buffers
   if (Globals::sparse_config.enabled)
     Kokkos::deep_copy(sending_nonzero_flags_h, sending_nonzero_flags);
@@ -146,6 +157,7 @@ TaskStatus SendBoundBufs(std::shared_ptr<MeshData<Real>> &md) {
     else
       buf.SendNull();
   }
+  host_timer2.Stop();
 
   return TaskStatus::complete;
 }
diff --git a/src/mesh/mesh.cpp b/src/mesh/mesh.cpp
index 8eb073ea11f1..d759930b3865 100644
--- a/src/mesh/mesh.cpp
+++ b/src/mesh/mesh.cpp
@@ -1142,6 +1142,14 @@ void Mesh::Initialize(bool init_problem, ParameterInput *pin, ApplicationInput *
       init_done = false;
       // caching nbtotal the private variable my be updated in the following function
       const int nb_before_loadbalance = nbtotal;
+      // don't trust costs during problem initialization
+      for (int i = 0; i < nmb; ++i)
+        block_cost_host[i] = 1.0;
+#ifdef ENABLE_LB_TIMERS
+      parthenon::par_for(
+          DEFAULT_LOOP_PATTERN, PARTHENON_AUTO_LABEL, DevExecSpace(), 0, nmb - 1,
+          KOKKOS_LAMBDA(const int b) { block_cost(b) = 1.0; });
+#endif
       LoadBalancingAndAdaptiveMeshRefinement(pin, app_in);
       if (nbtotal == nb_before_loadbalance) {
         init_done = true;

From 2c78030006989aebc22464a23fa79a1e1ad850ea Mon Sep 17 00:00:00 2001
From: jdolence <jcdolence@gmail.com>
Date: Fri, 3 Nov 2023 08:13:36 -0600
Subject: [PATCH 66/67] fix loop pattern, reset lb vars

---
 src/mesh/amr_loadbalance.cpp | 6 +++++-
 src/mesh/mesh.cpp            | 2 +-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/mesh/amr_loadbalance.cpp b/src/mesh/amr_loadbalance.cpp
index 1db3480e1624..c02f95babb8d 100644
--- a/src/mesh/amr_loadbalance.cpp
+++ b/src/mesh/amr_loadbalance.cpp
@@ -368,6 +368,7 @@ Real DistributeTrial(std::vector<Real> const &cost, std::vector<int> &start,
 double CalculateNewBalance(std::vector<double> const &cost, std::vector<int> &start,
                            std::vector<int> &nb, const double avg_cost,
                            const double max_block_cost) {
+  PARTHENON_INSTRUMENT
   const int nblocks = cost.size();
   const int max_rank = std::min(nblocks, Globals::nranks);
 
@@ -689,7 +690,10 @@ bool Mesh::RedistributeAndRefineMeshBlocks(ParameterInput *pin, ApplicationInput
       double new_max =
           CalculateNewBalance(costlist, start_trial, nb_trial, avg_cost, max_block_cost);
       // if the improvement isn't large enough, just return because we're done
-      if ((max_rank_cost - new_max) / max_rank_cost < lb_tolerance_) return false;
+      if ((max_rank_cost - new_max) / max_rank_cost < lb_tolerance_) {
+        ResetLoadBalanceVariables();
+        return false;
+      }
       newrank.resize(ntot);
       AssignBlocks(start_trial, nb_trial, newrank);
       nslist = std::move(start_trial);
diff --git a/src/mesh/mesh.cpp b/src/mesh/mesh.cpp
index d759930b3865..9aa60df70f86 100644
--- a/src/mesh/mesh.cpp
+++ b/src/mesh/mesh.cpp
@@ -1147,7 +1147,7 @@ void Mesh::Initialize(bool init_problem, ParameterInput *pin, ApplicationInput *
         block_cost_host[i] = 1.0;
 #ifdef ENABLE_LB_TIMERS
       parthenon::par_for(
-          DEFAULT_LOOP_PATTERN, PARTHENON_AUTO_LABEL, DevExecSpace(), 0, nmb - 1,
+          loop_pattern_flatrange_tag, PARTHENON_AUTO_LABEL, DevExecSpace(), 0, nmb - 1,
           KOKKOS_LAMBDA(const int b) { block_cost(b) = 1.0; });
 #endif
       LoadBalancingAndAdaptiveMeshRefinement(pin, app_in);

From 0b13b20360f49ce41213a01a2afbb38403934bb1 Mon Sep 17 00:00:00 2001
From: jdolence <jcdolence@gmail.com>
Date: Fri, 3 Nov 2023 12:00:21 -0600
Subject: [PATCH 67/67] try to fix some things

---
 src/mesh/amr_loadbalance.cpp | 2 +-
 src/mesh/mesh.cpp            | 8 +-------
 2 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/src/mesh/amr_loadbalance.cpp b/src/mesh/amr_loadbalance.cpp
index c02f95babb8d..49219de78bab 100644
--- a/src/mesh/amr_loadbalance.cpp
+++ b/src/mesh/amr_loadbalance.cpp
@@ -486,7 +486,7 @@ void Mesh::ResetLoadBalanceVariables() {
     // make a local copy to make CUDA happy
     auto bcost = block_cost;
     parthenon::par_for(
-        loop_pattern_flatrange_tag, "reset cost_d", DevExecSpace(), 0,
+        loop_pattern_flatrange_tag, PARTHENON_AUTO_LABEL, DevExecSpace(), 0,
         block_list.size() - 1, KOKKOS_LAMBDA(const int b) { bcost(b) = TINY_NUMBER; });
     for (int b = 0; b < block_list.size(); b++)
       block_cost_host[b] = TINY_NUMBER;
diff --git a/src/mesh/mesh.cpp b/src/mesh/mesh.cpp
index 9aa60df70f86..821c972b19de 100644
--- a/src/mesh/mesh.cpp
+++ b/src/mesh/mesh.cpp
@@ -1143,13 +1143,7 @@ void Mesh::Initialize(bool init_problem, ParameterInput *pin, ApplicationInput *
       // caching nbtotal the private variable my be updated in the following function
       const int nb_before_loadbalance = nbtotal;
       // don't trust costs during problem initialization
-      for (int i = 0; i < nmb; ++i)
-        block_cost_host[i] = 1.0;
-#ifdef ENABLE_LB_TIMERS
-      parthenon::par_for(
-          loop_pattern_flatrange_tag, PARTHENON_AUTO_LABEL, DevExecSpace(), 0, nmb - 1,
-          KOKKOS_LAMBDA(const int b) { block_cost(b) = 1.0; });
-#endif
+      ResetLoadBalanceVariables();
       LoadBalancingAndAdaptiveMeshRefinement(pin, app_in);
       if (nbtotal == nb_before_loadbalance) {
         init_done = true;