From 59d00674e7f133968b80402719b193234b35833a Mon Sep 17 00:00:00 2001
From: Philipp Grete <pgrete@hs.uni-hamburg.de>
Date: Thu, 14 Nov 2024 13:55:47 +0100
Subject: [PATCH 1/4] Only update energy in RKL2 int

---
 src/hydro/hydro_driver.cpp | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)
diff --git a/src/hydro/hydro_driver.cpp b/src/hydro/hydro_driver.cpp
index eeb29d3d..b0619a51 100644
--- a/src/hydro/hydro_driver.cpp
+++ b/src/hydro/hydro_driver.cpp
@@ -185,19 +185,19 @@ TaskStatus RKL2StepOther(MeshData<Real> *md_Y0, MeshData<Real> *md_Yjm1,
   // by enough work over the entire pack and it allows to not use any conditionals.
   parthenon::par_for(
       DEFAULT_LOOP_PATTERN, "RKL other step", parthenon::DevExecSpace(), 0,
-      Y0.GetDim(5) - 1, 0, Y0.GetDim(4) - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
-      KOKKOS_LAMBDA(const int b, const int v, const int k, const int j, const int i) {
+      Y0.GetDim(5) - 1, 0, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+      KOKKOS_LAMBDA(const int b, const int k, const int j, const int i) {
         // First calc this step
         const auto &coords = Yjm1.GetCoords(b);
         const Real MYjm1 =
-            parthenon::Update::FluxDivHelper(v, k, j, i, ndim, coords, Yjm1(b));
-        const Real Yj = mu_j * Yjm1(b, v, k, j, i) + nu_j * Yjm2(b, v, k, j, i) +
-                        (1.0 - mu_j - nu_j) * Y0(b, v, k, j, i) +
+            parthenon::Update::FluxDivHelper(IEN, k, j, i, ndim, coords, Yjm1(b));
+        const Real Yj = mu_j * Yjm1(b, IEN, k, j, i) + nu_j * Yjm2(b, IEN, k, j, i) +
+                        (1.0 - mu_j - nu_j) * Y0(b, IEN, k, j, i) +
                         mu_tilde_j * tau * MYjm1 +
-                        gamma_tilde_j * tau * MY0(b, v, k, j, i);
+                        gamma_tilde_j * tau * MY0(b, IEN, k, j, i);
         // Then shuffle vars for next step
-        Yjm2(b, v, k, j, i) = Yjm1(b, v, k, j, i);
-        Yjm1(b, v, k, j, i) = Yj;
+        Yjm2(b, IEN, k, j, i) = Yjm1(b, IEN, k, j, i);
+        Yjm1(b, IEN, k, j, i) = Yj;
       });
 
   return TaskStatus::complete;

From 1dbfccd3b05333afc4782bb44c2a0a794469d736 Mon Sep 17 00:00:00 2001
From: Philipp Grete <pgrete@hs.uni-hamburg.de>
Date: Thu, 14 Nov 2024 15:06:48 +0100
Subject: [PATCH 2/4] Fix leftover index

---
 src/hydro/hydro_driver.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/hydro/hydro_driver.cpp b/src/hydro/hydro_driver.cpp
index b0619a51..f01e4c2d 100644
--- a/src/hydro/hydro_driver.cpp
+++ b/src/hydro/hydro_driver.cpp
@@ -185,7 +185,7 @@ TaskStatus RKL2StepOther(MeshData<Real> *md_Y0, MeshData<Real> *md_Yjm1,
   // by enough work over the entire pack and it allows to not use any conditionals.
   parthenon::par_for(
       DEFAULT_LOOP_PATTERN, "RKL other step", parthenon::DevExecSpace(), 0,
-      Y0.GetDim(5) - 1, 0, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+      Y0.GetDim(5) - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
       KOKKOS_LAMBDA(const int b, const int k, const int j, const int i) {
         // First calc this step
         const auto &coords = Yjm1.GetCoords(b);

From a4155c927af6f56e63999fd346a3a61a08e6881c Mon Sep 17 00:00:00 2001
From: Philipp Grete <pgrete@hs.uni-hamburg.de>
Date: Thu, 14 Nov 2024 15:23:13 +0100
Subject: [PATCH 3/4] Fix reset of dt_diff

---
 src/hydro/hydro_driver.cpp | 30 ++++++++++++++++--------------
 1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/src/hydro/hydro_driver.cpp b/src/hydro/hydro_driver.cpp
index f01e4c2d..afcba1d8 100644
--- a/src/hydro/hydro_driver.cpp
+++ b/src/hydro/hydro_driver.cpp
@@ -660,8 +660,24 @@ TaskCollection HydroDriver::MakeTaskCollection(BlockList_t &blocks, int stage) {
                                         pmesh->multilevel);
   }
 
+  TaskRegion &single_tasklist_per_pack_region_3 = tc.AddRegion(num_partitions);
+  for (int i = 0; i < num_partitions; i++) {
+    auto &tl = single_tasklist_per_pack_region_3[i];
+    auto &mu0 = pmesh->mesh_data.GetOrAdd("base", i);
+    auto fill_derived =
+        tl.AddTask(none, parthenon::Update::FillDerived<MeshData<Real>>, mu0.get());
+  }
+  const auto &diffint = hydro_pkg->Param<DiffInt>("diffint");
+  // If any tasks modify the conserved variables before this place and after FillDerived,
+  // then the STS tasks should be updated to not assume prim and cons are in sync.
+  if (diffint == DiffInt::rkl2 && stage == integrator->nstages) {
+    AddSTSTasks(&tc, pmesh, blocks, 0.5 * tm.dt);
+  }
+
   // Single task in single (serial) region to reset global vars used in reductions in the
   // first stage.
+  // TODO(pgrete) check if we logically need this reset or if we can reset within the
+  // timestep task
   if (stage == integrator->nstages &&
       (hydro_pkg->Param<bool>("calc_c_h") ||
        hydro_pkg->Param<DiffInt>("diffint") != DiffInt::none)) {
@@ -678,20 +694,6 @@ TaskCollection HydroDriver::MakeTaskCollection(BlockList_t &blocks, int stage) {
         hydro_pkg.get());
   }
 
-  TaskRegion &single_tasklist_per_pack_region_3 = tc.AddRegion(num_partitions);
-  for (int i = 0; i < num_partitions; i++) {
-    auto &tl = single_tasklist_per_pack_region_3[i];
-    auto &mu0 = pmesh->mesh_data.GetOrAdd("base", i);
-    auto fill_derived =
-        tl.AddTask(none, parthenon::Update::FillDerived<MeshData<Real>>, mu0.get());
-  }
-  const auto &diffint = hydro_pkg->Param<DiffInt>("diffint");
-  // If any tasks modify the conserved variables before this place and after FillDerived,
-  // then the STS tasks should be updated to not assume prim and cons are in sync.
-  if (diffint == DiffInt::rkl2 && stage == integrator->nstages) {
-    AddSTSTasks(&tc, pmesh, blocks, 0.5 * tm.dt);
-  }
-
   if (stage == integrator->nstages) {
     TaskRegion &tr = tc.AddRegion(num_partitions);
     for (int i = 0; i < num_partitions; i++) {

From 487c6b82a8fc4882a38a87eac56b9260b6052951 Mon Sep 17 00:00:00 2001
From: Philipp Grete <pgrete@hs.uni-hamburg.de>
Date: Tue, 19 Nov 2024 10:12:30 +0100
Subject: [PATCH 4/4] Ensure dt_diff is set when STS tasks are added

---
 src/hydro/hydro.cpp        | 102 ++++++++++++++++++++++++++++++++-----
 src/hydro/hydro_driver.cpp |  94 ----------------------------------
 2 files changed, 89 insertions(+), 107 deletions(-)

diff --git a/src/hydro/hydro.cpp b/src/hydro/hydro.cpp
index 5720346c..1f6e8e36 100644
--- a/src/hydro/hydro.cpp
+++ b/src/hydro/hydro.cpp
@@ -57,12 +57,87 @@ parthenon::Packages_t ProcessPackages(std::unique_ptr<ParameterInput> &pin) {
   return packages;
 }
 
+// Calculate mininum dx, which is used in calculating the divergence cleaning speed c_h
+// TODO(PG) eventually move to calculating the timestep once the timestep calc
+// has been moved to be done before Step()
+Real CalculateGlobalMinDx(MeshData<Real> *md) {
+  auto *pmb = md->GetBlockData(0)->GetBlockPointer();
+  auto hydro_pkg = pmb->packages.Get("Hydro");
+
+  const auto &prim_pack = md->PackVariables(std::vector<std::string>{"prim"});
+
+  IndexRange ib = md->GetBlockData(0)->GetBoundsI(IndexDomain::interior);
+  IndexRange jb = md->GetBlockData(0)->GetBoundsJ(IndexDomain::interior);
+  IndexRange kb = md->GetBlockData(0)->GetBoundsK(IndexDomain::interior);
+
+  Real mindx = std::numeric_limits<Real>::max();
+
+  bool nx2 = prim_pack.GetDim(2) > 1;
+  bool nx3 = prim_pack.GetDim(3) > 1;
+  pmb->par_reduce(
+      "CalculateGlobalMinDx", 0, prim_pack.GetDim(5) - 1, kb.s, kb.e, jb.s, jb.e, ib.s,
+      ib.e,
+      KOKKOS_LAMBDA(const int b, const int k, const int j, const int i, Real &lmindx) {
+        const auto &coords = prim_pack.GetCoords(b);
+        lmindx = fmin(lmindx, coords.Dxc<1>(k, j, i));
+        if (nx2) {
+          lmindx = fmin(lmindx, coords.Dxc<2>(k, j, i));
+        }
+        if (nx3) {
+          lmindx = fmin(lmindx, coords.Dxc<3>(k, j, i));
+        }
+      },
+      Kokkos::Min<Real>(mindx));
+
+  return mindx;
+}
+
 // Using this per cycle function to populate various variables in
 // Params that require global reduction *and* need to be set/known when
 // the task list is constructed (versus when the task list is being executed).
 // TODO(next person touching this function): If more/separate feature are required
 // please separate concerns.
-void PreStepMeshUserWorkInLoop(Mesh *pmesh, ParameterInput *pin, SimTime &tm) {}
+void PreStepMeshUserWorkInLoop(Mesh *pmesh, ParameterInput *pin, SimTime &tm) {
+  auto hydro_pkg = pmesh->packages.Get("Hydro");
+
+  // Calculate hyperbolic divergence cleaning speed
+  // TODO(pgrete) Calculating mindx is only required after remeshing. Need to
+  // find a clean solution for this one-off global reduction.
+  if (hydro_pkg->Param<bool>("calc_c_h") ||
+      hydro_pkg->Param<DiffInt>("diffint") != DiffInt::none) {
+
+    Real mindx = std::numeric_limits<Real>::max();
+    // Going over default partitions. Not using a (new) single partition containing
+    // all blocks here as this (default) split is also used main Step() function and
+    // thus does not create an overhead (such as creating a new MeshBlockPack that is just
+    // used here). All partitions are executed sequentially. Given that a par_reduce to a
+    // host var is blocking it's save to dirctly use the return value.
+    const int num_partitions = pmesh->DefaultNumPartitions();
+    for (int i = 0; i < num_partitions; i++) {
+      auto &mu0 = pmesh->mesh_data.GetOrAdd("base", i);
+      mindx = std::min(mindx, CalculateGlobalMinDx(mu0.get()));
+    }
+#ifdef MPI_PARALLEL
+    Real mins[3];
+    mins[0] = mindx;
+    mins[1] = hydro_pkg->Param<Real>("dt_hyp");
+    mins[2] = hydro_pkg->Param<Real>("dt_diff");
+    PARTHENON_MPI_CHECK(MPI_Allreduce(MPI_IN_PLACE, mins, 3, MPI_PARTHENON_REAL, MPI_MIN,
+                                      MPI_COMM_WORLD));
+
+    hydro_pkg->UpdateParam("mindx", mins[0]);
+    hydro_pkg->UpdateParam("dt_hyp", mins[1]);
+    hydro_pkg->UpdateParam("dt_diff", mins[2]);
+#else
+    hydro_pkg->UpdateParam("mindx", mindx);
+    // dt_hyp and dt_diff are already set directly in Params when they're calculated
+#endif
+    // Finally update c_h
+    const auto &cfl_hyp = hydro_pkg->Param<Real>("cfl");
+    const auto &dt_hyp = hydro_pkg->Param<Real>("dt_hyp");
+    hydro_pkg->UpdateParam("c_h", cfl_hyp * mindx / dt_hyp);
+  }
+}
 
 template <Hst hst, int idx = -1>
 Real HydroHst(MeshData<Real> *md) {
@@ -1189,10 +1264,10 @@ TaskStatus FirstOrderFluxCorrect(MeshData<Real> *u0_data, MeshData<Real> *u1_dat
           const auto &u0_prim = u0_prim_pack(b);
           auto &u0_cons = u0_cons_pack(b);
 
-          // In principle, the u_cons.fluxes could be updated in parallel by a different
-          // thread resulting in a race conditon here.
-          // However, if the fluxes of a cell have been updated (anywhere) then the entire
-          // kernel will be called again anyway, and, at that point the already fixed
+          // In principle, the u_cons.fluxes could be updated in parallel by a
+          // different thread resulting in a race conditon here. However, if the
+          // fluxes of a cell have been updated (anywhere) then the entire kernel will
+          // be called again anyway, and, at that point the already fixed
           // u0_cons.fluxes will automaticlly be used here.
           Real new_cons[NVAR];
           for (auto v = 0; v < NVAR; v++) {
@@ -1220,13 +1295,13 @@ TaskStatus FirstOrderFluxCorrect(MeshData<Real> *u0_data, MeshData<Real> *u1_dat
             lnum_need_floor += 1;
             return;
           }
-          // In principle, there could be a racecondion as this loop goes over all k,j,i
-          // and we updating the i+1 flux here.
-          // However, the results are idential because u0_prim is never updated in this
-          // kernel so we don't worry about it.
-          // TODO(pgrete) as we need to keep the function signature idential for now (due
-          // to Cuda compiler bug) we could potentially template these function and get
-          // rid of the `if constexpr`
+          // In principle, there could be a racecondion as this loop goes over all
+          // k,j,i and we updating the i+1 flux here. However, the results are
+          // idential because u0_prim is never updated in this kernel so we don't
+          // worry about it.
+          // TODO(pgrete) as we need to keep the function signature idential for now
+          // (due to Cuda compiler bug) we could potentially template these function
+          // and get rid of the `if constexpr`
           riemann.Solve(eos, k, j, i, IV1, u0_prim, u0_cons, c_h);
           riemann.Solve(eos, k, j, i + 1, IV1, u0_prim, u0_cons, c_h);
 
@@ -1243,7 +1318,8 @@ TaskStatus FirstOrderFluxCorrect(MeshData<Real> *u0_data, MeshData<Real> *u1_dat
         Kokkos::Sum<std::int64_t>(num_corrected),
         Kokkos::Sum<std::int64_t>(num_need_floor));
     // TODO(pgrete) make this optional and global (potentially store values in Params)
-    // std::cout << "[" << parthenon::Globals::my_rank << "] Attempt: " << num_attempts
+    // std::cout << "[" << parthenon::Globals::my_rank << "] Attempt: " <<
+    // num_attempts
     //           << " Corrected (center): " << num_corrected
     //           << " Failed (will rely on floor): " << num_need_floor << std::endl;
     num_attempts += 1;
diff --git a/src/hydro/hydro_driver.cpp b/src/hydro/hydro_driver.cpp
index afcba1d8..dd9d4540 100644
--- a/src/hydro/hydro_driver.cpp
+++ b/src/hydro/hydro_driver.cpp
@@ -37,46 +37,6 @@ HydroDriver::HydroDriver(ParameterInput *pin, ApplicationInput *app_in, Mesh *pm
   pin->CheckDesired("parthenon/time", "cfl");
 }
 
-// Calculate mininum dx, which is used in calculating the divergence cleaning speed c_h
-TaskStatus CalculateGlobalMinDx(MeshData<Real> *md) {
-  auto pmb = md->GetBlockData(0)->GetBlockPointer();
-  auto hydro_pkg = pmb->packages.Get("Hydro");
-
-  const auto &prim_pack = md->PackVariables(std::vector<std::string>{"prim"});
-
-  IndexRange ib = md->GetBlockData(0)->GetBoundsI(IndexDomain::interior);
-  IndexRange jb = md->GetBlockData(0)->GetBoundsJ(IndexDomain::interior);
-  IndexRange kb = md->GetBlockData(0)->GetBoundsK(IndexDomain::interior);
-
-  Real mindx = std::numeric_limits<Real>::max();
-
-  bool nx2 = prim_pack.GetDim(2) > 1;
-  bool nx3 = prim_pack.GetDim(3) > 1;
-  pmb->par_reduce(
-      "CalculateGlobalMinDx", 0, prim_pack.GetDim(5) - 1, kb.s, kb.e, jb.s, jb.e, ib.s,
-      ib.e,
-      KOKKOS_LAMBDA(const int b, const int k, const int j, const int i, Real &lmindx) {
-        const auto &coords = prim_pack.GetCoords(b);
-        lmindx = fmin(lmindx, coords.Dxc<1>(k, j, i));
-        if (nx2) {
-          lmindx = fmin(lmindx, coords.Dxc<2>(k, j, i));
-        }
-        if (nx3) {
-          lmindx = fmin(lmindx, coords.Dxc<3>(k, j, i));
-        }
-      },
-      Kokkos::Min<Real>(mindx));
-
-  // Reduction to host var is blocking and only have one of this tasks run at the same
-  // time so modifying the package should be safe.
-  auto mindx_pkg = hydro_pkg->Param<Real>("mindx");
-  if (mindx < mindx_pkg) {
-    hydro_pkg->UpdateParam("mindx", mindx);
-  }
-
-  return TaskStatus::complete;
-}
-
 // Sets all fluxes to 0
 TaskStatus ResetFluxes(MeshData<Real> *md) {
   auto pmb = md->GetBlockData(0)->GetBlockPointer();
@@ -444,60 +404,6 @@ TaskCollection HydroDriver::MakeTaskCollection(BlockList_t &blocks, int stage) {
     }
   }
 
-  // Calculate hyperbolic divergence cleaning speed
-  // TODO(pgrete) Calculating mindx is only required after remeshing. Need to find a clean
-  // solution for this one-off global reduction.
-  // TODO(PG) move this to PreStepMeshUserWorkInLoop
-  if ((hydro_pkg->Param<bool>("calc_c_h") ||
-       hydro_pkg->Param<DiffInt>("diffint") != DiffInt::none) &&
-      (stage == 1)) {
-    // need to make sure that there's only one region in order to MPI_reduce to work
-    TaskRegion &single_task_region = tc.AddRegion(1);
-    auto &tl = single_task_region[0];
-    // Adding one task for each partition. Not using a (new) single partition containing
-    // all blocks here as this (default) split is also used for the following tasks and
-    // thus does not create an overhead (such as creating a new MeshBlockPack that is just
-    // used here). Given that all partitions are in one task list they'll be executed
-    // sequentially. Given that a par_reduce to a host var is blocking it's also save to
-    // store the variable in the Params for now.
-    auto prev_task = none;
-    for (int i = 0; i < num_partitions; i++) {
-      auto &mu0 = pmesh->mesh_data.GetOrAdd("base", i);
-      auto new_mindx = tl.AddTask(prev_task, CalculateGlobalMinDx, mu0.get());
-      prev_task = new_mindx;
-    }
-    auto reduce_c_h = prev_task;
-#ifdef MPI_PARALLEL
-    reduce_c_h = tl.AddTask(
-        prev_task,
-        [](StateDescriptor *hydro_pkg) {
-          Real mins[3];
-          mins[0] = hydro_pkg->Param<Real>("mindx");
-          mins[1] = hydro_pkg->Param<Real>("dt_hyp");
-          mins[2] = hydro_pkg->Param<Real>("dt_diff");
-          PARTHENON_MPI_CHECK(MPI_Allreduce(MPI_IN_PLACE, mins, 3, MPI_PARTHENON_REAL,
-                                            MPI_MIN, MPI_COMM_WORLD));
-
-          hydro_pkg->UpdateParam("mindx", mins[0]);
-          hydro_pkg->UpdateParam("dt_hyp", mins[1]);
-          hydro_pkg->UpdateParam("dt_diff", mins[2]);
-          return TaskStatus::complete;
-        },
-        hydro_pkg.get());
-#endif
-    // Finally update c_h
-    auto update_c_h = tl.AddTask(
-        reduce_c_h,
-        [](StateDescriptor *hydro_pkg) {
-          const auto &mindx = hydro_pkg->Param<Real>("mindx");
-          const auto &cfl_hyp = hydro_pkg->Param<Real>("cfl");
-          const auto &dt_hyp = hydro_pkg->Param<Real>("dt_hyp");
-          hydro_pkg->UpdateParam("c_h", cfl_hyp * mindx / dt_hyp);
-          return TaskStatus::complete;
-        },
-        hydro_pkg.get());
-  }
-
   // calculate magnetic tower scaling
   if ((stage == 1) && hydro_pkg->AllParams().hasKey("magnetic_tower_power_scaling") &&
       hydro_pkg->Param<bool>("magnetic_tower_power_scaling")) {