WIP: Tiling z flux for performance

parthenon-hpc-lab · Jan 30, 2024 · 726f82b · 726f82b
1 parent f334d20
commit 726f82b
Show file tree

Hide file tree

Showing 4 changed files with 351 additions and 3 deletions.
diff --git a/external/parthenon b/external/parthenon
diff --git a/src/hydro/hydro.cpp b/src/hydro/hydro.cpp
@@ -848,7 +848,7 @@ TaskStatus CalculateFluxes(std::shared_ptr<MeshData<Real>> &md) {
         parthenon::ScratchPad2D<Real>::shmem_size(num_scratch_vars, nx1) * 3;
     // set the loop limits
     il = ib.s - 1, iu = ib.e + 1, kl = kb.s, ku = kb.e;
-    if (pmb->block_size.nx3 == 1) // 2D
+    if (pmb->block_size.nx3 == 1) // 2D                                                                        
       kl = kb.s, ku = kb.e;
     else // 3D
       kl = kb.s - 1, ku = kb.e + 1;
@@ -951,6 +951,236 @@ TaskStatus CalculateFluxes(std::shared_ptr<MeshData<Real>> &md) {
   return TaskStatus::complete;
 }
 
+// Calculate fluxes using scratch pad memory, i.e., over cached pencils in i-dir.
+template<>  //forrestglines:: Specialize for testing
+TaskStatus CalculateFluxes<Fluid::euler, Reconstruction::plm, RiemannSolver::hlle>
+    (std::shared_ptr<MeshData<Real>> &md) {
+  auto const fluid = Fluid::euler;
+  auto const recon = Reconstruction::plm;
+  auto const rsolver = RiemannSolver::hlle;
+  auto pmb = md->GetBlockData(0)->GetBlockPointer();
+  IndexRange ib = pmb->cellbounds.GetBoundsI(IndexDomain::interior);
+  IndexRange jb = pmb->cellbounds.GetBoundsJ(IndexDomain::interior);
+  IndexRange kb = pmb->cellbounds.GetBoundsK(IndexDomain::interior);
+  int il, iu, jl, ju, kl, ku;
+  jl = jb.s, ju = jb.e, kl = kb.s, ku = kb.e;
+  // TODO(pgrete): are these looop limits are likely too large for 2nd order
+  if (pmb->block_size.nx2 > 1) {
+    if (pmb->block_size.nx3 == 1) // 2D
+      jl = jb.s - 1, ju = jb.e + 1, kl = kb.s, ku = kb.e;
+    else // 3D
+      jl = jb.s - 1, ju = jb.e + 1, kl = kb.s - 1, ku = kb.e + 1;
+  }
+
+  std::vector<parthenon::MetadataFlag> flags_ind({Metadata::Independent});
+  auto cons_in = md->PackVariablesAndFluxes(flags_ind);
+  auto pkg = pmb->packages.Get("Hydro");
+  const auto nhydro = pkg->Param<int>("nhydro");
+  const auto nscalars = pkg->Param<int>("nscalars");
+
+  const auto &eos =
+      pkg->Param<typename std::conditional<fluid == Fluid::euler, AdiabaticHydroEOS,
+                                           AdiabaticGLMMHDEOS>::type>("eos");
+
+  auto num_scratch_vars = nhydro + nscalars;
+
+  // Hyperbolic divergence cleaning speed for GLM MHD
+  Real c_h = 0.0;
+  if (fluid == Fluid::glmmhd) {
+    c_h = pkg->Param<Real>("c_h");
+  }
+
+  auto const &prim_in = md->PackVariables(std::vector<std::string>{"prim"});
+
+  const int scratch_level =
+      pkg->Param<int>("scratch_level"); // 0 is actual scratch (tiny); 1 is HBM
+  const int nx1 = pmb->cellbounds.ncellsi(IndexDomain::entire);
+
+  size_t scratch_size_in_bytes =
+      parthenon::ScratchPad2D<Real>::shmem_size(num_scratch_vars, nx1) * 2;
+
+  auto riemann = Riemann<fluid, rsolver>();
+
+  parthenon::par_for_outer(
+      DEFAULT_OUTER_LOOP_PATTERN, "x1 flux", DevExecSpace(), scratch_size_in_bytes,
+      scratch_level, 0, cons_in.GetDim(5) - 1, kl, ku, jl, ju,
+      KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int b, const int k, const int j) {
+        const auto &prim = prim_in(b);
+        auto &cons = cons_in(b);
+        parthenon::ScratchPad2D<Real> wl(member.team_scratch(scratch_level),
+                                         num_scratch_vars, nx1);
+        parthenon::ScratchPad2D<Real> wr(member.team_scratch(scratch_level),
+                                         num_scratch_vars, nx1);
+        // get reconstructed state on faces
+        Reconstruct<recon, X1DIR>(member, k, j, ib.s - 1, ib.e + 1, prim, wl, wr);
+        // Sync all threads in the team so that scratch memory is consistent
+        member.team_barrier();
+
+        riemann.Solve(member, k, j, ib.s, ib.e + 1, IV1, wl, wr, cons, eos, c_h);
+        member.team_barrier();
+
+        // Passive scalar fluxes
+        for (auto n = nhydro; n < nhydro + nscalars; ++n) {
+          parthenon::par_for_inner(member, ib.s, ib.e + 1, [&](const int i) {
+            if (cons.flux(IV1, IDN, k, j, i) >= 0.0) {
+              cons.flux(IV1, n, k, j, i) = cons.flux(IV1, IDN, k, j, i) * wl(n, i);
+            } else {
+              cons.flux(IV1, n, k, j, i) = cons.flux(IV1, IDN, k, j, i) * wr(n, i);
+            }
+          });
+        }
+      });
+
+  //--------------------------------------------------------------------------------------
+  // j-direction
+  if (pmb->pmy_mesh->ndim >= 2) {
+    scratch_size_in_bytes =
+        parthenon::ScratchPad2D<Real>::shmem_size(num_scratch_vars, nx1) * 3;
+    // set the loop limits
+    il = ib.s - 1, iu = ib.e + 1, kl = kb.s, ku = kb.e;
+    if (pmb->block_size.nx3 == 1) // 2D
+      kl = kb.s, ku = kb.e;
+    else // 3D
+      kl = kb.s - 1, ku = kb.e + 1;
+
+    parthenon::par_for_outer(
+        DEFAULT_OUTER_LOOP_PATTERN, "x2 flux", DevExecSpace(), scratch_size_in_bytes,
+        scratch_level, 0, cons_in.GetDim(5) - 1, kl, ku,
+        KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int b, const int k) {
+          const auto &prim = prim_in(b);
+          auto &cons = cons_in(b);
+          parthenon::ScratchPad2D<Real> wl(member.team_scratch(scratch_level),
+                                           num_scratch_vars, nx1);
+          parthenon::ScratchPad2D<Real> wr(member.team_scratch(scratch_level),
+                                           num_scratch_vars, nx1);
+          parthenon::ScratchPad2D<Real> wlb(member.team_scratch(scratch_level),
+                                            num_scratch_vars, nx1);
+          for (int j = jb.s - 1; j <= jb.e + 1; ++j) {
+            // reconstruct L/R states at j
+            Reconstruct<recon, X2DIR>(member, k, j, il, iu, prim, wlb, wr);
+            // Sync all threads in the team so that scratch memory is consistent
+            member.team_barrier();
+
+            if (j > jb.s - 1) {
+              riemann.Solve(member, k, j, il, iu, IV2, wl, wr, cons, eos, c_h);
+              member.team_barrier();
+
+              // Passive scalar fluxes
+              for (auto n = nhydro; n < nhydro + nscalars; ++n) {
+                parthenon::par_for_inner(member, il, iu, [&](const int i) {
+                  if (cons.flux(IV2, IDN, k, j, i) >= 0.0) {
+                    cons.flux(IV2, n, k, j, i) = cons.flux(IV2, IDN, k, j, i) * wl(n, i);
+                  } else {
+                    cons.flux(IV2, n, k, j, i) = cons.flux(IV2, IDN, k, j, i) * wr(n, i);
+                  }
+                });
+              }
+              member.team_barrier();
+            }
+
+            // swap the arrays for the next step
+            auto *tmp = wl.data();
+            wl.assign_data(wlb.data());
+            wlb.assign_data(tmp);
+          }
+        });
+  }
+  //--------------------------------------------------------------------------------------
+  // k-direction
+  if (pmb->pmy_mesh->ndim >= 3) {
+    // set the loop limits
+    il = ib.s - 1, iu = ib.e + 1, jl = jb.s - 1, ju = jb.e + 1;
+
+    const int tile_width = 16; //TODO(forrestglines): Magic number, set to cache line size for NVIDIA in doubles
+    const int tile_height = 8; //TODO(forrestglines): Choose tile_height based on width/NHYDRO/shared mem
+
+    constexpr const int tile_ghosts =  0*(recon == Reconstruction::dc)
+                                      +1*(recon == Reconstruction::limo3 || recon == Reconstruction::plm || recon == Reconstruction::weno3)
+                                      +2*(recon == Reconstruction::ppm || recon == Reconstruction::wenoz);
+
+    const int tile_entire_height = tile_height + tile_ghosts*2;
+
+    //TODO(forrestglines): could reduce first shmem to 1 var
+    scratch_size_in_bytes = 
+           parthenon::ScratchPad3D<Real>::shmem_size(num_scratch_vars, tile_entire_height, tile_width)
+        +2*parthenon::ScratchPad3D<Real>::shmem_size(num_scratch_vars, tile_height, tile_width);
+
+    //TODO(forrestglines):Are these number of tiles right?
+    const int n_k_tiles = static_cast<int>(ceil(static_cast<Real>(kb.e + 1 - kb.s)/tile_height));
+    const int n_i_tiles = static_cast<int>(ceil(static_cast<Real>(ib.e + 1 - ib.s)/tile_width));
+
+    parthenon::par_for_outer(
+        DEFAULT_OUTER_LOOP_PATTERN, "x3 flux", DevExecSpace(), scratch_size_in_bytes,
+        scratch_level, 0, cons_in.GetDim(5) - 1, jl, ju, 0 , n_k_tiles-1, 0, n_i_tiles-1,
+        KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int b, const int j, const int k_tile, const int i_tile) {
+          const auto &prim = prim_in(b);
+          auto &cons = cons_in(b);
+
+          //TODO(Are these ends correct?)
+          //Determine the start and end of the tile in k
+          const int k_tile_s = kb.s + tile_height*k_tile;
+          const int k_tile_e = std::min(kb.s + tile_height*(k_tile+1) -1,kb.e);
+
+          //Determine the start and end of the tile in i
+          const int i_tile_s = il + tile_width*i_tile;
+          const int i_tile_e = std::min(il + tile_width*(i_tile+1) - 1,iu);
+
+          //Load a tile of prim into scratch memory
+          parthenon::ScratchPad3D<Real> q(member.team_scratch(scratch_level),
+                                           num_scratch_vars, tile_entire_height, tile_width);
+          parthenon::par_for_inner(parthenon::inner_loop_pattern_ttr_tag, member, 0, num_scratch_vars-1,
+            0, tile_entire_height-1, 
+            0, tile_width-1, [&](const int n, const int k_in_tile, const int i_in_tile) {
+              const int k = k_tile_s - tile_ghosts + k_in_tile;
+              const int i = i_tile_s + i_in_tile;
+              q(n, k_in_tile, i_in_tile) = cons(n, k, j, i);
+          });
+          member.team_barrier();
+
+          //Perform reconstruction on the primitives in scratch memory
+          //FIXME: Does tileheight need to be one bigger to account for extra computation?
+          parthenon::ScratchPad3D<Real> wl(member.team_scratch(scratch_level),
+                                           num_scratch_vars, tile_height, tile_width);
+          parthenon::ScratchPad3D<Real> wr(member.team_scratch(scratch_level),
+                                           num_scratch_vars, tile_height, tile_width);
+          parthenon::par_for_inner(parthenon::inner_loop_pattern_ttr_tag,member,
+            0, num_scratch_vars-1,
+            0, tile_height-1,
+            0, tile_width-1, [&](const int n, const int k_in_tile, const int i_in_tile) {
+
+            Reconstruct<recon, X3DIR>(n, k_in_tile, i_in_tile, q, wl, wr);
+          });
+          member.team_barrier();
+
+          //Solve the Riemann problem
+          parthenon::par_for_inner(parthenon::inner_loop_pattern_ttr_tag,member,
+            0, tile_height-1, 
+            0, tile_width-1, [&](const int k_in_tile, const int i_in_tile) {
+              const int k = k_tile_s + k_in_tile;
+              const int i = i_tile_s + i_in_tile;
+
+              //Reconstruction
+              riemann.Solve(k, j, i, k_in_tile, i_in_tile, IV3, wl, wr, cons, eos, c_h);
+
+              for (auto n = nhydro; n < nhydro + nscalars; ++n) {
+                if (cons.flux(IV3, IDN, k, j, i) >= 0.0) {
+                  cons.flux(IV3, n, k, j, i) = cons.flux(IV3, IDN, k, j, i) * wl(n, k_in_tile, i_in_tile);
+                } else {
+                  cons.flux(IV3, n, k, j, i) = cons.flux(IV3, IDN, k, j, i) * wr(n, k_in_tile, i_in_tile);
+                }
+              }
+          });
+        });
+  }
+
+  const auto &conduction = pkg->Param<Conduction>("conduction");
+  if (conduction != Conduction::none) {
+    ThermalFluxAniso(md.get());
+  }
+
+  return TaskStatus::complete;
+}
+
 // Apply first order flux correction, i.e., use first order reconstruction and a
 // diffusive LLF Riemann solver if a negative density or energy density is expected.
 // The current implementation is computationally not the most efficient one, but works

diff --git a/src/hydro/rsolvers/hydro_hlle.hpp b/src/hydro/rsolvers/hydro_hlle.hpp
@@ -132,6 +132,102 @@ struct Riemann<Fluid::euler, RiemannSolver::hlle> {
       cons.flux(ivx, IEN, k, j, i) = flxi[IEN];
     });
   }
+
+  static KOKKOS_INLINE_FUNCTION void
+  Solve(const int k, const int j, const int i, 
+        const int j_in_tile, const int i_in_tile,
+        const int ivx,
+        const ScratchPad3D<Real> &wl,
+        const ScratchPad3D<Real> &wr, 
+        VariableFluxPack<Real> &cons,
+        const AdiabaticHydroEOS &eos, const Real c_h) {
+    int ivy = IV1 + ((ivx - IV1) + 1) % 3;
+    int ivz = IV1 + ((ivx - IV1) + 2) % 3;
+    Real gamma;
+    gamma = eos.GetGamma();
+    Real gm1 = gamma - 1.0;
+    Real igm1 = 1.0 / gm1;
+    Real wli[(NHYDRO)], wri[(NHYDRO)];
+    Real fl[(NHYDRO)], fr[(NHYDRO)], flxi[(NHYDRO)];
+    //--- Step 1.  Load L/R states into local variables
+    wli[IDN] = wl(IDN, j_in_tile, i_in_tile);
+    wli[IV1] = wl(ivx, j_in_tile, i_in_tile);
+    wli[IV2] = wl(ivy, j_in_tile, i_in_tile);
+    wli[IV3] = wl(ivz, j_in_tile, i_in_tile);
+    wli[IPR] = wl(IPR, j_in_tile, i_in_tile);
+
+    wri[IDN] = wr(IDN, j_in_tile, i_in_tile);
+    wri[IV1] = wr(ivx, j_in_tile, i_in_tile);
+    wri[IV2] = wr(ivy, j_in_tile, i_in_tile);
+    wri[IV3] = wr(ivz, j_in_tile, i_in_tile);
+    wri[IPR] = wr(IPR, j_in_tile, i_in_tile);
+
+    //--- Step 2.  Compute middle state estimates with PVRS (Toro 10.5.2)
+    Real al, ar, el, er;
+    Real cl = eos.SoundSpeed(wli);
+    Real cr = eos.SoundSpeed(wri);
+    el = wli[IPR] * igm1 +
+          0.5 * wli[IDN] * (SQR(wli[IV1]) + SQR(wli[IV2]) + SQR(wli[IV3]));
+    er = wri[IPR] * igm1 +
+          0.5 * wri[IDN] * (SQR(wri[IV1]) + SQR(wri[IV2]) + SQR(wri[IV3]));
+    Real rhoa = .5 * (wli[IDN] + wri[IDN]); // average density
+    Real ca = .5 * (cl + cr);               // average sound speed
+    Real pmid = .5 * (wli[IPR] + wri[IPR] + (wli[IV1] - wri[IV1]) * rhoa * ca);
+
+    //--- Step 3.  Compute sound speed in L,R
+    Real ql, qr;
+    ql = (pmid <= wli[IPR])
+              ? 1.0
+              : (1.0 + (gamma + 1) / std::sqrt(2 * gamma) * (pmid / wli[IPR] - 1.0));
+    qr = (pmid <= wri[IPR])
+              ? 1.0
+              : (1.0 + (gamma + 1) / std::sqrt(2 * gamma) * (pmid / wri[IPR] - 1.0));
+
+    //--- Step 4. Compute the max/min wave speeds based on L/R states
+
+    al = wli[IV1] - cl * ql;
+    ar = wri[IV1] + cr * qr;
+
+    Real bp = ar > 0.0 ? ar : 0.0;
+    Real bm = al < 0.0 ? al : 0.0;
+
+    //-- Step 5. Compute L/R fluxes along lines bm/bp: F_L - (S_L)U_L; F_R - (S_R)U_R
+    Real vxl = wli[IV1] - bm;
+    Real vxr = wri[IV1] - bp;
+
+    fl[IDN] = wli[IDN] * vxl;
+    fr[IDN] = wri[IDN] * vxr;
+
+    fl[IV1] = wli[IDN] * wli[IV1] * vxl;
+    fr[IV1] = wri[IDN] * wri[IV1] * vxr;
+
+    fl[IV2] = wli[IDN] * wli[IV2] * vxl;
+    fr[IV2] = wri[IDN] * wri[IV2] * vxr;
+
+    fl[IV3] = wli[IDN] * wli[IV3] * vxl;
+    fr[IV3] = wri[IDN] * wri[IV3] * vxr;
+
+    fl[IV1] += wli[IPR];
+    fr[IV1] += wri[IPR];
+    fl[IEN] = el * vxl + wli[IPR] * wli[IV1];
+    fr[IEN] = er * vxr + wri[IPR] * wri[IV1];
+
+    //--- Step 6. Compute the HLLE flux at interface.
+    Real tmp = 0.0;
+    if (bp != bm) tmp = 0.5 * (bp + bm) / (bp - bm);
+
+    flxi[IDN] = 0.5 * (fl[IDN] + fr[IDN]) + (fl[IDN] - fr[IDN]) * tmp;
+    flxi[IV1] = 0.5 * (fl[IV1] + fr[IV1]) + (fl[IV1] - fr[IV1]) * tmp;
+    flxi[IV2] = 0.5 * (fl[IV2] + fr[IV2]) + (fl[IV2] - fr[IV2]) * tmp;
+    flxi[IV3] = 0.5 * (fl[IV3] + fr[IV3]) + (fl[IV3] - fr[IV3]) * tmp;
+    flxi[IEN] = 0.5 * (fl[IEN] + fr[IEN]) + (fl[IEN] - fr[IEN]) * tmp;
+
+    cons.flux(ivx, IDN, k, j, i) = flxi[IDN];
+    cons.flux(ivx, ivx, k, j, i) = flxi[IV1];
+    cons.flux(ivx, ivy, k, j, i) = flxi[IV2];
+    cons.flux(ivx, ivz, k, j, i) = flxi[IV3];
+    cons.flux(ivx, IEN, k, j, i) = flxi[IEN];
+  }
 };
 
 #endif // RSOLVERS_HYDRO_HLLE_HPP_
diff --git a/src/recon/plm_simple.hpp b/src/recon/plm_simple.hpp
@@ -12,6 +12,7 @@
 #include <parthenon/parthenon.hpp>
 
 using parthenon::ScratchPad2D;
+using parthenon::ScratchPad3D;
 //----------------------------------------------------------------------------------------
 //! \fn PLM()
 //  \brief Reconstructs linear slope in cell i to compute ql(i+1) and qr(i). Works for
@@ -63,10 +64,31 @@ Reconstruct(parthenon::team_mbr_t const &member, const int k, const int j, const
         // ql is ql_kp1 and qr is qr_k
         PLM(q(n, k - 1, j, i), q(n, k, j, i), q(n, k + 1, j, i), ql(n, i), qr(n, i));
       } else {
-        PARTHENON_FAIL("Unknow direction for PLM reconstruction.")
+        PARTHENON_FAIL("Unknown direction for PLM reconstruction.")
       }
     });
   }
 }
 
+template <Reconstruction recon, int XNDIR>
+KOKKOS_INLINE_FUNCTION typename std::enable_if<recon == Reconstruction::plm, void>::type
+Reconstruct(const int n, const int j, const int i, 
+      const parthenon::ScratchPad3D<Real> &q, ScratchPad3D<Real> &ql,
+            ScratchPad3D<Real> &qr) {
+  if constexpr (XNDIR == parthenon::X1DIR) {
+    // ql is ql_ip1 and qr is qr_i
+    PLM(q(n, j, i-1), q(n, j, i), q(n, j, i+1), ql(n, j, i), qr(n, j, i));
+  } else if constexpr (XNDIR == parthenon::X2DIR) {
+    // ql is ql_jp1 and qr is qr_j
+    //PLM(q(n, j - 1, i), q(n, j, i), q(n, j + 1, i), ql(n, j, i), qr(n, j, i));
+    PLM(q(n, j, i), q(n, j+1, i), q(n, j + 2, i), ql(n, j, i), qr(n, j, i));
+  } else if constexpr (XNDIR == parthenon::X3DIR) {
+    // ql is ql_kp1 and qr is qr_k
+    //FIXME Watchout for indexing in q, ql, qr -- they're different sizes!
+    PLM(q(n, j, i), q(n, j+1, i), q(n, j + 2, i), ql(n, j, i), qr(n, j, i));
+  } else {
+    PARTHENON_FAIL("Unknow direction for PLM reconstruction.")
+  }
+}
+
 #endif // RECONSTRUCT_PLM_SIMPLE_HPP_