diff --git a/external/parthenon b/external/parthenon index 5b6bb619..80b2c10b 160000 --- a/external/parthenon +++ b/external/parthenon @@ -1 +1 @@ -Subproject commit 5b6bb61906f7c278f9724ee9f38e79dee8707098 +Subproject commit 80b2c10b6c3f1bda31f174f8d13564536b235347 diff --git a/src/hydro/hydro.cpp b/src/hydro/hydro.cpp index 6d0e087c..46c18a30 100644 --- a/src/hydro/hydro.cpp +++ b/src/hydro/hydro.cpp @@ -848,7 +848,7 @@ TaskStatus CalculateFluxes(std::shared_ptr> &md) { parthenon::ScratchPad2D::shmem_size(num_scratch_vars, nx1) * 3; // set the loop limits il = ib.s - 1, iu = ib.e + 1, kl = kb.s, ku = kb.e; - if (pmb->block_size.nx3 == 1) // 2D + if (pmb->block_size.nx3 == 1) // 2D kl = kb.s, ku = kb.e; else // 3D kl = kb.s - 1, ku = kb.e + 1; @@ -951,6 +951,236 @@ TaskStatus CalculateFluxes(std::shared_ptr> &md) { return TaskStatus::complete; } +// Calculate fluxes using scratch pad memory, i.e., over cached pencils in i-dir. +template<> //forrestglines:: Specialize for testing +TaskStatus CalculateFluxes + (std::shared_ptr> &md) { + auto const fluid = Fluid::euler; + auto const recon = Reconstruction::plm; + auto const rsolver = RiemannSolver::hlle; + auto pmb = md->GetBlockData(0)->GetBlockPointer(); + IndexRange ib = pmb->cellbounds.GetBoundsI(IndexDomain::interior); + IndexRange jb = pmb->cellbounds.GetBoundsJ(IndexDomain::interior); + IndexRange kb = pmb->cellbounds.GetBoundsK(IndexDomain::interior); + int il, iu, jl, ju, kl, ku; + jl = jb.s, ju = jb.e, kl = kb.s, ku = kb.e; + // TODO(pgrete): are these looop limits are likely too large for 2nd order + if (pmb->block_size.nx2 > 1) { + if (pmb->block_size.nx3 == 1) // 2D + jl = jb.s - 1, ju = jb.e + 1, kl = kb.s, ku = kb.e; + else // 3D + jl = jb.s - 1, ju = jb.e + 1, kl = kb.s - 1, ku = kb.e + 1; + } + + std::vector flags_ind({Metadata::Independent}); + auto cons_in = md->PackVariablesAndFluxes(flags_ind); + auto pkg = pmb->packages.Get("Hydro"); + const auto nhydro = pkg->Param("nhydro"); + const auto nscalars = pkg->Param("nscalars"); + + const auto &eos = + pkg->Param::type>("eos"); + + auto num_scratch_vars = nhydro + nscalars; + + // Hyperbolic divergence cleaning speed for GLM MHD + Real c_h = 0.0; + if (fluid == Fluid::glmmhd) { + c_h = pkg->Param("c_h"); + } + + auto const &prim_in = md->PackVariables(std::vector{"prim"}); + + const int scratch_level = + pkg->Param("scratch_level"); // 0 is actual scratch (tiny); 1 is HBM + const int nx1 = pmb->cellbounds.ncellsi(IndexDomain::entire); + + size_t scratch_size_in_bytes = + parthenon::ScratchPad2D::shmem_size(num_scratch_vars, nx1) * 2; + + auto riemann = Riemann(); + + parthenon::par_for_outer( + DEFAULT_OUTER_LOOP_PATTERN, "x1 flux", DevExecSpace(), scratch_size_in_bytes, + scratch_level, 0, cons_in.GetDim(5) - 1, kl, ku, jl, ju, + KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int b, const int k, const int j) { + const auto &prim = prim_in(b); + auto &cons = cons_in(b); + parthenon::ScratchPad2D wl(member.team_scratch(scratch_level), + num_scratch_vars, nx1); + parthenon::ScratchPad2D wr(member.team_scratch(scratch_level), + num_scratch_vars, nx1); + // get reconstructed state on faces + Reconstruct(member, k, j, ib.s - 1, ib.e + 1, prim, wl, wr); + // Sync all threads in the team so that scratch memory is consistent + member.team_barrier(); + + riemann.Solve(member, k, j, ib.s, ib.e + 1, IV1, wl, wr, cons, eos, c_h); + member.team_barrier(); + + // Passive scalar fluxes + for (auto n = nhydro; n < nhydro + nscalars; ++n) { + parthenon::par_for_inner(member, ib.s, ib.e + 1, [&](const int i) { + if (cons.flux(IV1, IDN, k, j, i) >= 0.0) { + cons.flux(IV1, n, k, j, i) = cons.flux(IV1, IDN, k, j, i) * wl(n, i); + } else { + cons.flux(IV1, n, k, j, i) = cons.flux(IV1, IDN, k, j, i) * wr(n, i); + } + }); + } + }); + + //-------------------------------------------------------------------------------------- + // j-direction + if (pmb->pmy_mesh->ndim >= 2) { + scratch_size_in_bytes = + parthenon::ScratchPad2D::shmem_size(num_scratch_vars, nx1) * 3; + // set the loop limits + il = ib.s - 1, iu = ib.e + 1, kl = kb.s, ku = kb.e; + if (pmb->block_size.nx3 == 1) // 2D + kl = kb.s, ku = kb.e; + else // 3D + kl = kb.s - 1, ku = kb.e + 1; + + parthenon::par_for_outer( + DEFAULT_OUTER_LOOP_PATTERN, "x2 flux", DevExecSpace(), scratch_size_in_bytes, + scratch_level, 0, cons_in.GetDim(5) - 1, kl, ku, + KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int b, const int k) { + const auto &prim = prim_in(b); + auto &cons = cons_in(b); + parthenon::ScratchPad2D wl(member.team_scratch(scratch_level), + num_scratch_vars, nx1); + parthenon::ScratchPad2D wr(member.team_scratch(scratch_level), + num_scratch_vars, nx1); + parthenon::ScratchPad2D wlb(member.team_scratch(scratch_level), + num_scratch_vars, nx1); + for (int j = jb.s - 1; j <= jb.e + 1; ++j) { + // reconstruct L/R states at j + Reconstruct(member, k, j, il, iu, prim, wlb, wr); + // Sync all threads in the team so that scratch memory is consistent + member.team_barrier(); + + if (j > jb.s - 1) { + riemann.Solve(member, k, j, il, iu, IV2, wl, wr, cons, eos, c_h); + member.team_barrier(); + + // Passive scalar fluxes + for (auto n = nhydro; n < nhydro + nscalars; ++n) { + parthenon::par_for_inner(member, il, iu, [&](const int i) { + if (cons.flux(IV2, IDN, k, j, i) >= 0.0) { + cons.flux(IV2, n, k, j, i) = cons.flux(IV2, IDN, k, j, i) * wl(n, i); + } else { + cons.flux(IV2, n, k, j, i) = cons.flux(IV2, IDN, k, j, i) * wr(n, i); + } + }); + } + member.team_barrier(); + } + + // swap the arrays for the next step + auto *tmp = wl.data(); + wl.assign_data(wlb.data()); + wlb.assign_data(tmp); + } + }); + } + //-------------------------------------------------------------------------------------- + // k-direction + if (pmb->pmy_mesh->ndim >= 3) { + // set the loop limits + il = ib.s - 1, iu = ib.e + 1, jl = jb.s - 1, ju = jb.e + 1; + + const int tile_width = 16; //TODO(forrestglines): Magic number, set to cache line size for NVIDIA in doubles + const int tile_height = 8; //TODO(forrestglines): Choose tile_height based on width/NHYDRO/shared mem + + constexpr const int tile_ghosts = 0*(recon == Reconstruction::dc) + +1*(recon == Reconstruction::limo3 || recon == Reconstruction::plm || recon == Reconstruction::weno3) + +2*(recon == Reconstruction::ppm || recon == Reconstruction::wenoz); + + const int tile_entire_height = tile_height + tile_ghosts*2; + + //TODO(forrestglines): could reduce first shmem to 1 var + scratch_size_in_bytes = + parthenon::ScratchPad3D::shmem_size(num_scratch_vars, tile_entire_height, tile_width) + +2*parthenon::ScratchPad3D::shmem_size(num_scratch_vars, tile_height, tile_width); + + //TODO(forrestglines):Are these number of tiles right? + const int n_k_tiles = static_cast(ceil(static_cast(kb.e + 1 - kb.s)/tile_height)); + const int n_i_tiles = static_cast(ceil(static_cast(ib.e + 1 - ib.s)/tile_width)); + + parthenon::par_for_outer( + DEFAULT_OUTER_LOOP_PATTERN, "x3 flux", DevExecSpace(), scratch_size_in_bytes, + scratch_level, 0, cons_in.GetDim(5) - 1, jl, ju, 0 , n_k_tiles-1, 0, n_i_tiles-1, + KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int b, const int j, const int k_tile, const int i_tile) { + const auto &prim = prim_in(b); + auto &cons = cons_in(b); + + //TODO(Are these ends correct?) + //Determine the start and end of the tile in k + const int k_tile_s = kb.s + tile_height*k_tile; + const int k_tile_e = std::min(kb.s + tile_height*(k_tile+1) -1,kb.e); + + //Determine the start and end of the tile in i + const int i_tile_s = il + tile_width*i_tile; + const int i_tile_e = std::min(il + tile_width*(i_tile+1) - 1,iu); + + //Load a tile of prim into scratch memory + parthenon::ScratchPad3D q(member.team_scratch(scratch_level), + num_scratch_vars, tile_entire_height, tile_width); + parthenon::par_for_inner(parthenon::inner_loop_pattern_ttr_tag, member, 0, num_scratch_vars-1, + 0, tile_entire_height-1, + 0, tile_width-1, [&](const int n, const int k_in_tile, const int i_in_tile) { + const int k = k_tile_s - tile_ghosts + k_in_tile; + const int i = i_tile_s + i_in_tile; + q(n, k_in_tile, i_in_tile) = cons(n, k, j, i); + }); + member.team_barrier(); + + //Perform reconstruction on the primitives in scratch memory + //FIXME: Does tileheight need to be one bigger to account for extra computation? + parthenon::ScratchPad3D wl(member.team_scratch(scratch_level), + num_scratch_vars, tile_height, tile_width); + parthenon::ScratchPad3D wr(member.team_scratch(scratch_level), + num_scratch_vars, tile_height, tile_width); + parthenon::par_for_inner(parthenon::inner_loop_pattern_ttr_tag,member, + 0, num_scratch_vars-1, + 0, tile_height-1, + 0, tile_width-1, [&](const int n, const int k_in_tile, const int i_in_tile) { + + Reconstruct(n, k_in_tile, i_in_tile, q, wl, wr); + }); + member.team_barrier(); + + //Solve the Riemann problem + parthenon::par_for_inner(parthenon::inner_loop_pattern_ttr_tag,member, + 0, tile_height-1, + 0, tile_width-1, [&](const int k_in_tile, const int i_in_tile) { + const int k = k_tile_s + k_in_tile; + const int i = i_tile_s + i_in_tile; + + //Reconstruction + riemann.Solve(k, j, i, k_in_tile, i_in_tile, IV3, wl, wr, cons, eos, c_h); + + for (auto n = nhydro; n < nhydro + nscalars; ++n) { + if (cons.flux(IV3, IDN, k, j, i) >= 0.0) { + cons.flux(IV3, n, k, j, i) = cons.flux(IV3, IDN, k, j, i) * wl(n, k_in_tile, i_in_tile); + } else { + cons.flux(IV3, n, k, j, i) = cons.flux(IV3, IDN, k, j, i) * wr(n, k_in_tile, i_in_tile); + } + } + }); + }); + } + + const auto &conduction = pkg->Param("conduction"); + if (conduction != Conduction::none) { + ThermalFluxAniso(md.get()); + } + + return TaskStatus::complete; +} + // Apply first order flux correction, i.e., use first order reconstruction and a // diffusive LLF Riemann solver if a negative density or energy density is expected. // The current implementation is computationally not the most efficient one, but works diff --git a/src/hydro/rsolvers/hydro_hlle.hpp b/src/hydro/rsolvers/hydro_hlle.hpp index 814acc63..893142be 100644 --- a/src/hydro/rsolvers/hydro_hlle.hpp +++ b/src/hydro/rsolvers/hydro_hlle.hpp @@ -132,6 +132,102 @@ struct Riemann { cons.flux(ivx, IEN, k, j, i) = flxi[IEN]; }); } + + static KOKKOS_INLINE_FUNCTION void + Solve(const int k, const int j, const int i, + const int j_in_tile, const int i_in_tile, + const int ivx, + const ScratchPad3D &wl, + const ScratchPad3D &wr, + VariableFluxPack &cons, + const AdiabaticHydroEOS &eos, const Real c_h) { + int ivy = IV1 + ((ivx - IV1) + 1) % 3; + int ivz = IV1 + ((ivx - IV1) + 2) % 3; + Real gamma; + gamma = eos.GetGamma(); + Real gm1 = gamma - 1.0; + Real igm1 = 1.0 / gm1; + Real wli[(NHYDRO)], wri[(NHYDRO)]; + Real fl[(NHYDRO)], fr[(NHYDRO)], flxi[(NHYDRO)]; + //--- Step 1. Load L/R states into local variables + wli[IDN] = wl(IDN, j_in_tile, i_in_tile); + wli[IV1] = wl(ivx, j_in_tile, i_in_tile); + wli[IV2] = wl(ivy, j_in_tile, i_in_tile); + wli[IV3] = wl(ivz, j_in_tile, i_in_tile); + wli[IPR] = wl(IPR, j_in_tile, i_in_tile); + + wri[IDN] = wr(IDN, j_in_tile, i_in_tile); + wri[IV1] = wr(ivx, j_in_tile, i_in_tile); + wri[IV2] = wr(ivy, j_in_tile, i_in_tile); + wri[IV3] = wr(ivz, j_in_tile, i_in_tile); + wri[IPR] = wr(IPR, j_in_tile, i_in_tile); + + //--- Step 2. Compute middle state estimates with PVRS (Toro 10.5.2) + Real al, ar, el, er; + Real cl = eos.SoundSpeed(wli); + Real cr = eos.SoundSpeed(wri); + el = wli[IPR] * igm1 + + 0.5 * wli[IDN] * (SQR(wli[IV1]) + SQR(wli[IV2]) + SQR(wli[IV3])); + er = wri[IPR] * igm1 + + 0.5 * wri[IDN] * (SQR(wri[IV1]) + SQR(wri[IV2]) + SQR(wri[IV3])); + Real rhoa = .5 * (wli[IDN] + wri[IDN]); // average density + Real ca = .5 * (cl + cr); // average sound speed + Real pmid = .5 * (wli[IPR] + wri[IPR] + (wli[IV1] - wri[IV1]) * rhoa * ca); + + //--- Step 3. Compute sound speed in L,R + Real ql, qr; + ql = (pmid <= wli[IPR]) + ? 1.0 + : (1.0 + (gamma + 1) / std::sqrt(2 * gamma) * (pmid / wli[IPR] - 1.0)); + qr = (pmid <= wri[IPR]) + ? 1.0 + : (1.0 + (gamma + 1) / std::sqrt(2 * gamma) * (pmid / wri[IPR] - 1.0)); + + //--- Step 4. Compute the max/min wave speeds based on L/R states + + al = wli[IV1] - cl * ql; + ar = wri[IV1] + cr * qr; + + Real bp = ar > 0.0 ? ar : 0.0; + Real bm = al < 0.0 ? al : 0.0; + + //-- Step 5. Compute L/R fluxes along lines bm/bp: F_L - (S_L)U_L; F_R - (S_R)U_R + Real vxl = wli[IV1] - bm; + Real vxr = wri[IV1] - bp; + + fl[IDN] = wli[IDN] * vxl; + fr[IDN] = wri[IDN] * vxr; + + fl[IV1] = wli[IDN] * wli[IV1] * vxl; + fr[IV1] = wri[IDN] * wri[IV1] * vxr; + + fl[IV2] = wli[IDN] * wli[IV2] * vxl; + fr[IV2] = wri[IDN] * wri[IV2] * vxr; + + fl[IV3] = wli[IDN] * wli[IV3] * vxl; + fr[IV3] = wri[IDN] * wri[IV3] * vxr; + + fl[IV1] += wli[IPR]; + fr[IV1] += wri[IPR]; + fl[IEN] = el * vxl + wli[IPR] * wli[IV1]; + fr[IEN] = er * vxr + wri[IPR] * wri[IV1]; + + //--- Step 6. Compute the HLLE flux at interface. + Real tmp = 0.0; + if (bp != bm) tmp = 0.5 * (bp + bm) / (bp - bm); + + flxi[IDN] = 0.5 * (fl[IDN] + fr[IDN]) + (fl[IDN] - fr[IDN]) * tmp; + flxi[IV1] = 0.5 * (fl[IV1] + fr[IV1]) + (fl[IV1] - fr[IV1]) * tmp; + flxi[IV2] = 0.5 * (fl[IV2] + fr[IV2]) + (fl[IV2] - fr[IV2]) * tmp; + flxi[IV3] = 0.5 * (fl[IV3] + fr[IV3]) + (fl[IV3] - fr[IV3]) * tmp; + flxi[IEN] = 0.5 * (fl[IEN] + fr[IEN]) + (fl[IEN] - fr[IEN]) * tmp; + + cons.flux(ivx, IDN, k, j, i) = flxi[IDN]; + cons.flux(ivx, ivx, k, j, i) = flxi[IV1]; + cons.flux(ivx, ivy, k, j, i) = flxi[IV2]; + cons.flux(ivx, ivz, k, j, i) = flxi[IV3]; + cons.flux(ivx, IEN, k, j, i) = flxi[IEN]; + } }; #endif // RSOLVERS_HYDRO_HLLE_HPP_ diff --git a/src/recon/plm_simple.hpp b/src/recon/plm_simple.hpp index 45f4c720..19795b7a 100644 --- a/src/recon/plm_simple.hpp +++ b/src/recon/plm_simple.hpp @@ -12,6 +12,7 @@ #include using parthenon::ScratchPad2D; +using parthenon::ScratchPad3D; //---------------------------------------------------------------------------------------- //! \fn PLM() // \brief Reconstructs linear slope in cell i to compute ql(i+1) and qr(i). Works for @@ -63,10 +64,31 @@ Reconstruct(parthenon::team_mbr_t const &member, const int k, const int j, const // ql is ql_kp1 and qr is qr_k PLM(q(n, k - 1, j, i), q(n, k, j, i), q(n, k + 1, j, i), ql(n, i), qr(n, i)); } else { - PARTHENON_FAIL("Unknow direction for PLM reconstruction.") + PARTHENON_FAIL("Unknown direction for PLM reconstruction.") } }); } } +template +KOKKOS_INLINE_FUNCTION typename std::enable_if::type +Reconstruct(const int n, const int j, const int i, + const parthenon::ScratchPad3D &q, ScratchPad3D &ql, + ScratchPad3D &qr) { + if constexpr (XNDIR == parthenon::X1DIR) { + // ql is ql_ip1 and qr is qr_i + PLM(q(n, j, i-1), q(n, j, i), q(n, j, i+1), ql(n, j, i), qr(n, j, i)); + } else if constexpr (XNDIR == parthenon::X2DIR) { + // ql is ql_jp1 and qr is qr_j + //PLM(q(n, j - 1, i), q(n, j, i), q(n, j + 1, i), ql(n, j, i), qr(n, j, i)); + PLM(q(n, j, i), q(n, j+1, i), q(n, j + 2, i), ql(n, j, i), qr(n, j, i)); + } else if constexpr (XNDIR == parthenon::X3DIR) { + // ql is ql_kp1 and qr is qr_k + //FIXME Watchout for indexing in q, ql, qr -- they're different sizes! + PLM(q(n, j, i), q(n, j+1, i), q(n, j + 2, i), ql(n, j, i), qr(n, j, i)); + } else { + PARTHENON_FAIL("Unknow direction for PLM reconstruction.") + } +} + #endif // RECONSTRUCT_PLM_SIMPLE_HPP_