Skip to content

Commit

Permalink
WIP: Tiling z flux for performance
Browse files Browse the repository at this point in the history
  • Loading branch information
Forrest Wolfgang Glines committed Jan 30, 2024
1 parent f334d20 commit 726f82b
Show file tree
Hide file tree
Showing 4 changed files with 351 additions and 3 deletions.
2 changes: 1 addition & 1 deletion external/parthenon
232 changes: 231 additions & 1 deletion src/hydro/hydro.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -848,7 +848,7 @@ TaskStatus CalculateFluxes(std::shared_ptr<MeshData<Real>> &md) {
parthenon::ScratchPad2D<Real>::shmem_size(num_scratch_vars, nx1) * 3;
// set the loop limits
il = ib.s - 1, iu = ib.e + 1, kl = kb.s, ku = kb.e;
if (pmb->block_size.nx3 == 1) // 2D
if (pmb->block_size.nx3 == 1) // 2D
kl = kb.s, ku = kb.e;
else // 3D
kl = kb.s - 1, ku = kb.e + 1;
Expand Down Expand Up @@ -951,6 +951,236 @@ TaskStatus CalculateFluxes(std::shared_ptr<MeshData<Real>> &md) {
return TaskStatus::complete;
}

// Calculate fluxes using scratch pad memory, i.e., over cached pencils in i-dir.
template<> //forrestglines:: Specialize for testing
TaskStatus CalculateFluxes<Fluid::euler, Reconstruction::plm, RiemannSolver::hlle>
(std::shared_ptr<MeshData<Real>> &md) {
auto const fluid = Fluid::euler;
auto const recon = Reconstruction::plm;
auto const rsolver = RiemannSolver::hlle;
auto pmb = md->GetBlockData(0)->GetBlockPointer();
IndexRange ib = pmb->cellbounds.GetBoundsI(IndexDomain::interior);
IndexRange jb = pmb->cellbounds.GetBoundsJ(IndexDomain::interior);
IndexRange kb = pmb->cellbounds.GetBoundsK(IndexDomain::interior);
int il, iu, jl, ju, kl, ku;
jl = jb.s, ju = jb.e, kl = kb.s, ku = kb.e;
// TODO(pgrete): are these looop limits are likely too large for 2nd order
if (pmb->block_size.nx2 > 1) {
if (pmb->block_size.nx3 == 1) // 2D
jl = jb.s - 1, ju = jb.e + 1, kl = kb.s, ku = kb.e;
else // 3D
jl = jb.s - 1, ju = jb.e + 1, kl = kb.s - 1, ku = kb.e + 1;
}

std::vector<parthenon::MetadataFlag> flags_ind({Metadata::Independent});
auto cons_in = md->PackVariablesAndFluxes(flags_ind);
auto pkg = pmb->packages.Get("Hydro");
const auto nhydro = pkg->Param<int>("nhydro");
const auto nscalars = pkg->Param<int>("nscalars");

const auto &eos =
pkg->Param<typename std::conditional<fluid == Fluid::euler, AdiabaticHydroEOS,
AdiabaticGLMMHDEOS>::type>("eos");

auto num_scratch_vars = nhydro + nscalars;

// Hyperbolic divergence cleaning speed for GLM MHD
Real c_h = 0.0;
if (fluid == Fluid::glmmhd) {
c_h = pkg->Param<Real>("c_h");
}

auto const &prim_in = md->PackVariables(std::vector<std::string>{"prim"});

const int scratch_level =
pkg->Param<int>("scratch_level"); // 0 is actual scratch (tiny); 1 is HBM
const int nx1 = pmb->cellbounds.ncellsi(IndexDomain::entire);

size_t scratch_size_in_bytes =
parthenon::ScratchPad2D<Real>::shmem_size(num_scratch_vars, nx1) * 2;

auto riemann = Riemann<fluid, rsolver>();

parthenon::par_for_outer(
DEFAULT_OUTER_LOOP_PATTERN, "x1 flux", DevExecSpace(), scratch_size_in_bytes,
scratch_level, 0, cons_in.GetDim(5) - 1, kl, ku, jl, ju,
KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int b, const int k, const int j) {
const auto &prim = prim_in(b);
auto &cons = cons_in(b);
parthenon::ScratchPad2D<Real> wl(member.team_scratch(scratch_level),
num_scratch_vars, nx1);
parthenon::ScratchPad2D<Real> wr(member.team_scratch(scratch_level),
num_scratch_vars, nx1);
// get reconstructed state on faces
Reconstruct<recon, X1DIR>(member, k, j, ib.s - 1, ib.e + 1, prim, wl, wr);
// Sync all threads in the team so that scratch memory is consistent
member.team_barrier();

riemann.Solve(member, k, j, ib.s, ib.e + 1, IV1, wl, wr, cons, eos, c_h);
member.team_barrier();

// Passive scalar fluxes
for (auto n = nhydro; n < nhydro + nscalars; ++n) {
parthenon::par_for_inner(member, ib.s, ib.e + 1, [&](const int i) {
if (cons.flux(IV1, IDN, k, j, i) >= 0.0) {
cons.flux(IV1, n, k, j, i) = cons.flux(IV1, IDN, k, j, i) * wl(n, i);
} else {
cons.flux(IV1, n, k, j, i) = cons.flux(IV1, IDN, k, j, i) * wr(n, i);
}
});
}
});

//--------------------------------------------------------------------------------------
// j-direction
if (pmb->pmy_mesh->ndim >= 2) {
scratch_size_in_bytes =
parthenon::ScratchPad2D<Real>::shmem_size(num_scratch_vars, nx1) * 3;
// set the loop limits
il = ib.s - 1, iu = ib.e + 1, kl = kb.s, ku = kb.e;
if (pmb->block_size.nx3 == 1) // 2D
kl = kb.s, ku = kb.e;
else // 3D
kl = kb.s - 1, ku = kb.e + 1;

parthenon::par_for_outer(
DEFAULT_OUTER_LOOP_PATTERN, "x2 flux", DevExecSpace(), scratch_size_in_bytes,
scratch_level, 0, cons_in.GetDim(5) - 1, kl, ku,
KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int b, const int k) {
const auto &prim = prim_in(b);
auto &cons = cons_in(b);
parthenon::ScratchPad2D<Real> wl(member.team_scratch(scratch_level),
num_scratch_vars, nx1);
parthenon::ScratchPad2D<Real> wr(member.team_scratch(scratch_level),
num_scratch_vars, nx1);
parthenon::ScratchPad2D<Real> wlb(member.team_scratch(scratch_level),
num_scratch_vars, nx1);
for (int j = jb.s - 1; j <= jb.e + 1; ++j) {
// reconstruct L/R states at j
Reconstruct<recon, X2DIR>(member, k, j, il, iu, prim, wlb, wr);
// Sync all threads in the team so that scratch memory is consistent
member.team_barrier();

if (j > jb.s - 1) {
riemann.Solve(member, k, j, il, iu, IV2, wl, wr, cons, eos, c_h);
member.team_barrier();

// Passive scalar fluxes
for (auto n = nhydro; n < nhydro + nscalars; ++n) {
parthenon::par_for_inner(member, il, iu, [&](const int i) {
if (cons.flux(IV2, IDN, k, j, i) >= 0.0) {
cons.flux(IV2, n, k, j, i) = cons.flux(IV2, IDN, k, j, i) * wl(n, i);
} else {
cons.flux(IV2, n, k, j, i) = cons.flux(IV2, IDN, k, j, i) * wr(n, i);
}
});
}
member.team_barrier();
}

// swap the arrays for the next step
auto *tmp = wl.data();
wl.assign_data(wlb.data());
wlb.assign_data(tmp);
}
});
}
//--------------------------------------------------------------------------------------
// k-direction
if (pmb->pmy_mesh->ndim >= 3) {
// set the loop limits
il = ib.s - 1, iu = ib.e + 1, jl = jb.s - 1, ju = jb.e + 1;

const int tile_width = 16; //TODO(forrestglines): Magic number, set to cache line size for NVIDIA in doubles
const int tile_height = 8; //TODO(forrestglines): Choose tile_height based on width/NHYDRO/shared mem

constexpr const int tile_ghosts = 0*(recon == Reconstruction::dc)
+1*(recon == Reconstruction::limo3 || recon == Reconstruction::plm || recon == Reconstruction::weno3)
+2*(recon == Reconstruction::ppm || recon == Reconstruction::wenoz);

const int tile_entire_height = tile_height + tile_ghosts*2;

//TODO(forrestglines): could reduce first shmem to 1 var
scratch_size_in_bytes =
parthenon::ScratchPad3D<Real>::shmem_size(num_scratch_vars, tile_entire_height, tile_width)
+2*parthenon::ScratchPad3D<Real>::shmem_size(num_scratch_vars, tile_height, tile_width);

//TODO(forrestglines):Are these number of tiles right?
const int n_k_tiles = static_cast<int>(ceil(static_cast<Real>(kb.e + 1 - kb.s)/tile_height));
const int n_i_tiles = static_cast<int>(ceil(static_cast<Real>(ib.e + 1 - ib.s)/tile_width));

parthenon::par_for_outer(
DEFAULT_OUTER_LOOP_PATTERN, "x3 flux", DevExecSpace(), scratch_size_in_bytes,
scratch_level, 0, cons_in.GetDim(5) - 1, jl, ju, 0 , n_k_tiles-1, 0, n_i_tiles-1,
KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int b, const int j, const int k_tile, const int i_tile) {
const auto &prim = prim_in(b);
auto &cons = cons_in(b);

//TODO(Are these ends correct?)
//Determine the start and end of the tile in k
const int k_tile_s = kb.s + tile_height*k_tile;
const int k_tile_e = std::min(kb.s + tile_height*(k_tile+1) -1,kb.e);

//Determine the start and end of the tile in i
const int i_tile_s = il + tile_width*i_tile;
const int i_tile_e = std::min(il + tile_width*(i_tile+1) - 1,iu);

//Load a tile of prim into scratch memory
parthenon::ScratchPad3D<Real> q(member.team_scratch(scratch_level),
num_scratch_vars, tile_entire_height, tile_width);
parthenon::par_for_inner(parthenon::inner_loop_pattern_ttr_tag, member, 0, num_scratch_vars-1,
0, tile_entire_height-1,
0, tile_width-1, [&](const int n, const int k_in_tile, const int i_in_tile) {
const int k = k_tile_s - tile_ghosts + k_in_tile;
const int i = i_tile_s + i_in_tile;
q(n, k_in_tile, i_in_tile) = cons(n, k, j, i);
});
member.team_barrier();

//Perform reconstruction on the primitives in scratch memory
//FIXME: Does tileheight need to be one bigger to account for extra computation?
parthenon::ScratchPad3D<Real> wl(member.team_scratch(scratch_level),
num_scratch_vars, tile_height, tile_width);
parthenon::ScratchPad3D<Real> wr(member.team_scratch(scratch_level),
num_scratch_vars, tile_height, tile_width);
parthenon::par_for_inner(parthenon::inner_loop_pattern_ttr_tag,member,
0, num_scratch_vars-1,
0, tile_height-1,
0, tile_width-1, [&](const int n, const int k_in_tile, const int i_in_tile) {

Reconstruct<recon, X3DIR>(n, k_in_tile, i_in_tile, q, wl, wr);
});
member.team_barrier();

//Solve the Riemann problem
parthenon::par_for_inner(parthenon::inner_loop_pattern_ttr_tag,member,
0, tile_height-1,
0, tile_width-1, [&](const int k_in_tile, const int i_in_tile) {
const int k = k_tile_s + k_in_tile;
const int i = i_tile_s + i_in_tile;

//Reconstruction
riemann.Solve(k, j, i, k_in_tile, i_in_tile, IV3, wl, wr, cons, eos, c_h);

for (auto n = nhydro; n < nhydro + nscalars; ++n) {
if (cons.flux(IV3, IDN, k, j, i) >= 0.0) {
cons.flux(IV3, n, k, j, i) = cons.flux(IV3, IDN, k, j, i) * wl(n, k_in_tile, i_in_tile);
} else {
cons.flux(IV3, n, k, j, i) = cons.flux(IV3, IDN, k, j, i) * wr(n, k_in_tile, i_in_tile);
}
}
});
});
}

const auto &conduction = pkg->Param<Conduction>("conduction");
if (conduction != Conduction::none) {
ThermalFluxAniso(md.get());
}

return TaskStatus::complete;
}

// Apply first order flux correction, i.e., use first order reconstruction and a
// diffusive LLF Riemann solver if a negative density or energy density is expected.
// The current implementation is computationally not the most efficient one, but works
Expand Down
96 changes: 96 additions & 0 deletions src/hydro/rsolvers/hydro_hlle.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,102 @@ struct Riemann<Fluid::euler, RiemannSolver::hlle> {
cons.flux(ivx, IEN, k, j, i) = flxi[IEN];
});
}

static KOKKOS_INLINE_FUNCTION void
Solve(const int k, const int j, const int i,
const int j_in_tile, const int i_in_tile,
const int ivx,
const ScratchPad3D<Real> &wl,
const ScratchPad3D<Real> &wr,
VariableFluxPack<Real> &cons,
const AdiabaticHydroEOS &eos, const Real c_h) {
int ivy = IV1 + ((ivx - IV1) + 1) % 3;
int ivz = IV1 + ((ivx - IV1) + 2) % 3;
Real gamma;
gamma = eos.GetGamma();
Real gm1 = gamma - 1.0;
Real igm1 = 1.0 / gm1;
Real wli[(NHYDRO)], wri[(NHYDRO)];
Real fl[(NHYDRO)], fr[(NHYDRO)], flxi[(NHYDRO)];
//--- Step 1. Load L/R states into local variables
wli[IDN] = wl(IDN, j_in_tile, i_in_tile);
wli[IV1] = wl(ivx, j_in_tile, i_in_tile);
wli[IV2] = wl(ivy, j_in_tile, i_in_tile);
wli[IV3] = wl(ivz, j_in_tile, i_in_tile);
wli[IPR] = wl(IPR, j_in_tile, i_in_tile);

wri[IDN] = wr(IDN, j_in_tile, i_in_tile);
wri[IV1] = wr(ivx, j_in_tile, i_in_tile);
wri[IV2] = wr(ivy, j_in_tile, i_in_tile);
wri[IV3] = wr(ivz, j_in_tile, i_in_tile);
wri[IPR] = wr(IPR, j_in_tile, i_in_tile);

//--- Step 2. Compute middle state estimates with PVRS (Toro 10.5.2)
Real al, ar, el, er;
Real cl = eos.SoundSpeed(wli);
Real cr = eos.SoundSpeed(wri);
el = wli[IPR] * igm1 +
0.5 * wli[IDN] * (SQR(wli[IV1]) + SQR(wli[IV2]) + SQR(wli[IV3]));
er = wri[IPR] * igm1 +
0.5 * wri[IDN] * (SQR(wri[IV1]) + SQR(wri[IV2]) + SQR(wri[IV3]));
Real rhoa = .5 * (wli[IDN] + wri[IDN]); // average density
Real ca = .5 * (cl + cr); // average sound speed
Real pmid = .5 * (wli[IPR] + wri[IPR] + (wli[IV1] - wri[IV1]) * rhoa * ca);

//--- Step 3. Compute sound speed in L,R
Real ql, qr;
ql = (pmid <= wli[IPR])
? 1.0
: (1.0 + (gamma + 1) / std::sqrt(2 * gamma) * (pmid / wli[IPR] - 1.0));
qr = (pmid <= wri[IPR])
? 1.0
: (1.0 + (gamma + 1) / std::sqrt(2 * gamma) * (pmid / wri[IPR] - 1.0));

//--- Step 4. Compute the max/min wave speeds based on L/R states

al = wli[IV1] - cl * ql;
ar = wri[IV1] + cr * qr;

Real bp = ar > 0.0 ? ar : 0.0;
Real bm = al < 0.0 ? al : 0.0;

//-- Step 5. Compute L/R fluxes along lines bm/bp: F_L - (S_L)U_L; F_R - (S_R)U_R
Real vxl = wli[IV1] - bm;
Real vxr = wri[IV1] - bp;

fl[IDN] = wli[IDN] * vxl;
fr[IDN] = wri[IDN] * vxr;

fl[IV1] = wli[IDN] * wli[IV1] * vxl;
fr[IV1] = wri[IDN] * wri[IV1] * vxr;

fl[IV2] = wli[IDN] * wli[IV2] * vxl;
fr[IV2] = wri[IDN] * wri[IV2] * vxr;

fl[IV3] = wli[IDN] * wli[IV3] * vxl;
fr[IV3] = wri[IDN] * wri[IV3] * vxr;

fl[IV1] += wli[IPR];
fr[IV1] += wri[IPR];
fl[IEN] = el * vxl + wli[IPR] * wli[IV1];
fr[IEN] = er * vxr + wri[IPR] * wri[IV1];

//--- Step 6. Compute the HLLE flux at interface.
Real tmp = 0.0;
if (bp != bm) tmp = 0.5 * (bp + bm) / (bp - bm);

flxi[IDN] = 0.5 * (fl[IDN] + fr[IDN]) + (fl[IDN] - fr[IDN]) * tmp;
flxi[IV1] = 0.5 * (fl[IV1] + fr[IV1]) + (fl[IV1] - fr[IV1]) * tmp;
flxi[IV2] = 0.5 * (fl[IV2] + fr[IV2]) + (fl[IV2] - fr[IV2]) * tmp;
flxi[IV3] = 0.5 * (fl[IV3] + fr[IV3]) + (fl[IV3] - fr[IV3]) * tmp;
flxi[IEN] = 0.5 * (fl[IEN] + fr[IEN]) + (fl[IEN] - fr[IEN]) * tmp;

cons.flux(ivx, IDN, k, j, i) = flxi[IDN];
cons.flux(ivx, ivx, k, j, i) = flxi[IV1];
cons.flux(ivx, ivy, k, j, i) = flxi[IV2];
cons.flux(ivx, ivz, k, j, i) = flxi[IV3];
cons.flux(ivx, IEN, k, j, i) = flxi[IEN];
}
};

#endif // RSOLVERS_HYDRO_HLLE_HPP_
24 changes: 23 additions & 1 deletion src/recon/plm_simple.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include <parthenon/parthenon.hpp>

using parthenon::ScratchPad2D;
using parthenon::ScratchPad3D;
//----------------------------------------------------------------------------------------
//! \fn PLM()
// \brief Reconstructs linear slope in cell i to compute ql(i+1) and qr(i). Works for
Expand Down Expand Up @@ -63,10 +64,31 @@ Reconstruct(parthenon::team_mbr_t const &member, const int k, const int j, const
// ql is ql_kp1 and qr is qr_k
PLM(q(n, k - 1, j, i), q(n, k, j, i), q(n, k + 1, j, i), ql(n, i), qr(n, i));
} else {
PARTHENON_FAIL("Unknow direction for PLM reconstruction.")
PARTHENON_FAIL("Unknown direction for PLM reconstruction.")
}
});
}
}

template <Reconstruction recon, int XNDIR>
KOKKOS_INLINE_FUNCTION typename std::enable_if<recon == Reconstruction::plm, void>::type
Reconstruct(const int n, const int j, const int i,
const parthenon::ScratchPad3D<Real> &q, ScratchPad3D<Real> &ql,
ScratchPad3D<Real> &qr) {
if constexpr (XNDIR == parthenon::X1DIR) {
// ql is ql_ip1 and qr is qr_i
PLM(q(n, j, i-1), q(n, j, i), q(n, j, i+1), ql(n, j, i), qr(n, j, i));
} else if constexpr (XNDIR == parthenon::X2DIR) {
// ql is ql_jp1 and qr is qr_j
//PLM(q(n, j - 1, i), q(n, j, i), q(n, j + 1, i), ql(n, j, i), qr(n, j, i));
PLM(q(n, j, i), q(n, j+1, i), q(n, j + 2, i), ql(n, j, i), qr(n, j, i));
} else if constexpr (XNDIR == parthenon::X3DIR) {
// ql is ql_kp1 and qr is qr_k
//FIXME Watchout for indexing in q, ql, qr -- they're different sizes!
PLM(q(n, j, i), q(n, j+1, i), q(n, j + 2, i), ql(n, j, i), qr(n, j, i));
} else {
PARTHENON_FAIL("Unknow direction for PLM reconstruction.")
}
}

#endif // RECONSTRUCT_PLM_SIMPLE_HPP_

0 comments on commit 726f82b

Please sign in to comment.