Skip to content

Commit

Permalink
Fix event consumption counting for CUDA graphs
Browse files Browse the repository at this point in the history
Previously, debug build runs with CUDA graphs were crashing due to
event consumption counting issues associated with re-marking of
events. This change avoids event remarking for PME and PP on same
rank, and allows the recessary coordinate availability remarking to
allow graphs to be defined and launched through
setConsumptionLimits().
  • Loading branch information
agray3 authored and lundborgmagnus committed Nov 7, 2023
1 parent 69085a9 commit 9bf709b
Show file tree
Hide file tree
Showing 6 changed files with 15 additions and 12 deletions.
8 changes: 4 additions & 4 deletions src/gromacs/ewald/pme.h
Original file line number Diff line number Diff line change
Expand Up @@ -496,12 +496,12 @@ GPU_FUNC_QUALIFIER void pme_gpu_wait_and_reduce(gmx_pme_t* GPU_FUN
* \todo Rename this function to *clear* -- it clearly only does output resetting
* and we should be clear about what the function does..
*
* \param[in] pme The PME data structure.
* \param[in] useMdGpuGraph Whether MD GPU Graph is in use.
* \param[in] wcycle The wallclock counter.
* \param[in] pme The PME data structure.
* \param[in] gpuGraphWithSeparatePmeRank Whether MD GPU Graph with separate PME rank is in use.
* \param[in] wcycle The wallclock counter.
*/
GPU_FUNC_QUALIFIER void pme_gpu_reinit_computation(const gmx_pme_t* GPU_FUNC_ARGUMENT(pme),
bool GPU_FUNC_ARGUMENT(useMdGpuGraph),
bool GPU_FUNC_ARGUMENT(gpuGraphWithSeparatePmeRank),
gmx_wallcycle* GPU_FUNC_ARGUMENT(wcycle)) GPU_FUNC_TERM;

/*! \brief Set pointer to device copy of coordinate data.
Expand Down
4 changes: 2 additions & 2 deletions src/gromacs/ewald/pme_gpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -425,7 +425,7 @@ void pme_gpu_wait_and_reduce(gmx_pme_t* pme,
pme_gpu_reduce_outputs(computeEnergyAndVirial, output, wcycle, forceWithVirial, enerd);
}

void pme_gpu_reinit_computation(const gmx_pme_t* pme, const bool useMdGpuGraph, gmx_wallcycle* wcycle)
void pme_gpu_reinit_computation(const gmx_pme_t* pme, const bool gpuGraphWithSeparatePmeRank, gmx_wallcycle* wcycle)
{
GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled.");

Expand All @@ -434,7 +434,7 @@ void pme_gpu_reinit_computation(const gmx_pme_t* pme, const bool useMdGpuGraph,
pme_gpu_update_timings(pme->gpu);

pme_gpu_clear_grids(pme->gpu);
pme_gpu_clear_energy_virial(pme->gpu, useMdGpuGraph);
pme_gpu_clear_energy_virial(pme->gpu, gpuGraphWithSeparatePmeRank);

wallcycle_stop(wcycle, WallCycleCounter::LaunchGpuPme);
}
Expand Down
4 changes: 2 additions & 2 deletions src/gromacs/ewald/pme_gpu_internal.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ void pme_gpu_free_energy_virial(PmeGpu* pmeGpu)
}
}

void pme_gpu_clear_energy_virial(const PmeGpu* pmeGpu, const bool useMdGpuGraph)
void pme_gpu_clear_energy_virial(const PmeGpu* pmeGpu, const bool gpuGraphWithSeparatePmeRank)
{
for (int gridIndex = 0; gridIndex < pmeGpu->common->ngrids; gridIndex++)
{
Expand All @@ -174,7 +174,7 @@ void pme_gpu_clear_energy_virial(const PmeGpu* pmeGpu, const bool useMdGpuGraph)
c_virialAndEnergyCount,
pmeGpu->archSpecific->pmeStream_);
}
if (pmeGpu->settings.useGpuForceReduction && useMdGpuGraph)
if (pmeGpu->settings.useGpuForceReduction && gpuGraphWithSeparatePmeRank)
{
// Mark forces ready event after this clearing, otherwise CUDA graph capture fails due to unjoined work
pmeGpu->archSpecific->pmeForcesReady.markEvent(pmeGpu->archSpecific->pmeStream_);
Expand Down
6 changes: 3 additions & 3 deletions src/gromacs/ewald/pme_gpu_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -139,10 +139,10 @@ void pme_gpu_free_energy_virial(PmeGpu* pmeGpu);
* Clears the energy and virial memory on GPU with 0.
* Should be called at the end of PME computation which returned energy/virial.
*
* \param[in] pmeGpu The PME GPU structure.
* \param[in] useMdGpuGraph Whether MD GPU Graph is in use.
* \param[in] pmeGpu The PME GPU structure.
* \param[in] gpuGraphWithSeparatePmeRank Whether MD GPU Graph with separate PME rank is in use.
*/
void pme_gpu_clear_energy_virial(const PmeGpu* pmeGpu, bool useMdGpuGraph);
void pme_gpu_clear_energy_virial(const PmeGpu* pmeGpu, bool gpuGraphWithSeparatePmeRank);

/*! \libinternal \brief
* Reallocates and copies the pre-computed B-spline values to the GPU.
Expand Down
2 changes: 2 additions & 0 deletions src/gromacs/mdlib/mdgraph_gpu_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -263,6 +263,7 @@ void MdGpuGraph::Impl::startRecord(GpuEventSynchronizer* xReadyOnDeviceEvent)
helperEvent_->enqueueWaitEvent(deviceStreamManager_.stream(gmx::DeviceStreamType::UpdateAndConstraints));

// Re-mark xReadyOnDeviceEvent to allow full isolation within graph capture
xReadyOnDeviceEvent->reset();
xReadyOnDeviceEvent->markEvent(deviceStreamManager_.stream(gmx::DeviceStreamType::UpdateAndConstraints));
graphState_ = GraphState::Recording;
};
Expand Down Expand Up @@ -491,6 +492,7 @@ void MdGpuGraph::Impl::launchGraphMdStep(GpuEventSynchronizer* xUpdatedOnDeviceE
// TODO: This is actually only required on steps that don't have graph usage in
// their following step, but it is harmless to do it on all steps for the time being.
enqueueRank0EventToAllPpStreams(helperEvent_.get(), *thisLaunchStream);
xUpdatedOnDeviceEvent->reset();
xUpdatedOnDeviceEvent->markEvent(*thisLaunchStream);

wallcycle_sub_stop(wcycle_, WallCycleSubCounter::MdGpuGraphLaunch);
Expand Down
3 changes: 2 additions & 1 deletion src/gromacs/mdlib/sim_util.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1009,7 +1009,8 @@ static void launchGpuEndOfStepTasks(nonbonded_verlet_t* nbv,
if (runScheduleWork.stepWork.haveGpuPmeOnThisRank)
{
wallcycle_start_nocount(wcycle, WallCycleCounter::PmeGpuMesh);
pme_gpu_reinit_computation(pmedata, runScheduleWork.simulationWork.useMdGpuGraph, wcycle);
bool gpuGraphWithSeparatePmeRank = false;
pme_gpu_reinit_computation(pmedata, gpuGraphWithSeparatePmeRank, wcycle);
wallcycle_stop(wcycle, WallCycleCounter::PmeGpuMesh);
}

Expand Down

0 comments on commit 9bf709b

Please sign in to comment.