Skip to content

Commit

Permalink
Merge pull request ComputationalRadiationPhysics#897 from ax3l/topic-…
Browse files Browse the repository at this point in the history
…cudaSyncCheckpoints

Checkpoints: Check for CUDA Errors
  • Loading branch information
psychocoderHPC committed May 29, 2015
2 parents a16bea8 + 3386050 commit f0ba515
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 1 deletion.
19 changes: 18 additions & 1 deletion src/libPMacc/include/simulationControl/SimulationHelper.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,16 @@ class SimulationHelper : public IPlugin
/* trigger checkpoint notification */
if (checkpointPeriod && (currentStep % checkpointPeriod == 0))
{
/* first synchronize: if something failed, we can spare the time
* for the checkpoint writing */
CUDA_CHECK(cudaDeviceSynchronize());
CUDA_CHECK(cudaGetLastError());

GridController<DIM> &gc = Environment<DIM>::get().GridController();
/* can be spared for better scalings, but allows to spare the
* time for checkpointing if some ranks died */
MPI_CHECK(MPI_Barrier(gc.getCommunicator().getMPIComm()));

/* create directory containing checkpoints */
if (numCheckpoints == 0)
{
Expand All @@ -135,7 +145,14 @@ class SimulationHelper : public IPlugin
Environment<DIM>::get().PluginConnector().checkpointPlugins(currentStep,
checkpointDirectory);

GridController<DIM> &gc = Environment<DIM>::get().GridController();
/* important synchronize: only if no errors occured until this
* point guarantees that a checkpoint is usable */
CUDA_CHECK(cudaDeviceSynchronize());
CUDA_CHECK(cudaGetLastError());

/* \todo in an ideal world with MPI-3, this would be an
* MPI_Ibarrier call and this function would return a MPI_Request
* that could be checked */
MPI_CHECK(MPI_Barrier(gc.getCommunicator().getMPIComm()));

if (gc.getGlobalRank() == 0)
Expand Down
8 changes: 8 additions & 0 deletions src/picongpu/include/initialization/InitialiserController.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,14 @@ class InitialiserController : public IInitPlugin
Environment<>::get().PluginConnector().restartPlugins(restartStep, restartDirectory);
__getTransactionEvent().waitForFinished();

CUDA_CHECK(cudaDeviceSynchronize());
CUDA_CHECK(cudaGetLastError());

GridController<simDim> &gc = Environment<simDim>::get().GridController();
/* can be spared for better scalings, but guarantees the user
* that the restart was successful */
MPI_CHECK(MPI_Barrier(gc.getCommunicator().getMPIComm()));

log<picLog::SIMULATION_STATE > ("Loading from persistent data finished");
}

Expand Down

0 comments on commit f0ba515

Please sign in to comment.