diff --git a/src/libPMacc/include/simulationControl/SimulationHelper.hpp b/src/libPMacc/include/simulationControl/SimulationHelper.hpp index 4db9e08205..75502fb60a 100644 --- a/src/libPMacc/include/simulationControl/SimulationHelper.hpp +++ b/src/libPMacc/include/simulationControl/SimulationHelper.hpp @@ -126,6 +126,16 @@ class SimulationHelper : public IPlugin /* trigger checkpoint notification */ if (checkpointPeriod && (currentStep % checkpointPeriod == 0)) { + /* first synchronize: if something failed, we can spare the time + * for the checkpoint writing */ + CUDA_CHECK(cudaDeviceSynchronize()); + CUDA_CHECK(cudaGetLastError()); + + GridController &gc = Environment::get().GridController(); + /* can be spared for better scalings, but allows to spare the + * time for checkpointing if some ranks died */ + MPI_CHECK(MPI_Barrier(gc.getCommunicator().getMPIComm())); + /* create directory containing checkpoints */ if (numCheckpoints == 0) { @@ -135,7 +145,14 @@ class SimulationHelper : public IPlugin Environment::get().PluginConnector().checkpointPlugins(currentStep, checkpointDirectory); - GridController &gc = Environment::get().GridController(); + /* important synchronize: only if no errors occured until this + * point guarantees that a checkpoint is usable */ + CUDA_CHECK(cudaDeviceSynchronize()); + CUDA_CHECK(cudaGetLastError()); + + /* \todo in an ideal world with MPI-3, this would be an + * MPI_Ibarrier call and this function would return a MPI_Request + * that could be checked */ MPI_CHECK(MPI_Barrier(gc.getCommunicator().getMPIComm())); if (gc.getGlobalRank() == 0) diff --git a/src/picongpu/include/initialization/InitialiserController.hpp b/src/picongpu/include/initialization/InitialiserController.hpp index f115f996be..c1b13d3b30 100644 --- a/src/picongpu/include/initialization/InitialiserController.hpp +++ b/src/picongpu/include/initialization/InitialiserController.hpp @@ -84,6 +84,14 @@ class InitialiserController : public IInitPlugin Environment<>::get().PluginConnector().restartPlugins(restartStep, restartDirectory); __getTransactionEvent().waitForFinished(); + CUDA_CHECK(cudaDeviceSynchronize()); + CUDA_CHECK(cudaGetLastError()); + + GridController &gc = Environment::get().GridController(); + /* can be spared for better scalings, but guarantees the user + * that the restart was successful */ + MPI_CHECK(MPI_Barrier(gc.getCommunicator().getMPIComm())); + log ("Loading from persistent data finished"); }