From 33860504488d6bfe1aca9e701929ad72b37493ca Mon Sep 17 00:00:00 2001 From: Axel Huebl Date: Thu, 28 May 2015 17:26:02 +0200 Subject: [PATCH] Checkpoints: Check for CUDA Errors Check for cuda errors to spare the time for writing a checkpoint and also synchronize again afterwards to ensure a checkpoint can really be considered "valid" (not corrupted). --- .../simulationControl/SimulationHelper.hpp | 19 ++++++++++++++++++- .../initialization/InitialiserController.hpp | 8 ++++++++ 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/src/libPMacc/include/simulationControl/SimulationHelper.hpp b/src/libPMacc/include/simulationControl/SimulationHelper.hpp index 4db9e08205..75502fb60a 100644 --- a/src/libPMacc/include/simulationControl/SimulationHelper.hpp +++ b/src/libPMacc/include/simulationControl/SimulationHelper.hpp @@ -126,6 +126,16 @@ class SimulationHelper : public IPlugin /* trigger checkpoint notification */ if (checkpointPeriod && (currentStep % checkpointPeriod == 0)) { + /* first synchronize: if something failed, we can spare the time + * for the checkpoint writing */ + CUDA_CHECK(cudaDeviceSynchronize()); + CUDA_CHECK(cudaGetLastError()); + + GridController &gc = Environment::get().GridController(); + /* can be spared for better scalings, but allows to spare the + * time for checkpointing if some ranks died */ + MPI_CHECK(MPI_Barrier(gc.getCommunicator().getMPIComm())); + /* create directory containing checkpoints */ if (numCheckpoints == 0) { @@ -135,7 +145,14 @@ class SimulationHelper : public IPlugin Environment::get().PluginConnector().checkpointPlugins(currentStep, checkpointDirectory); - GridController &gc = Environment::get().GridController(); + /* important synchronize: only if no errors occured until this + * point guarantees that a checkpoint is usable */ + CUDA_CHECK(cudaDeviceSynchronize()); + CUDA_CHECK(cudaGetLastError()); + + /* \todo in an ideal world with MPI-3, this would be an + * MPI_Ibarrier call and this function would return a MPI_Request + * that could be checked */ MPI_CHECK(MPI_Barrier(gc.getCommunicator().getMPIComm())); if (gc.getGlobalRank() == 0) diff --git a/src/picongpu/include/initialization/InitialiserController.hpp b/src/picongpu/include/initialization/InitialiserController.hpp index f115f996be..c1b13d3b30 100644 --- a/src/picongpu/include/initialization/InitialiserController.hpp +++ b/src/picongpu/include/initialization/InitialiserController.hpp @@ -84,6 +84,14 @@ class InitialiserController : public IInitPlugin Environment<>::get().PluginConnector().restartPlugins(restartStep, restartDirectory); __getTransactionEvent().waitForFinished(); + CUDA_CHECK(cudaDeviceSynchronize()); + CUDA_CHECK(cudaGetLastError()); + + GridController &gc = Environment::get().GridController(); + /* can be spared for better scalings, but guarantees the user + * that the restart was successful */ + MPI_CHECK(MPI_Barrier(gc.getCommunicator().getMPIComm())); + log ("Loading from persistent data finished"); }