Merge branch 'patch20240822' of github.com:openpsi-project/realhf int…

…o profile
openpsi-project · Aug 27, 2024 · f12bd7a · f12bd7a
2 parents 793eb3f + 6adc95c
commit f12bd7a
Show file tree

Hide file tree

Showing 2 changed files with 19 additions and 5 deletions.
diff --git a/realhf/api/core/system_api.py b/realhf/api/core/system_api.py
@@ -186,6 +186,11 @@ class ExperimentSaveEvalControl:
     :param benchmark_steps: Terminate the training after this number of steps.
         Used by system benchmark only. Please leave it to None for normal training.
     :type benchmark_steps: Optional[int]
+    :param save_eval_timeout: Timeout in seconds for saving and evaluation.
+        Will be used for the last step of the experiment. The master worker will sleep
+        for `save_eval_timeout` seconds to wait all save or evaluations to finish.
+        Defaults to 120 seconds.
+    :type save_eval_timeout: int
     """
 
     total_train_epochs: int = 1
@@ -199,6 +204,8 @@ class ExperimentSaveEvalControl:
     eval_freq_secs: Optional[int] = None
     # benchmark
     benchmark_steps: Optional[int] = None
+    # Graceful exit
+    save_eval_timeout: int = 120
 
 
 @dataclasses.dataclass

diff --git a/realhf/system/master_worker.py b/realhf/system/master_worker.py
@@ -1359,11 +1359,18 @@ def _poll(self):
             self.__benchmark_steps is not None
             and self._global_step >= self.__benchmark_steps
         ) or (is_new_epoch and self._epoch > self.__total_train_epochs):
-            logger.info(
-                f"Finished benchmark {self.__benchmark_steps}. "
-                f"Time consumption of this setup: {time_since_configure:.3f}"
-            )
-            logger.info(f"avg #e2e# time *{np.mean(self.e2e_time_history):.3f}*")
+            if should_eval or should_save:
+                logger.info(
+                    f"Waiting for all save/eval requests at the last step"
+                    f" for {self.config.exp_ctrl.save_eval_timeout} secs..."
+                )
+                time.sleep(self.config.exp_ctrl.save_eval_timeout)
+            if self.__benchmark_steps is not None:
+                logger.info(
+                    f"Finished benchmark {self.__benchmark_steps}. "
+                    f"Time consumption of this setup: {time_since_configure:.3f}"
+                )
+                logger.info(f"avg #e2e# time *{np.mean(self.e2e_time_history):.3f}*")
             return self.experiment_complete_exit()
 
         # Send clear cache requests to model workers.