Skip to content

Commit

Permalink
Merge branch 'patch20240822' of github.com:openpsi-project/realhf int…
Browse files Browse the repository at this point in the history
…o profile
  • Loading branch information
garrett4wade committed Aug 27, 2024
2 parents 793eb3f + 6adc95c commit f12bd7a
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 5 deletions.
7 changes: 7 additions & 0 deletions realhf/api/core/system_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,11 @@ class ExperimentSaveEvalControl:
:param benchmark_steps: Terminate the training after this number of steps.
Used by system benchmark only. Please leave it to None for normal training.
:type benchmark_steps: Optional[int]
:param save_eval_timeout: Timeout in seconds for saving and evaluation.
Will be used for the last step of the experiment. The master worker will sleep
for `save_eval_timeout` seconds to wait all save or evaluations to finish.
Defaults to 120 seconds.
:type save_eval_timeout: int
"""

total_train_epochs: int = 1
Expand All @@ -199,6 +204,8 @@ class ExperimentSaveEvalControl:
eval_freq_secs: Optional[int] = None
# benchmark
benchmark_steps: Optional[int] = None
# Graceful exit
save_eval_timeout: int = 120


@dataclasses.dataclass
Expand Down
17 changes: 12 additions & 5 deletions realhf/system/master_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -1359,11 +1359,18 @@ def _poll(self):
self.__benchmark_steps is not None
and self._global_step >= self.__benchmark_steps
) or (is_new_epoch and self._epoch > self.__total_train_epochs):
logger.info(
f"Finished benchmark {self.__benchmark_steps}. "
f"Time consumption of this setup: {time_since_configure:.3f}"
)
logger.info(f"avg #e2e# time *{np.mean(self.e2e_time_history):.3f}*")
if should_eval or should_save:
logger.info(
f"Waiting for all save/eval requests at the last step"
f" for {self.config.exp_ctrl.save_eval_timeout} secs..."
)
time.sleep(self.config.exp_ctrl.save_eval_timeout)
if self.__benchmark_steps is not None:
logger.info(
f"Finished benchmark {self.__benchmark_steps}. "
f"Time consumption of this setup: {time_since_configure:.3f}"
)
logger.info(f"avg #e2e# time *{np.mean(self.e2e_time_history):.3f}*")
return self.experiment_complete_exit()

# Send clear cache requests to model workers.
Expand Down

0 comments on commit f12bd7a

Please sign in to comment.