From a527ce7a6b65e5abeb5d5505e141306288868b8b Mon Sep 17 00:00:00 2001 From: ashors1 <71393111+ashors1@users.noreply.github.com> Date: Tue, 25 Jun 2024 05:27:42 -0700 Subject: [PATCH] [NeMo-UX] minor logging bug fixes (#9529) * minor exp_manager bug fixes * remove print statement * fix docstring * fix AppState defaults --------- Co-authored-by: Marc Romeyn --- nemo/lightning/nemo_logger.py | 8 ++++++++ .../callbacks/megatron_model_checkpoint.py | 11 ++++------- nemo/utils/app_state.py | 18 +++++++++++++++++- 3 files changed, 29 insertions(+), 8 deletions(-) diff --git a/nemo/lightning/nemo_logger.py b/nemo/lightning/nemo_logger.py index 2ad0753d04c5..fbf9298dfec4 100644 --- a/nemo/lightning/nemo_logger.py +++ b/nemo/lightning/nemo_logger.py @@ -100,6 +100,7 @@ def setup( "No version folders would be created under the log folder as 'resume_if_exists' is enabled." ) version = None + trainer.logger._version = version or "" if version: if is_global_rank_zero(): os.environ[NEMO_ENV_VARNAME_VERSION] = version @@ -160,6 +161,12 @@ def setup( # This is set if the env var NEMO_TESTING is set to True. nemo_testing = get_envbool(NEMO_ENV_VARNAME_TESTING, False) + files_to_move = [] + if Path(log_dir).exists(): + for child in Path(log_dir).iterdir(): + if child.is_file(): + files_to_move.append(child) + # Handle logging to file log_file = log_dir / f'nemo_log_globalrank-{global_rank}_localrank-{local_rank}.txt' if self.log_local_rank_0_only is True and not nemo_testing: @@ -174,6 +181,7 @@ def setup( add_handlers_to_mcore_logger() + app_state.files_to_move = files_to_move app_state.files_to_copy = self.files_to_copy app_state.cmd_args = sys.argv diff --git a/nemo/lightning/pytorch/callbacks/megatron_model_checkpoint.py b/nemo/lightning/pytorch/callbacks/megatron_model_checkpoint.py index fb10ad3a218b..44b1ab238198 100644 --- a/nemo/lightning/pytorch/callbacks/megatron_model_checkpoint.py +++ b/nemo/lightning/pytorch/callbacks/megatron_model_checkpoint.py @@ -82,11 +82,7 @@ def on_train_start(self, trainer, pl_module): log_dir = app_state.log_dir # Check to see if any files exist that need to be moved - files_to_move = [] - if Path(log_dir).exists(): - for child in Path(log_dir).iterdir(): - if child.is_file(): - files_to_move.append(child) + files_to_move = app_state.files_to_move if len(files_to_move) > 0: # Move old files to a new folder @@ -106,8 +102,9 @@ def on_train_start(self, trainer, pl_module): shutil.copy(Path(_file), log_dir) # Create files for cmd args and git info - with open(log_dir / 'cmd-args.log', 'w', encoding='utf-8') as _file: - _file.write(" ".join(app_state.cmd_args)) + if app_state.cmd_args: + with open(log_dir / 'cmd-args.log', 'w', encoding='utf-8') as _file: + _file.write(" ".join(app_state.cmd_args)) # Try to get git hash git_repo, git_hash = get_git_hash() diff --git a/nemo/utils/app_state.py b/nemo/utils/app_state.py index 4d1d7387ba90..7a60c3969df3 100644 --- a/nemo/utils/app_state.py +++ b/nemo/utils/app_state.py @@ -81,8 +81,10 @@ def __init__(self): self._model_guid_map = {} # type: Dict[str, ModelMetadataRegistry] self._restore = False # TODO: are this and _is_model_being_restored both needed? + # files from a previous run to move into a new directory + self.files_to_move = [] # files to copy into log dir - self._files_to_copy = None + self._files_to_copy = [] # command-ling arguments for run self._cmd_args = None @@ -560,6 +562,20 @@ def checkpoint_callback_params(self, params): """ self._checkpoint_callback_params = params + @property + def files_to_move(self): + """Returns the list of files to move into a separate directory.""" + return self._files_to_move + + @files_to_move.setter + def files_to_move(self, files): + """Sets the files_to_move property. + + Args: + files (list[str]): list of filenames to move. + """ + self._files_to_move = files + @property def files_to_copy(self): """Returns the list of files to copy into the log dir."""