Skip to content

Commit

Permalink
[NeMo-UX] minor logging bug fixes (NVIDIA#9529)
Browse files Browse the repository at this point in the history
* minor exp_manager bug fixes

* remove print statement

* fix docstring

* fix AppState defaults

---------

Co-authored-by: Marc Romeyn <[email protected]>
  • Loading branch information
ashors1 and marcromeyn authored Jun 25, 2024
1 parent 490ade4 commit a527ce7
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 8 deletions.
8 changes: 8 additions & 0 deletions nemo/lightning/nemo_logger.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ def setup(
"No version folders would be created under the log folder as 'resume_if_exists' is enabled."
)
version = None
trainer.logger._version = version or ""
if version:
if is_global_rank_zero():
os.environ[NEMO_ENV_VARNAME_VERSION] = version
Expand Down Expand Up @@ -160,6 +161,12 @@ def setup(
# This is set if the env var NEMO_TESTING is set to True.
nemo_testing = get_envbool(NEMO_ENV_VARNAME_TESTING, False)

files_to_move = []
if Path(log_dir).exists():
for child in Path(log_dir).iterdir():
if child.is_file():
files_to_move.append(child)

# Handle logging to file
log_file = log_dir / f'nemo_log_globalrank-{global_rank}_localrank-{local_rank}.txt'
if self.log_local_rank_0_only is True and not nemo_testing:
Expand All @@ -174,6 +181,7 @@ def setup(

add_handlers_to_mcore_logger()

app_state.files_to_move = files_to_move
app_state.files_to_copy = self.files_to_copy
app_state.cmd_args = sys.argv

Expand Down
11 changes: 4 additions & 7 deletions nemo/lightning/pytorch/callbacks/megatron_model_checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,11 +82,7 @@ def on_train_start(self, trainer, pl_module):
log_dir = app_state.log_dir

# Check to see if any files exist that need to be moved
files_to_move = []
if Path(log_dir).exists():
for child in Path(log_dir).iterdir():
if child.is_file():
files_to_move.append(child)
files_to_move = app_state.files_to_move

if len(files_to_move) > 0:
# Move old files to a new folder
Expand All @@ -106,8 +102,9 @@ def on_train_start(self, trainer, pl_module):
shutil.copy(Path(_file), log_dir)

# Create files for cmd args and git info
with open(log_dir / 'cmd-args.log', 'w', encoding='utf-8') as _file:
_file.write(" ".join(app_state.cmd_args))
if app_state.cmd_args:
with open(log_dir / 'cmd-args.log', 'w', encoding='utf-8') as _file:
_file.write(" ".join(app_state.cmd_args))

# Try to get git hash
git_repo, git_hash = get_git_hash()
Expand Down
18 changes: 17 additions & 1 deletion nemo/utils/app_state.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,8 +81,10 @@ def __init__(self):
self._model_guid_map = {} # type: Dict[str, ModelMetadataRegistry]
self._restore = False # TODO: are this and _is_model_being_restored both needed?

# files from a previous run to move into a new directory
self.files_to_move = []
# files to copy into log dir
self._files_to_copy = None
self._files_to_copy = []
# command-ling arguments for run
self._cmd_args = None

Expand Down Expand Up @@ -560,6 +562,20 @@ def checkpoint_callback_params(self, params):
"""
self._checkpoint_callback_params = params

@property
def files_to_move(self):
"""Returns the list of files to move into a separate directory."""
return self._files_to_move

@files_to_move.setter
def files_to_move(self, files):
"""Sets the files_to_move property.
Args:
files (list[str]): list of filenames to move.
"""
self._files_to_move = files

@property
def files_to_copy(self):
"""Returns the list of files to copy into the log dir."""
Expand Down

0 comments on commit a527ce7

Please sign in to comment.