Skip to content

Commit

Permalink
fix FSDP bugs in HF Trainer
Browse files Browse the repository at this point in the history
  • Loading branch information
sangkeun00 committed May 30, 2024
1 parent 1dfcdca commit 08ebc16
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 10 deletions.
17 changes: 8 additions & 9 deletions logix/huggingface/callback.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,15 @@ def __init__(
self._log_dataloader = None

def on_init_end(self, args, state, control, **kwargs):
model = kwargs["model"]
self.logix.watch(
model=model,
name_filter=self.args.name_filter,
type_filter=self.args.type_filter,
)

if self.args.lora:
model = kwargs["model"]
self.logix.add_lora(model, watch=False)
self.logix.add_lora()

def on_epoch_begin(self, args, state, control, **kwargs):
if self.args.mode == "log":
Expand All @@ -33,13 +39,6 @@ def on_epoch_end(self, args, state, control, **kwargs):
self.logix.finalize()

def on_train_begin(self, args, state, control, **kwargs):
model = kwargs["model"]
self.logix.watch(
model=model,
name_filter=self.args.name_filter,
type_filter=self.args.type_filter,
)

if self.args.initialize_from_log:
self.logix.initialize_from_log()

Expand Down
4 changes: 3 additions & 1 deletion logix/logix.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,9 @@ def add_lora(
assert name in self.model.state_dict(), f"{name} not in model!"
model.load_state_dict(lora_state, strict=False)

if get_rank() == 0:
self.save_lora()

# Clear state and logger
if clear:
msg = "LogIX will clear the previous Hessian, Storage, and Logging "
Expand Down Expand Up @@ -464,7 +467,6 @@ def finalize(
if get_rank() == 0:
self.save_config()
self.save_state()
self.save_lora()

def setup(self, log_option_kwargs: Dict[str, Any]) -> None:
"""
Expand Down

0 comments on commit 08ebc16

Please sign in to comment.