From 98272d14fe0be043c47b5637cc560c6c49ea88ce Mon Sep 17 00:00:00 2001 From: billishyahao Date: Tue, 16 Jul 2024 08:54:02 +0800 Subject: [PATCH] [bugfix] promote state in bf16_optimizer (#5767) This patch is to promote state in bf16_optimizer so it can be accessible in downstream deepspeed usecase. For example, without the patch, we found issue in megatron-deepspeed llama showcase: ``` [rank3]: Traceback (most recent call last): [rank3]: File "/yahao/Megatron-DeepSpeed/pretrain_gpt.py", line 356, in [rank3]: pretrain(train_valid_test_datasets_provider, [rank3]: File "/yahao/Megatron-DeepSpeed/megatron/training.py", line 222, in pretrain [rank3]: iteration = train(forward_step_func, [rank3]: File "/yahao/Megatron-DeepSpeed/megatron/training.py", line 1264, in train [rank3]: report_memory_flag = training_log(loss_dict, total_loss_dict, [rank3]: File "/yahao/Megatron-DeepSpeed/megatron/training.py", line 999, in training_log [rank3]: opt_stats[0] += (torch.norm(optimizer.state[param]['exp_avg_sq']).item())**2 [rank3]: AttributeError: 'BF16_Optimizer' object has no attribute 'state' ``` With the patch, the invocation can pass smoothly. Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com> --- deepspeed/runtime/bf16_optimizer.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/deepspeed/runtime/bf16_optimizer.py b/deepspeed/runtime/bf16_optimizer.py index 965b446163ec..325188f02931 100644 --- a/deepspeed/runtime/bf16_optimizer.py +++ b/deepspeed/runtime/bf16_optimizer.py @@ -540,6 +540,11 @@ def param_groups(self): """Forward the wrapped optimizer's parameters.""" return self.optimizer.param_groups + @property + def state(self): + """Forward the wrapped optimizer's states.""" + return self.optimizer.state + def accumulate_hp_grads_and_remove_lp(self, lp_param, group_idx, param_idx): assert self.immediate_grad_update self._update_hp_grad(lp_param, group_idx, param_idx, clear_lp_grads=True)