From 98272d14fe0be043c47b5637cc560c6c49ea88ce Mon Sep 17 00:00:00 2001
From: billishyahao <yahao.he@gmail.com>
Date: Tue, 16 Jul 2024 08:54:02 +0800
Subject: [PATCH] [bugfix] promote state in bf16_optimizer (#5767)

This patch is to promote state in bf16_optimizer so it can be accessible
in downstream deepspeed usecase.

For example, without the patch, we found issue in megatron-deepspeed
llama showcase:
```
[rank3]: Traceback (most recent call last):
[rank3]:   File "/yahao/Megatron-DeepSpeed/pretrain_gpt.py", line 356, in <module>
[rank3]:     pretrain(train_valid_test_datasets_provider,
[rank3]:   File "/yahao/Megatron-DeepSpeed/megatron/training.py", line 222, in pretrain
[rank3]:     iteration = train(forward_step_func,
[rank3]:   File "/yahao/Megatron-DeepSpeed/megatron/training.py", line 1264, in train
[rank3]:     report_memory_flag = training_log(loss_dict, total_loss_dict,
[rank3]:   File "/yahao/Megatron-DeepSpeed/megatron/training.py", line 999, in training_log
[rank3]:     opt_stats[0] += (torch.norm(optimizer.state[param]['exp_avg_sq']).item())**2
[rank3]: AttributeError: 'BF16_Optimizer' object has no attribute 'state'
```

With the patch, the invocation can pass smoothly.

Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com>
---
 deepspeed/runtime/bf16_optimizer.py | 5 +++++
 1 file changed, 5 insertions(+)
diff --git a/deepspeed/runtime/bf16_optimizer.py b/deepspeed/runtime/bf16_optimizer.py
index 965b446163ec..325188f02931 100644
--- a/deepspeed/runtime/bf16_optimizer.py
+++ b/deepspeed/runtime/bf16_optimizer.py
@@ -540,6 +540,11 @@ def param_groups(self):
         """Forward the wrapped optimizer's parameters."""
         return self.optimizer.param_groups
 
+    @property
+    def state(self):
+        """Forward the wrapped optimizer's states."""
+        return self.optimizer.state
+
     def accumulate_hp_grads_and_remove_lp(self, lp_param, group_idx, param_idx):
         assert self.immediate_grad_update
         self._update_hp_grad(lp_param, group_idx, param_idx, clear_lp_grads=True)