Merge branch 'master' into loadams/update-release-workflow

microsoft · Dec 4, 2024 · 1e17fdf · 1e17fdf
2 parents 19f835c + fc23007
commit 1e17fdf
Show file tree

Hide file tree

Showing 2 changed files with 10 additions and 8 deletions.
diff --git a/README.md b/README.md
@@ -16,7 +16,7 @@
 ## Latest News
 <b> <span style="color:orange" > DeepSpeed empowers ChatGPT-like model training with a single click, offering 15x speedup over SOTA RLHF systems with unprecedented cost reduction at all scales; [learn how](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-chat)</span>.</b>
 
-
+* [2024/12] [DeepSpeed-Domino: Communication-Free LLM Training Engine](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-domino/README.md)
 * [2024/08] [DeepSpeed on Windows](https://github.com/microsoft/DeepSpeed/tree/master/blogs/windows/08-2024/README.md) [[日本語](https://github.com/microsoft/DeepSpeed/tree/master/blogs/windows/08-2024/japanese/README.md)]
 * [2024/08] [DeepNVMe: Improving DL Applications through I/O Optimizations](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-gds/README.md) [[日本語](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-gds/japanese/README.md)]
 * [2024/07] [DeepSpeed Universal Checkpointing: Efficient and Flexible Checkpointing for Large Scale Distributed Training](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-ucp/README.md) [[中文](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-ucp/chinese/README.md)] [[日本語](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-ucp/japanese/README.md)]

diff --git a/deepspeed/utils/zero_to_fp32.py b/deepspeed/utils/zero_to_fp32.py
@@ -514,19 +514,20 @@ def to_torch_tensor(state_dict, return_empty_tensor=False):
     """
     Convert state_dict of GatheredTensor to torch tensor
     """
+    torch_state_dict = {}
     converted_tensors = {}
     for name, tensor in state_dict.items():
         tensor_id = id(tensor)
-        if tensor_id in converted_tensors:
-            shared_tensor = state_dict[converted_tensors[tensor_id]]
-            state_dict[name] = shared_tensor
+        if tensor_id in converted_tensors:  # shared tensors
+            shared_tensor = torch_state_dict[converted_tensors[tensor_id]]
+            torch_state_dict[name] = shared_tensor
         else:
             converted_tensors[tensor_id] = name
             if return_empty_tensor:
-                state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype)
+                torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype)
             else:
-                state_dict[name] = tensor.contiguous()
-    return state_dict
+                torch_state_dict[name] = tensor.contiguous()
+    return torch_state_dict
 
 
 def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
@@ -660,8 +661,9 @@ def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir,
         else:
             torch.save(shard_state_dict, output_path)
         # release the memory of current shard
-        for tensor_name in shard_state_dict:
+        for tensor_name in list(shard_state_dict.keys()):
             del state_dict[tensor_name]
+            del shard_state_dict[tensor_name]
         del shard_state_dict
         gc.collect()