diff --git a/README.md b/README.md index 5f5452ce232d..8bfc344a4edc 100755 --- a/README.md +++ b/README.md @@ -16,7 +16,7 @@ ## Latest News DeepSpeed empowers ChatGPT-like model training with a single click, offering 15x speedup over SOTA RLHF systems with unprecedented cost reduction at all scales; [learn how](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-chat). - +* [2024/12] [DeepSpeed-Domino: Communication-Free LLM Training Engine](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-domino/README.md) * [2024/08] [DeepSpeed on Windows](https://github.com/microsoft/DeepSpeed/tree/master/blogs/windows/08-2024/README.md) [[日本語](https://github.com/microsoft/DeepSpeed/tree/master/blogs/windows/08-2024/japanese/README.md)] * [2024/08] [DeepNVMe: Improving DL Applications through I/O Optimizations](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-gds/README.md) [[日本語](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-gds/japanese/README.md)] * [2024/07] [DeepSpeed Universal Checkpointing: Efficient and Flexible Checkpointing for Large Scale Distributed Training](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-ucp/README.md) [[中文](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-ucp/chinese/README.md)] [[日本語](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-ucp/japanese/README.md)] diff --git a/deepspeed/utils/zero_to_fp32.py b/deepspeed/utils/zero_to_fp32.py index e93cb1c95f15..0e759146cadd 100755 --- a/deepspeed/utils/zero_to_fp32.py +++ b/deepspeed/utils/zero_to_fp32.py @@ -514,19 +514,20 @@ def to_torch_tensor(state_dict, return_empty_tensor=False): """ Convert state_dict of GatheredTensor to torch tensor """ + torch_state_dict = {} converted_tensors = {} for name, tensor in state_dict.items(): tensor_id = id(tensor) - if tensor_id in converted_tensors: - shared_tensor = state_dict[converted_tensors[tensor_id]] - state_dict[name] = shared_tensor + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor else: converted_tensors[tensor_id] = name if return_empty_tensor: - state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) else: - state_dict[name] = tensor.contiguous() - return state_dict + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, @@ -660,8 +661,9 @@ def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, else: torch.save(shard_state_dict, output_path) # release the memory of current shard - for tensor_name in shard_state_dict: + for tensor_name in list(shard_state_dict.keys()): del state_dict[tensor_name] + del shard_state_dict[tensor_name] del shard_state_dict gc.collect()