From e6e8c1378de035df59034d09373b44af3319b6d7 Mon Sep 17 00:00:00 2001 From: Reza Yazdani <44502768+RezaYazdaniAminabadi@users.noreply.github.com> Date: Mon, 4 Mar 2024 10:35:12 -0800 Subject: [PATCH] Fix moe cpu offload (#5220) The MoE- param gradients norms don't need to be averaged when created on CPU only when using 1-DP training. However, I just moved the tensor back to GPU to get average when having data-parallel on the MoE parameters and using CPU-offload. This PR addresses https://github.com/microsoft/DeepSpeed/issues/5203 --------- Co-authored-by: Reza Yazdani --- deepspeed/runtime/zero/stage_1_and_2.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/deepspeed/runtime/zero/stage_1_and_2.py b/deepspeed/runtime/zero/stage_1_and_2.py index e4009f6ac883..71a01b2391f8 100755 --- a/deepspeed/runtime/zero/stage_1_and_2.py +++ b/deepspeed/runtime/zero/stage_1_and_2.py @@ -1946,8 +1946,10 @@ def _average_expert_grad_norms(self, norm_groups): for i, norm in enumerate(norm_groups): if self.is_moe_param_group[i]: scaled_norm_tensor = norm * 1.0 / dist.get_world_size(group=self.real_dp_process_group[i]) + if self.device == 'cpu': + scaled_norm_tensor = scaled_norm_tensor.to(get_accelerator().current_device_name()) dist.all_reduce(scaled_norm_tensor, group=self.real_dp_process_group[i]) - norm_groups[i] = scaled_norm_tensor + norm_groups[i] = scaled_norm_tensor.to(self.device) def unscale_and_clip_grads(self, grad_groups_flat, total_norm): # compute combined scale factor for this group