From dcf6282baef37b992194c1a56289b7bcd179ddfd Mon Sep 17 00:00:00 2001 From: Liran Bachar Date: Wed, 13 Mar 2024 11:22:05 +0200 Subject: [PATCH] Average only valid part of the ipg buffer. When contiguous gradients is used ipg buffer may not be fully utilized. Call average_tensor only for the slice with valid gradints Change-Id: I760559d52c2f91e15cd6cd0b48e534ec2352802a --- deepspeed/runtime/zero/stage_1_and_2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepspeed/runtime/zero/stage_1_and_2.py b/deepspeed/runtime/zero/stage_1_and_2.py index b1d94a4459d9..e8823f153fb8 100755 --- a/deepspeed/runtime/zero/stage_1_and_2.py +++ b/deepspeed/runtime/zero/stage_1_and_2.py @@ -1360,7 +1360,7 @@ def reduce_ipg_grads(self): self.average_tensor(extra_large_grad_reduc.view(-1)) self.extra_large_param_to_reduce = None else: - self.average_tensor(self.ipg_buffer[self.ipg_index]) + self.average_tensor(self.ipg_buffer[self.ipg_index].narrow(0, 0, self.elements_in_ipg_bucket)) else: self.buffered_reduce_fallback(None, self.grads_in_ipg_bucket,