From 24d1d86ba294c7794dce5848d8caac55438fc5ad Mon Sep 17 00:00:00 2001 From: YiSheng5 Date: Wed, 14 Aug 2024 19:19:21 +0800 Subject: [PATCH 1/2] [Zero2] Reduce the unnecessary all-reduce when tensor size is 0. (#5868) When running for Zero2, the reduce_bucket_size we set is not large enough, the self.elements_in_ipg_bucket will be 0, then in function average_tensor the input is the tensor with size=0 https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/runtime/zero/stage_1_and_2.py#L1372 use reduce_scatter can be WA https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/runtime/zero/stage_1_and_2.py#L1066 if user uses the reduce_scatter=false, in function gradient_reduction_w_predivide will meet the unnecessary all-reduce with tensor size is 0. https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/runtime/zero/stage_1_and_2.py#L974 This pr is to add the judgement to reduce this unnecessary all-reduce. Co-authored-by: Olatunji Ruwase --- deepspeed/runtime/zero/stage_1_and_2.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/deepspeed/runtime/zero/stage_1_and_2.py b/deepspeed/runtime/zero/stage_1_and_2.py index 461ad719536a..57e80911d645 100755 --- a/deepspeed/runtime/zero/stage_1_and_2.py +++ b/deepspeed/runtime/zero/stage_1_and_2.py @@ -972,6 +972,8 @@ def print_rank_0(self, message): logger.info(message) def gradient_reduction_w_predivide(self, tensor): + if tensor.size().numel() == 0: + return tensor dp_world_size = dist.get_world_size(group=self.dp_process_group) From 051c993310f0c521ee0758d717168edcdf60c77e Mon Sep 17 00:00:00 2001 From: Raza Sikander <54884406+raza-sikander@users.noreply.github.com> Date: Wed, 14 Aug 2024 20:36:38 +0530 Subject: [PATCH 2/2] Update container version for Gaudi2 CI (#5937) Update version to 1.17.0 from 1.16.2 Co-authored-by: Shaik Raza Sikander --- .github/workflows/hpu-gaudi2.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/hpu-gaudi2.yml b/.github/workflows/hpu-gaudi2.yml index ac19638e67de..0272829e8286 100644 --- a/.github/workflows/hpu-gaudi2.yml +++ b/.github/workflows/hpu-gaudi2.yml @@ -39,7 +39,7 @@ jobs: # The type of runner that the job will run on runs-on: [self-hosted, intel, gaudi2] container: - image: vault.habana.ai/gaudi-docker/1.16.2/ubuntu22.04/habanalabs/pytorch-installer-2.2.2:latest + image: vault.habana.ai/gaudi-docker/1.17.0/ubuntu22.04/habanalabs/pytorch-installer-2.3.1:latest ports: - 80 options: --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice