From 857584fd0c8865efaa65285ff4772b3e6142f6af Mon Sep 17 00:00:00 2001 From: Michael Wyatt Date: Thu, 29 Feb 2024 09:39:05 -0800 Subject: [PATCH 1/2] Allow specifying MII branch on MII CI (#5208) --- .github/workflows/nv-mii.yml | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/.github/workflows/nv-mii.yml b/.github/workflows/nv-mii.yml index e542ce7464b7..0b3f128be5a4 100644 --- a/.github/workflows/nv-mii.yml +++ b/.github/workflows/nv-mii.yml @@ -2,6 +2,12 @@ name: nv-mii on: workflow_dispatch: + inputs: + mii_branch: + description: 'DeepSpeed-MII Branch' + required: false + default: 'main' + type: string pull_request: paths: - '.github/workflows/nv-mii.yml' @@ -55,7 +61,12 @@ jobs: - name: MII unit tests run: | - git clone https://github.com/microsoft/DeepSpeed-MII.git + BRANCH="main" + if [[ ! -z "${{ github.event.inputs.mii_branch }}" ]]; then + BRANCH="${{ github.event.inputs.mii_branch }}" + fi + echo "Cloning DeepSpeed-MII branch: $BRANCH" + git clone -b $BRANCH --depth=1 https://github.com/microsoft/DeepSpeed-MII.git cd DeepSpeed-MII pip install .[dev] unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch From 4578c2490b086ef786bfdcf3755074a1259af2c6 Mon Sep 17 00:00:00 2001 From: ByronHsu Date: Fri, 1 Mar 2024 14:16:35 -0800 Subject: [PATCH 2/2] [zero++] Synchronize at the end of secondary partitioning and simplify the logic (#5216) ## 1. Why? We have a very long thread investigating [the issue](https://github.com/microsoft/DeepSpeed/issues/5059). To summarize, this is because a. The 2nd partitioning is asynchronous because it copies device-to-device from full tensor to 2nd tensor b. When using prefetching, the all-gather of 2nd tensor can happen before 2nd partitioning ends. At that moment, the value of 2nd tensor might contain bad values. ![image](https://github.com/microsoft/DeepSpeed/assets/24364830/ad6ee6a2-8e1e-4214-a0d2-ee5314b252b8) Also, we found that the logic of copying is wrong and lengthy, so we simplified it to only two lines. Kudos to @yundai424, Haowen Ning, @samadejacobs for the investigation effort. ## 2. What? After multiple careful tests, we found patching `get_accelerator().synchronize()` to ensure all cuda stream finished before 2nd partitioning can prevent the issue ## 3. Tests I validated the correctness of the simplification of 2nd partition logic. The loss is "exactly" the same before and after simplification under the same random seed. Before ``` [ {"loss": 2.0731}, {"loss": 2.0288}, {"loss": 1.927}, {"loss": 1.8347}, {"loss": 1.8347}, {"loss": 1.7896}, {"loss": 1.602}, {"loss": 1.766}, {"loss": 1.8751}, {"loss": 1.6776} ] ``` After ``` [ {"loss": 2.0731}, {"loss": 2.0288}, {"loss": 1.927}, {"loss": 1.8347}, {"loss": 1.8347}, {"loss": 1.7896}, {"loss": 1.602}, {"loss": 1.766}, {"loss": 1.8751}, {"loss": 1.6776} ] ``` ## 4. TODO We need further investigation on the issue @samadejacobs 1) Revisit ZeRO-3 prefetch design 2) Refactor hpz to reuse primary tensor for secondary partition. --------- Signed-off-by: byhsu Co-authored-by: byhsu --- .../runtime/zero/partition_parameters.py | 21 ++++++++----------- 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/deepspeed/runtime/zero/partition_parameters.py b/deepspeed/runtime/zero/partition_parameters.py index 5cf655d8741a..142259c1b7df 100755 --- a/deepspeed/runtime/zero/partition_parameters.py +++ b/deepspeed/runtime/zero/partition_parameters.py @@ -1635,19 +1635,16 @@ def _partition_param_sec(self, param, buffer=None, has_been_updated=False): secondary_end = secondary_start + secondary_partition_size one_dim_param = param.contiguous().view(-1) - start = partition_size * self.rank - end = start + partition_size - if start < param.ds_numel and end <= param.ds_numel: - if secondary_start < param.ds_numel and secondary_end <= param.ds_numel: - sec_src_tensor = one_dim_param.narrow(0, secondary_start, secondary_partition_size) - param.ds_secondary_tensor.copy_(sec_src_tensor) - else: - if start < param.ds_numel: - elements_to_copy = param.ds_numel - start - elements_to_copy_sec = elements_to_copy * param.ds_secondary_tensor_num_of_groups - param.ds_secondary_tensor.narrow(0, 0, elements_to_copy_sec).copy_( - one_dim_param.narrow(0, secondary_start, elements_to_copy_sec)) + # ds_numel is unpadded, so the last chunk of the secondary tensor might not be secondary_partition_size + sec_numel = param.ds_numel - secondary_start if secondary_end > param.ds_numel else secondary_partition_size + + # copy from full tensor to secondary tensor + param.ds_secondary_tensor.narrow(0, 0, + sec_numel).copy_(one_dim_param.narrow(0, secondary_start, sec_numel)) + + # TODO: This is a temporary fix to avoid the issue that 2nd tensor all-gather happens before 2nd tensor partition is done + get_accelerator().current_stream().synchronize() print_rank_0(f"{param.ds_id} partitioned type {param.dtype} dev {param.device} shape {param.shape}", force=False)