Merge branch 'master' into master

microsoft · Aug 3, 2024 · 133836c · 133836c
2 parents b7a2a06 + 2ef8223
commit 133836c
Show file tree

Hide file tree

Showing 111 changed files with 2,787 additions and 633 deletions.
diff --git a/.github/workflows/amd-mi200.yml b/.github/workflows/amd-mi200.yml
@@ -32,7 +32,7 @@ jobs:
 
       - name: Install pytorch
         run: |
-          pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/rocm5.6
+          pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/rocm6.0
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 

diff --git a/.github/workflows/cpu-inference.yml b/.github/workflows/cpu-inference.yml
@@ -24,6 +24,8 @@ jobs:
   unit-tests:
     runs-on: [self-hosted, cpu]
 
+    env: {ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true} # Allow using Node16 actions
+
     steps:
       - uses: actions/checkout@v3
 

diff --git a/.github/workflows/cpu-torch-latest.yml b/.github/workflows/cpu-torch-latest.yml
@@ -19,7 +19,7 @@ concurrency:
 
 jobs:
   unit-tests:
-    runs-on: ubuntu-20.04
+    runs-on: ubuntu-22.04
 
     steps:
       - uses: actions/checkout@v4
@@ -50,5 +50,5 @@ jobs:
         run: |
           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
           cd tests
-          HF_HOME=/tmp/hf_home/ pytest $PYTEST_OPTS -n 4 unit/ --torch_ver="2.3"
-          HF_HOME=/tmp/hf_home/ pytest $PYTEST_OPTS -m 'sequential' unit/ --torch_ver="2.3"
+          HF_HOME=/tmp/hf_home/ pytest $PYTEST_OPTS -n 4 unit/ --torch_ver="2.4"
+          HF_HOME=/tmp/hf_home/ pytest $PYTEST_OPTS -m 'sequential' unit/ --torch_ver="2.4"
diff --git a/.github/workflows/formatting.yml b/.github/workflows/formatting.yml
@@ -18,7 +18,7 @@ jobs:
 
   # formatting and basic install on cpu-only machine
   unit-tests:
-    runs-on: ubuntu-20.04
+    runs-on: ubuntu-22.04
 
     steps:
       - uses: actions/checkout@v4

diff --git a/.github/workflows/nv-human-eval.yml b/.github/workflows/nv-human-eval.yml
@@ -17,7 +17,7 @@ jobs:
       options: --gpus all --shm-size "8G"
 
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       - name: Check container state
         run: |

diff --git a/.github/workflows/nv-lightning-v100.yml b/.github/workflows/nv-lightning-v100.yml
@@ -21,6 +21,8 @@ jobs:
   unit-tests:
     runs-on: [self-hosted, nvidia, cu111, v100]
 
+    env: {ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true} # Allow using Node16 actions
+
     steps:
       - uses: actions/checkout@v3
 

diff --git a/.github/workflows/nv-mii.yml b/.github/workflows/nv-mii.yml
@@ -37,7 +37,7 @@ jobs:
 
       - name: Install pytorch
         run: |
-          pip3 install -U --cache-dir $TORCH_CACHE torch --index-url https://download.pytorch.org/whl/cu118
+          pip3 install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu118
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 
@@ -46,7 +46,7 @@ jobs:
           git clone https://github.com/huggingface/transformers
           cd transformers
           # if needed switch to the last known good SHA until transformers@master is fixed
-          git checkout bdf36dc
+          git checkout v4.42.4
           git rev-parse --short HEAD
           pip install .
 

diff --git a/.github/workflows/nv-pre-compile-ops.yml b/.github/workflows/nv-pre-compile-ops.yml
@@ -21,7 +21,7 @@ concurrency:
 
 jobs:
   unit-tests:
-    runs-on: ubuntu-20.04
+    runs-on: ubuntu-22.04
     container:
       image: deepspeed/gh-builder:ubuntu1804-py38-torch1131-cu116
 

diff --git a/.github/workflows/nv-torch-latest-v100.yml b/.github/workflows/nv-torch-latest-v100.yml
@@ -55,5 +55,5 @@ jobs:
         run: |
           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
           cd tests
-          pytest $PYTEST_OPTS --forked -n 4 unit/ --torch_ver="2.3" --cuda_ver="11.8"
-          pytest $PYTEST_OPTS --forked -m 'sequential' unit/ --torch_ver="2.3" --cuda_ver="11.8"
+          pytest $PYTEST_OPTS --forked -n 4 unit/ --torch_ver="2.4" --cuda_ver="11.8"
+          pytest $PYTEST_OPTS --forked -m 'sequential' unit/ --torch_ver="2.4" --cuda_ver="11.8"
diff --git a/.github/workflows/nv-torch110-p40.yml b/.github/workflows/nv-torch110-p40.yml
@@ -17,6 +17,8 @@ jobs:
   unit-tests:
     runs-on: [self-hosted, nvidia, cu111, p40]
 
+    env: {ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true} # Allow using Node16 actions
+
     steps:
       - uses: actions/checkout@v3
 

diff --git a/.github/workflows/nv-torch110-v100.yml b/.github/workflows/nv-torch110-v100.yml
@@ -17,6 +17,8 @@ jobs:
   unit-tests:
     runs-on: [self-hosted, nvidia, cu111, v100]
 
+    env: {ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true} # Allow using Node16 actions
+
     steps:
       - uses: actions/checkout@v3
 

diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml
@@ -21,15 +21,15 @@ jobs:
   unit-tests:
     strategy:
       matrix:
-        pyVersion: ["3.6", "3.7", "3.8", "3.9", "3.10"]
+        pyVersion: ["3.7", "3.8", "3.9", "3.10"]
       fail-fast: false
 
-    runs-on: ubuntu-20.04
+    runs-on: ubuntu-22.04
     container:
       image: deepspeed/gh-builder:py${{ matrix.pyVersion }}
 
     steps:
-        - uses: actions/checkout@v3
+        - uses: actions/checkout@v4
 
         - name: environment
           run: |

diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -7,7 +7,7 @@ on:
 
 jobs:
   deploy:
-    runs-on: ubuntu-20.04
+    runs-on: ubuntu-22.04
     environment: release-env
 
     steps:

diff --git a/.github/workflows/xpu-max1100.yml b/.github/workflows/xpu-max1100.yml
@@ -11,6 +11,7 @@ on:
       - "accelerator/abstract_accelerator.py"
       - "accelerator/cpu_accelerator.py"
       - "accelerator/real_accelerator.py"
+      - "csrc/xpu/**"
       - "deepspeed/runtime/engine.py"
       - "deepspeed/runtime/bf16_optimizer.py"
       - "deepspeed/runtime/zero/stage_1_and_2.py"
@@ -20,6 +21,7 @@ on:
       - "deepspeed/runtime/zero/parameter_offload.py"
       - "deepspeed/runtime/pipe/engine.py"
       - "deepspeed/runtime/utils.py"
+      - "opbuilder/xpu/**"
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}
@@ -34,7 +36,7 @@ jobs:
   unit-tests:
     runs-on: [self-hosted, intel, xpu]
     container:
-      image: intel/intel-extension-for-pytorch:2.1.20-xpu
+      image: intel/intel-extension-for-pytorch:2.1.30-xpu
       ports:
         - 80
       options: --privileged -it --rm --device /dev/dri:/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --ipc=host --cap-add=ALL

diff --git a/README.md b/README.md
@@ -15,6 +15,7 @@
 ## Latest News
 <b> <span style="color:orange" > DeepSpeed empowers ChatGPT-like model training with a single click, offering 15x speedup over SOTA RLHF systems with unprecedented cost reduction at all scales; [learn how](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-chat)</span>.</b>
 
+* [2024/07] [DeepSpeed Universal Checkpointing: Efficient and Flexible Checkpointing for Large Scale Distributed Training](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-ucp/README.md) [[中文](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-ucp/chinese/README.md)] [[日本語](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-ucp/japanese/README.md)]
 * [2024/03] [DeepSpeed-FP6:The power of FP6-Centric Serving for Large Language Models](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fp6/03-05-2024) [[English](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fp6/03-05-2024/README.md)] [[中文](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fp6/03-05-2024/README-Chinese.md)]
 * [2024/01] [DeepSpeed-FastGen: Introducing Mixtral, Phi-2, and Falcon support with major performance and feature enhancements.](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen/2024-01-19)
 * [2023/11] [Llama 2 Inference on 4th Gen Intel® Xeon® Scalable Processor with DeepSpeed](https://github.com/microsoft/DeepSpeed/tree/master/blogs/intel-inference) [[Intel version]](https://www.intel.com/content/www/us/en/developer/articles/technical/xllama-2-on-xeon-scalable-processor-with-deepspeed.html)
@@ -270,6 +271,9 @@ Conduct](https://opensource.microsoft.com/codeofconduct/). For more information
 30. Xiaoxia Wu, Haojun Xia, Stephen Youn, Zhen Zheng, Shiyang Chen, Arash Bakhtiari, Michael Wyatt, Reza Yazdani Aminabadi, Yuxiong He, Olatunji Ruwase, Leon Song, Zhewei Yao (2023) ZeroQuant(4+2): Redefining LLMs Quantization with a New FP6-Centric Strategy for Diverse Generative Tasks [arXiv:2312.08583](https://arxiv.org/abs/2312.08583)
 
 31. Haojun Xia, Zhen Zheng, Xiaoxia Wu, Shiyang Chen, Zhewei Yao, Stephen Youn, Arash Bakhtiari, Michael Wyatt, Donglin Zhuang, Zhongzhu Zhou, Olatunji Ruwase, Yuxiong He, Shuaiwen Leon Song. (2024) FP6-LLM: Efficiently Serving Large Language Models Through FP6-Centric Algorithm-System Co-Design  [arXiv:2401.14112](https://arxiv.org/abs/2401.14112)
+32. Sam Ade Jacobs, Masahiro Tanaka, Chengming Zhang, Minjia Zhang, Reza Yazdani Aminadabi, Shuaiwen Leon Song, Samyam Rajbhandari, Yuxiong He. (2024) [System Optimizations for Enabling Training of Extreme Long Sequence Transformer Models](https://dl.acm.org/doi/10.1145/3662158.3662806)
+33. Xinyu Lian, Sam Ade Jacobs, Lev Kurilenko, Masahiro Tanaka, Stas Bekman, Olatunji Ruwase, Minjia Zhang. (2024) Universal Checkpointing: Efficient and Flexible Checkpointing for Large Scale Distributed Training [arXiv:2406.18820](https://arxiv.org/abs/2406.18820)
+
 
 
 

diff --git a/accelerator/hpu_accelerator.py b/accelerator/hpu_accelerator.py
@@ -3,6 +3,7 @@
 
 # DeepSpeed Team
 
+import functools
 import os
 import pkgutil
 import importlib
@@ -196,31 +197,31 @@ def replay_graph(self, graph):
     # Tensor operations
     @property
     def BFloat16Tensor(self):
-        return self.hpu.BFloat16Tensor
+        return functools.partial(torch.tensor, dtype=torch.bfloat16, device='hpu')
 
     @property
     def ByteTensor(self):
-        return self.hpu.ByteTensor
+        return functools.partial(torch.tensor, dtype=torch.uint8, device='hpu')
 
     @property
     def DoubleTensor(self):
-        return self.hpu.DoubleTensor
+        return functools.partial(torch.tensor, dtype=torch.double, device='hpu')
 
     @property
     def FloatTensor(self):
-        return self.hpu.FloatTensor
+        return functools.partial(torch.tensor, dtype=torch.float, device='hpu')
 
     @property
     def HalfTensor(self):
-        return self.hpu.HalfTensor
+        return functools.partial(torch.tensor, dtype=torch.half, device='hpu')
 
     @property
     def IntTensor(self):
-        return self.hpu.IntTensor
+        return functools.partial(torch.tensor, dtype=torch.int, device='hpu')
 
     @property
     def LongTensor(self):
-        return self.hpu.LongTensor
+        return functools.partial(torch.tensor, dtype=torch.long, device='hpu')
 
     def pin_memory(self, tensor, align_bytes=1):
         return tensor.pin_memory(self.device())

diff --git a/accelerator/xpu_accelerator.py b/accelerator/xpu_accelerator.py
@@ -26,7 +26,11 @@ def is_synchronized_device(self):
         return False
 
     def use_host_timers(self):
-        return self.is_synchronized_device()
+        # WA XPU event will be consolidated in 2.5
+        if ipex.__version__ < '2.5':
+            return True
+        else:
+            return self.is_synchronized_device()
 
     def resolves_data_dependency(self):
         return self.is_synchronized_device()

diff --git a/bin/deepspeed.bat b/bin/deepspeed.bat
@@ -0,0 +1,2 @@
+@echo off
+python "%~dp0\ds" %*
diff --git a/bin/ds_report.bat b/bin/ds_report.bat
@@ -0,0 +1,2 @@
+@echo off
+python "%~dp0\ds_report" %*
diff --git a/blogs/deepspeed-fastgen/README.md b/blogs/deepspeed-fastgen/README.md
@@ -231,7 +231,10 @@ We currently support the following model architectures in this alpha release of
 * [Falcon](https://huggingface.co/models?other=falcon)
 * [Mixtral](https://huggingface.co/models?other=mixtral)
 * [Phi-2](https://huggingface.co/models?other=phi-msft)
+* [Phi-3](https://huggingface.co/models?other=phi3)
 * [Qwen](https://huggingface.co/models?other=qwen)
+* [Qwen2](https://huggingface.co/models?other=qwen2)
+* [Qwen2-MoE](https://huggingface.co/models?other=qwen2_moe)
 
 All current models leverage [HuggingFace](https://github.com/huggingface) APIs in our backend to provide both the model weights and the model's corresponding tokenizer.
 

diff --git a/blogs/deepspeed-fastgen/chinese/README.md b/blogs/deepspeed-fastgen/chinese/README.md
@@ -226,6 +226,10 @@ DeepSpeed-FastGen 是 [DeepSpeed-MII](https://github.com/microsoft/DeepSpeed-MII
 * [LLaMA](https://huggingface.co/models?other=llama) 和 [LLaMA-2](https://huggingface.co/models?other=llama-2)
 * [Mistral](https://huggingface.co/models?other=mistral)
 * [OPT](https://huggingface.co/models?other=opt)
+* [Falcon](https://huggingface.co/models?other=falcon)
+* [Mixtral](https://huggingface.co/models?other=mixtral)
+* [Phi-2](https://huggingface.co/models?other=phi-msft)
+* [Qwen](https://huggingface.co/models?other=qwen)
 
 所有当前模型都利用了后端的 [HuggingFace](https://github.com/huggingface) API 来提供模型权重和模型对应的分词器。
-Original file line number
+Diff line change
@@ Expand Up / @@ -7,7 +7,7 @@ on: @@
     jobs:
       deploy:
-        runs-on: ubuntu-20.04
+        runs-on: ubuntu-22.04
         environment: release-env
         steps:
@@ Expand Down @@