Merge branch 'master' into master

microsoft · Sep 28, 2023 · 96e643e · 96e643e
2 parents fa8a914 + 388c848
commit 96e643e
Show file tree

Hide file tree

Showing 12 changed files with 48 additions and 16 deletions.
diff --git a/.github/workflows/cpu-inference.yml b/.github/workflows/cpu-inference.yml
@@ -76,4 +76,5 @@ jobs:
           source oneCCL/build/_install/env/setvars.sh
           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
           cd tests
-          TRANSFORMERS_CACHE=~/tmp/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest -m 'seq_inference' -m 'inference_ops' -m 'inference' unit/
+          TRANSFORMERS_CACHE=~/tmp/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest -m 'seq_inference' unit/
+          TRANSFORMERS_CACHE=~/tmp/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest -m 'inference_ops' -m 'inference' unit/
diff --git a/.github/workflows/nv-transformers-v100.yml b/.github/workflows/nv-transformers-v100.yml
@@ -16,7 +16,7 @@ concurrency:
 
 jobs:
   unit-tests:
-    runs-on: [self-hosted, nvidia, cu111, v100]
+    runs-on: [self-hosted, nvidia, cu116, v100]
 
     steps:
       - uses: actions/checkout@v3

diff --git a/blogs/deepspeed4science/chinese/README.md b/blogs/deepspeed4science/chinese/README.md
@@ -29,7 +29,7 @@ DeepSpeed4Science的新系统技术可以用于很多推动科学边界的标志
 #### 科学基础模型（Scientific Foundation Model，SFM），微软研究院AI4Science
 
 <div align="center">
-<img src="../media/Figure2-1.jpg" width="800px" alt="" />
+<img src="../media/Figure2-1.png" width="800px" alt="" />
 <img src="../media/Figure2-2.gif" width="800px" alt="" />
 
 *图2：科学基础模型（Scientific Foundation Model，SFM）及其当前探索：Distributional Graphormer。*
@@ -47,7 +47,7 @@ DeepSpeed4Science的新系统技术可以用于很多推动科学边界的标志
 
 我们的气候正在发生变化，导致极端天气事件的频率增加。为了减轻负面影响，预测这些事件将发生的地方变得越来越重要。[ClimaX](https://www.microsoft.com/en-us/research/group/autonomous-systems-group-robotics/articles/introducing-climax-the-first-foundation-model-for-weather-and-climate/)是第一个设计用于执行各种天气和气候建模任务的基础模型。它可以吸收许多具有不同变量和分辨率的数据集以提高天气预报的准确性。DeepSpeed4Science正在为ClimaX创建新的系统支持和加速策略，以高效地预训练/微调更大的基础模型，同时处理非常大的高分辨率图像数据（例如，数十到数百PB）和长序列。
 
-#### AI驱动的第一性原理分子动力学（AI Powered Ab Initio Molecular Dynamics，AI<sup>2</sup>MD），微软研究院AI4Science
+#### 分子动力学和机器学习力场（Molecular Dynamics and Machine Learning Force Field），微软研究院AI4Science
 
 <div align="center">
 <img src="../media/Figure4.gif" width="800px" alt="" />

diff --git a/blogs/deepspeed4science/japanese/README.md b/blogs/deepspeed4science/japanese/README.md
@@ -29,7 +29,7 @@ DeepSpeed4Scienceによる新しいシステム技術はAI駆動の幅広い科
 #### 科学基盤モデル (Scientific Foundation Model, SFM), Microsoft Research AI4Science
 
 <div align="center">
-<img src="../media/Figure2-1.jpg" width="800px" alt="" />
+<img src="../media/Figure2-1.png" width="800px" alt="" />
 <img src="../media/Figure2-2.gif" width="800px" alt="" />
 
 *図2: 科学基盤モデル (Scientific foundation model, SFM) とその探索: Distributional Graphormer*
@@ -47,7 +47,7 @@ DeepSpeed4Scienceによる新しいシステム技術はAI駆動の幅広い科
 
 気候の変化は、より頻繁な異常気象を引き起こしています。悪影響を軽減するため、これらのイベントが発生する場所を予測することがますます重要になっています。[ClimaX](https://www.microsoft.com/en-us/research/group/autonomous-systems-group-robotics/articles/introducing-climax-the-first-foundation-model-for-weather-and-climate/)は、さまざまな気象および気候モデリングタスクを実行するために設計された最初の基盤モデルです。さまざまな変数と解像度を持つ多くの異なるデータセットを扱えるため、天気予報の精度が向上する可能性があります。DeepSpeed4Scienceは、非常に大きな高解像度画像データ(数十から数百ペタバイトなど)を長いシーケンスで処理しながら、より大きな基盤モデルを効率的に事前訓練/ファインチューニングするためのClimaXの新しいシステムサポートを提供しています。
 
-#### AIを用いたAb Initio分子動力学法（AI Powered Ab Initio Molecular Dynamics，AI<sup>2</sup>MD），Microsoft Research AI4Science
+#### 分子動力学と機械学習型力場（Molecular Dynamics and Machine Learning Force Field），Microsoft Research AI4Science
 
 <div align="center">
 <img src="../media/Figure4.gif" width="800px" alt="" />

diff --git a/blogs/deepspeed4science/media/Figure2-1.jpg b/blogs/deepspeed4science/media/Figure2-1.jpg
diff --git a/blogs/deepspeed4science/media/Figure2-1.png b/blogs/deepspeed4science/media/Figure2-1.png
diff --git a/deepspeed/runtime/zero/linear.py b/deepspeed/runtime/zero/linear.py
@@ -94,7 +94,10 @@ def backward(ctx, grad_output):
             #print(f"Computed grad weight grad_weight {grad_weight.shape}")
         if bias is not None and ctx.needs_input_grad[2]:
             #print("Computing grad bias")
-            grad_bias = grad_output.sum(0)
+            if dim > 2:
+                grad_bias = grad_output.sum([i for i in range(dim - 1)])
+            else:
+                grad_bias = grad_output.sum(0)
             #print("Done computing grad bias")
             #print("needs bias")
         #print(f"backward shaped grad_input {grad_input.shape}, grad_weight {grad_weight.shape}, grad_bias {grad_bias.shape if grad_bias is not None else None}")

diff --git a/deepspeed/runtime/zero/partition_parameters.py b/deepspeed/runtime/zero/partition_parameters.py
@@ -897,7 +897,7 @@ def __init__(
             self.quantized_weights = _ds_config.zero_config.zero_quantized_weights
         self.quantized_nontrainable_weights = zero_quantized_nontrainable_weights
         if _ds_config is not None and _ds_config.zero_config.zero_quantized_nontrainable_weights and not self.quantized_nontrainable_weights:
-            self.quantized_weights = _ds_config.zero_config.zero_quantized_nontrainable_weights
+            self.quantized_nontrainable_weights = _ds_config.zero_config.zero_quantized_nontrainable_weights
 
         self.module = module
         if (self.quantized_weights or self.quantized_nontrainable_weights):
@@ -1476,7 +1476,10 @@ def _partition_param(self, param, buffer=None, has_been_updated=False):
             if start < param.ds_numel and end <= param.ds_numel:
                 src_tensor = one_dim_param.narrow(0, start, partition_size)
 
-                param.ds_tensor.copy_(src_tensor)
+                with torch.no_grad():
+                    # make sure param.ds_tensor requires_grad always be false,
+                    # otherwise, torch tracer will complain.
+                    param.ds_tensor.copy_(src_tensor)
 
                 #partitioned_tensor = src_tensor.clone().detach().to(self.remote_device)
 
@@ -1486,9 +1489,12 @@ def _partition_param(self, param, buffer=None, has_been_updated=False):
                 #                                  device=self.remote_device )
 
                 if start < param.ds_numel:
-                    elements_to_copy = param.ds_numel - start
-                    param.ds_tensor.narrow(0, 0,
-                                           elements_to_copy).copy_(one_dim_param.narrow(0, start, elements_to_copy))
+                    elems_to_copy = param.ds_numel - start
+                    with torch.no_grad():
+                        # make sure param.ds_tensor requires_grad always be false,
+                        # otherwise, torch tracer will complain.
+                        param.ds_tensor.narrow(0, 0,
+                                               elems_to_copy).copy_(one_dim_param.narrow(0, start, elems_to_copy))
 
             #print(f"Remote device {self.remote_device}")
 

diff --git a/tests/unit/hybrid_engine/test_he_all.py b/tests/unit/hybrid_engine/test_he_all.py
@@ -12,6 +12,10 @@
 from deepspeed.accelerator import get_accelerator
 
 from transformers import (AutoConfig, AutoTokenizer, AutoModelForCausalLM)
+from deepspeed.ops.op_builder import InferenceBuilder
+
+if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
+    pytest.skip("This op had not been implemented on this system.", allow_module_level=True)
 
 rocm_version = OpBuilder.installed_rocm_version()
 if rocm_version != (0, 0):

diff --git a/tests/unit/hybrid_engine/test_he_llama.py b/tests/unit/hybrid_engine/test_he_llama.py
@@ -12,6 +12,10 @@
 from deepspeed.accelerator import get_accelerator
 
 from transformers import (AutoConfig, AutoTokenizer, AutoModelForCausalLM)
+from deepspeed.ops.op_builder import InferenceBuilder
+
+if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
+    pytest.skip("This op had not been implemented on this system.", allow_module_level=True)
 
 rocm_version = OpBuilder.installed_rocm_version()
 if rocm_version != (0, 0):

diff --git a/tests/unit/hybrid_engine/test_he_lora.py b/tests/unit/hybrid_engine/test_he_lora.py
@@ -14,6 +14,10 @@
 from deepspeed.utils import safe_get_full_grad
 import numpy.testing as npt
 from unit.common import DistributedTest
+from deepspeed.ops.op_builder import InferenceBuilder
+
+if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
+    pytest.skip("This op had not been implemented on this system.", allow_module_level=True)
 
 from transformers import (AutoConfig, AutoTokenizer, AutoModelForCausalLM)
 

diff --git a/tests/unit/inference/test_inference.py b/tests/unit/inference/test_inference.py
@@ -22,9 +22,6 @@
 from deepspeed.accelerator import get_accelerator
 from deepspeed.ops.op_builder import InferenceBuilder
 
-if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
-    pytest.skip("This op had not been implemented on this system.", allow_module_level=True)
-
 rocm_version = OpBuilder.installed_rocm_version()
 if rocm_version != (0, 0):
     pytest.skip("skip inference tests on rocm for now", allow_module_level=True)
@@ -365,6 +362,9 @@ def test(
         if invalid_test_msg:
             pytest.skip(invalid_test_msg)
 
+        if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
+            pytest.skip("This op had not been implemented on this system.", allow_module_level=True)
+
         model, task = model_w_task
         local_rank = int(os.getenv("LOCAL_RANK", "0"))
 
@@ -401,6 +401,9 @@ def test(
     ):
         model, task = model_w_task
         dtype = torch.float16
+        if dtype not in get_accelerator().supported_dtypes():
+            pytest.skip(f"Acceleraor {get_accelerator().device_name()} does not support {dtype}.")
+
         local_rank = int(os.getenv("LOCAL_RANK", "0"))
 
         pipe = pipeline(task, model=model, model_kwargs={"low_cpu_mem_usage": True}, device=local_rank, framework="pt")
@@ -514,7 +517,7 @@ def test(
     [("Helsinki-NLP/opus-mt-en-de", "translation"), ("Salesforce/codegen-350M-mono", "text-generation")],
     ids=["marian", "codegen"],  #codegen has fusedqkv weight.
 )
-@pytest.mark.parametrize("dtype", [torch.float16], ids=["fp16"])
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16], ids=["fp16", "bf16"])
 class TestAutoTensorParallelism(DistributedTest):
     world_size = [2]
 
@@ -530,6 +533,13 @@ def test(
         if invalid_test_msg:
             pytest.skip(invalid_test_msg)
 
+        if dtype not in get_accelerator().supported_dtypes():
+            pytest.skip(f"Acceleraor {get_accelerator().device_name()} does not support {dtype}.")
+
+        # TODO: enable this test after torch 2.1 stable release
+        if dtype == torch.bfloat16 and model_w_task[0] == "Salesforce/codegen-350M-mono":
+            pytest.skip("Codegen model(bf16) need to use torch version > 2.0.")
+
         model, task = model_w_task
         local_rank = int(os.getenv("LOCAL_RANK", "0"))
         world_size = int(os.getenv("WORLD_SIZE", "2"))