*図2: 科学基盤モデル (Scientific foundation model, SFM) とその探索: Distributional Graphormer*
@@ -47,7 +47,7 @@ DeepSpeed4Scienceによる新しいシステム技術はAI駆動の幅広い科
気候の変化は、より頻繁な異常気象を引き起こしています。悪影響を軽減するため、これらのイベントが発生する場所を予測することがますます重要になっています。[ClimaX](https://www.microsoft.com/en-us/research/group/autonomous-systems-group-robotics/articles/introducing-climax-the-first-foundation-model-for-weather-and-climate/)は、さまざまな気象および気候モデリングタスクを実行するために設計された最初の基盤モデルです。さまざまな変数と解像度を持つ多くの異なるデータセットを扱えるため、天気予報の精度が向上する可能性があります。DeepSpeed4Scienceは、非常に大きな高解像度画像データ(数十から数百ペタバイトなど)を長いシーケンスで処理しながら、より大きな基盤モデルを効率的に事前訓練/ファインチューニングするためのClimaXの新しいシステムサポートを提供しています。
-#### AIを用いたAb Initio分子動力学法(AI Powered Ab Initio Molecular Dynamics,AI
MD),Microsoft Research AI4Science
+#### 分子動力学と機械学習型力場(Molecular Dynamics and Machine Learning Force Field),Microsoft Research AI4Science
diff --git a/blogs/deepspeed4science/media/Figure2-1.jpg b/blogs/deepspeed4science/media/Figure2-1.jpg
deleted file mode 100644
index 6008ccd91d09..000000000000
Binary files a/blogs/deepspeed4science/media/Figure2-1.jpg and /dev/null differ
diff --git a/blogs/deepspeed4science/media/Figure2-1.png b/blogs/deepspeed4science/media/Figure2-1.png
new file mode 100644
index 000000000000..bb0b8d9206d1
Binary files /dev/null and b/blogs/deepspeed4science/media/Figure2-1.png differ
diff --git a/deepspeed/runtime/zero/linear.py b/deepspeed/runtime/zero/linear.py
index 175013361fb8..e9dd78864cde 100644
--- a/deepspeed/runtime/zero/linear.py
+++ b/deepspeed/runtime/zero/linear.py
@@ -94,7 +94,10 @@ def backward(ctx, grad_output):
#print(f"Computed grad weight grad_weight {grad_weight.shape}")
if bias is not None and ctx.needs_input_grad[2]:
#print("Computing grad bias")
- grad_bias = grad_output.sum(0)
+ if dim > 2:
+ grad_bias = grad_output.sum([i for i in range(dim - 1)])
+ else:
+ grad_bias = grad_output.sum(0)
#print("Done computing grad bias")
#print("needs bias")
#print(f"backward shaped grad_input {grad_input.shape}, grad_weight {grad_weight.shape}, grad_bias {grad_bias.shape if grad_bias is not None else None}")
diff --git a/deepspeed/runtime/zero/partition_parameters.py b/deepspeed/runtime/zero/partition_parameters.py
index 0a3cb030f9cf..de94d757c3b7 100755
--- a/deepspeed/runtime/zero/partition_parameters.py
+++ b/deepspeed/runtime/zero/partition_parameters.py
@@ -897,7 +897,7 @@ def __init__(
self.quantized_weights = _ds_config.zero_config.zero_quantized_weights
self.quantized_nontrainable_weights = zero_quantized_nontrainable_weights
if _ds_config is not None and _ds_config.zero_config.zero_quantized_nontrainable_weights and not self.quantized_nontrainable_weights:
- self.quantized_weights = _ds_config.zero_config.zero_quantized_nontrainable_weights
+ self.quantized_nontrainable_weights = _ds_config.zero_config.zero_quantized_nontrainable_weights
self.module = module
if (self.quantized_weights or self.quantized_nontrainable_weights):
@@ -1476,7 +1476,10 @@ def _partition_param(self, param, buffer=None, has_been_updated=False):
if start < param.ds_numel and end <= param.ds_numel:
src_tensor = one_dim_param.narrow(0, start, partition_size)
- param.ds_tensor.copy_(src_tensor)
+ with torch.no_grad():
+ # make sure param.ds_tensor requires_grad always be false,
+ # otherwise, torch tracer will complain.
+ param.ds_tensor.copy_(src_tensor)
#partitioned_tensor = src_tensor.clone().detach().to(self.remote_device)
@@ -1486,9 +1489,12 @@ def _partition_param(self, param, buffer=None, has_been_updated=False):
# device=self.remote_device )
if start < param.ds_numel:
- elements_to_copy = param.ds_numel - start
- param.ds_tensor.narrow(0, 0,
- elements_to_copy).copy_(one_dim_param.narrow(0, start, elements_to_copy))
+ elems_to_copy = param.ds_numel - start
+ with torch.no_grad():
+ # make sure param.ds_tensor requires_grad always be false,
+ # otherwise, torch tracer will complain.
+ param.ds_tensor.narrow(0, 0,
+ elems_to_copy).copy_(one_dim_param.narrow(0, start, elems_to_copy))
#print(f"Remote device {self.remote_device}")
diff --git a/tests/unit/hybrid_engine/test_he_all.py b/tests/unit/hybrid_engine/test_he_all.py
index 86eabb1add0c..aa1f120645b1 100644
--- a/tests/unit/hybrid_engine/test_he_all.py
+++ b/tests/unit/hybrid_engine/test_he_all.py
@@ -12,6 +12,10 @@
from deepspeed.accelerator import get_accelerator
from transformers import (AutoConfig, AutoTokenizer, AutoModelForCausalLM)
+from deepspeed.ops.op_builder import InferenceBuilder
+
+if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
+ pytest.skip("This op had not been implemented on this system.", allow_module_level=True)
rocm_version = OpBuilder.installed_rocm_version()
if rocm_version != (0, 0):
diff --git a/tests/unit/hybrid_engine/test_he_llama.py b/tests/unit/hybrid_engine/test_he_llama.py
index 5f992f69b402..fcf5b8ffb89b 100644
--- a/tests/unit/hybrid_engine/test_he_llama.py
+++ b/tests/unit/hybrid_engine/test_he_llama.py
@@ -12,6 +12,10 @@
from deepspeed.accelerator import get_accelerator
from transformers import (AutoConfig, AutoTokenizer, AutoModelForCausalLM)
+from deepspeed.ops.op_builder import InferenceBuilder
+
+if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
+ pytest.skip("This op had not been implemented on this system.", allow_module_level=True)
rocm_version = OpBuilder.installed_rocm_version()
if rocm_version != (0, 0):
diff --git a/tests/unit/hybrid_engine/test_he_lora.py b/tests/unit/hybrid_engine/test_he_lora.py
index f61fdeb3a9f9..ea27239ed55e 100644
--- a/tests/unit/hybrid_engine/test_he_lora.py
+++ b/tests/unit/hybrid_engine/test_he_lora.py
@@ -14,6 +14,10 @@
from deepspeed.utils import safe_get_full_grad
import numpy.testing as npt
from unit.common import DistributedTest
+from deepspeed.ops.op_builder import InferenceBuilder
+
+if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
+ pytest.skip("This op had not been implemented on this system.", allow_module_level=True)
from transformers import (AutoConfig, AutoTokenizer, AutoModelForCausalLM)
diff --git a/tests/unit/inference/test_inference.py b/tests/unit/inference/test_inference.py
index 4ee3cd73c045..894f040be207 100644
--- a/tests/unit/inference/test_inference.py
+++ b/tests/unit/inference/test_inference.py
@@ -22,9 +22,6 @@
from deepspeed.accelerator import get_accelerator
from deepspeed.ops.op_builder import InferenceBuilder
-if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
- pytest.skip("This op had not been implemented on this system.", allow_module_level=True)
-
rocm_version = OpBuilder.installed_rocm_version()
if rocm_version != (0, 0):
pytest.skip("skip inference tests on rocm for now", allow_module_level=True)
@@ -365,6 +362,9 @@ def test(
if invalid_test_msg:
pytest.skip(invalid_test_msg)
+ if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
+ pytest.skip("This op had not been implemented on this system.", allow_module_level=True)
+
model, task = model_w_task
local_rank = int(os.getenv("LOCAL_RANK", "0"))
@@ -401,6 +401,9 @@ def test(
):
model, task = model_w_task
dtype = torch.float16
+ if dtype not in get_accelerator().supported_dtypes():
+ pytest.skip(f"Acceleraor {get_accelerator().device_name()} does not support {dtype}.")
+
local_rank = int(os.getenv("LOCAL_RANK", "0"))
pipe = pipeline(task, model=model, model_kwargs={"low_cpu_mem_usage": True}, device=local_rank, framework="pt")
@@ -514,7 +517,7 @@ def test(
[("Helsinki-NLP/opus-mt-en-de", "translation"), ("Salesforce/codegen-350M-mono", "text-generation")],
ids=["marian", "codegen"], #codegen has fusedqkv weight.
)
-@pytest.mark.parametrize("dtype", [torch.float16], ids=["fp16"])
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16], ids=["fp16", "bf16"])
class TestAutoTensorParallelism(DistributedTest):
world_size = [2]
@@ -530,6 +533,13 @@ def test(
if invalid_test_msg:
pytest.skip(invalid_test_msg)
+ if dtype not in get_accelerator().supported_dtypes():
+ pytest.skip(f"Acceleraor {get_accelerator().device_name()} does not support {dtype}.")
+
+ # TODO: enable this test after torch 2.1 stable release
+ if dtype == torch.bfloat16 and model_w_task[0] == "Salesforce/codegen-350M-mono":
+ pytest.skip("Codegen model(bf16) need to use torch version > 2.0.")
+
model, task = model_w_task
local_rank = int(os.getenv("LOCAL_RANK", "0"))
world_size = int(os.getenv("WORLD_SIZE", "2"))