Skip to content

Commit

Permalink
Merge branch 'master' into master
Browse files Browse the repository at this point in the history
  • Loading branch information
tjruwase authored Sep 28, 2023
2 parents fa8a914 + 388c848 commit 96e643e
Show file tree
Hide file tree
Showing 12 changed files with 48 additions and 16 deletions.
3 changes: 2 additions & 1 deletion .github/workflows/cpu-inference.yml
Original file line number Diff line number Diff line change
Expand Up @@ -76,4 +76,5 @@ jobs:
source oneCCL/build/_install/env/setvars.sh
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
cd tests
TRANSFORMERS_CACHE=~/tmp/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest -m 'seq_inference' -m 'inference_ops' -m 'inference' unit/
TRANSFORMERS_CACHE=~/tmp/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest -m 'seq_inference' unit/
TRANSFORMERS_CACHE=~/tmp/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest -m 'inference_ops' -m 'inference' unit/
2 changes: 1 addition & 1 deletion .github/workflows/nv-transformers-v100.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ concurrency:

jobs:
unit-tests:
runs-on: [self-hosted, nvidia, cu111, v100]
runs-on: [self-hosted, nvidia, cu116, v100]

steps:
- uses: actions/checkout@v3
Expand Down
4 changes: 2 additions & 2 deletions blogs/deepspeed4science/chinese/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ DeepSpeed4Science的新系统技术可以用于很多推动科学边界的标志
#### 科学基础模型(Scientific Foundation Model,SFM),微软研究院AI4Science

<div align="center">
<img src="../media/Figure2-1.jpg" width="800px" alt="" />
<img src="../media/Figure2-1.png" width="800px" alt="" />
<img src="../media/Figure2-2.gif" width="800px" alt="" />

*图2:科学基础模型(Scientific Foundation Model,SFM)及其当前探索:Distributional Graphormer。*
Expand All @@ -47,7 +47,7 @@ DeepSpeed4Science的新系统技术可以用于很多推动科学边界的标志

我们的气候正在发生变化,导致极端天气事件的频率增加。为了减轻负面影响,预测这些事件将发生的地方变得越来越重要。[ClimaX](https://www.microsoft.com/en-us/research/group/autonomous-systems-group-robotics/articles/introducing-climax-the-first-foundation-model-for-weather-and-climate/)是第一个设计用于执行各种天气和气候建模任务的基础模型。它可以吸收许多具有不同变量和分辨率的数据集以提高天气预报的准确性。DeepSpeed4Science正在为ClimaX创建新的系统支持和加速策略,以高效地预训练/微调更大的基础模型,同时处理非常大的高分辨率图像数据(例如,数十到数百PB)和长序列。

#### AI驱动的第一性原理分子动力学(AI Powered Ab Initio Molecular Dynamics,AI<sup>2</sup>MD),微软研究院AI4Science
#### 分子动力学和机器学习力场(Molecular Dynamics and Machine Learning Force Field),微软研究院AI4Science

<div align="center">
<img src="../media/Figure4.gif" width="800px" alt="" />
Expand Down
4 changes: 2 additions & 2 deletions blogs/deepspeed4science/japanese/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ DeepSpeed4Scienceによる新しいシステム技術はAI駆動の幅広い科
#### 科学基盤モデル (Scientific Foundation Model, SFM), Microsoft Research AI4Science

<div align="center">
<img src="../media/Figure2-1.jpg" width="800px" alt="" />
<img src="../media/Figure2-1.png" width="800px" alt="" />
<img src="../media/Figure2-2.gif" width="800px" alt="" />

*図2: 科学基盤モデル (Scientific foundation model, SFM) とその探索: Distributional Graphormer*
Expand All @@ -47,7 +47,7 @@ DeepSpeed4Scienceによる新しいシステム技術はAI駆動の幅広い科

気候の変化は、より頻繁な異常気象を引き起こしています。悪影響を軽減するため、これらのイベントが発生する場所を予測することがますます重要になっています。[ClimaX](https://www.microsoft.com/en-us/research/group/autonomous-systems-group-robotics/articles/introducing-climax-the-first-foundation-model-for-weather-and-climate/)は、さまざまな気象および気候モデリングタスクを実行するために設計された最初の基盤モデルです。さまざまな変数と解像度を持つ多くの異なるデータセットを扱えるため、天気予報の精度が向上する可能性があります。DeepSpeed4Scienceは、非常に大きな高解像度画像データ(数十から数百ペタバイトなど)を長いシーケンスで処理しながら、より大きな基盤モデルを効率的に事前訓練/ファインチューニングするためのClimaXの新しいシステムサポートを提供しています。

#### AIを用いたAb Initio分子動力学法(AI Powered Ab Initio Molecular Dynamics,AI<sup>2</sup>MD),Microsoft Research AI4Science
#### 分子動力学と機械学習型力場(Molecular Dynamics and Machine Learning Force Field),Microsoft Research AI4Science

<div align="center">
<img src="../media/Figure4.gif" width="800px" alt="" />
Expand Down
Binary file removed blogs/deepspeed4science/media/Figure2-1.jpg
Binary file not shown.
Binary file added blogs/deepspeed4science/media/Figure2-1.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
5 changes: 4 additions & 1 deletion deepspeed/runtime/zero/linear.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,10 @@ def backward(ctx, grad_output):
#print(f"Computed grad weight grad_weight {grad_weight.shape}")
if bias is not None and ctx.needs_input_grad[2]:
#print("Computing grad bias")
grad_bias = grad_output.sum(0)
if dim > 2:
grad_bias = grad_output.sum([i for i in range(dim - 1)])
else:
grad_bias = grad_output.sum(0)
#print("Done computing grad bias")
#print("needs bias")
#print(f"backward shaped grad_input {grad_input.shape}, grad_weight {grad_weight.shape}, grad_bias {grad_bias.shape if grad_bias is not None else None}")
Expand Down
16 changes: 11 additions & 5 deletions deepspeed/runtime/zero/partition_parameters.py
Original file line number Diff line number Diff line change
Expand Up @@ -897,7 +897,7 @@ def __init__(
self.quantized_weights = _ds_config.zero_config.zero_quantized_weights
self.quantized_nontrainable_weights = zero_quantized_nontrainable_weights
if _ds_config is not None and _ds_config.zero_config.zero_quantized_nontrainable_weights and not self.quantized_nontrainable_weights:
self.quantized_weights = _ds_config.zero_config.zero_quantized_nontrainable_weights
self.quantized_nontrainable_weights = _ds_config.zero_config.zero_quantized_nontrainable_weights

self.module = module
if (self.quantized_weights or self.quantized_nontrainable_weights):
Expand Down Expand Up @@ -1476,7 +1476,10 @@ def _partition_param(self, param, buffer=None, has_been_updated=False):
if start < param.ds_numel and end <= param.ds_numel:
src_tensor = one_dim_param.narrow(0, start, partition_size)

param.ds_tensor.copy_(src_tensor)
with torch.no_grad():
# make sure param.ds_tensor requires_grad always be false,
# otherwise, torch tracer will complain.
param.ds_tensor.copy_(src_tensor)

#partitioned_tensor = src_tensor.clone().detach().to(self.remote_device)

Expand All @@ -1486,9 +1489,12 @@ def _partition_param(self, param, buffer=None, has_been_updated=False):
# device=self.remote_device )

if start < param.ds_numel:
elements_to_copy = param.ds_numel - start
param.ds_tensor.narrow(0, 0,
elements_to_copy).copy_(one_dim_param.narrow(0, start, elements_to_copy))
elems_to_copy = param.ds_numel - start
with torch.no_grad():
# make sure param.ds_tensor requires_grad always be false,
# otherwise, torch tracer will complain.
param.ds_tensor.narrow(0, 0,
elems_to_copy).copy_(one_dim_param.narrow(0, start, elems_to_copy))

#print(f"Remote device {self.remote_device}")

Expand Down
4 changes: 4 additions & 0 deletions tests/unit/hybrid_engine/test_he_all.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,10 @@
from deepspeed.accelerator import get_accelerator

from transformers import (AutoConfig, AutoTokenizer, AutoModelForCausalLM)
from deepspeed.ops.op_builder import InferenceBuilder

if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
pytest.skip("This op had not been implemented on this system.", allow_module_level=True)

rocm_version = OpBuilder.installed_rocm_version()
if rocm_version != (0, 0):
Expand Down
4 changes: 4 additions & 0 deletions tests/unit/hybrid_engine/test_he_llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,10 @@
from deepspeed.accelerator import get_accelerator

from transformers import (AutoConfig, AutoTokenizer, AutoModelForCausalLM)
from deepspeed.ops.op_builder import InferenceBuilder

if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
pytest.skip("This op had not been implemented on this system.", allow_module_level=True)

rocm_version = OpBuilder.installed_rocm_version()
if rocm_version != (0, 0):
Expand Down
4 changes: 4 additions & 0 deletions tests/unit/hybrid_engine/test_he_lora.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,10 @@
from deepspeed.utils import safe_get_full_grad
import numpy.testing as npt
from unit.common import DistributedTest
from deepspeed.ops.op_builder import InferenceBuilder

if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
pytest.skip("This op had not been implemented on this system.", allow_module_level=True)

from transformers import (AutoConfig, AutoTokenizer, AutoModelForCausalLM)

Expand Down
18 changes: 14 additions & 4 deletions tests/unit/inference/test_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,6 @@
from deepspeed.accelerator import get_accelerator
from deepspeed.ops.op_builder import InferenceBuilder

if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
pytest.skip("This op had not been implemented on this system.", allow_module_level=True)

rocm_version = OpBuilder.installed_rocm_version()
if rocm_version != (0, 0):
pytest.skip("skip inference tests on rocm for now", allow_module_level=True)
Expand Down Expand Up @@ -365,6 +362,9 @@ def test(
if invalid_test_msg:
pytest.skip(invalid_test_msg)

if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
pytest.skip("This op had not been implemented on this system.", allow_module_level=True)

model, task = model_w_task
local_rank = int(os.getenv("LOCAL_RANK", "0"))

Expand Down Expand Up @@ -401,6 +401,9 @@ def test(
):
model, task = model_w_task
dtype = torch.float16
if dtype not in get_accelerator().supported_dtypes():
pytest.skip(f"Acceleraor {get_accelerator().device_name()} does not support {dtype}.")

local_rank = int(os.getenv("LOCAL_RANK", "0"))

pipe = pipeline(task, model=model, model_kwargs={"low_cpu_mem_usage": True}, device=local_rank, framework="pt")
Expand Down Expand Up @@ -514,7 +517,7 @@ def test(
[("Helsinki-NLP/opus-mt-en-de", "translation"), ("Salesforce/codegen-350M-mono", "text-generation")],
ids=["marian", "codegen"], #codegen has fusedqkv weight.
)
@pytest.mark.parametrize("dtype", [torch.float16], ids=["fp16"])
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16], ids=["fp16", "bf16"])
class TestAutoTensorParallelism(DistributedTest):
world_size = [2]

Expand All @@ -530,6 +533,13 @@ def test(
if invalid_test_msg:
pytest.skip(invalid_test_msg)

if dtype not in get_accelerator().supported_dtypes():
pytest.skip(f"Acceleraor {get_accelerator().device_name()} does not support {dtype}.")

# TODO: enable this test after torch 2.1 stable release
if dtype == torch.bfloat16 and model_w_task[0] == "Salesforce/codegen-350M-mono":
pytest.skip("Codegen model(bf16) need to use torch version > 2.0.")

model, task = model_w_task
local_rank = int(os.getenv("LOCAL_RANK", "0"))
world_size = int(os.getenv("WORLD_SIZE", "2"))
Expand Down

0 comments on commit 96e643e

Please sign in to comment.