diff --git a/.github/workflows/cpu-inference.yml b/.github/workflows/cpu-inference.yml index 2c555203e950..8bba51dab6fd 100644 --- a/.github/workflows/cpu-inference.yml +++ b/.github/workflows/cpu-inference.yml @@ -76,4 +76,5 @@ jobs: source oneCCL/build/_install/env/setvars.sh unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch cd tests - TRANSFORMERS_CACHE=~/tmp/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest -m 'seq_inference' -m 'inference_ops' -m 'inference' unit/ + TRANSFORMERS_CACHE=~/tmp/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest -m 'seq_inference' unit/ + TRANSFORMERS_CACHE=~/tmp/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest -m 'inference_ops' -m 'inference' unit/ diff --git a/.github/workflows/nv-transformers-v100.yml b/.github/workflows/nv-transformers-v100.yml index a252c615015d..4e5a34365f52 100644 --- a/.github/workflows/nv-transformers-v100.yml +++ b/.github/workflows/nv-transformers-v100.yml @@ -16,7 +16,7 @@ concurrency: jobs: unit-tests: - runs-on: [self-hosted, nvidia, cu111, v100] + runs-on: [self-hosted, nvidia, cu116, v100] steps: - uses: actions/checkout@v3 diff --git a/blogs/deepspeed4science/chinese/README.md b/blogs/deepspeed4science/chinese/README.md index 3ffddfb16fe5..07647c767553 100644 --- a/blogs/deepspeed4science/chinese/README.md +++ b/blogs/deepspeed4science/chinese/README.md @@ -29,7 +29,7 @@ DeepSpeed4Science的新系统技术可以用于很多推动科学边界的标志 #### 科学基础模型(Scientific Foundation Model,SFM),微软研究院AI4Science
- + *图2:科学基础模型(Scientific Foundation Model,SFM)及其当前探索:Distributional Graphormer。* @@ -47,7 +47,7 @@ DeepSpeed4Science的新系统技术可以用于很多推动科学边界的标志 我们的气候正在发生变化,导致极端天气事件的频率增加。为了减轻负面影响,预测这些事件将发生的地方变得越来越重要。[ClimaX](https://www.microsoft.com/en-us/research/group/autonomous-systems-group-robotics/articles/introducing-climax-the-first-foundation-model-for-weather-and-climate/)是第一个设计用于执行各种天气和气候建模任务的基础模型。它可以吸收许多具有不同变量和分辨率的数据集以提高天气预报的准确性。DeepSpeed4Science正在为ClimaX创建新的系统支持和加速策略,以高效地预训练/微调更大的基础模型,同时处理非常大的高分辨率图像数据(例如,数十到数百PB)和长序列。 -#### AI驱动的第一性原理分子动力学(AI Powered Ab Initio Molecular Dynamics,AI2MD),微软研究院AI4Science +#### 分子动力学和机器学习力场(Molecular Dynamics and Machine Learning Force Field),微软研究院AI4Science
diff --git a/blogs/deepspeed4science/japanese/README.md b/blogs/deepspeed4science/japanese/README.md index 80fc137e16bb..774ef79a17dc 100644 --- a/blogs/deepspeed4science/japanese/README.md +++ b/blogs/deepspeed4science/japanese/README.md @@ -29,7 +29,7 @@ DeepSpeed4Scienceによる新しいシステム技術はAI駆動の幅広い科 #### 科学基盤モデル (Scientific Foundation Model, SFM), Microsoft Research AI4Science
- + *図2: 科学基盤モデル (Scientific foundation model, SFM) とその探索: Distributional Graphormer* @@ -47,7 +47,7 @@ DeepSpeed4Scienceによる新しいシステム技術はAI駆動の幅広い科 気候の変化は、より頻繁な異常気象を引き起こしています。悪影響を軽減するため、これらのイベントが発生する場所を予測することがますます重要になっています。[ClimaX](https://www.microsoft.com/en-us/research/group/autonomous-systems-group-robotics/articles/introducing-climax-the-first-foundation-model-for-weather-and-climate/)は、さまざまな気象および気候モデリングタスクを実行するために設計された最初の基盤モデルです。さまざまな変数と解像度を持つ多くの異なるデータセットを扱えるため、天気予報の精度が向上する可能性があります。DeepSpeed4Scienceは、非常に大きな高解像度画像データ(数十から数百ペタバイトなど)を長いシーケンスで処理しながら、より大きな基盤モデルを効率的に事前訓練/ファインチューニングするためのClimaXの新しいシステムサポートを提供しています。 -#### AIを用いたAb Initio分子動力学法(AI Powered Ab Initio Molecular Dynamics,AI2MD),Microsoft Research AI4Science +#### 分子動力学と機械学習型力場(Molecular Dynamics and Machine Learning Force Field),Microsoft Research AI4Science
diff --git a/blogs/deepspeed4science/media/Figure2-1.jpg b/blogs/deepspeed4science/media/Figure2-1.jpg deleted file mode 100644 index 6008ccd91d09..000000000000 Binary files a/blogs/deepspeed4science/media/Figure2-1.jpg and /dev/null differ diff --git a/blogs/deepspeed4science/media/Figure2-1.png b/blogs/deepspeed4science/media/Figure2-1.png new file mode 100644 index 000000000000..bb0b8d9206d1 Binary files /dev/null and b/blogs/deepspeed4science/media/Figure2-1.png differ diff --git a/deepspeed/runtime/zero/linear.py b/deepspeed/runtime/zero/linear.py index 175013361fb8..e9dd78864cde 100644 --- a/deepspeed/runtime/zero/linear.py +++ b/deepspeed/runtime/zero/linear.py @@ -94,7 +94,10 @@ def backward(ctx, grad_output): #print(f"Computed grad weight grad_weight {grad_weight.shape}") if bias is not None and ctx.needs_input_grad[2]: #print("Computing grad bias") - grad_bias = grad_output.sum(0) + if dim > 2: + grad_bias = grad_output.sum([i for i in range(dim - 1)]) + else: + grad_bias = grad_output.sum(0) #print("Done computing grad bias") #print("needs bias") #print(f"backward shaped grad_input {grad_input.shape}, grad_weight {grad_weight.shape}, grad_bias {grad_bias.shape if grad_bias is not None else None}") diff --git a/deepspeed/runtime/zero/partition_parameters.py b/deepspeed/runtime/zero/partition_parameters.py index 0a3cb030f9cf..de94d757c3b7 100755 --- a/deepspeed/runtime/zero/partition_parameters.py +++ b/deepspeed/runtime/zero/partition_parameters.py @@ -897,7 +897,7 @@ def __init__( self.quantized_weights = _ds_config.zero_config.zero_quantized_weights self.quantized_nontrainable_weights = zero_quantized_nontrainable_weights if _ds_config is not None and _ds_config.zero_config.zero_quantized_nontrainable_weights and not self.quantized_nontrainable_weights: - self.quantized_weights = _ds_config.zero_config.zero_quantized_nontrainable_weights + self.quantized_nontrainable_weights = _ds_config.zero_config.zero_quantized_nontrainable_weights self.module = module if (self.quantized_weights or self.quantized_nontrainable_weights): @@ -1476,7 +1476,10 @@ def _partition_param(self, param, buffer=None, has_been_updated=False): if start < param.ds_numel and end <= param.ds_numel: src_tensor = one_dim_param.narrow(0, start, partition_size) - param.ds_tensor.copy_(src_tensor) + with torch.no_grad(): + # make sure param.ds_tensor requires_grad always be false, + # otherwise, torch tracer will complain. + param.ds_tensor.copy_(src_tensor) #partitioned_tensor = src_tensor.clone().detach().to(self.remote_device) @@ -1486,9 +1489,12 @@ def _partition_param(self, param, buffer=None, has_been_updated=False): # device=self.remote_device ) if start < param.ds_numel: - elements_to_copy = param.ds_numel - start - param.ds_tensor.narrow(0, 0, - elements_to_copy).copy_(one_dim_param.narrow(0, start, elements_to_copy)) + elems_to_copy = param.ds_numel - start + with torch.no_grad(): + # make sure param.ds_tensor requires_grad always be false, + # otherwise, torch tracer will complain. + param.ds_tensor.narrow(0, 0, + elems_to_copy).copy_(one_dim_param.narrow(0, start, elems_to_copy)) #print(f"Remote device {self.remote_device}") diff --git a/tests/unit/hybrid_engine/test_he_all.py b/tests/unit/hybrid_engine/test_he_all.py index 86eabb1add0c..aa1f120645b1 100644 --- a/tests/unit/hybrid_engine/test_he_all.py +++ b/tests/unit/hybrid_engine/test_he_all.py @@ -12,6 +12,10 @@ from deepspeed.accelerator import get_accelerator from transformers import (AutoConfig, AutoTokenizer, AutoModelForCausalLM) +from deepspeed.ops.op_builder import InferenceBuilder + +if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]: + pytest.skip("This op had not been implemented on this system.", allow_module_level=True) rocm_version = OpBuilder.installed_rocm_version() if rocm_version != (0, 0): diff --git a/tests/unit/hybrid_engine/test_he_llama.py b/tests/unit/hybrid_engine/test_he_llama.py index 5f992f69b402..fcf5b8ffb89b 100644 --- a/tests/unit/hybrid_engine/test_he_llama.py +++ b/tests/unit/hybrid_engine/test_he_llama.py @@ -12,6 +12,10 @@ from deepspeed.accelerator import get_accelerator from transformers import (AutoConfig, AutoTokenizer, AutoModelForCausalLM) +from deepspeed.ops.op_builder import InferenceBuilder + +if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]: + pytest.skip("This op had not been implemented on this system.", allow_module_level=True) rocm_version = OpBuilder.installed_rocm_version() if rocm_version != (0, 0): diff --git a/tests/unit/hybrid_engine/test_he_lora.py b/tests/unit/hybrid_engine/test_he_lora.py index f61fdeb3a9f9..ea27239ed55e 100644 --- a/tests/unit/hybrid_engine/test_he_lora.py +++ b/tests/unit/hybrid_engine/test_he_lora.py @@ -14,6 +14,10 @@ from deepspeed.utils import safe_get_full_grad import numpy.testing as npt from unit.common import DistributedTest +from deepspeed.ops.op_builder import InferenceBuilder + +if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]: + pytest.skip("This op had not been implemented on this system.", allow_module_level=True) from transformers import (AutoConfig, AutoTokenizer, AutoModelForCausalLM) diff --git a/tests/unit/inference/test_inference.py b/tests/unit/inference/test_inference.py index 4ee3cd73c045..894f040be207 100644 --- a/tests/unit/inference/test_inference.py +++ b/tests/unit/inference/test_inference.py @@ -22,9 +22,6 @@ from deepspeed.accelerator import get_accelerator from deepspeed.ops.op_builder import InferenceBuilder -if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]: - pytest.skip("This op had not been implemented on this system.", allow_module_level=True) - rocm_version = OpBuilder.installed_rocm_version() if rocm_version != (0, 0): pytest.skip("skip inference tests on rocm for now", allow_module_level=True) @@ -365,6 +362,9 @@ def test( if invalid_test_msg: pytest.skip(invalid_test_msg) + if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]: + pytest.skip("This op had not been implemented on this system.", allow_module_level=True) + model, task = model_w_task local_rank = int(os.getenv("LOCAL_RANK", "0")) @@ -401,6 +401,9 @@ def test( ): model, task = model_w_task dtype = torch.float16 + if dtype not in get_accelerator().supported_dtypes(): + pytest.skip(f"Acceleraor {get_accelerator().device_name()} does not support {dtype}.") + local_rank = int(os.getenv("LOCAL_RANK", "0")) pipe = pipeline(task, model=model, model_kwargs={"low_cpu_mem_usage": True}, device=local_rank, framework="pt") @@ -514,7 +517,7 @@ def test( [("Helsinki-NLP/opus-mt-en-de", "translation"), ("Salesforce/codegen-350M-mono", "text-generation")], ids=["marian", "codegen"], #codegen has fusedqkv weight. ) -@pytest.mark.parametrize("dtype", [torch.float16], ids=["fp16"]) +@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16], ids=["fp16", "bf16"]) class TestAutoTensorParallelism(DistributedTest): world_size = [2] @@ -530,6 +533,13 @@ def test( if invalid_test_msg: pytest.skip(invalid_test_msg) + if dtype not in get_accelerator().supported_dtypes(): + pytest.skip(f"Acceleraor {get_accelerator().device_name()} does not support {dtype}.") + + # TODO: enable this test after torch 2.1 stable release + if dtype == torch.bfloat16 and model_w_task[0] == "Salesforce/codegen-350M-mono": + pytest.skip("Codegen model(bf16) need to use torch version > 2.0.") + model, task = model_w_task local_rank = int(os.getenv("LOCAL_RANK", "0")) world_size = int(os.getenv("WORLD_SIZE", "2"))