Merge branch 'microsoft:master' into master

microsoft · Jun 9, 2023 · a726d13 · a726d13
2 parents bc70db4 + 046afce
commit a726d13
Show file tree

Hide file tree

Showing 47 changed files with 634 additions and 167 deletions.
diff --git a/.github/workflows/cpu-inference.yml b/.github/workflows/cpu-inference.yml
@@ -6,9 +6,11 @@ on:
       - 'staging**'
     paths-ignore:
       - 'docs/**'
+      - 'blogs/**'
   pull_request:
     paths-ignore:
       - 'docs/**'
+      - 'blogs/**'
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}

diff --git a/.github/workflows/nv-pre-compile-ops.yml b/.github/workflows/nv-pre-compile-ops.yml
@@ -4,9 +4,15 @@ on:
   push:
     branches:
       - 'staging**'
+    paths-ignore:
+      - 'docs/**'
+      - 'blogs/**'
   pull_request:
     branches:
       '**'
+    paths-ignore:
+      - 'docs/**'
+      - 'blogs/**'
   schedule:
     - cron: "0 0 * * *"
 

diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml
@@ -4,9 +4,15 @@ on:
   push:
     branches:
       - 'staging**'
+    paths-ignore:
+      - 'docs/**'
+      - 'blogs/**'
   pull_request:
     branches:
       '**'
+    paths-ignore:
+      - 'docs/**'
+      - 'blogs/**'
   schedule:
     - cron: "0 0 * * *"
 

diff --git a/README.md b/README.md
@@ -223,6 +223,7 @@ Conduct](https://opensource.microsoft.com/codeofconduct/). For more information
 18. Syed Zawad, Cheng Li, Zhewei Yao, Elton Zheng, Yuxiong He, Feng Yan. (2023) DySR: Adaptive Super-Resolution via Algorithm and System Co-design. [ICLR:2023](https://openreview.net/forum?id=Pgtn4l6eKjv).
 19. Sheng Shen, Zhewei Yao, Chunyuan Li, Trevor Darrell, Kurt Keutzer, Yuxiong He. (2023) Scaling Vision-Language Models with Sparse Mixture of Experts. [arXiv:2303.07226](https://arxiv.org/abs/2303.07226).
 20. Quentin Anthony, Ammar Ahmad Awan, Jeff Rasley, Yuxiong He, Aamir Shafi, Mustafa Abduljabbar, Hari Subramoni, Dhabaleswar Panda. (2023) MCR-DL: Mix-and-Match Communication Runtime for Deep Learning [arXiv:2303.08374](https://arxiv.org/abs/2303.08374) and will appear at IPDPS 2023.
+21. Siddharth Singh, Olatunji Ruwase, Ammar Ahmad Awan, Samyam Rajbhandari, Yuxiong He, Abhinav Bhatele. (2023) A Hybrid Tensor-Expert-Data Parallelism Approach to Optimize Mixture-of-Experts Training [arXiv:2303.06318](https://arxiv.org/abs/2303.06318) and will appear at ICS 2023.
 
 
 # Videos

diff --git a/accelerator/npu_accelerator.py b/accelerator/npu_accelerator.py
@@ -0,0 +1,241 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .abstract_accelerator import DeepSpeedAccelerator
+# During setup stage torch may not be installed, pass on no torch will
+# allow op builder related API to be executed.
+try:
+    import torch.npu
+except ImportError:
+    pass
+
+
+class NPU_Accelerator(DeepSpeedAccelerator):
+
+    def __init__(self):
+        self._name = 'npu'
+        self._communication_backend_name = 'hccl'
+
+    def is_synchronized_device(self):
+        return False
+
+    # Device APIs
+    def device_name(self, device_index=None):
+        if device_index == None:
+            return 'npu'
+        return 'npu:{}'.format(device_index)
+
+    def device(self, device_index=None):
+        return torch.npu.device(device_index)
+
+    def set_device(self, device_index):
+        torch.npu.set_device(device_index)
+
+    def current_device(self):
+        return torch.npu.current_device()
+
+    def current_device_name(self):
+        return 'npu:{}'.format(torch.npu.current_device())
+
+    def device_count(self):
+        return torch.npu.device_count()
+
+    def synchronize(self, device_index=None):
+        return torch.npu.synchronize(device_index)
+
+    # RNG APIs
+    def random(self):
+        return torch.random
+
+    def set_rng_state(self, new_state, device_index=None):
+        if device_index is None:
+            return torch.npu.set_rng_state(new_state)
+
+        return torch.npu.set_rng_state(new_state, device_index)
+
+    def get_rng_state(self, device_index=None):
+        if device_index is None:
+            return torch.npu.get_rng_state()
+
+        return torch.npu.get_rng_state(device_index)
+
+    def manual_seed(self, seed):
+        return torch.npu.manual_seed(seed)
+
+    def manual_seed_all(self, seed):
+        return torch.npu.manual_seed_all(seed)
+
+    def initial_seed(self, seed):
+        return torch.npu.initial_seed(seed)
+
+    def default_generator(self, device_index):
+        return torch.npu.default_generators[device_index]
+
+    # Streams/Events
+    @property
+    def Stream(self):
+        return torch.npu.Stream
+
+    def stream(self, stream):
+        return torch.npu.stream(stream)
+
+    def current_stream(self, device_index=None):
+        return torch.npu.current_stream(device_index)
+
+    def default_stream(self, device_index=None):
+        return torch.npu.default_stream(device_index)
+
+    @property
+    def Event(self):
+        return torch.npu.Event
+
+    # Memory management
+    def empty_cache(self):
+        return torch.npu.empty_cache()
+
+    def memory_allocated(self, device_index=None):
+        return torch.npu.memory_allocated(device_index)
+
+    def max_memory_allocated(self, device_index=None):
+        return torch.npu.max_memory_allocated(device_index)
+
+    def reset_max_memory_allocated(self, device_index=None):
+        return torch.npu.reset_max_memory_allocated(device_index)
+
+    def memory_cached(self, device_index=None):
+        return torch.npu.memory_cached(device_index)
+
+    def max_memory_cached(self, device_index=None):
+        return torch.npu.max_memory_cached(device_index)
+
+    def reset_max_memory_cached(self, device_index=None):
+        return torch.npu.reset_max_memory_cached(device_index)
+
+    def memory_stats(self, device_index=None):
+        if hasattr(torch.npu, 'memory_stats'):
+            return torch.npu.memory_stats(device_index)
+
+    def reset_peak_memory_stats(self, device_index=None):
+        if hasattr(torch.npu, 'reset_peak_memory_stats'):
+            return torch.npu.reset_peak_memory_stats(device_index)
+
+    def memory_reserved(self, device_index=None):
+        if hasattr(torch.npu, 'memory_reserved'):
+            return torch.npu.memory_reserved(device_index)
+
+    def max_memory_reserved(self, device_index=None):
+        if hasattr(torch.npu, 'max_memory_reserved'):
+            return torch.npu.max_memory_reserved(device_index)
+
+    def total_memory(self, device_index=None):
+        return torch.npu.get_device_properties(device_index).total_memory
+
+    # Data types
+    def is_bf16_supported(self):
+        return torch.npu.is_bf16_supported()
+
+    def is_fp16_supported(self):
+        return True
+
+    # Misc
+    def amp(self):
+        if hasattr(torch.npu, 'amp'):
+            return torch.npu.amp
+        return None
+
+    def is_available(self):
+        return torch.npu.is_available()
+
+    def range_push(self, msg):
+        return
+
+    def range_pop(self):
+        return
+
+    def lazy_call(self, callback):
+        return torch.npu._lazy_call(callback)
+
+    def communication_backend_name(self):
+        return self._communication_backend_name
+
+    # Tensor operations
+
+    @property
+    def BFloat16Tensor(self):
+        return torch.npu.BFloat16Tensor
+
+    @property
+    def ByteTensor(self):
+        return torch.npu.ByteTensor
+
+    @property
+    def DoubleTensor(self):
+        return torch.npu.DoubleTensor
+
+    @property
+    def FloatTensor(self):
+        return torch.npu.FloatTensor
+
+    @property
+    def HalfTensor(self):
+        return torch.npu.HalfTensor
+
+    @property
+    def IntTensor(self):
+        return torch.npu.IntTensor
+
+    @property
+    def LongTensor(self):
+        return torch.npu.LongTensor
+
+    def pin_memory(self, tensor):
+        return tensor.pin_memory()
+
+    def on_accelerator(self, tensor):
+        device_str = str(tensor.device)
+        if device_str.startswith('npu:'):
+            return True
+        else:
+            return False
+
+    def op_builder_dir(self):
+        try:
+            # is op_builder from deepspeed or a 3p version? this should only succeed if it's deepspeed
+            # if successful this also means we're doing a local install and not JIT compile path
+            from op_builder import __deepspeed__  # noqa: F401
+            return "op_builder.npu"
+        except ImportError:
+            return "deepspeed.ops.op_builder.npu"
+
+    # dict that holds class name <--> class type mapping i.e.
+    # 'AsyncIOBuilder': <class 'op_builder.async_io.AsyncIOBuilder'>
+    # this dict will be filled at init stage
+    class_dict = None
+
+    def _lazy_init_class_dict(self):
+        if self.class_dict != None:
+            return
+        else:
+            self.class_dict = {}
+
+    # create an instance of op builder and return, name specified by class_name
+    def create_op_builder(self, class_name):
+        self._lazy_init_class_dict()
+        if class_name in self.class_dict:
+            return self.class_dict[class_name]()
+        else:
+            return None
+
+    # return an op builder class, name specified by class_name
+    def get_op_builder(self, class_name):
+        self._lazy_init_class_dict()
+        if class_name in self.class_dict:
+            return self.class_dict[class_name]
+        else:
+            return None
+
+    def build_extension(self):
+        from torch.utils.cpp_extension import BuildExtension
+        return BuildExtension
diff --git a/accelerator/real_accelerator.py b/accelerator/real_accelerator.py
@@ -4,6 +4,13 @@
 # DeepSpeed Team
 import os
 
+try:
+    # Importing logger currently requires that torch is installed, hence the try...except
+    # TODO: Remove logger dependency on torch.
+    from deepspeed.utils import logger as accel_logger
+except ImportError as e:
+    accel_logger = None
+
 try:
     from accelerator.abstract_accelerator import DeepSpeedAccelerator as dsa1
 except ImportError as e:
@@ -72,12 +79,12 @@ def get_accelerator():
             accelerator_name = 'xpu'
         except ImportError as e:
             # We need a way to choose between CUDA_Accelerator and CPU_Accelerator
-            # Currently we detect whether intel_etension_for_pytorch is installed
-            # in the environment and use CPU_Accelerator if the answewr is True.
+            # Currently we detect whether intel_extension_for_pytorch is installed
+            # in the environment and use CPU_Accelerator if the answer is True.
             # An alternative might be detect whether CUDA device is installed on
             # the system but this comes with two pitfalls:
             # 1. the system may not have torch pre-installed, so
-            #    get_accelerator().is_avaiable() may not work.
+            #    get_accelerator().is_available() may not work.
             # 2. Some scenario like install on login node (without CUDA device)
             #    and run on compute node (with CUDA device) may cause mismatch
             #    between installation time and runtime.
@@ -99,25 +106,27 @@ def get_accelerator():
         # XPU_Accelerator is already imported in detection stage
         ds_accelerator = XPU_Accelerator()
     _validate_accelerator(ds_accelerator)
-    print(f"Setting ds_accelerator to {ds_accelerator._name} ({ds_set_method})")
+    if accel_logger is not None:
+        accel_logger.info(f"Setting ds_accelerator to {ds_accelerator._name} ({ds_set_method})")
     return ds_accelerator
 
 
 def set_accelerator(accel_obj):
     global ds_accelerator
     _validate_accelerator(accel_obj)
-    print(f"Setting ds_accelerator to {accel_obj._name} (model specified)")
+    if accel_logger is not None:
+        accel_logger.info(f"Setting ds_accelerator to {accel_obj._name} (model specified)")
     ds_accelerator = accel_obj
 
 
 '''
 -----------[code] test_get.py -----------
 from deepspeed.accelerator import get_accelerator
 my_accelerator = get_accelerator()
-print(f'{my_accelerator._name=}')
-print(f'{my_accelerator._communication_backend=}')
-print(f'{my_accelerator.HalfTensor().device=}')
-print(f'{my_accelerator.total_memory()=}')
+logger.info(f'{my_accelerator._name=}')
+logger.info(f'{my_accelerator._communication_backend=}')
+logger.info(f'{my_accelerator.HalfTensor().device=}')
+logger.info(f'{my_accelerator.total_memory()=}')
 -----------[code] test_get.py -----------
 
 ---[output] python test_get.py---------
@@ -131,16 +140,16 @@ def set_accelerator(accel_obj):
 -----------[code] test_set.py -----------
 from deepspeed.accelerator.cuda_accelerator import CUDA_Accelerator
 cu_accel = CUDA_Accelerator()
-print(f'{id(cu_accel)=}')
+logger.info(f'{id(cu_accel)=}')
 from deepspeed.accelerator import set_accelerator, get_accelerator
 set_accelerator(cu_accel)
 
 my_accelerator = get_accelerator()
-print(f'{id(my_accelerator)=}')
-print(f'{my_accelerator._name=}')
-print(f'{my_accelerator._communication_backend=}')
-print(f'{my_accelerator.HalfTensor().device=}')
-print(f'{my_accelerator.total_memory()=}')
+logger.info(f'{id(my_accelerator)=}')
+logger.info(f'{my_accelerator._name=}')
+logger.info(f'{my_accelerator._communication_backend=}')
+logger.info(f'{my_accelerator.HalfTensor().device=}')
+logger.info(f'{my_accelerator.total_memory()=}')
 -----------[code] test_set.py -----------
 
 

diff --git a/blogs/deepspeed-chat/chinese/README.md b/blogs/deepspeed-chat/chinese/README.md
@@ -107,7 +107,7 @@ python train.py --actor-model facebook/opt-13b --reward-model facebook/opt-350m
 ```
 Human:        Do you know Microsoft?
 Assistant:    Yes, I’m a big fan of Microsoft.  What about them?
-Human:        Can you explian it to a 6-year old child? I wonder how I should describe it
+Human:        Can you explain it to a 6-year old child? I wonder how I should describe it
 Assistant:    Sure, I can try.  Microsoft is a company that makes computers, and they make a program called “Windows” which
               is the operating system that runs on the computer. It’s like the “brain” of the computer.  It’s where all the
               programs and files are stored. It’s also where you’ll find the “start” button, which is where you’ll start to

diff --git a/deepspeed/autotuning/autotuner.py b/deepspeed/autotuning/autotuner.py
@@ -637,7 +637,7 @@ def tune_space(self, tuning_space, prev_max_mbs=0, prev_best_mbs=0, prev_best_me
         logger.info(f"End tuning for space: {tuning_space_name}")
         return max_micro_batch_size, best_mbs, best_metric_val
 
-    def get_plauteu_mbs(self, tuning_space_name):
+    def get_plateau_mbs(self, tuning_space_name):
         if tuning_space_name not in self.records:
             return 0
         space_records = self.records[tuning_space_name]