From 59d62d614ed4390b82bdb36811fcc55c710510fe Mon Sep 17 00:00:00 2001
From: Max Kovalenko <75629718+deepcharm@users.noreply.github.com>
Date: Mon, 9 Oct 2023 04:32:25 +0300
Subject: [PATCH 01/11] Move tensors to device if mp is not enabled (#4461)

Tensors are not moved to device in gather_partitioned_activations
in case model parallel is not enabled.

Co-authored-by: Olatunji Ruwase <olruwase@microsoft.com>
---
 deepspeed/runtime/activation_checkpointing/checkpointing.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/deepspeed/runtime/activation_checkpointing/checkpointing.py b/deepspeed/runtime/activation_checkpointing/checkpointing.py
index 77407a52026a..108cb37b57fb 100644
--- a/deepspeed/runtime/activation_checkpointing/checkpointing.py
+++ b/deepspeed/runtime/activation_checkpointing/checkpointing.py
@@ -270,6 +270,8 @@ def gather_partitioned_activations(tensors, device=None):
         # don't need to do all_gather if model parallel is not enabled
         if mp_group is None or mp_size == 1:
             item = item.view(list(size.numpy()))
+            if device is not None:
+                item = item.to(device)
             inputs.append(item)
             continue
 

From 6b634d0e7e7a7059f9fdf8434b7d92d3da2953ff Mon Sep 17 00:00:00 2001
From: Michael Wyatt <michaelwyatt@microsoft.com>
Date: Sun, 8 Oct 2023 18:33:17 -0700
Subject: [PATCH 02/11] move torch import (#4468)

---
 op_builder/async_io.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/op_builder/async_io.py b/op_builder/async_io.py
index da511a0a8c9d..b55c821910b9 100644
--- a/op_builder/async_io.py
+++ b/op_builder/async_io.py
@@ -5,7 +5,6 @@
 
 import distutils.spawn
 import subprocess
-import torch
 
 from .builder import OpBuilder
 
@@ -36,6 +35,7 @@ def cxx_args(self):
         # -O0 for improved debugging, since performance is bound by I/O
         CPU_ARCH = self.cpu_arch()
         SIMD_WIDTH = self.simd_width()
+        import torch  # Keep this import here to avoid errors when building DeepSpeed wheel without torch installed
         TORCH_MAJOR, TORCH_MINOR = map(int, torch.__version__.split('.')[0:2])
         if TORCH_MAJOR >= 2 and TORCH_MINOR >= 1:
             CPP_STD = '-std=c++17'

From 6763e2de610b006c0fafc2c19a31f6508c920d1b Mon Sep 17 00:00:00 2001
From: Yejing-Lai <55339926+Yejing-Lai@users.noreply.github.com>
Date: Mon, 9 Oct 2023 18:24:16 +0800
Subject: [PATCH 03/11] add lm_head and embed_out tensor parallel (#3962)

* add lm_head and embed_out tensor parallel

* fix load lm_head.weight name issue

* replace all_reduce with inference_all_reduce

* refactor lm_head tensor parallel

---------

Co-authored-by: Chen, Zhenhuan <zhenhuan.chen@intel.com>
---
 deepspeed/module_inject/auto_tp.py        | 20 ++++++++++++++-
 deepspeed/module_inject/layers.py         | 30 +++++++++++++++++++++++
 deepspeed/module_inject/replace_module.py | 11 ++++++++-
 3 files changed, 59 insertions(+), 2 deletions(-)

diff --git a/deepspeed/module_inject/auto_tp.py b/deepspeed/module_inject/auto_tp.py
index daf143919558..2e348de63454 100644
--- a/deepspeed/module_inject/auto_tp.py
+++ b/deepspeed/module_inject/auto_tp.py
@@ -11,7 +11,7 @@
 from typing import Optional
 import torch
 from deepspeed import comm as dist
-from .layers import LinearAllreduce, LinearLayer
+from .layers import LinearAllreduce, LinearLayer, LmHeadLinearAllreduce
 from deepspeed.accelerator import get_accelerator
 from .fusedqkv_utils import require_tp_fused_qkvw, prepare_tp_fused_qkvw
 
@@ -318,6 +318,11 @@ def _replace(self, child, name, conv_linear_layer):
             del data
 
             setattr(child, "replaced", True)
+            if name == "lm_head" or name == 'embed_out':
+                return LmHeadLinearAllreduce(
+                    torch.nn.parameter.Parameter(data_dc, requires_grad=False), dist.get_rank(), dist.get_world_size(),
+                    child.bias if child.bias is None else torch.nn.parameter.Parameter(
+                        child.bias.to(get_accelerator().current_device_name())), self.mp_group)
             return LinearAllreduce(torch.nn.parameter.Parameter(data_dc, requires_grad=False), child.bias if child.bias is None else \
                         torch.nn.parameter.Parameter(child.bias.to(get_accelerator().current_device_name())), self.mp_group)
         else:
@@ -436,3 +441,16 @@ def _replace_module(self, r_module, prev_name='', prev_class_name=''):
                 self.update_mp_params(child)
                 self._replace_module(child, name, class_name)
         return r_module
+
+    def _replace_last_linear_module(self, r_module):
+        if hasattr(r_module, "lm_head"):
+            name = "lm_head"
+            child = r_module.lm_head
+        elif hasattr(r_module, "embed_out"):
+            name = "embed_out"
+            child = r_module.embed_out
+        else:
+            return r_module
+        if child.__class__ in self.linear_policies:
+            setattr(r_module, name, self.linear_policies[child.__class__](child, name, self.conv_linear_layer))
+        return r_module
diff --git a/deepspeed/module_inject/layers.py b/deepspeed/module_inject/layers.py
index aa29651ec4cf..7a565560dec9 100644
--- a/deepspeed/module_inject/layers.py
+++ b/deepspeed/module_inject/layers.py
@@ -29,6 +29,36 @@ def forward(self, input):
         return output
 
 
+class LmHeadLinearAllreduce(nn.Module):
+
+    def __init__(
+        self,
+        weight,
+        rank,
+        world_size,
+        bias=None,
+        mp_group=None,
+    ):
+        super(LmHeadLinearAllreduce, self).__init__()
+        self.weight = weight
+        self.bias = bias
+        self.mp_group = mp_group
+        self.rank = rank
+        self.world_size = world_size
+
+    def forward(self, input):
+        assert input.shape[
+            -1] % self.world_size == 0, 'Please ensure that self.world_size is divisible by input.shape[-1]'
+        input_shard = input.shape[-1] // self.world_size
+        output = torch.matmul(input[:, :, self.rank * input_shard:(self.rank + 1) * input_shard],
+                              self.weight.transpose(-1, -2))
+        if self.mp_group is not None:
+            dist.inference_all_reduce(output, group=self.mp_group)
+        if self.bias is not None:
+            output += self.bias
+        return output
+
+
 class LinearLayer(nn.Module):
 
     def __init__(self, weight_shape=None, dtype=torch.half, weight=None, bias=None):
diff --git a/deepspeed/module_inject/replace_module.py b/deepspeed/module_inject/replace_module.py
index f9dd921b9ae1..8666372fa3f4 100644
--- a/deepspeed/module_inject/replace_module.py
+++ b/deepspeed/module_inject/replace_module.py
@@ -275,6 +275,8 @@ def replace_wo_policy(module, all_reduce_linears, prefix="", state_dict=None):
         _autotp.update_linear_policies()
 
         # 4. Replace modules
+        if "lm_head" in all_reduce_linears or "embed_out" in all_reduce_linears:
+            return _autotp._replace_last_linear_module(module)
         return _autotp._replace_module(module)
 
     def replace_fn(child, _policy, layer_id=0, prefix="", state_dict=None):
@@ -304,6 +306,13 @@ def set_lm_head(module):
         if embedding_weight is not None and hasattr(module, "lm_head") and hasattr(
                 module.lm_head, "weight") and module.lm_head.weight.is_meta:
             module.lm_head.weight = embedding_weight
+        # enable tensor parallel for the last linear
+        if hasattr(module, "lm_head") and hasattr(module.lm_head, "weight") and not module.lm_head.weight.is_meta:
+            module = replace_wo_policy(module, ("lm_head", ), 0, "lm_head")
+        elif hasattr(module, "embed_out") and hasattr(module.embed_out,
+                                                      "weight") and not module.embed_out.weight.is_meta:
+            module = replace_wo_policy(module, ("embed_out", ), 0, "embed_out")
+        return module
 
     if checkpoint_dict is not None and not config.replace_with_kernel_inject:
         # AutoTP shard loading
@@ -318,7 +327,7 @@ def set_lm_head(module):
                                              checkpoint=checkpoint_file)
             pbar.update(1)
             gc.collect()
-        set_lm_head(replaced_module)
+        replaced_module = set_lm_head(replaced_module)
     else:
         replaced_module = replace_module(model=model,
                                          orig_class=orig_layer_impl,

From 923f3590ee908d39cfcd3c59d3fc0ae27bbd7e8c Mon Sep 17 00:00:00 2001
From: Michael Wyatt <michaelwyatt@microsoft.com>
Date: Mon, 9 Oct 2023 09:44:39 -0700
Subject: [PATCH 04/11] fix bad build command (#4483)

---
 .github/workflows/release.yml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 093bc98d41e8..fe6122d04890 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -14,6 +14,8 @@ jobs:
     - uses: actions/checkout@v3
       with:
         ref: "master"
+    - id: setup-venv
+      uses: ./.github/workflows/setup-venv
     - name: Get release version from tag
       run: |
         echo "RELEASE_VERSION=${GITHUB_REF#refs/*/v}" >> $GITHUB_ENV
@@ -22,7 +24,7 @@ jobs:
         python release/check_release_version.py --release_version ${{ env.RELEASE_VERSION }}
     - name: Build DeepSpeed
       run: |
-        DS_BUILD_STRING=" " python setup.py sdist_wheel
+        DS_BUILD_STRING=" " python setup.py sdist
     - name: Publish to PyPI
       uses: pypa/gh-action-pypi-publish@release/v1
       with:

From e9503fe40ee127e74c8b0ea962aac4f0ae957624 Mon Sep 17 00:00:00 2001
From: Michael Wyatt <michaelwyatt@microsoft.com>
Date: Mon, 9 Oct 2023 09:53:41 -0700
Subject: [PATCH 05/11] fix missing package

---
 .github/workflows/release.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index fe6122d04890..8e016b4169cb 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -21,6 +21,7 @@ jobs:
         echo "RELEASE_VERSION=${GITHUB_REF#refs/*/v}" >> $GITHUB_ENV
     - name: Check release version
       run: |
+        pip install packaging
         python release/check_release_version.py --release_version ${{ env.RELEASE_VERSION }}
     - name: Build DeepSpeed
       run: |

From 77a28371630ebb12f9fa058d6adda84e283d6175 Mon Sep 17 00:00:00 2001
From: Michael Wyatt <michaelwyatt@microsoft.com>
Date: Mon, 9 Oct 2023 09:57:15 -0700
Subject: [PATCH 06/11] [create-pull-request] automated change (#4484)

Co-authored-by: mrwyattii <mrwyattii@users.noreply.github.com>
---
 version.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/version.txt b/version.txt
index af88ba824866..bc859cbd6d99 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-0.11.1
+0.11.2

From e7acee4933a596cf17fed50376823a5625d85f31 Mon Sep 17 00:00:00 2001
From: Logan Adams <114770087+loadams@users.noreply.github.com>
Date: Mon, 9 Oct 2023 10:08:29 -0700
Subject: [PATCH 07/11] Remove DS_BUILD_UTILS reference (#4485)

---
 docs/_tutorials/advanced-install.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/_tutorials/advanced-install.md b/docs/_tutorials/advanced-install.md
index c2b4c04cad1c..10197e62f681 100755
--- a/docs/_tutorials/advanced-install.md
+++ b/docs/_tutorials/advanced-install.md
@@ -61,6 +61,7 @@ Available `DS_BUILD` options include:
 * `DS_BUILD_CCL_COMM` builds the communication collective libs
 * `DS_BUILD_CPU_ADAM` builds the CPUAdam op
 * `DS_BUILD_CPU_LION` builds the CPULion op
+* `DS_BUILD_EVOFORMER_ATTN` builds the EvoformerAttn op (from [Alphafold](https://www.deepspeed.ai/tutorials/ds4sci_evoformerattention/))
 * `DS_BUILD_FUSED_ADAM` builds the FusedAdam op (from [apex](https://github.com/NVIDIA/apex))
 * `DS_BUILD_FUSED_LION` builds the FusedLion op
 * `DS_BUILD_CPU_ADAGRAD` builds the CPUAdagrad op
@@ -71,7 +72,6 @@ Available `DS_BUILD` options include:
 * `DS_BUILD_TRANSFORMER` builds the transformer op
 * `DS_BUILD_TRANSFORMER_INFERENCE` builds the transformer-inference op
 * `DS_BUILD_STOCHASTIC_TRANSFORMER` builds the stochastic transformer op
-* `DS_BUILD_UTILS` builds various optimized utilities
 
 To speed up the build-all process, you can parallelize the compilation process with:
 

From 604d701e35548e5407b017c088bdc3760832c9e0 Mon Sep 17 00:00:00 2001
From: Matthew Hoffman <matthew@protopia.ai>
Date: Mon, 9 Oct 2023 10:59:30 -0800
Subject: [PATCH 08/11] Introduce pydantic_v1 compatibility module for
 pydantic>=2.0.0 support (#4407)

* Introduce pydantic_v1 compatibility module for pydantic>=2.0.0 support
---
 deepspeed/comm/config.py                   |  2 +-
 deepspeed/inference/config.py              |  3 +--
 deepspeed/monitor/config.py                |  2 +-
 deepspeed/pydantic_v1.py                   | 16 ++++++++++++++++
 deepspeed/runtime/config_utils.py          |  2 +-
 deepspeed/runtime/zero/config.py           |  2 +-
 deepspeed/runtime/zero/offload_config.py   |  2 +-
 environment.yml                            |  2 +-
 requirements/requirements-readthedocs.txt  |  4 ++--
 requirements/requirements.txt              |  2 +-
 tests/unit/runtime/test_ds_config_model.py |  2 +-
 11 files changed, 27 insertions(+), 12 deletions(-)
 create mode 100644 deepspeed/pydantic_v1.py

diff --git a/deepspeed/comm/config.py b/deepspeed/comm/config.py
index 138badebe5a9..1c441bb6bfe9 100644
--- a/deepspeed/comm/config.py
+++ b/deepspeed/comm/config.py
@@ -3,8 +3,8 @@
 
 # DeepSpeed Team
 
-from pydantic import BaseModel
 from .constants import *
+from ..pydantic_v1 import BaseModel
 
 
 class CommsConfig(BaseModel):
diff --git a/deepspeed/inference/config.py b/deepspeed/inference/config.py
index 70f1c0dbd5b7..1d5018aaa75b 100644
--- a/deepspeed/inference/config.py
+++ b/deepspeed/inference/config.py
@@ -5,10 +5,9 @@
 
 import torch
 import deepspeed
+from deepspeed.pydantic_v1 import Field, validator
 from deepspeed.runtime.config_utils import DeepSpeedConfigModel
 from deepspeed.runtime.zero.config import DeepSpeedZeroConfig
-from pydantic import Field
-from pydantic import validator
 from typing import Dict, Union
 from enum import Enum
 
diff --git a/deepspeed/monitor/config.py b/deepspeed/monitor/config.py
index 2706764290fd..5a8ca6ecf5cd 100644
--- a/deepspeed/monitor/config.py
+++ b/deepspeed/monitor/config.py
@@ -3,7 +3,7 @@
 
 # DeepSpeed Team
 
-from pydantic import root_validator
+from deepspeed.pydantic_v1 import root_validator
 from deepspeed.runtime.config_utils import DeepSpeedConfigModel
 
 
diff --git a/deepspeed/pydantic_v1.py b/deepspeed/pydantic_v1.py
new file mode 100644
index 000000000000..6aba072ad929
--- /dev/null
+++ b/deepspeed/pydantic_v1.py
@@ -0,0 +1,16 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""Pydantic v1 compatibility module.
+
+Pydantic v2 introduced breaking changes that hinder its adoption:
+https://docs.pydantic.dev/latest/migration/. To provide deepspeed users the option to
+migrate to pydantic v2 on their own timeline, deepspeed uses this compatibility module
+as a pydantic-version-agnostic alias for pydantic's v1 API.
+"""
+
+try:
+    from pydantic.v1 import *  # noqa: F401
+except ImportError:
+    from pydantic import *  # noqa: F401
diff --git a/deepspeed/runtime/config_utils.py b/deepspeed/runtime/config_utils.py
index 0fb1372deac8..5522a8e79d69 100755
--- a/deepspeed/runtime/config_utils.py
+++ b/deepspeed/runtime/config_utils.py
@@ -9,7 +9,7 @@
 import collections
 import collections.abc
 from functools import reduce
-from pydantic import BaseModel
+from deepspeed.pydantic_v1 import BaseModel
 from deepspeed.utils import logger
 
 
diff --git a/deepspeed/runtime/zero/config.py b/deepspeed/runtime/zero/config.py
index 1fc11f0e46f5..35d60b5b3290 100644
--- a/deepspeed/runtime/zero/config.py
+++ b/deepspeed/runtime/zero/config.py
@@ -3,10 +3,10 @@
 
 # DeepSpeed Team
 
-from pydantic import Field, validator
 import sys
 from typing import Optional
 from enum import Enum
+from deepspeed.pydantic_v1 import Field, validator
 from deepspeed.runtime.config_utils import get_scalar_param, pp_int, DeepSpeedConfigModel
 from deepspeed.utils import logger
 from .offload_config import DeepSpeedZeroOffloadParamConfig, DeepSpeedZeroOffloadOptimizerConfig, OffloadDeviceEnum
diff --git a/deepspeed/runtime/zero/offload_config.py b/deepspeed/runtime/zero/offload_config.py
index c3a6dc7af530..1bd79412d39f 100644
--- a/deepspeed/runtime/zero/offload_config.py
+++ b/deepspeed/runtime/zero/offload_config.py
@@ -3,9 +3,9 @@
 
 # DeepSpeed Team
 
-from pydantic import Field, validator
 from enum import Enum
 from pathlib import Path
+from deepspeed.pydantic_v1 import Field, validator
 from deepspeed.runtime.config_utils import DeepSpeedConfigModel, pp_int
 
 
diff --git a/environment.yml b/environment.yml
index e55fe96e5a5a..28c298717d80 100644
--- a/environment.yml
+++ b/environment.yml
@@ -18,4 +18,4 @@ dependencies:
   - certifi
   - openssl
   - python=3.10
-  - pydantic<2.0.0
+  - pydantic
diff --git a/requirements/requirements-readthedocs.txt b/requirements/requirements-readthedocs.txt
index a6d7915e0ea5..fcd0ec5a9a6a 100644
--- a/requirements/requirements-readthedocs.txt
+++ b/requirements/requirements-readthedocs.txt
@@ -1,9 +1,9 @@
-autodoc_pydantic<2.0.0
+autodoc_pydantic
 docutils<0.18
 hjson
 packaging
 psutil
 py-cpuinfo
-pydantic<2.0.0
+pydantic
 torch
 tqdm
diff --git a/requirements/requirements.txt b/requirements/requirements.txt
index 8c5e76750573..6840d6dbcc98 100755
--- a/requirements/requirements.txt
+++ b/requirements/requirements.txt
@@ -4,6 +4,6 @@ numpy
 packaging>=20.0
 psutil
 py-cpuinfo
-pydantic<2.0.0
+pydantic
 torch
 tqdm
diff --git a/tests/unit/runtime/test_ds_config_model.py b/tests/unit/runtime/test_ds_config_model.py
index b9c67c9a30dd..87ea747cf423 100644
--- a/tests/unit/runtime/test_ds_config_model.py
+++ b/tests/unit/runtime/test_ds_config_model.py
@@ -6,8 +6,8 @@
 import pytest
 import os
 import json
-from pydantic import Field, ValidationError
 from typing import List
+from deepspeed.pydantic_v1 import Field, ValidationError
 from deepspeed.runtime import config as ds_config
 from deepspeed.runtime.config_utils import DeepSpeedConfigModel
 

From 706a72562ab6a4276fb0e6a7348a04f8e11bdf79 Mon Sep 17 00:00:00 2001
From: Bram Vanroy <2779410+BramVanroy@users.noreply.github.com>
Date: Tue, 10 Oct 2023 17:56:10 +0200
Subject: [PATCH 09/11] Allow env var for timeout (#4405)

Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com>
Co-authored-by: Michael Wyatt <michaelwyatt@microsoft.com>
---
 deepspeed/constants.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/deepspeed/constants.py b/deepspeed/constants.py
index 7ebc8f9983a5..30135f41b7b6 100644
--- a/deepspeed/constants.py
+++ b/deepspeed/constants.py
@@ -3,6 +3,7 @@
 
 # DeepSpeed Team
 
+import os
 from datetime import timedelta
 
 #############################################
@@ -15,6 +16,6 @@
 # (only if NCCL_BLOCKING_WAIT or NCCL_ASYNC_ERROR_HANDLING is set to 1).
 # To make an attempt at backwards compatibility with THD, we use an
 # extraordinarily high default timeout, given that THD did not have timeouts.
-default_pg_timeout = timedelta(minutes=30)
+default_pg_timeout = timedelta(minutes=int(os.getenv("DEEPSPEED_TIMEOUT", default=30)))
 INFERENCE_GENERIC_MODE = 'generic'
 INFERENCE_SPECIALIZED_MODE = 'specialized'

From 427253b94bf425395d48f494270550bac9ac880a Mon Sep 17 00:00:00 2001
From: Logan Adams <114770087+loadams@users.noreply.github.com>
Date: Tue, 10 Oct 2023 10:41:28 -0700
Subject: [PATCH 10/11] Update ROCm verison (#4486)

* Update ROCm verison

* Update .github/workflows/amd-mi200.yml

---------

Co-authored-by: Michael Wyatt <michaelwyatt@microsoft.com>
---
 .github/workflows/amd-mi200.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/amd-mi200.yml b/.github/workflows/amd-mi200.yml
index a275225cc5e4..77f33f744ea8 100644
--- a/.github/workflows/amd-mi200.yml
+++ b/.github/workflows/amd-mi200.yml
@@ -28,7 +28,7 @@ jobs:
 
       - name: Install pytorch
         run: |
-          pip install -U --cache-dir $TORCH_CACHE torch torchvision --extra-index-url https://download.pytorch.org/whl/rocm5.4.2
+          pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/rocm5.6
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 

From 6c86ff393fbb579a5ca40f104cd70a14d3cb1eda Mon Sep 17 00:00:00 2001
From: stephen youn <13525892+stephen-youn@users.noreply.github.com>
Date: Tue, 10 Oct 2023 20:01:48 -0700
Subject: [PATCH 11/11] adding 8bit dequantization kernel for asym fine-grained
 block quantization in zero-inference (#4450)

* kernels added for asym fine-grained block quantization with 8bits

* formatting

* clean up the code

* rename quantize_int4.cu to quantize_intX.cu

* rename test_int4_quantization.py to test_intX_quantization.py

* "rename test_int4_quantization.py to test_intX_quantization.py"

This reverts commit 2d341405b2ed6cf69e83fcabad1804513ea92122.

* rename

* fix after the pr comments

* increased coverage of QuantLinear test
(w/ and w/o the cuda kernels)

* formatting

---------

Co-authored-by: Stephen Youn <styoun@microsoft.com>
Co-authored-by: Olatunji Ruwase <olruwase@microsoft.com>
---
 csrc/includes/quantization.h                  |  8 +++
 csrc/quantization/pt_binding.cpp              | 23 ++++++++
 .../{quantize_int4.cu => quantize_intX.cu}    | 51 +++++++++++++++++
 deepspeed/inference/quantization/utils.py     | 18 +++---
 op_builder/quantizer.py                       |  2 +-
 ...ntization.py => test_intX_quantization.py} | 56 +++++++++++--------
 6 files changed, 126 insertions(+), 32 deletions(-)
 rename csrc/quantization/{quantize_int4.cu => quantize_intX.cu} (76%)
 rename tests/unit/inference/quantization/{test_int4_quantization.py => test_intX_quantization.py} (91%)

diff --git a/csrc/includes/quantization.h b/csrc/includes/quantization.h
index d2873abf1839..45828832d8d2 100644
--- a/csrc/includes/quantization.h
+++ b/csrc/includes/quantization.h
@@ -98,3 +98,11 @@ void launch_dequantize_int4_to_half_experimental(uint8_t* data_in,
                                                  int num_group,
                                                  int group_size,
                                                  cudaStream_t stream);
+
+void launch_dequantize_int8_to_half_experimental(uint8_t* data_in,
+                                                 half* data_out,
+                                                 half* scale_buffer,
+                                                 half* min_val_buffer,
+                                                 int num_group,
+                                                 int group_size,
+                                                 cudaStream_t stream);
diff --git a/csrc/quantization/pt_binding.cpp b/csrc/quantization/pt_binding.cpp
index d4c253ee005d..a4210897092d 100644
--- a/csrc/quantization/pt_binding.cpp
+++ b/csrc/quantization/pt_binding.cpp
@@ -156,6 +156,26 @@ at::Tensor dequantize_int4_to_half_experimental(at::Tensor& data_in,
     return output;
 }
 
+at::Tensor dequantize_int8_to_half_experimental(at::Tensor& data_in,
+                                                at::Tensor& scale_buffer,
+                                                at::Tensor& min_val_buffer,
+                                                int num_group,
+                                                int group_size)
+{
+    auto output_options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA);
+    auto output = torch::empty({num_group, group_size}, output_options);
+
+    launch_dequantize_int8_to_half_experimental((uint8_t*)data_in.data_ptr(),
+                                                (half*)output.data_ptr(),
+                                                (half*)scale_buffer.data_ptr(),
+                                                (half*)min_val_buffer.data_ptr(),
+                                                num_group,
+                                                group_size,
+                                                at::cuda::getCurrentCUDAStream());
+
+    return output;
+}
+
 std::vector<at::Tensor> ds_swizzle_quant(at::Tensor& input_vals,
                                          int groups,
                                          int num_bits,
@@ -270,6 +290,9 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
     m.def("dequantize_int4_to_half_experimental",
           &dequantize_int4_to_half_experimental,
           "Dequantize int4 to half (experimental)");
+    m.def("dequantize_int8_to_half_experimental",
+          &dequantize_int8_to_half_experimental,
+          "Dequantize int8 to half (experimental)");
     m.def("swizzle_quant", &ds_swizzle_quant);
     m.def("quantized_reduction", &quantized_reduction);
 }
diff --git a/csrc/quantization/quantize_int4.cu b/csrc/quantization/quantize_intX.cu
similarity index 76%
rename from csrc/quantization/quantize_int4.cu
rename to csrc/quantization/quantize_intX.cu
index fed707c1fa7c..b26151ab5c8c 100644
--- a/csrc/quantization/quantize_int4.cu
+++ b/csrc/quantization/quantize_intX.cu
@@ -228,3 +228,54 @@ void launch_dequantize_int4_to_half_experimental(uint8_t* data_in,
     dequantize_int4_to_half<<<num_block, 256, 0, stream>>>(
         data_in, data_out, scale_buffer, min_val_buffer, num_group, group_size);
 }
+
+template <int N>
+__device__ __forceinline__ AlignedArray<half, N> int8_to_half(const AlignedArray<uint8_t, N>& data)
+{
+    AlignedArray<half, N> ret;
+
+#pragma unroll
+    for (int idx = 0; idx < N; idx += 1) { ret[idx] = half(int(data[idx])); }
+
+    return ret;
+}
+
+__global__ void dequantize_int8_to_half(uint8_t* data_in,
+                                        half* data_out,
+                                        half* scale_buffer,
+                                        half* min_val_buffer,
+                                        int num_group,
+                                        int group_size)
+{
+    using AccessType = AlignedArray<uint8_t, 8>;
+    using AccessTypeOut = AlignedArray<half, 8>;
+
+    for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < num_group * group_size / 8;
+         idx += blockDim.x * gridDim.x) {
+        int id_group = idx / (group_size / 8);
+        AccessType value = reinterpret_cast<AccessType*>(data_in)[idx];
+        half scale = scale_buffer[id_group];
+        half min_value = min_val_buffer[id_group];
+
+        AccessTypeOut output = int8_to_half(value);
+        output = divide<half, 8>()(output, scale);
+        output = plus<half, 8>()(output, min_value);
+
+        reinterpret_cast<AccessTypeOut*>(data_out)[idx] = output;
+    }
+}
+
+void launch_dequantize_int8_to_half_experimental(uint8_t* data_in,
+                                                 half* data_out,
+                                                 half* scale_buffer,
+                                                 half* min_val_buffer,
+                                                 int num_group,
+                                                 int group_size,
+                                                 cudaStream_t stream)
+{
+    int num_warp = num_group / 4;
+    int num_block = num_warp / 8;  // 256 trd / block
+
+    dequantize_int8_to_half<<<num_block, 256, 0, stream>>>(
+        data_in, data_out, scale_buffer, min_val_buffer, num_group, group_size);
+}
diff --git a/deepspeed/inference/quantization/utils.py b/deepspeed/inference/quantization/utils.py
index d47eb265c214..712abc384a44 100644
--- a/deepspeed/inference/quantization/utils.py
+++ b/deepspeed/inference/quantization/utils.py
@@ -105,20 +105,24 @@ def __init__(self, config: Dict, dtype: torch.dtype) -> None:
     def dequantize(self, tensor: Tensor, quant_scale: Tensor, quant_min: Tensor) -> Tensor:
         # Use customized CUDA quantization kernel if possible.
         if self.config['group_size'] % 8 == 0 and \
-                self.config['num_bits'] == 4 and \
+                (self.config['num_bits'] == 4 or self.config['num_bits'] == 8) and \
                 self.config['group_dim'] == len(tensor.shape) - 1 and \
                     self.dtype == torch.float16 and device == 'cuda':
 
             last_dimension_size = self.config['group_size']
             if self.config['num_bits'] == 4:
                 last_dimension_size = last_dimension_size // 2
-            quantized_tensor = get_quantizer_cuda_module().dequantize_int4_to_half_experimental(
-                tensor.reshape(-1, last_dimension_size), quant_scale, quant_min,
-                tensor.numel() // last_dimension_size, self.config['group_size'])
-
-            shape = list(tensor.shape)
-            if self.config['num_bits'] == 4:
+                quantized_tensor = get_quantizer_cuda_module().dequantize_int4_to_half_experimental(
+                    tensor.reshape(-1, last_dimension_size), quant_scale, quant_min,
+                    tensor.numel() // last_dimension_size, self.config['group_size'])
+                shape = list(tensor.shape)
                 shape[-1] = shape[-1] * 2
+            elif self.config['num_bits'] == 8:
+                # last_dimension_size = last_dimension_size // 2
+                quantized_tensor = get_quantizer_cuda_module().dequantize_int8_to_half_experimental(
+                    tensor.reshape(-1, last_dimension_size), quant_scale, quant_min,
+                    tensor.numel() // last_dimension_size, self.config['group_size'])
+                shape = list(tensor.shape)
 
             return quantized_tensor.reshape(shape)
 
diff --git a/op_builder/quantizer.py b/op_builder/quantizer.py
index ada80b8f3331..fd765b743de0 100644
--- a/op_builder/quantizer.py
+++ b/op_builder/quantizer.py
@@ -22,7 +22,7 @@ def sources(self):
             'csrc/quantization/pt_binding.cpp',
             'csrc/quantization/fake_quantizer.cu',
             'csrc/quantization/quantize.cu',
-            'csrc/quantization/quantize_int4.cu',
+            'csrc/quantization/quantize_intX.cu',
             'csrc/quantization/dequantize.cu',
             'csrc/quantization/swizzled_quantize.cu',
             'csrc/quantization/quant_reduce.cu',
diff --git a/tests/unit/inference/quantization/test_int4_quantization.py b/tests/unit/inference/quantization/test_intX_quantization.py
similarity index 91%
rename from tests/unit/inference/quantization/test_int4_quantization.py
rename to tests/unit/inference/quantization/test_intX_quantization.py
index 56a5a7d48382..56df2b232d15 100644
--- a/tests/unit/inference/quantization/test_int4_quantization.py
+++ b/tests/unit/inference/quantization/test_intX_quantization.py
@@ -53,12 +53,11 @@ def quantization_test_helper(pre_quant_type: torch.dtype, num_bits: int):
     assert mean_diff < 0.15 and max_diff < 0.5, f'Numeric error exceed threshold, mean diff {mean_diff} (threshold 0.15), max diff {max_diff} (threshold 0.5)'
 
 
-def zero3_post_init_quantization_test_helper(cpu_offload: bool, nvme_offload: bool):
+def zero3_post_init_quantization_test_helper(cpu_offload: bool, nvme_offload: bool, bits: int):
     import deepspeed
     from transformers.deepspeed import HfDeepSpeedConfig
 
-    def get_zero3_ds_config(hf_config: OPTConfig, cpu_offload: bool, nvme_offload: bool) -> Dict:
-        bits = 4
+    def get_zero3_ds_config(hf_config: OPTConfig, cpu_offload: bool, nvme_offload: bool, bits: int) -> Dict:
         GB = 1 << 30
 
         ds_config = {
@@ -143,7 +142,7 @@ def get_zero3_ds_config(hf_config: OPTConfig, cpu_offload: bool, nvme_offload: b
         return ds_config
 
     hf_config = AutoConfig.from_pretrained('facebook/opt-125m')
-    ds_config = get_zero3_ds_config(hf_config=hf_config, cpu_offload=cpu_offload, nvme_offload=nvme_offload)
+    ds_config = get_zero3_ds_config(hf_config=hf_config, cpu_offload=cpu_offload, nvme_offload=nvme_offload, bits=bits)
 
     input_ids = torch.ones(1, 16, dtype=torch.int32, device=device)
     attention_mask = torch.ones(1, 16, dtype=torch.float32, device=device)
@@ -171,12 +170,11 @@ def get_zero3_ds_config(hf_config: OPTConfig, cpu_offload: bool, nvme_offload: b
     assert mean_diff < 0.4, f'Numeric error exceed threshold, relative error {mean_diff} (threshold 0.4)'
 
 
-def zero3_quantized_initialization_test_helper(cpu_offload: bool, nvme_offload: bool):
+def zero3_quantized_initialization_test_helper(cpu_offload: bool, nvme_offload: bool, bits: int):
     import deepspeed
     from transformers.deepspeed import HfDeepSpeedConfig
 
-    def get_zero3_ds_config(hf_config: OPTConfig, cpu_offload: bool, nvme_offload: bool) -> Dict:
-        bits = 4
+    def get_zero3_ds_config(hf_config: OPTConfig, cpu_offload: bool, nvme_offload: bool, bits: int) -> Dict:
         GB = 1 << 30
 
         ds_config = {
@@ -223,7 +221,7 @@ def get_zero3_ds_config(hf_config: OPTConfig, cpu_offload: bool, nvme_offload: b
         return ds_config
 
     hf_config = AutoConfig.from_pretrained('facebook/opt-125m')
-    ds_config = get_zero3_ds_config(hf_config=hf_config, cpu_offload=cpu_offload, nvme_offload=nvme_offload)
+    ds_config = get_zero3_ds_config(hf_config=hf_config, cpu_offload=cpu_offload, nvme_offload=nvme_offload, bits=bits)
 
     input_ids = torch.ones(1, 16, dtype=torch.int32, device=device)
     attention_mask = torch.ones(1, 16, dtype=torch.float32, device=device)
@@ -249,16 +247,26 @@ def get_zero3_ds_config(hf_config: OPTConfig, cpu_offload: bool, nvme_offload: b
     assert mean_diff < 0.4, f'Numeric error exceed threshold, relative error {mean_diff} (threshold 0.4)'
 
 
-class TestQuantizedInt4(DistributedTest):
+@pytest.fixture(params=[4, 8], ids=["4bits", "8bits"])
+def quantization_bits(request):
+    return request.param
 
-    def test_model_quantization(self):
+
+@pytest.fixture(params=[0, 1], ids=["0", "1"])
+def group_dim(request):
+    return request.param
+
+
+class TestQuantizedInt(DistributedTest):
+
+    def test_model_quantization(self, quantization_bits):
         reset_random()
 
         config = AutoConfig.from_pretrained('facebook/opt-125m')
 
         with torch.no_grad():
             model = OPTDecoderLayer(config).half().to(device)
-            bits = 4
+            bits = quantization_bits
 
             ds_config = {
                 'weight_quantization': {
@@ -307,7 +315,7 @@ def test_model_quantization(self):
             assert type(model.self_attn.out_proj) is QuantizedLinear
 
     @pytest.mark.skipif(device == 'cpu', reason='CPU does support FP16 GEMM')
-    def test_quantized_linear(self):
+    def test_quantized_linear(self, quantization_bits, group_dim):
         reset_random()
 
         layers = []
@@ -326,9 +334,9 @@ def test_quantized_linear(self):
                 'weight_quantization': {
                     'post_init_quant': {
                         'layer': {
-                            'num_bits': 4,
+                            'num_bits': quantization_bits,
                             'group_size': 64,
-                            'group_dim': 0,
+                            'group_dim': group_dim,
                             'symmetric': False
                         }
                     }
@@ -368,31 +376,31 @@ def test_half_int8_quantization(self):
         quantization_test_helper(torch.float16, 8)
 
     @pytest.mark.skipif(device == 'cpu', reason='CPU does support FP16 GEMM')
-    def test_zero3_int4_post_init_quant(self):
+    def test_zero3_int4_post_init_quant(self, quantization_bits):
         reset_random()
-        zero3_post_init_quantization_test_helper(cpu_offload=False, nvme_offload=False)
+        zero3_post_init_quantization_test_helper(cpu_offload=False, nvme_offload=False, bits=quantization_bits)
 
     @pytest.mark.skipif(device == 'cpu', reason='CPU does support FP16 GEMM')
-    def test_zero3_int4_post_init_quant_cpu_offload(self):
+    def test_zero3_int4_post_init_quant_cpu_offload(self, quantization_bits):
         reset_random()
-        zero3_post_init_quantization_test_helper(cpu_offload=True, nvme_offload=False)
+        zero3_post_init_quantization_test_helper(cpu_offload=True, nvme_offload=False, bits=quantization_bits)
 
     @pytest.mark.skipif(device == 'cpu', reason='CPU does support FP16 GEMM')
     def test_zero3_int4_post_init_quant_nvme_offload(self):
         reset_random()
-        zero3_post_init_quantization_test_helper(cpu_offload=False, nvme_offload=True)
+        zero3_post_init_quantization_test_helper(cpu_offload=False, nvme_offload=True, bits=4)
 
     @pytest.mark.skipif(device == 'cpu', reason='CPU does support FP16 GEMM')
-    def test_zero3_int4_quantized_initialization(self):
+    def test_zero3_int4_quantized_initialization(self, quantization_bits):
         reset_random()
-        zero3_quantized_initialization_test_helper(cpu_offload=False, nvme_offload=False)
+        zero3_quantized_initialization_test_helper(cpu_offload=False, nvme_offload=False, bits=quantization_bits)
 
     @pytest.mark.skipif(device == 'cpu', reason='CPU does support FP16 GEMM')
-    def test_zero3_int4_quantized_initialization_cpu_offload(self):
+    def test_zero3_int4_quantized_initialization_cpu_offload(self, quantization_bits):
         reset_random()
-        zero3_quantized_initialization_test_helper(cpu_offload=True, nvme_offload=False)
+        zero3_quantized_initialization_test_helper(cpu_offload=True, nvme_offload=False, bits=quantization_bits)
 
     @pytest.mark.skipif(device == 'cpu', reason='CPU does support FP16 GEMM')
     def test_zero3_int4_quantized_initialization_nvme_offload(self):
         reset_random()
-        zero3_quantized_initialization_test_helper(cpu_offload=False, nvme_offload=True)
+        zero3_quantized_initialization_test_helper(cpu_offload=False, nvme_offload=True, bits=4)